From 96517682ef8b0bb4315ab548850a30df867e33fa Mon Sep 17 00:00:00 2001 From: River Li Date: Mon, 16 Dec 2024 13:36:30 +0800 Subject: [PATCH 01/17] [GPU] rope optimization (#27907) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Details: - Optimize rope opencl kernel to improve its performance - Test result shows it can improve RoPE performance about 50% in average.
batch=128, seq_length = 7 | base latency(ns) | optimized latency(ns) | latency decreased |   |   -- | -- | -- | -- | -- | -- rope_ref_5266667119713786613_0_0__sa, | 921352 | 872395 | 5.31% | RoPETestQwen7b | f32 rope_ref_2672092794364911740_0_0__sa, | 1724374 | 514790 | 70.15% | RoPETestChatGLM | f32 rope_ref_8061762790816124098_0_0__sa, | 633019 | 127186 | 79.91% | RoPETestQwen7b | f16 rope_ref_4392014836945391706_0_0__sa, | 629791 | 518749 | 17.63% | RoPETestLlama2 | f32 rope_ref_13829176589243505378_0_0__sa, | 870312 | 259583 | 70.17% | RoPETestChatGLM | f32 rope_ref_6813544162411765619_0_0__sa, | 749895 | 421875 | 43.74% | RoPETestChatGLM | f16 rope_ref_15054358246334082928_0_0__sa, | 637708 | 45208 | 92.91% | RoPETestFlux | f32 rope_ref_3898891400599565440_0_0__sa, | 378333 | 335937 | 11.21% | RoPETestRotateHalfWithoutTranspose | f32 rope_ref_18119704851383556529_0_0__sa, | 371250 | 208645 | 43.80% | RoPETestChatGLM | f16 rope_ref_17460680473512025171_0_0__sa, | 299166 | 98958 | 66.92% | RoPETestFlux | f16
![image](https://github.com/user-attachments/assets/4328b1a7-18ec-485f-abd0-b0fe16785854) ### Tickets: - *CVS-157438* --- .../subgraph_tests/rotary_pos_emb.cpp | 24 +- .../intel_gpu/src/graph/impls/ocl/rope.cpp | 2 +- .../kernel_selector/cl_kernels/rope_opt.cl | 445 ++++++++++++++++++ .../kernel_selector/cl_kernels/rope_ref.cl | 201 -------- .../kernels/rope/rope_kernel_base.cpp | 3 +- .../kernels/rope/rope_kernel_opt.cpp | 107 +++++ .../kernels/rope/rope_kernel_opt.h | 25 + .../kernels/rope/rope_kernel_ref.cpp | 35 -- .../kernels/rope/rope_kernel_ref.h | 20 - .../kernels/rope/rope_kernel_selector.cpp | 4 +- .../subgraph_tests/rotary_pos_emb.cpp | 30 +- .../subgraph/rotary_pos_emb.hpp | 50 +- .../src/subgraph/rotary_pos_emb.cpp | 193 ++++---- 13 files changed, 760 insertions(+), 379 deletions(-) create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_opt.cl delete mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_opt.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_opt.h delete mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_ref.cpp delete mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_ref.h diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp index 8cd8707e047878..20e77bb02897e5 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp @@ -9,51 +9,65 @@ namespace test { INSTANTIATE_TEST_SUITE_P(smoke_RoPETestLlama2StridedSlice, RoPETestLlama2StridedSlice, - ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Combine( + ::testing::Values(ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_CPU)), RoPETestLlama2StridedSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLMStridedSlice, RoPETestChatGLMStridedSlice, - ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Combine( + ::testing::Values(ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_CPU)), RoPETestChatGLMStridedSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestQwen7bStridedSlice, RoPETestQwen7bStridedSlice, ::testing::Combine(::testing::Values(true, false), + ::testing::Values(ov::element::f32), ::testing::Values(ov::test::utils::DEVICE_CPU)), RoPETestQwen7bStridedSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestGPTJStridedSlice, RoPETestGPTJStridedSlice, ::testing::Combine(::testing::Values(true, false), + ::testing::Values(ov::element::f32), ::testing::Values(ov::test::utils::DEVICE_CPU)), RoPETestGPTJStridedSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestLlama2Slice, RoPETestLlama2Slice, - ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Combine( + ::testing::Values(ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_CPU)), RoPETestLlama2Slice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLMSlice, RoPETestChatGLMSlice, - ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Combine( + ::testing::Values(ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_CPU)), RoPETestChatGLMSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestQwen7bSlice, RoPETestQwen7bSlice, ::testing::Combine(::testing::Values(true, false), + ::testing::Values(ov::element::f32), ::testing::Values(ov::test::utils::DEVICE_CPU)), RoPETestQwen7bSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestGPTJSlice, RoPETestGPTJSlice, ::testing::Combine(::testing::Values(true, false), + ::testing::Values(ov::element::f32), ::testing::Values(ov::test::utils::DEVICE_CPU)), RoPETestGPTJSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLM, RoPETestChatGLM2DRoPEStridedSlice, - ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Combine( + ::testing::Values(ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_CPU)), RoPETestChatGLM2DRoPEStridedSlice::getTestCaseName); } // namespace test diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp index d06e643c71ad18..1d2051461fd9c0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp @@ -6,7 +6,7 @@ #include "rope_inst.h" #include "rope/rope_kernel_selector.h" -#include "rope/rope_kernel_ref.h" +#include "rope/rope_kernel_opt.h" namespace cldnn { namespace ocl { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_opt.cl new file mode 100644 index 00000000000000..a5c71c84211c7e --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_opt.cl @@ -0,0 +1,445 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/fetch_utils.cl" + +#define INPUT_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE) +#define OUTPUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE) + +#define UNPACK_FLOAT_VEC_1(outputv, input1, input2) \ + outputv.s0 = convert_float(input1.s0); \ + outputv.s1 = convert_float(input1.s2); \ + outputv.s2 = convert_float(input1.s4); \ + outputv.s3 = convert_float(input1.s6); \ + outputv.s4 = convert_float(input2.s0); \ + outputv.s5 = convert_float(input2.s2); \ + outputv.s6 = convert_float(input2.s4); \ + outputv.s7 = convert_float(input2.s6); + +#define UNPACK_FLOAT_VEC_2(outputv, input1, input2) \ + outputv.s0 = convert_float(input1.s1); \ + outputv.s1 = convert_float(input1.s3); \ + outputv.s2 = convert_float(input1.s5); \ + outputv.s3 = convert_float(input1.s7); \ + outputv.s4 = convert_float(input2.s1); \ + outputv.s5 = convert_float(input2.s3); \ + outputv.s6 = convert_float(input2.s5); \ + outputv.s7 = convert_float(input2.s7); + +#define UNPACK_HALF_VEC_1(outputv, input1) \ + outputv.s0 = convert_float(input1[0]); \ + outputv.s1 = convert_float(input1[2]); \ + outputv.s2 = convert_float(input1[4]); \ + outputv.s3 = convert_float(input1[6]); \ + outputv.s4 = convert_float(input1[8]); \ + outputv.s5 = convert_float(input1[10]); \ + outputv.s6 = convert_float(input1[12]); \ + outputv.s7 = convert_float(input1[14]); + +#define UNPACK_HALF_VEC_2(outputv, input1) \ + outputv.s0 = convert_float(input1[1]); \ + outputv.s1 = convert_float(input1[3]); \ + outputv.s2 = convert_float(input1[5]); \ + outputv.s3 = convert_float(input1[7]); \ + outputv.s4 = convert_float(input1[9]); \ + outputv.s5 = convert_float(input1[11]); \ + outputv.s6 = convert_float(input1[13]); \ + outputv.s7 = convert_float(input1[15]); + +#define UNPACK_HALF16_VEC_1(outputv, input1, input2) \ + outputv = (half16)(input1[0], \ + input1[2], \ + input1[4], \ + input1[6], \ + input1[8], \ + input1[10], \ + input1[12], \ + input1[14], \ + input2[0], \ + input2[2], \ + input2[4], \ + input2[6], \ + input2[8], \ + input2[10], \ + input2[12], \ + input2[14]); + +#define UNPACK_HALF16_VEC_2(outputv, input1, input2) \ + outputv = (half16)(input1[1], \ + input1[3], \ + input1[5], \ + input1[7], \ + input1[9], \ + input1[11], \ + input1[13], \ + input1[15], \ + input2[1], \ + input2[3], \ + input2[5], \ + input2[7], \ + input2[9], \ + input2[11], \ + input2[13], \ + input2[15]); + +#define PACK_HALF16_VEC_1(outputv, input1, input2) \ + outputv = (half16)(input1[0], \ + input2[0], \ + input1[1], \ + input2[1], \ + input1[2], \ + input2[2], \ + input1[3], \ + input2[3], \ + input1[4], \ + input2[4], \ + input1[5], \ + input2[5], \ + input1[6], \ + input2[6], \ + input1[7], \ + input2[7]); + +#define PACK_HALF16_VEC_2(outputv, input1, input2) \ + outputv = (half16)(input1[8], \ + input2[8], \ + input1[9], \ + input2[9], \ + input1[10], \ + input2[10], \ + input1[11], \ + input2[11], \ + input1[12], \ + input2[12], \ + input1[13], \ + input2[13], \ + input1[14], \ + input2[14], \ + input1[15], \ + input2[15]); + +#ifdef CHATGLM +KERNEL(rope_opt)( + OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, + const __global INPUT1_TYPE* cos_sin, + __global OUTPUT_TYPE* output) { +#if VEC_SIZE != 1 && VEC_SIZE != 8 && VEC_SIZE != 16 +# error "rope_opt.cl - VEC_SIZE must be one of {1, 8, 16}" +#endif + +#ifdef SUPPORT_2D_ROPE + const uint p = get_global_id(0) / HEAD_COUNT; + const uint h = get_global_id(0) % HEAD_COUNT; + const uint b = get_global_id(1); // sequence length + const uint rf = get_global_id(2); // max(HALF_ROTARY_NDIMS, HEAD_SIZE - ROTARY_NDIMS) + uint output_idx = OUTPUT_GET_INDEX(p, h, b, 0); +#else + const uint p = get_global_id(0); + const uint b = get_global_id(1); + const uint h = (uint)get_global_id(2) % HEAD_COUNT; + const uint rf = (uint)get_global_id(2) / HEAD_COUNT; + uint output_idx = OUTPUT_GET_INDEX(p, b, h, 0); +#endif + + uint r = rf < HALF_ROTARY_NDIMS ? rf * 2 * VEC_SIZE : 0; + uint f = rf < HEAD_SIZE - ROTARY_NDIMS ? rf * 2 * VEC_SIZE : 0; + + uint input_idx = INPUT0_GET_INDEX(p, b, h * HEAD_SIZE, 0); +#ifdef ENABLE_SLICE + input_idx += SLICED_FROM_START; +#endif + + uint cos_sin_p = p < INPUT1_BATCH_NUM ? p : 0; + uint cos_sin_b = b < INPUT1_FEATURE_NUM ? b : 0; + uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_p, cos_sin_b, 0, 0); + +#if VEC_SIZE == 1 + float cosv = convert_float(cos_sin[cos_sin_idx + r]); + float sinv = convert_float(cos_sin[cos_sin_idx + r + 1]); + + float in1 = convert_float(input[input_idx + r]); + float in2 = convert_float(input[input_idx + r + 1]); + + output[output_idx + r] = TO_OUTPUT_TYPE(cosv * in1 - sinv * in2); + output[output_idx + r + 1] = TO_OUTPUT_TYPE(sinv * in1 + cosv * in2); + + #ifdef ENABLE_IO_COPY + output[output_idx + ROTARY_NDIMS + f] = input[input_idx + ROTARY_NDIMS + f]; + output[output_idx + ROTARY_NDIMS + f + 1] = input[input_idx + ROTARY_NDIMS + f + 1]; + #endif +#elif VEC_SIZE == 8 + INPUT_VEC_TYPE inv1 = *(INPUT_VEC_TYPE*)(input + input_idx + r); + INPUT_VEC_TYPE inv2 = *(INPUT_VEC_TYPE*)(input + input_idx + r + VEC_SIZE); + INPUT_VEC_TYPE cossinv1 = *(INPUT_VEC_TYPE*)(cos_sin + cos_sin_idx + r); + INPUT_VEC_TYPE cossinv2 = *(INPUT_VEC_TYPE*)(cos_sin + cos_sin_idx + r + VEC_SIZE); + + float8 in1, in2, cosv, sinv; + UNPACK_FLOAT_VEC_1(in1, inv1, inv2); + UNPACK_FLOAT_VEC_2(in2, inv1, inv2); + UNPACK_FLOAT_VEC_1(cosv, cossinv1, cossinv2); + UNPACK_FLOAT_VEC_2(sinv, cossinv1, cossinv2); + float8 out1 = cosv * in1 - sinv * in2; + float8 out2 = sinv * in1 + cosv * in2; + + *(float8*)(output + output_idx + r) = + (float8)(out1.s0, out2.s0, out1.s1, out2.s1, out1.s2, out2.s2, out1.s3, out2.s3); + *(float8*)(output + output_idx + r + VEC_SIZE) = + (float8)(out1.s4, out2.s4, out1.s5, out2.s5, out1.s6, out2.s6, out1.s7, out2.s7); + + #ifdef ENABLE_IO_COPY + *(float8*)(output + output_idx + ROTARY_NDIMS + f) = *(float8*)(input + input_idx + ROTARY_NDIMS + f); + *(float8*)(output + output_idx + ROTARY_NDIMS + f + VEC_SIZE) = + *(float8*)(input + input_idx + ROTARY_NDIMS + f + VEC_SIZE); + #endif +#elif VEC_SIZE == 16 + unroll_for(int i = 0; i < 2; i += 1) { + INPUT_VEC_TYPE inv = *(INPUT_VEC_TYPE*)(input + input_idx + r + i * VEC_SIZE); + INPUT_VEC_TYPE cossinv = *(INPUT_VEC_TYPE*)(cos_sin + cos_sin_idx + r + i * VEC_SIZE); + float8 in1, in2, cosv, sinv; + UNPACK_HALF_VEC_1(in1, inv); + UNPACK_HALF_VEC_2(in2, inv); + UNPACK_HALF_VEC_1(cosv, cossinv); + UNPACK_HALF_VEC_2(sinv, cossinv); + float8 out1 = cosv * in1 - sinv * in2; + float8 out2 = sinv * in1 + cosv * in2; + + unroll_for(int j = 0; j < 8; j += 1) { + output[output_idx + r + i * VEC_SIZE + 2 * j] = TO_OUTPUT_TYPE(out1[j]); + output[output_idx + r + i * VEC_SIZE + 2 * j + 1] = TO_OUTPUT_TYPE(out2[j]); + } + } + #ifdef ENABLE_IO_COPY + *(float8*)(output + output_idx + ROTARY_NDIMS + f) = *(float8*)(input + input_idx + ROTARY_NDIMS + f); + *(float8*)(output + output_idx + ROTARY_NDIMS + f + VEC_SIZE) = + *(float8*)(input + input_idx + ROTARY_NDIMS + f + VEC_SIZE); + #endif +#endif +} +#endif + +#ifdef QWEN +KERNEL(rope_opt)( + OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, + const __global INPUT1_TYPE* cos, + const __global INPUT2_TYPE* sin, + __global OUTPUT_TYPE* output) { + const uint b = get_global_id(0); + const uint p = get_global_id(1); + const uint h = (uint)get_global_id(2) * VEC_SIZE / HALF_ROTARY_NDIMS; + const uint r = ((uint)get_global_id(2) * VEC_SIZE) % HALF_ROTARY_NDIMS; + + uint input_idx = INPUT0_GET_INDEX(b, p, h * HEAD_SIZE, 0); +#ifdef ENABLE_SLICE + input_idx += SLICED_FROM_START; +#endif + + uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; + uint cos_sin_p = p + INPUT1_FEATURE_NUM - INPUT0_FEATURE_NUM < INPUT1_FEATURE_NUM + ? p + INPUT1_FEATURE_NUM - INPUT0_FEATURE_NUM + : 0; + uint cos_sin_h = h < INPUT1_SIZE_Y ? h : 0; + +#ifndef SIN_COS_HAVE_DYNAMIC_PADDINGS + uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_p, cos_sin_h, 0); + + uint cos_idx = cos_sin_idx; + uint sin_idx = cos_sin_idx; +#else + uint cos_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_p, cos_sin_h, 0); + uint sin_idx = INPUT2_GET_INDEX(cos_sin_b, cos_sin_p, cos_sin_h, 0); +#endif + + uint output_idx = OUTPUT_GET_INDEX(b, p, h, 0); + +#if VEC_SIZE == 1 + INPUT0_TYPE in1 = input[input_idx + r]; + INPUT0_TYPE in2 = input[input_idx + HALF_ROTARY_NDIMS + r]; + + output[output_idx + r] = cos[cos_idx + r] * in1 - sin[sin_idx + r] * in2; + + output[output_idx + HALF_ROTARY_NDIMS + r] = + cos[cos_idx + HALF_ROTARY_NDIMS + r] * in2 + sin[sin_idx + HALF_ROTARY_NDIMS + r] * in1; +#else + INPUT_VEC_TYPE in1 = *(INPUT_VEC_TYPE*)(input + input_idx + r); + INPUT_VEC_TYPE in2 = *(INPUT_VEC_TYPE*)(input + input_idx + HALF_ROTARY_NDIMS + r); + INPUT_VEC_TYPE cos1 = *(INPUT_VEC_TYPE*)(cos + cos_idx + r); + INPUT_VEC_TYPE cos2 = *(INPUT_VEC_TYPE*)(cos + cos_idx + HALF_ROTARY_NDIMS + r); + INPUT_VEC_TYPE sin1 = *(INPUT_VEC_TYPE*)(sin + sin_idx + r); + INPUT_VEC_TYPE sin2 = *(INPUT_VEC_TYPE*)(sin + sin_idx + HALF_ROTARY_NDIMS + r); + + OUTPUT_VEC_TYPE out1 = cos1 * in1 - sin1 * in2; + OUTPUT_VEC_TYPE out2 = cos2 * in2 + sin2 * in1; + + *(OUTPUT_VEC_TYPE*)(output + output_idx + r) = out1; + *(OUTPUT_VEC_TYPE*)(output + output_idx + HALF_ROTARY_NDIMS + r) = out2; +#endif +} +#endif + +#ifdef RotateHalf +KERNEL(rope_opt) +(OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, + const __global INPUT1_TYPE* cos, + const __global INPUT2_TYPE* sin, +#ifdef ENABLE_GATHER + const __global INPUT3_TYPE* gather, +#endif + __global OUTPUT_TYPE* output) { + const uint b = get_global_id(0); + const uint h = get_global_id(1); + const uint p = ((uint)get_global_id(2) * VEC_SIZE) / HALF_ROTARY_NDIMS; + const uint r = ((uint)get_global_id(2) * VEC_SIZE) % HALF_ROTARY_NDIMS; + +#if ENABLE_TRANSPOSE + uint input_idx = INPUT0_GET_INDEX(b, p, h, 0); +#else + uint input_idx = INPUT0_GET_INDEX(b, h, p, 0); + #ifdef ENABLE_SLICE + input_idx += SLICED_FROM_START; + #endif +#endif + + uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; + uint cos_sin_h = h < INPUT1_FEATURE_NUM ? h : 0; + uint cos_sin_p = p; +#ifdef ENABLE_GATHER + uint gather_b = b < INPUT3_BATCH_NUM ? b : 0; + #if GATHER_RANK == 4 + uint gather_h = h < INPUT3_FEATURE_NUM ? h : 0; + uint gather_p = p < INPUT3_SIZE_Y ? p : 0; + uint gather_idx = INPUT3_GET_INDEX(gather_b, gather_h, gather_p, 0); + #else + uint gather_p = p < INPUT3_FEATURE_NUM ? p : 0; + uint gather_idx = INPUT3_GET_INDEX(gather_b, gather_p, 0, 0); + #endif + cos_sin_p = gather[gather_idx]; +#endif + cos_sin_p = cos_sin_p < INPUT1_SIZE_Y ? cos_sin_p : 0; + +#ifndef SIN_COS_HAVE_DYNAMIC_PADDINGS + uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); + + uint cos_idx = cos_sin_idx; + uint sin_idx = cos_sin_idx; +#else + uint cos_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); + uint sin_idx = INPUT2_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); +#endif + + uint output_idx = OUTPUT_GET_INDEX(b, h, p, 0); + +#if VEC_SIZE == 1 + INPUT0_TYPE in1 = input[input_idx + r]; + INPUT0_TYPE in2 = input[input_idx + HALF_ROTARY_NDIMS + r]; + + output[output_idx + r] = cos[cos_idx + r] * in1 - sin[sin_idx + r] * in2; + + output[output_idx + HALF_ROTARY_NDIMS + r] = + cos[cos_idx + HALF_ROTARY_NDIMS + r] * in2 + sin[sin_idx + HALF_ROTARY_NDIMS + r] * in1; +#else + INPUT_VEC_TYPE in1 = *(INPUT_VEC_TYPE*)(input + input_idx + r); + INPUT_VEC_TYPE in2 = *(INPUT_VEC_TYPE*)(input + input_idx + HALF_ROTARY_NDIMS + r); + INPUT_VEC_TYPE cos1 = *(INPUT_VEC_TYPE*)(cos + cos_idx + r); + INPUT_VEC_TYPE cos2 = *(INPUT_VEC_TYPE*)(cos + cos_idx + HALF_ROTARY_NDIMS + r); + INPUT_VEC_TYPE sin1 = *(INPUT_VEC_TYPE*)(sin + sin_idx + r); + INPUT_VEC_TYPE sin2 = *(INPUT_VEC_TYPE*)(sin + sin_idx + HALF_ROTARY_NDIMS + r); + + OUTPUT_VEC_TYPE out1 = cos1 * in1 - sin1 * in2; + OUTPUT_VEC_TYPE out2 = cos2 * in2 + sin2 * in1; + + *(OUTPUT_VEC_TYPE*)(output + output_idx + r) = out1; + *(OUTPUT_VEC_TYPE*)(output + output_idx + HALF_ROTARY_NDIMS + r) = out2; +#endif +} +#endif + +#ifdef RotateInterleaved +KERNEL(rope_opt)( + OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, + const __global INPUT1_TYPE* cos, + const __global INPUT2_TYPE* sin, + __global OUTPUT_TYPE* output) { +#if VEC_SIZE != 1 && VEC_SIZE != 8 && VEC_SIZE != 16 +# error "rope_opt.cl - VEC_SIZE must be one of {1, 8, 16}" +#endif + const uint b = get_global_id(0); + const uint h = get_global_id(1); + const uint p = ((uint)get_global_id(2) * VEC_SIZE) / HALF_ROTARY_NDIMS; + const uint r = 2 * (((uint)get_global_id(2) * VEC_SIZE) % HALF_ROTARY_NDIMS); + + uint input_idx = INPUT0_GET_INDEX(b, h, p, 0); + + uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; + uint cos_sin_h = h < INPUT1_FEATURE_NUM ? h : 0; + uint cos_sin_p = p < INPUT1_SIZE_Y ? p : 0; + +#ifndef SIN_COS_HAVE_DYNAMIC_PADDINGS + uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); + + uint cos_idx = cos_sin_idx; + uint sin_idx = cos_sin_idx; +#else + uint cos_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); + uint sin_idx = INPUT2_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); +#endif + + uint output_idx = OUTPUT_GET_INDEX(b, h, p, 0); + +#if VEC_SIZE == 1 + INPUT0_TYPE in1 = input[input_idx + r]; + INPUT0_TYPE in2 = input[input_idx + r + 1]; + + output[output_idx + r] = cos[cos_idx + r] * in1 - sin[sin_idx + r] * in2; + output[output_idx + r + 1] = cos[cos_idx + r + 1] * in2 + sin[sin_idx + r + 1] * in1; +#elif VEC_SIZE == 8 + INPUT_VEC_TYPE inv1 = *(INPUT_VEC_TYPE*)(input + input_idx + r); + INPUT_VEC_TYPE inv2 = *(INPUT_VEC_TYPE*)(input + input_idx + r + VEC_SIZE); + INPUT_VEC_TYPE cosv1 = *(INPUT_VEC_TYPE*)(cos + cos_idx + r); + INPUT_VEC_TYPE sinv1 = *(INPUT_VEC_TYPE*)(sin + sin_idx + r); + INPUT_VEC_TYPE cosv2 = *(INPUT_VEC_TYPE*)(cos + cos_idx + r + VEC_SIZE); + INPUT_VEC_TYPE sinv2 = *(INPUT_VEC_TYPE*)(sin + sin_idx + r + VEC_SIZE); + + float8 in1, in2, cos1, sin1, cos2, sin2; + UNPACK_FLOAT_VEC_1(in1, inv1, inv2); + UNPACK_FLOAT_VEC_2(in2, inv1, inv2); + UNPACK_FLOAT_VEC_1(cos1, cosv1, cosv2); + UNPACK_FLOAT_VEC_2(cos2, cosv1, cosv2); + UNPACK_FLOAT_VEC_1(sin1, sinv1, sinv2); + UNPACK_FLOAT_VEC_2(sin2, sinv1, sinv2); + + float8 out1 = cos1 * in1 - sin1 * in2; + float8 out2 = sin2 * in1 + cos2 * in2; + + *(float8*)(output + output_idx + r) = + (float8)(out1.s0, out2.s0, out1.s1, out2.s1, out1.s2, out2.s2, out1.s3, out2.s3); + *(float8*)(output + output_idx + r + VEC_SIZE) = + (float8)(out1.s4, out2.s4, out1.s5, out2.s5, out1.s6, out2.s6, out1.s7, out2.s7); +#elif VEC_SIZE == 16 + INPUT_VEC_TYPE inv1 = *(INPUT_VEC_TYPE*)(input + input_idx + r); + INPUT_VEC_TYPE inv2 = *(INPUT_VEC_TYPE*)(input + input_idx + r + VEC_SIZE); + INPUT_VEC_TYPE cosv1 = *(INPUT_VEC_TYPE*)(cos + cos_idx + r); + INPUT_VEC_TYPE sinv1 = *(INPUT_VEC_TYPE*)(sin + sin_idx + r); + INPUT_VEC_TYPE cosv2 = *(INPUT_VEC_TYPE*)(cos + cos_idx + r + VEC_SIZE); + INPUT_VEC_TYPE sinv2 = *(INPUT_VEC_TYPE*)(sin + sin_idx + r + VEC_SIZE); + + INPUT_VEC_TYPE in1, in2, cos1, sin1, cos2, sin2; + UNPACK_HALF16_VEC_1(in1, inv1, inv2); + UNPACK_HALF16_VEC_2(in2, inv1, inv2); + UNPACK_HALF16_VEC_1(cos1, cosv1, cosv2); + UNPACK_HALF16_VEC_2(cos2, cosv1, cosv2); + UNPACK_HALF16_VEC_1(sin1, sinv1, sinv2); + UNPACK_HALF16_VEC_2(sin2, sinv1, sinv2); + + half16 out1 = cos1 * in1 - sin1 * in2; + half16 out2 = sin2 * in1 + cos2 * in2; + + half16 outputv1, outputv2; + PACK_HALF16_VEC_1(outputv1, out1, out2); + PACK_HALF16_VEC_2(outputv2, out1, out2); + + *(half16*)(output + output_idx + r) = outputv1; + *(half16*)(output + output_idx + r + VEC_SIZE) = outputv2; +#endif +} +#endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl deleted file mode 100644 index d429916b46d69a..00000000000000 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "include/fetch_utils.cl" - -#ifdef CHATGLM -KERNEL(rope_ref)( - OPTIONAL_SHAPE_INFO_ARG - const __global INPUT0_TYPE* input, - const __global INPUT1_TYPE* cos_sin, - __global OUTPUT_TYPE* output) -{ -#ifdef SUPPORT_2D_ROPE - const uint p = get_global_id(0) / HEAD_COUNT; - const uint h = get_global_id(0) % HEAD_COUNT; - const uint b = get_global_id(1);//sequence length - const uint rf = get_global_id(2);//max(HALF_ROTARY_NDIMS, HEAD_SIZE - ROTARY_NDIMS) - uint output_idx = OUTPUT_GET_INDEX(p, h, b, 0); -#else - const uint p = get_global_id(0); - const uint b = get_global_id(1); - const uint h = (uint)get_global_id(2) % HEAD_COUNT; - const uint rf = (uint)get_global_id(2) / HEAD_COUNT; - uint output_idx = OUTPUT_GET_INDEX(p, b, h, 0); -#endif - - uint r = rf < HALF_ROTARY_NDIMS ? rf * 2 : 0; - uint f = rf < HEAD_SIZE - ROTARY_NDIMS ? rf * 2 : 0; - - uint input_idx = INPUT0_GET_INDEX(p, b, h * HEAD_SIZE, 0); -#ifdef ENABLE_SLICE - input_idx += SLICED_FROM_START; -#endif - - uint cos_sin_p = p < INPUT1_BATCH_NUM ? p : 0; - uint cos_sin_b = b < INPUT1_FEATURE_NUM ? b : 0; - uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_p, cos_sin_b, 0, 0); - - float cosv = convert_float(cos_sin[cos_sin_idx + r]); - float sinv = convert_float(cos_sin[cos_sin_idx + r + 1]); - - float in1 = convert_float(input[input_idx + r]); - float in2 = convert_float(input[input_idx + r + 1]); - - output[output_idx + r] = TO_OUTPUT_TYPE(cosv * in1 - sinv * in2); - output[output_idx + r + 1] = TO_OUTPUT_TYPE(sinv * in1 + cosv * in2); - -#ifdef ENABLE_IO_COPY - output[output_idx + ROTARY_NDIMS + f] = input[input_idx + ROTARY_NDIMS + f]; - output[output_idx + ROTARY_NDIMS + f + 1] = input[input_idx + ROTARY_NDIMS + f + 1]; -#endif -} -#endif - -#ifdef QWEN -KERNEL(rope_ref)( - OPTIONAL_SHAPE_INFO_ARG - const __global INPUT0_TYPE* input, - const __global INPUT1_TYPE* cos, - const __global INPUT2_TYPE* sin, - __global OUTPUT_TYPE* output) -{ - const uint b = get_global_id(0); - const uint p = get_global_id(1); - const uint h = (uint)get_global_id(2) / HALF_ROTARY_NDIMS; - const uint r = (uint)get_global_id(2) % HALF_ROTARY_NDIMS; - - uint input_idx = INPUT0_GET_INDEX(b, p, h * HEAD_SIZE, 0); -#ifdef ENABLE_SLICE - input_idx += SLICED_FROM_START; -#endif - - uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; - uint cos_sin_p = p + INPUT1_FEATURE_NUM - INPUT0_FEATURE_NUM < INPUT1_FEATURE_NUM ? p + INPUT1_FEATURE_NUM - INPUT0_FEATURE_NUM : 0; - uint cos_sin_h = h < INPUT1_SIZE_Y ? h : 0; - -#ifndef SIN_COS_HAVE_DYNAMIC_PADDINGS - uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_p, cos_sin_h, 0); - - uint cos_idx = cos_sin_idx; - uint sin_idx = cos_sin_idx; -#else - uint cos_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_p, cos_sin_h, 0); - uint sin_idx = INPUT2_GET_INDEX(cos_sin_b, cos_sin_p, cos_sin_h, 0); -#endif - - uint output_idx = OUTPUT_GET_INDEX(b, p, h, 0); - - INPUT0_TYPE in1 = input[input_idx + r]; - INPUT0_TYPE in2 = input[input_idx + HALF_ROTARY_NDIMS + r]; - - output[output_idx + r] = cos[cos_idx + r] * in1 - sin[sin_idx + r] * in2; - - output[output_idx + HALF_ROTARY_NDIMS + r] = cos[cos_idx + HALF_ROTARY_NDIMS + r] * in2 + - sin[sin_idx + HALF_ROTARY_NDIMS + r] * in1; -} -#endif - -#ifdef RotateHalf -KERNEL(rope_ref)( - OPTIONAL_SHAPE_INFO_ARG - const __global INPUT0_TYPE* input, - const __global INPUT1_TYPE* cos, - const __global INPUT2_TYPE* sin, -#ifdef ENABLE_GATHER - const __global INPUT3_TYPE* gather, -#endif - __global OUTPUT_TYPE* output) -{ - const uint b = get_global_id(0); - const uint h = get_global_id(1); - const uint p = (uint)get_global_id(2) / HALF_ROTARY_NDIMS; - const uint r = (uint)get_global_id(2) % HALF_ROTARY_NDIMS; - -#if ENABLE_TRANSPOSE - uint input_idx = INPUT0_GET_INDEX(b, p, h, 0); -#else - uint input_idx = INPUT0_GET_INDEX(b, h, p, 0); -#ifdef ENABLE_SLICE - input_idx += SLICED_FROM_START; -#endif -#endif - - uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; - uint cos_sin_h = h < INPUT1_FEATURE_NUM ? h : 0; - uint cos_sin_p = p; -#ifdef ENABLE_GATHER - uint gather_b = b < INPUT3_BATCH_NUM ? b : 0; -#if GATHER_RANK == 4 - uint gather_h = h < INPUT3_FEATURE_NUM ? h : 0; - uint gather_p = p < INPUT3_SIZE_Y ? p : 0; - uint gather_idx = INPUT3_GET_INDEX(gather_b, gather_h, gather_p, 0); -#else - uint gather_p = p < INPUT3_FEATURE_NUM ? p : 0; - uint gather_idx = INPUT3_GET_INDEX(gather_b, gather_p, 0, 0); -#endif - cos_sin_p = gather[gather_idx]; -#endif - cos_sin_p = cos_sin_p < INPUT1_SIZE_Y ? cos_sin_p : 0; - -#ifndef SIN_COS_HAVE_DYNAMIC_PADDINGS - uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); - - uint cos_idx = cos_sin_idx; - uint sin_idx = cos_sin_idx; -#else - uint cos_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); - uint sin_idx = INPUT2_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); -#endif - - uint output_idx = OUTPUT_GET_INDEX(b, h, p, 0); - - INPUT0_TYPE in1 = input[input_idx + r]; - INPUT0_TYPE in2 = input[input_idx + HALF_ROTARY_NDIMS + r]; - - output[output_idx + r] = cos[cos_idx + r] * in1 - sin[sin_idx + r] * in2; - - output[output_idx + HALF_ROTARY_NDIMS + r] = cos[cos_idx + HALF_ROTARY_NDIMS + r] * in2 + - sin[sin_idx + HALF_ROTARY_NDIMS + r] * in1; -} -#endif - -#ifdef RotateInterleaved -KERNEL(rope_ref)( - OPTIONAL_SHAPE_INFO_ARG - const __global INPUT0_TYPE* input, - const __global INPUT1_TYPE* cos, - const __global INPUT2_TYPE* sin, - __global OUTPUT_TYPE* output) -{ - const uint b = get_global_id(0); - const uint h = get_global_id(1); - const uint p = (uint)get_global_id(2) / HALF_ROTARY_NDIMS; - const uint r = 2 * ((uint)get_global_id(2) % HALF_ROTARY_NDIMS); - - uint input_idx = INPUT0_GET_INDEX(b, h, p, 0); - - uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; - uint cos_sin_h = h < INPUT1_FEATURE_NUM ? h : 0; - uint cos_sin_p = p < INPUT1_SIZE_Y ? p : 0; - -#ifndef SIN_COS_HAVE_DYNAMIC_PADDINGS - uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); - - uint cos_idx = cos_sin_idx; - uint sin_idx = cos_sin_idx; -#else - uint cos_idx = INPUT1_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); - uint sin_idx = INPUT2_GET_INDEX(cos_sin_b, cos_sin_h, cos_sin_p, 0); -#endif - - uint output_idx = OUTPUT_GET_INDEX(b, h, p, 0); - - INPUT0_TYPE in1 = input[input_idx + r]; - INPUT0_TYPE in2 = input[input_idx + r + 1]; - - output[output_idx + r] = cos[cos_idx + r] * in1 - sin[sin_idx + r] * in2; - output[output_idx + r + 1] = cos[cos_idx + r + 1] * in2 + sin[sin_idx + r + 1] * in1; -} -#endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp index 98212254be9e3c..13df539c0c5508 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp @@ -70,7 +70,8 @@ RoPEKernelBase::DispatchData RoPEKernelBase::SetDefault(const rope_params& param if (params.is_qwen) { dispatchData.gws = {input.Batch().v, input.Feature().v, - params.head_cnt * std::max(params.rotary_ndims / 2ul, params.head_size - params.rotary_ndims)}; + params.head_cnt * + std::max(params.rotary_ndims / 2ul, params.head_size - params.rotary_ndims)}; } else if (params.is_chatglm) { if (params.support_2d_rope) { // input [batch_size, seq_length] diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_opt.cpp new file mode 100644 index 00000000000000..40cf9094c458c3 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_opt.cpp @@ -0,0 +1,107 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "rope_kernel_opt.h" +#include "kernel_selector_utils.h" +#include + +namespace kernel_selector { +ParamsKey RoPEKernelOpt::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + k.EnableDynamicShapesSupport(); + return k; +} + +RoPEKernelBase::DispatchData RoPEKernelOpt::SetDefault(const rope_params& params) const { + DispatchData dispatchData; + const auto& input = params.inputs[0]; + const auto& output = params.outputs[0]; + + std::vector> dims_by_gws = { + {Tensor::DataChannelName::BATCH}, + {Tensor::DataChannelName::FEATURE}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::X}}; + + size_t vec_size = GetVecSize(params); + if (params.is_qwen) { + auto count = params.head_cnt * std::max(params.rotary_ndims / 2ul, params.head_size - params.rotary_ndims); + dispatchData.gws = {input.Batch().v, input.Feature().v, count / vec_size}; + } else if (params.is_chatglm) { + if (params.support_2d_rope) { + // input [batch_size, seq_length] + // output [batch_size, head_count, seq_length, half_rotary_ndims] + dispatchData.gws = {input.Batch().v * params.head_cnt, + input.Feature().v, + params.rotary_ndims / 2ul / vec_size}; + } else { + dispatchData.gws = {input.Batch().v, + input.Feature().v, + params.head_cnt * (params.rotary_ndims / 2ul / vec_size)}; + } + } else { + dispatchData.gws = {output.Batch().v, output.Feature().v, output.Y().v * params.rotary_ndims / 2ul / vec_size}; + } + + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, + params.engineInfo, + input.GetLayout(), + output.GetLayout(), + dims_by_gws); + + return dispatchData; +} + +JitConstants RoPEKernelOpt::GetJitConstants(const rope_params& params, RoPEKernelBase::DispatchData dispatchData) const { + JitConstants jit = RoPEKernelBase::GetJitConstants(params, dispatchData); + + jit.AddConstant(MakeJitConstant("VEC_SIZE", GetVecSize(params))); + return jit; +} + +size_t RoPEKernelOpt::GetVecSize(const rope_params& params) const { + const auto& input = params.inputs[0]; + size_t vec_size = 1; + switch (input.GetDType()) { + case Datatype::F16: + vec_size = 16; + break; + case Datatype::F32: + vec_size = 8; + break; + default: + vec_size = 1; + break; + } + if (params.rotary_ndims % (2 * vec_size) != 0) + vec_size = 1; + + if (params.is_qwen) { + auto count = params.head_cnt * std::max(params.rotary_ndims / 2ul, params.head_size - params.rotary_ndims); + if (count % vec_size != 0) + vec_size = 1; + } + + return vec_size; +} + +KernelsData RoPEKernelOpt::GetKernelsData(const Params& params) const { + return GetCommonKernelsData(params); +} + +KernelsPriority RoPEKernelOpt::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_8; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_opt.h new file mode 100644 index 00000000000000..91010628595d64 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_opt.h @@ -0,0 +1,25 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "rope_kernel_base.h" + +namespace kernel_selector { +class RoPEKernelOpt : public RoPEKernelBase { +public: + using Parent = RoPEKernelBase; + RoPEKernelOpt() : RoPEKernelBase("rope_opt") {} + virtual ~RoPEKernelOpt() {} + + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; +protected: + JitConstants GetJitConstants(const rope_params& params, DispatchData dispatchData) const override; + DispatchData SetDefault(const rope_params& params) const override; +private: + size_t GetVecSize(const rope_params& params) const; +}; +} // namespace kernel_selector \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_ref.cpp deleted file mode 100644 index 5ec125ef6f083c..00000000000000 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_ref.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "rope_kernel_ref.h" -#include "kernel_selector_utils.h" -#include - -namespace kernel_selector { -ParamsKey RoPEKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT32); - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); - - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - k.EnableDifferentTypes(); - k.EnableDynamicShapesSupport(); - return k; -} - -KernelsData RoPEKernelRef::GetKernelsData(const Params& params) const { - return GetCommonKernelsData(params); -} - -KernelsPriority RoPEKernelRef::GetKernelsPriority(const Params& /*params*/) const { - return FORCE_PRIORITY_9; -} -} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_ref.h deleted file mode 100644 index ceea2a17720ad1..00000000000000 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_ref.h +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "rope_kernel_base.h" - -namespace kernel_selector { -class RoPEKernelRef : public RoPEKernelBase { -public: - using Parent = RoPEKernelBase; - RoPEKernelRef() : RoPEKernelBase("rope_ref") {} - virtual ~RoPEKernelRef() {} - - KernelsData GetKernelsData(const Params& params) const override; - KernelsPriority GetKernelsPriority(const Params& params) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_selector.cpp index e5436971b90c09..63063ad2af646b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_selector.cpp @@ -3,11 +3,11 @@ // #include "rope_kernel_selector.h" -#include "rope_kernel_ref.h" +#include "rope_kernel_opt.h" namespace kernel_selector { rope_kernel_selector::rope_kernel_selector() { - Attach(); + Attach(); } KernelsData rope_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp index 3f7fe91da86d93..98cb257776c388 100644 --- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/rotary_pos_emb.cpp @@ -9,49 +9,65 @@ namespace test { INSTANTIATE_TEST_SUITE_P(smoke_RoPETestFlux, RoPETestFlux, - ::testing::Values(ov::test::utils::DEVICE_GPU), + ::testing::Combine( + ::testing::Values(ov::element::f16, ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_GPU)), RoPETestFlux::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLM, RoPETestChatGLMStridedSlice, - ::testing::Values(ov::test::utils::DEVICE_GPU), + ::testing::Combine( + ::testing::Values(ov::element::f16, ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_GPU)), RoPETestChatGLMStridedSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestQwen7b, RoPETestQwen7bStridedSlice, ::testing::Combine(::testing::Values(true, false), + ::testing::Values(ov::element::f16, ov::element::f32), ::testing::Values(ov::test::utils::DEVICE_GPU)), RoPETestQwen7bStridedSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestLlama2, RoPETestLlama2StridedSlice, - ::testing::Values(ov::test::utils::DEVICE_GPU), + ::testing::Combine( + ::testing::Values(ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_GPU)), RoPETestLlama2StridedSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestRotateHalfWithoutTranspose, RoPETestRotateHalfWithoutTranspose, - ::testing::Values(ov::test::utils::DEVICE_GPU), + ::testing::Combine( + ::testing::Values(ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_GPU)), RoPETestRotateHalfWithoutTranspose::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLM, RoPETestChatGLMSlice, - ::testing::Values(ov::test::utils::DEVICE_GPU), + ::testing::Combine( + ::testing::Values(ov::element::f16, ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_GPU)), RoPETestChatGLMSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestQwen7b, RoPETestQwen7bSlice, ::testing::Combine(::testing::Values(true, false), + ::testing::Values(ov::element::f16, ov::element::f32), ::testing::Values(ov::test::utils::DEVICE_GPU)), RoPETestQwen7bSlice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestLlama2, RoPETestLlama2Slice, - ::testing::Values(ov::test::utils::DEVICE_GPU), + ::testing::Combine( + ::testing::Values(ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_GPU)), RoPETestLlama2Slice::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_RoPETestChatGLM, RoPETestChatGLM2DRoPEStridedSlice, - ::testing::Values(ov::test::utils::DEVICE_GPU), + ::testing::Combine( + ::testing::Values(ov::element::f16, ov::element::f32), + ::testing::Values(ov::test::utils::DEVICE_GPU)), RoPETestChatGLM2DRoPEStridedSlice::getTestCaseName); diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp index 39cdb871710e64..4fd59740cac549 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/rotary_pos_emb.hpp @@ -9,21 +9,25 @@ namespace ov { namespace test { -class RoPETestFlux : public SubgraphBaseTest, public testing::WithParamInterface { +using rope_params = std::tuple; +using rope_params_2 = std::tuple; + +class RoPETestFlux : public SubgraphBaseTest, public testing::WithParamInterface { private: std::shared_ptr build_rope_flux(int batch, int seq_length, int num_head, - int ndims); + int ndims, + ov::element::Type element_type); protected: void generate_inputs(const std::vector& targetInputStaticShapes) override; void SetUp() override; public: - static std::string getTestCaseName(const testing::TestParamInfo& obj); + static std::string getTestCaseName(const testing::TestParamInfo& obj); }; -class RoPETestLlama2StridedSlice : public SubgraphBaseTest, public testing::WithParamInterface { +class RoPETestLlama2StridedSlice : public SubgraphBaseTest, public testing::WithParamInterface { private: std::shared_ptr buildROPE_Llama2(int batch, int seq_length, @@ -37,47 +41,48 @@ class RoPETestLlama2StridedSlice : public SubgraphBaseTest, public testing::With void SetUp() override; public: - static std::string getTestCaseName(const testing::TestParamInfo& obj); + static std::string getTestCaseName(const testing::TestParamInfo& obj); }; -class RoPETestChatGLMStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface { +class RoPETestChatGLMStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface { private: - std::shared_ptr buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims); + std::shared_ptr buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims, ov::element::Type element_type); protected: ov::Tensor create_i32_tensor(const ov::Shape& shape, int start, int step = 1); void generate_inputs(const std::vector& targetInputStaticShapes) override; void SetUp() override; public: - static std::string getTestCaseName(const testing::TestParamInfo& obj); + static std::string getTestCaseName(const testing::TestParamInfo& obj); }; -class RoPETestQwen7bStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface> { +class RoPETestQwen7bStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface { private: - std::shared_ptr buildROPE_QWen7b(bool specialReshape); + std::shared_ptr buildROPE_QWen7b(bool specialReshape, ov::element::Type element_type); protected: void generate_inputs(const std::vector& targetInputStaticShapes) override; void SetUp() override; public: - static std::string getTestCaseName(const testing::TestParamInfo>& obj); + static std::string getTestCaseName(const testing::TestParamInfo& obj); }; -class RoPETestGPTJStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface> { +class RoPETestGPTJStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface { private: std::shared_ptr buildROPE_GPTJ(int num_head, int hidden_dims, int rotary_dims, - bool hasShapeOf); + bool hasShapeOf, + ov::element::Type element_type); protected: void generate_inputs(const std::vector& targetInputStaticShapes) override; void SetUp() override; public: - static std::string getTestCaseName(const testing::TestParamInfo>& obj); + static std::string getTestCaseName(const testing::TestParamInfo& obj); }; -class RoPETestRotateHalfWithoutTranspose : public SubgraphBaseTest, public testing::WithParamInterface { +class RoPETestRotateHalfWithoutTranspose : public SubgraphBaseTest, public testing::WithParamInterface { private: ov::Tensor create_i32_tensor(const ov::Shape& shape, int start, int step = 1); ov::OutputVector makeCosSinCache(int max_position_embeddings, int rotary_ndims); @@ -91,7 +96,7 @@ class RoPETestRotateHalfWithoutTranspose : public SubgraphBaseTest, public testi void SetUp() override; public: - static std::string getTestCaseName(const testing::TestParamInfo& obj); + static std::string getTestCaseName(const testing::TestParamInfo& obj); }; class RoPETestLlama2Slice : public RoPETestLlama2StridedSlice { @@ -107,14 +112,14 @@ class RoPETestLlama2Slice : public RoPETestLlama2StridedSlice { class RoPETestChatGLMSlice : public RoPETestChatGLMStridedSlice { private: - std::shared_ptr buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims); + std::shared_ptr buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims, ov::element::Type element_type); protected: void SetUp() override; }; class RoPETestQwen7bSlice : public RoPETestQwen7bStridedSlice { private: - std::shared_ptr buildROPE_Qwen7b(bool specialReshape); + std::shared_ptr buildROPE_Qwen7b(bool specialReshape, ov::element::Type element_type); protected: void SetUp() override; }; @@ -124,21 +129,22 @@ class RoPETestGPTJSlice : public RoPETestGPTJStridedSlice { std::shared_ptr buildROPE_GPTJ(int num_head, int hidden_dims, int rotary_dims, - bool hasShapeOf); + bool hasShapeOf, + ov::element::Type element_type); protected: void SetUp() override; }; -class RoPETestChatGLM2DRoPEStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface { +class RoPETestChatGLM2DRoPEStridedSlice : public SubgraphBaseTest, public testing::WithParamInterface { private: - std::shared_ptr buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims); + std::shared_ptr buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims, ov::element::Type element_type); protected: ov::Tensor create_i32_tensor(const ov::Shape& shape, int start, int step = 1); void generate_inputs(const std::vector& targetInputStaticShapes) override; void SetUp() override; public: - static std::string getTestCaseName(const testing::TestParamInfo& obj); + static std::string getTestCaseName(const testing::TestParamInfo& obj); }; } // namespace test diff --git a/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp b/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp index 1a078d9b49ebb7..ca8ecf39fb8276 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/rotary_pos_emb.cpp @@ -17,10 +17,11 @@ namespace test { std::shared_ptr RoPETestFlux::build_rope_flux(int batch, int seq_length, int num_head, - int ndims) { - auto x = std::make_shared(ov::element::f32, PartialShape{batch, num_head, seq_length, ndims}); - auto t_cos = std::make_shared(ov::element::f32, PartialShape{1, 1, seq_length, ndims}); - auto t_sin = std::make_shared(ov::element::f32, PartialShape{1, 1, seq_length, ndims}); + int ndims, + ov::element::Type element_type) { + auto x = std::make_shared(element_type, PartialShape{batch, num_head, seq_length, ndims}); + auto t_cos = std::make_shared(element_type, PartialShape{1, 1, seq_length, ndims}); + auto t_sin = std::make_shared(element_type, PartialShape{1, 1, seq_length, ndims}); auto x1_shape = makeConst(element::i64, ov::Shape({5}), {0, num_head, 0, -1, 2}); auto x1 = std::make_shared(x, x1_shape, true); @@ -28,7 +29,7 @@ std::shared_ptr RoPETestFlux::build_rope_flux(int batch, auto split_axis = makeConst(element::i64, ov::Shape(), {-1}); auto split = std::make_shared(x1, split_axis, 2); - auto minus_one = makeConst(element::f32, ov::Shape({}), {-1.0f}); + auto minus_one = makeConst(element_type, ov::Shape({}), {-1.0f}); auto x1_1_neg = std::make_shared(split->output(1), minus_one); auto x2 = std::make_shared(OutputVector{x1_1_neg->output(0), split->output(0)}, -1); @@ -68,9 +69,10 @@ void RoPETestFlux::generate_inputs(const std::vector& targetInputStat } void RoPETestFlux::SetUp() { - targetDevice = this->GetParam(); + ov::element::Type element_type; + std::tie(element_type, targetDevice) = this->GetParam(); - const int batch = 1; + const int batch = 128; const int seq_length = 7; const size_t max_position_embeddings = 2048; const size_t ndims = 128; @@ -82,17 +84,18 @@ void RoPETestFlux::SetUp() { {{1, 1, seq_length, ndims}, {{1, 1, seq_length, ndims}}} }; init_input_shapes(input_shapes); - function = build_rope_flux(batch, -1, num_head, ndims); + function = build_rope_flux(batch, -1, num_head, ndims, element_type); } -std::string RoPETestFlux::getTestCaseName(const testing::TestParamInfo& obj) { - std::string targetDevice = obj.param; +std::string RoPETestFlux::getTestCaseName(const testing::TestParamInfo& obj) { + std::string targetDevice; + ov::element::Type element_type; + std::tie(element_type, targetDevice) = obj.param; std::ostringstream result; - result << "targetDevice=" << targetDevice; + result << "targetDevice=" << targetDevice << ",element_type=" << element_type.to_string(); return result.str(); } - ov::OutputVector RoPETestLlama2StridedSlice::makeCosSinCache(int max_position_embeddings, int rotary_ndims) { std::vector lut_sin(max_position_embeddings * rotary_ndims, 0.0f); std::vector lut_cos(max_position_embeddings * rotary_ndims, 0.0f); @@ -229,9 +232,10 @@ void RoPETestLlama2StridedSlice::generate_inputs(const std::vector& t } void RoPETestLlama2StridedSlice::SetUp() { - targetDevice = this->GetParam(); + ov::element::Type element_type; + std::tie(element_type, targetDevice) = this->GetParam(); - const int batch = 2; + const int batch = 128; const int seq_length = 7; const size_t max_position_embeddings = 2048; const size_t ndims = 128; @@ -242,16 +246,18 @@ void RoPETestLlama2StridedSlice::SetUp() { function = buildROPE_Llama2(batch, seq_length, max_position_embeddings, num_head, ndims); } -std::string RoPETestLlama2StridedSlice::getTestCaseName(const testing::TestParamInfo& obj) { - std::string targetDevice = obj.param; +std::string RoPETestLlama2StridedSlice::getTestCaseName(const testing::TestParamInfo& obj) { + ov::element::Type element_type; + std::string targetDevice; + std::tie(element_type, targetDevice) = obj.param; std::ostringstream result; - result << "targetDevice=" << targetDevice; + result << "targetDevice=" << targetDevice << ",element_type=" << element_type.to_string(); return result.str(); } -std::shared_ptr RoPETestChatGLMStridedSlice::buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims) { - auto input = std::make_shared(ov::element::f32, PartialShape{-1, batch, 4096 + 256 + 256}); - auto cos_sin_cache = std::make_shared(ov::element::f32, PartialShape{32768, 32, 2}); +std::shared_ptr RoPETestChatGLMStridedSlice::buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims, ov::element::Type element_type) { + auto input = std::make_shared(element_type, PartialShape{-1, batch, 4096 + 256 + 256}); + auto cos_sin_cache = std::make_shared(element_type, PartialShape{32768, 32, 2}); auto position_ids = std::make_shared(ov::element::i32, PartialShape{-1, -1}); auto __module_transformer_index_67_Gather = @@ -371,29 +377,32 @@ void RoPETestChatGLMStridedSlice::generate_inputs(const std::vector& } void RoPETestChatGLMStridedSlice::SetUp() { - targetDevice = this->GetParam(); + ov::element::Type element_type; + std::tie(element_type, targetDevice) = this->GetParam(); - const int batch = 2; + const int batch = 128; const int seq_length = 7; const int num_head = 32; const int rotary_dims = 64; InputShape inpShape = {{-1, batch, 4096 + 256 + 256}, {{seq_length, batch, 4096 + 256 + 256}}}; init_input_shapes({inpShape}); - function = buildROPE_ChatGLM(batch, num_head, rotary_dims); + function = buildROPE_ChatGLM(batch, num_head, rotary_dims, element_type); } -std::string RoPETestChatGLMStridedSlice::getTestCaseName(const testing::TestParamInfo& obj) { - std::string targetDevice = obj.param; +std::string RoPETestChatGLMStridedSlice::getTestCaseName(const testing::TestParamInfo& obj) { + std::string targetDevice; + ov::element::Type element_type; + std::tie(element_type, targetDevice) = obj.param; std::ostringstream result; - result << "targetDevice=" << targetDevice; + result << "targetDevice=" << targetDevice << ",element_type=" << element_type.to_string(); return result.str(); } -std::shared_ptr RoPETestQwen7bStridedSlice::buildROPE_QWen7b(bool specialReshape) { - auto input = std::make_shared(ov::element::f32, PartialShape{-1, -1, 4096 + 4096 + 4096}); - auto cos_cache = std::make_shared(ov::element::f32, PartialShape{1, -1, 1, 128}); - auto sin_cache = std::make_shared(ov::element::f32, PartialShape{1, -1, 1, 128}); +std::shared_ptr RoPETestQwen7bStridedSlice::buildROPE_QWen7b(bool specialReshape, ov::element::Type element_type) { + auto input = std::make_shared(element_type, PartialShape{-1, -1, 4096 + 4096 + 4096}); + auto cos_cache = std::make_shared(element_type, PartialShape{1, -1, 1, 128}); + auto sin_cache = std::make_shared(element_type, PartialShape{1, -1, 1, 128}); auto ListUnpack_389_VariadicSplit = makeOP({input, 2, {4096, 4096, -1}}); auto view_Reshape = @@ -459,7 +468,7 @@ std::shared_ptr RoPETestQwen7bStridedSlice::buildROPE_QWen7b(bool spe 1, }), {-1}); - auto Constant_296840 = makeOP({Constant_296840_compressed}, {{"destination_type", "f32"}}); + auto Constant_296840 = makeOP({Constant_296840_compressed}, {{"destination_type", element_type.to_string()}}); auto neg_Multiply_499 = makeOP({ListUnpack_496_Squeeze_0, Constant_296840}, {{"auto_broadcast", "numpy"}}); auto ListUnpack_496_Squeeze = makeOP({ListUnpack_496_Split->output(0), -2}); @@ -507,32 +516,34 @@ void RoPETestQwen7bStridedSlice::generate_inputs(const std::vector& t void RoPETestQwen7bStridedSlice::SetUp() { bool specialReshape; - std::tie(specialReshape, targetDevice) = this->GetParam(); - const int batch = 2; + ov::element::Type element_type; + std::tie(specialReshape, element_type, targetDevice) = this->GetParam(); + const int batch = 128; const int seq_length = 7; InputShape inpShape = {{batch, -1, 4096 + 4096 + 4096}, {{batch, seq_length, 4096 + 4096 + 4096}}}; init_input_shapes({inpShape}); - function = buildROPE_QWen7b(specialReshape); + function = buildROPE_QWen7b(specialReshape, element_type); } -std::string RoPETestQwen7bStridedSlice::getTestCaseName( - const testing::TestParamInfo>& obj) { +std::string RoPETestQwen7bStridedSlice::getTestCaseName(const testing::TestParamInfo& obj) { bool specialReshape; + ov::element::Type element_type; std::string targetDevice; - std::tie(specialReshape, targetDevice) = obj.param; + std::tie(specialReshape, element_type, targetDevice) = obj.param; std::ostringstream result; result << "specialReshape=" << specialReshape << "_" - << "targetDevice=" << targetDevice; + << "targetDevice=" << targetDevice << "_element_type=" << element_type.to_string(); return result.str(); } std::shared_ptr RoPETestGPTJStridedSlice::buildROPE_GPTJ(int num_head, int hidden_dims, int rotary_dims, - bool hasShapeOf) { + bool hasShapeOf, + ov::element::Type element_type) { auto int32_max = std::numeric_limits::max(); - auto input = std::make_shared(ov::element::f32, PartialShape{-1, -1, num_head, hidden_dims}); - auto sincos = std::make_shared(ov::element::f32, PartialShape{-1, -1, rotary_dims}); + auto input = std::make_shared(element_type, PartialShape{-1, -1, num_head, hidden_dims}); + auto sincos = std::make_shared(element_type, PartialShape{-1, -1, rotary_dims}); auto slice_Slice_965 = makeOP({input, {0, 0, 0, 0}, {0, 0, 0, rotary_dims}, {1, 1, 1, 1}}, {{"begin_mask", {1, 1, 1, 0}}, @@ -555,7 +566,7 @@ std::shared_ptr RoPETestGPTJStridedSlice::buildROPE_GPTJ(int num_head } auto const_idx = makeConst(ov::element::i32, ov::Shape({static_cast(rotary_dims)}), gather_idx); - auto constant_155588 = makeConst(element::f32, + auto constant_155588 = makeConst(element_type, ov::Shape({ 1, 1, @@ -630,11 +641,11 @@ void RoPETestGPTJStridedSlice::generate_inputs(const std::vector& tar inputs.insert({funcInputs[1].get_node_shared_ptr(), t_cos_sin_cache}); } -std::string RoPETestGPTJStridedSlice::getTestCaseName( - const testing::TestParamInfo>& obj) { +std::string RoPETestGPTJStridedSlice::getTestCaseName(const testing::TestParamInfo& obj) { bool hasShapeOf; + ov::element::Type element_type; std::string targetDevice; - std::tie(hasShapeOf, targetDevice) = obj.param; + std::tie(hasShapeOf, element_type, targetDevice) = obj.param; std::ostringstream result; result << "hasShapeOf=" << hasShapeOf << "_" << "targetDevice=" << targetDevice; @@ -643,9 +654,10 @@ std::string RoPETestGPTJStridedSlice::getTestCaseName( void RoPETestGPTJStridedSlice::SetUp() { bool hasShapeOf; - std::tie(hasShapeOf, targetDevice) = this->GetParam(); + ov::element::Type element_type; + std::tie(hasShapeOf, element_type, targetDevice) = this->GetParam(); - const int batch = 2; + const int batch = 128; const int seq_length = 7; const int num_head = 16; const int hidden_dims = 256; @@ -654,7 +666,7 @@ void RoPETestGPTJStridedSlice::SetUp() { InputShape input = {{batch, seq_length, num_head, hidden_dims}, {{batch, seq_length, num_head, hidden_dims}}}; InputShape sincos = {{batch, seq_length, rotary_dims}, {{batch, seq_length, rotary_dims}}}; init_input_shapes({input, sincos}); - function = buildROPE_GPTJ(num_head, hidden_dims, rotary_dims, hasShapeOf); + function = buildROPE_GPTJ(num_head, hidden_dims, rotary_dims, hasShapeOf, element_type); } ov::OutputVector RoPETestRotateHalfWithoutTranspose::makeCosSinCache(int max_position_embeddings, int rotary_ndims) { @@ -791,9 +803,10 @@ void RoPETestRotateHalfWithoutTranspose::generate_inputs(const std::vectorGetParam(); + ov::element::Type element_type; + std::tie(element_type, targetDevice) = this->GetParam(); - const int batch = 2; + const int batch = 128; const int seq_length = 7; const size_t max_position_embeddings = 2048; const size_t ndims = 128; @@ -804,17 +817,20 @@ void RoPETestRotateHalfWithoutTranspose::SetUp() { function = buildROPE_RotateHalfWithoutTranspose(batch, seq_length, max_position_embeddings, num_head, ndims); } -std::string RoPETestRotateHalfWithoutTranspose::getTestCaseName(const testing::TestParamInfo& obj) { - std::string targetDevice = obj.param; +std::string RoPETestRotateHalfWithoutTranspose::getTestCaseName(const testing::TestParamInfo& obj) { + ov::element::Type element_type; + std::string targetDevice; + std::tie(element_type, targetDevice) = obj.param; std::ostringstream result; - result << "targetDevice=" << targetDevice; + result << "targetDevice=" << targetDevice <<"_element_type=" << element_type.to_string(); return result.str(); } void RoPETestLlama2Slice::SetUp() { - targetDevice = this->GetParam(); + ov::element::Type element_type; + std::tie(element_type, targetDevice) = this->GetParam(); - const int batch = 2; + const int batch = 128; const int seq_length = 7; const size_t max_position_embeddings = 2048; const size_t ndims = 128; @@ -879,21 +895,22 @@ std::shared_ptr RoPETestLlama2Slice::buildROPE_Llama2(int batch, } void RoPETestChatGLMSlice::SetUp() { - targetDevice = this->GetParam(); + ov::element::Type element_type; + std::tie(element_type, targetDevice) = this->GetParam(); - const int batch = 2; + const int batch = 128; const int seq_length = 7; const int num_head = 32; const int rotary_dims = 64; InputShape inpShape = {{-1, batch, 4096 + 256 + 256}, {{seq_length, batch, 4096 + 256 + 256}}}; init_input_shapes({inpShape}); - function = RoPETestChatGLMSlice::buildROPE_ChatGLM(batch, num_head, rotary_dims); + function = RoPETestChatGLMSlice::buildROPE_ChatGLM(batch, num_head, rotary_dims, element_type); } -std::shared_ptr RoPETestChatGLMSlice::buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims) { - auto input = std::make_shared(ov::element::f32, PartialShape{-1, batch, 4096 + 256 + 256}); - auto cos_sin_cache = std::make_shared(ov::element::f32, PartialShape{32768, 32, 2}); +std::shared_ptr RoPETestChatGLMSlice::buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims, ov::element::Type element_type) { + auto input = std::make_shared(element_type, PartialShape{-1, batch, 4096 + 256 + 256}); + auto cos_sin_cache = std::make_shared(element_type, PartialShape{32768, 32, 2}); auto position_ids = std::make_shared(ov::element::i32, PartialShape{-1, -1}); auto __module_transformer_index_67_Gather = @@ -962,18 +979,19 @@ std::shared_ptr RoPETestChatGLMSlice::buildROPE_ChatGLM(int batch, in void RoPETestQwen7bSlice::SetUp() { bool specialReshape; - std::tie(specialReshape, targetDevice) = this->GetParam(); - const int batch = 2; + ov::element::Type element_type; + std::tie(specialReshape, element_type, targetDevice) = this->GetParam(); + const int batch = 128; const int seq_length = 7; InputShape inpShape = {{batch, -1, 4096 + 4096 + 4096}, {{batch, seq_length, 4096 + 4096 + 4096}}}; init_input_shapes({inpShape}); - function = RoPETestQwen7bSlice::buildROPE_Qwen7b(specialReshape); + function = RoPETestQwen7bSlice::buildROPE_Qwen7b(specialReshape, element_type); } -std::shared_ptr RoPETestQwen7bSlice::buildROPE_Qwen7b(bool specialReshape) { - auto input = std::make_shared(ov::element::f32, PartialShape{-1, -1, 4096 + 4096 + 4096}); - auto cos_cache = std::make_shared(ov::element::f32, PartialShape{1, -1, 1, 128}); - auto sin_cache = std::make_shared(ov::element::f32, PartialShape{1, -1, 1, 128}); +std::shared_ptr RoPETestQwen7bSlice::buildROPE_Qwen7b(bool specialReshape, ov::element::Type element_type) { + auto input = std::make_shared(element_type, PartialShape{-1, -1, 4096 + 4096 + 4096}); + auto cos_cache = std::make_shared(element_type, PartialShape{1, -1, 1, 128}); + auto sin_cache = std::make_shared(element_type, PartialShape{1, -1, 1, 128}); auto ListUnpack_389_VariadicSplit = makeOP({input, 2, {4096, 4096, -1}}); auto view_Reshape = @@ -1013,7 +1031,7 @@ std::shared_ptr RoPETestQwen7bSlice::buildROPE_Qwen7b(bool specialRes 1, }), {-1}); - auto Constant_296840 = makeOP({Constant_296840_compressed}, {{"destination_type", "f32"}}); + auto Constant_296840 = makeOP({Constant_296840_compressed}, {{"destination_type", element_type.to_string()}}); auto neg_Multiply_499 = makeOP({ListUnpack_496_Squeeze_0, Constant_296840}, {{"auto_broadcast", "numpy"}}); auto ListUnpack_496_Squeeze = makeOP({ListUnpack_496_Split->output(0), -2}); @@ -1026,9 +1044,10 @@ std::shared_ptr RoPETestQwen7bSlice::buildROPE_Qwen7b(bool specialRes void RoPETestGPTJSlice::SetUp() { bool hasShapeOf; - std::tie(hasShapeOf, targetDevice) = this->GetParam(); + ov::element::Type element_type; + std::tie(hasShapeOf, element_type, targetDevice) = this->GetParam(); - const int batch = 2; + const int batch = 128; const int seq_length = 7; const int num_head = 16; const int hidden_dims = 256; @@ -1037,16 +1056,17 @@ void RoPETestGPTJSlice::SetUp() { InputShape input = {{batch, seq_length, num_head, hidden_dims}, {{batch, seq_length, num_head, hidden_dims}}}; InputShape sincos = {{batch, seq_length, rotary_dims}, {{batch, seq_length, rotary_dims}}}; init_input_shapes({input, sincos}); - function = RoPETestGPTJSlice::buildROPE_GPTJ(num_head, hidden_dims, rotary_dims, hasShapeOf); + function = RoPETestGPTJSlice::buildROPE_GPTJ(num_head, hidden_dims, rotary_dims, hasShapeOf, element_type); } std::shared_ptr RoPETestGPTJSlice::buildROPE_GPTJ(int num_head, int hidden_dims, int rotary_dims, - bool hasShapeOf) { + bool hasShapeOf, + ov::element::Type element_type) { auto int32_max = std::numeric_limits::max(); - auto input = std::make_shared(ov::element::f32, PartialShape{-1, -1, num_head, hidden_dims}); - auto sincos = std::make_shared(ov::element::f32, PartialShape{-1, -1, rotary_dims}); + auto input = std::make_shared(element_type, PartialShape{-1, -1, num_head, hidden_dims}); + auto sincos = std::make_shared(element_type, PartialShape{-1, -1, rotary_dims}); auto slice_Slice_965 = makeOP({input, {0}, {rotary_dims}, {1}, {3}}); slice_Slice_965->set_friendly_name("slice_Slice_965"); @@ -1064,7 +1084,7 @@ std::shared_ptr RoPETestGPTJSlice::buildROPE_GPTJ(int num_head, } auto const_idx = makeConst(ov::element::i32, ov::Shape({static_cast(rotary_dims)}), gather_idx); - auto constant_155588 = makeConst(element::f32, + auto constant_155588 = makeConst(element_type, ov::Shape({ 1, 1, @@ -1107,9 +1127,9 @@ std::shared_ptr RoPETestGPTJSlice::buildROPE_GPTJ(int num_head, return std::make_shared(model_output, ov::ParameterVector{input, sincos}); } -std::shared_ptr RoPETestChatGLM2DRoPEStridedSlice::buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims) { - auto input = std::make_shared(ov::element::f32, PartialShape{batch, -1, 4096 + 256 + 256}); - auto cos_sin_cache = std::make_shared(ov::element::f32, PartialShape{32768, 32, 2}); +std::shared_ptr RoPETestChatGLM2DRoPEStridedSlice::buildROPE_ChatGLM(int batch, int head_cnt, int rotary_dims, ov::element::Type element_type) { + auto input = std::make_shared(element_type, PartialShape{batch, -1, 4096 + 256 + 256}); + auto cos_sin_cache = std::make_shared(element_type, PartialShape{32768, 32, 2}); auto position_ids = std::make_shared(ov::element::i32, PartialShape{-1, -1}); auto __module_transformer_index_67_Gather = @@ -1212,22 +1232,25 @@ void RoPETestChatGLM2DRoPEStridedSlice::generate_inputs(const std::vectorGetParam(); + ov::element::Type element_type; + std::tie(element_type, targetDevice) = this->GetParam(); - const int batch = 2; + const int batch = 128; const int seq_length = 7; const int num_head = 32; const int rotary_dims = 64; InputShape inpShape = {{batch, -1, 4096 + 256 + 256}, {{batch, seq_length, 4096 + 256 + 256}}}; init_input_shapes({inpShape}); - function = buildROPE_ChatGLM(-1, num_head, rotary_dims); + function = buildROPE_ChatGLM(-1, num_head, rotary_dims, element_type); } -std::string RoPETestChatGLM2DRoPEStridedSlice::getTestCaseName(const testing::TestParamInfo& obj) { - std::string targetDevice = obj.param; +std::string RoPETestChatGLM2DRoPEStridedSlice::getTestCaseName(const testing::TestParamInfo& obj) { + ov::element::Type element_type; + std::string targetDevice; + std::tie(element_type, targetDevice) = obj.param; std::ostringstream result; - result << "targetDevice=" << targetDevice; + result << "targetDevice=" << targetDevice << "_element_type=" << element_type.to_string(); return result.str(); } From 659ff0d1affbbcbe637df594d6dbb2344e28bff9 Mon Sep 17 00:00:00 2001 From: Georgy Krivoruchko Date: Mon, 16 Dec 2024 09:37:40 +0400 Subject: [PATCH 02/17] [ONNX] Fixed wrong type casting for u/int16 when it stored externally (#28016) ### Details: - Fixed wrong type casting for u/int16 - Added more tests for correctness validation of a values from external file ### Tickets: - 159142 --- .../onnx/frontend/src/core/tensor.hpp | 2 +- .../external_data_int16.prototxt | 75 ++++++++++++++++++ .../external_data/external_data_int4.prototxt | 65 ++++++++++++++++ .../external_data/external_data_int8.prototxt | 75 ++++++++++++++++++ .../external_data_uint16.prototxt | 75 ++++++++++++++++++ .../external_data_uint4.prototxt | 65 ++++++++++++++++ .../external_data_uint8.prototxt | 75 ++++++++++++++++++ .../onnx/tests/onnx_reader_external_data.cpp | 76 +++++++++++++++++++ 8 files changed, 507 insertions(+), 1 deletion(-) create mode 100644 src/frontends/onnx/tests/models/external_data/external_data_int16.prototxt create mode 100644 src/frontends/onnx/tests/models/external_data/external_data_int4.prototxt create mode 100644 src/frontends/onnx/tests/models/external_data/external_data_int8.prototxt create mode 100644 src/frontends/onnx/tests/models/external_data/external_data_uint16.prototxt create mode 100644 src/frontends/onnx/tests/models/external_data/external_data_uint4.prototxt create mode 100644 src/frontends/onnx/tests/models/external_data/external_data_uint8.prototxt diff --git a/src/frontends/onnx/frontend/src/core/tensor.hpp b/src/frontends/onnx/frontend/src/core/tensor.hpp index af4d299f9d45e7..a63cdfd1906bb0 100644 --- a/src/frontends/onnx/frontend/src/core/tensor.hpp +++ b/src/frontends/onnx/frontend/src/core/tensor.hpp @@ -313,7 +313,7 @@ class Tensor { } else { buffer = ext_data.load_external_data(m_model_dir); } - return std::vector(buffer->get_ptr(), buffer->get_ptr() + buffer->size()); + return std::vector(buffer->get_ptr(), buffer->get_ptr() + (buffer->size() / sizeof(T))); } const void* get_data_ptr() const { diff --git a/src/frontends/onnx/tests/models/external_data/external_data_int16.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_int16.prototxt new file mode 100644 index 00000000000000..450e0d8ddc5325 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_int16.prototxt @@ -0,0 +1,75 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + input: "B" + output: "Y" + name: "add_node1" + op_type: "Add" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 5 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "length", + value: "8" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 5 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "B" + type { + tensor_type { + elem_type: 5 + shape { + dim { + dim_value: 1 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 5 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 4 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_int4.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_int4.prototxt new file mode 100644 index 00000000000000..836d2fab8f0cb4 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_int4.prototxt @@ -0,0 +1,65 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + output: "Y" + name: "add_node1" + op_type: "Identity" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 22 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "offset", + value: "2" + } + external_data { + key: "length", + value: "2" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 22 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 22 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 21 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_int8.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_int8.prototxt new file mode 100644 index 00000000000000..d741a7e8168530 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_int8.prototxt @@ -0,0 +1,75 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + input: "B" + output: "Y" + name: "add_node1" + op_type: "Add" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 3 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "length", + value: "4" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 3 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "B" + type { + tensor_type { + elem_type: 3 + shape { + dim { + dim_value: 1 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 3 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 4 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_uint16.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_uint16.prototxt new file mode 100644 index 00000000000000..8d144ebc054e40 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_uint16.prototxt @@ -0,0 +1,75 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + input: "B" + output: "Y" + name: "add_node1" + op_type: "Add" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 4 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "length", + value: "8" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 4 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "B" + type { + tensor_type { + elem_type: 4 + shape { + dim { + dim_value: 1 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 4 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 4 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_uint4.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_uint4.prototxt new file mode 100644 index 00000000000000..df60a23ba29929 --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_uint4.prototxt @@ -0,0 +1,65 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + output: "Y" + name: "add_node1" + op_type: "Identity" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 21 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "offset", + value: "2" + } + external_data { + key: "length", + value: "2" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 21 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 21 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 21 +} diff --git a/src/frontends/onnx/tests/models/external_data/external_data_uint8.prototxt b/src/frontends/onnx/tests/models/external_data/external_data_uint8.prototxt new file mode 100644 index 00000000000000..3f0cfa8406744e --- /dev/null +++ b/src/frontends/onnx/tests/models/external_data/external_data_uint8.prototxt @@ -0,0 +1,75 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + node { + input: "A" + input: "B" + output: "Y" + name: "add_node1" + op_type: "Add" + } + name: "test_graph" + initializer { + dims: 2 + dims: 2 + data_type: 2 + name: "A" + external_data { + key: "location", + value: "tensors_data/tensor.data" + } + external_data { + key: "length", + value: "4" + } + data_location: 1 + } + input { + name: "A" + type { + tensor_type { + elem_type: 2 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } + input { + name: "B" + type { + tensor_type { + elem_type: 2 + shape { + dim { + dim_value: 1 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 2 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 2 + } + } + } + } + } +} +opset_import { + version: 4 +} diff --git a/src/frontends/onnx/tests/onnx_reader_external_data.cpp b/src/frontends/onnx/tests/onnx_reader_external_data.cpp index 103099d08bced9..7d0fbbf6240f55 100644 --- a/src/frontends/onnx/tests/onnx_reader_external_data.cpp +++ b/src/frontends/onnx/tests/onnx_reader_external_data.cpp @@ -255,4 +255,80 @@ TEST_P(OnnxFeMmapFixture, onnx_external_data_in_constant_node) { test_case.run(); } +TEST_P(OnnxFeMmapFixture, onnx_external_data_int16) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_int16.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_input({-100}); + test_case.add_expected_output(Shape{2, 2}, {-100, 16156, -100, 16284}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_uint16) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_uint16.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_input({100}); + test_case.add_expected_output(Shape{2, 2}, {100, 16356, 100, 16484}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_int8) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_int8.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_input({-100}); + test_case.add_expected_output(Shape{2, 2}, {-100, 106, -100, -37}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_uint8) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_uint8.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_input({100}); + test_case.add_expected_output(Shape{2, 2}, {100, 100, 228, 163}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_int4) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_int4.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_expected_output(Shape{2, 2}, {static_cast(0x80), 0x3f}); + + test_case.run(); +} + +TEST_P(OnnxFeMmapFixture, onnx_external_data_uint4) { + const auto path = test::utils::getModelFromTestModelZoo(string(TEST_ONNX_MODELS_DIRNAME) + + "external_data/external_data_uint4.onnx"); + Core core; + core.set_property(enable_mmap(GetParam())); + const auto model = core.read_model(path); + auto test_case = test::TestCase(model); + test_case.add_expected_output(Shape{2, 2}, {0x80, 0x3f}); + + test_case.run(); +} + INSTANTIATE_TEST_SUITE_P(OnnxFeMMapReadModel, OnnxFeMmapFixture, ::testing::Bool()); From 2cbe4b7d7bb1b4408ff26c0cca26bb96e4e3d0d5 Mon Sep 17 00:00:00 2001 From: Egor Duplenskii Date: Mon, 16 Dec 2024 09:49:03 +0100 Subject: [PATCH 03/17] [CPU][DEBUG_CAPS] Fix average counters csv generation (#28068) csv file was generated by default by mistake --- src/plugins/intel_cpu/src/graph_dumper.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_cpu/src/graph_dumper.cpp b/src/plugins/intel_cpu/src/graph_dumper.cpp index 5a3a95362267fe..ffd58fdb162899 100644 --- a/src/plugins/intel_cpu/src/graph_dumper.cpp +++ b/src/plugins/intel_cpu/src/graph_dumper.cpp @@ -357,12 +357,16 @@ void average_counters(const Graph& graph) { * - _.csv * For example: 0_MyModel.csv */ + + const std::string& path = graph.getConfig().debugCaps.averageCountersPath; + + if (path.empty()) + return; + static int graphIndex = 0; + std::string fileName = path + "_" + std::to_string(graphIndex++) + ".csv"; std::ofstream file; - std::string fileName = - graph.getConfig().debugCaps.averageCountersPath + "_" + std::to_string(graphIndex++) + ".csv"; - file.open(fileName); // table structure is identical to the benchmark_app average_counters report From a66c456eb70d916049ce80ecac15b0a27ff9a949 Mon Sep 17 00:00:00 2001 From: Andrei Kashchikhin Date: Mon, 16 Dec 2024 09:42:28 +0000 Subject: [PATCH 04/17] [CI] [GHA] Adjust error message for 131918 (#28063) ### Tickets: - *131918* --- .github/scripts/workflow_rerun/errors_to_look_for.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/workflow_rerun/errors_to_look_for.json b/.github/scripts/workflow_rerun/errors_to_look_for.json index b1074fa43e7200..ad771e9d51f75d 100644 --- a/.github/scripts/workflow_rerun/errors_to_look_for.json +++ b/.github/scripts/workflow_rerun/errors_to_look_for.json @@ -8,7 +8,7 @@ "ticket": 135715 }, { - "error_text": "GnuTLS recv error", + "error_text": "error: RPC failed", "ticket": 131918 }, { From 6ccaa88abbc56ca079adf445c13f7d5e75f61b3e Mon Sep 17 00:00:00 2001 From: Andrzej Kopytko Date: Mon, 16 Dec 2024 13:30:06 +0100 Subject: [PATCH 05/17] [DOCS] JsonTable-Sphinx Directivs (#28080) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- .../generative-ai-performance.rst | 7 +- .../openvino_sphinx_theme/__init__.py | 3 +- .../openvino_sphinx_theme/directives/code.py | 50 ++++++++++++ .../_static/js/openVinoDataTables.js | 79 +++++++++---------- 4 files changed, 91 insertions(+), 48 deletions(-) diff --git a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst index cc51b1f5fac969..18b6bda5e4a185 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst @@ -8,7 +8,6 @@ The current data is as of OpenVINO 2024.4, 20 Nov. 2024. The tables below list the key performance indicators for inference on built-in GPUs. - .. raw:: html @@ -20,7 +19,7 @@ The tables below list the key performance indicators for inference on built-in G .. csv-table:: :class: modeldata stripe - :name: supportedModelsTableOv + :name: supportedModelsTable_V1 :header-rows: 1 :file: ../../_static/benchmarks_files/llm_models_9-288V.csv @@ -28,7 +27,7 @@ The tables below list the key performance indicators for inference on built-in G .. csv-table:: :class: modeldata stripe - :name: supportedModelsTableOv + :name: supportedModelsTable_V2 :header-rows: 1 :file: ../../_static/benchmarks_files/llm_models_7-258V.csv @@ -36,7 +35,7 @@ The tables below list the key performance indicators for inference on built-in G .. csv-table:: :class: modeldata stripe - :name: supportedModelsTableOv + :name: supportedModelsTable_V3 :header-rows: 1 :file: ../../_static/benchmarks_files/llm_models_7-155H.csv diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/__init__.py b/docs/openvino_sphinx_theme/openvino_sphinx_theme/__init__.py index 2696b134bbe9af..8cf187a8d89208 100644 --- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/__init__.py +++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/__init__.py @@ -7,7 +7,7 @@ from docutils.parsers import rst from pathlib import Path from sphinx.util import logging -from .directives.code import DoxygenSnippet, Scrollbox, Nodescrollbox, visit_scrollbox, depart_scrollbox +from .directives.code import DoxygenSnippet, Scrollbox, Nodescrollbox, visit_scrollbox, depart_scrollbox, DataTable SPHINX_LOGGER = logging.getLogger(__name__) @@ -99,6 +99,7 @@ def setup(app): app.add_html_theme('openvino_sphinx_theme', theme_path) rst.directives.register_directive('doxygensnippet', DoxygenSnippet) rst.directives.register_directive('scrollbox', Scrollbox) + rst.directives.register_directive('data-table', DataTable) app.add_node( Nodescrollbox, html=(visit_scrollbox, depart_scrollbox), diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py b/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py index b91b0d599ee28a..c3e0e81eec3b3a 100644 --- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py +++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/directives/code.py @@ -12,6 +12,8 @@ import re import json +import csv + logger = logging.getLogger(__name__) @@ -136,3 +138,51 @@ def run(self): if self.content: self.state.nested_parse(self.content, self.content_offset, node) return [node] + + +class DataTable(Directive): + required_arguments = 0 + has_content = False + option_spec = {'header-rows': directives.nonnegative_int, + 'file': directives.path, + 'class': directives.unchanged, + 'name': directives.unchanged, + 'hidden': directives.unchanged + } + + def run(self) -> List[Node]: + current_directory = os.path.dirname(os.path.abspath(self.state.document.current_source)) + csv_file = os.path.normpath(os.path.join(current_directory, self.options['file'])) + if os.path.isfile(csv_file) is False: + self.warning("Cannot find the specified CSV file. " + "Please provide a correct path.") + csv_node = [] + with open(csv_file, 'r') as j: + csv_data = list(csv.reader(j)) + class_table_tag = ' class="' + "".join(c for c in str(self.options['class']) + '"') if 'class' in self.options is not None else "" + id_table_tag = ' id="' + "".join(c for c in str(self.options['name']) + '"') if 'name' in self.options is not None else "" + hidden_table_tag = ' data-columns-hidden="' + "".join(c for c in str(self.options['hidden']) + '"') if 'hidden' in self.options is not None else "" + csv_table_html = '' + head_rows = 0 + head_rows += self.options.get('header-rows', 0) + row_count = 0 + for row in csv_data[:head_rows]: + row_count += 1 + parity = "row-even" if row_count % 2 == 0 else "row-odd" + csv_table_html += '' + for value in row: + csv_table_html += '

%s

' % value + csv_table_html += '\n' + csv_table_html += '' + for row in csv_data[head_rows:]: + row_count += 1 + parity = "row-even" if row_count % 2 == 0 else "row-odd" + csv_table_html += '' + for value in row: + csv_table_html += '

%s

' % value + csv_table_html += '\n' + csv_table_html += "" + csv_table_html += '' + csv_node.append(nodes.raw(csv_table_html, csv_table_html, format="html")) + + return csv_node \ No newline at end of file diff --git a/docs/sphinx_setup/_static/js/openVinoDataTables.js b/docs/sphinx_setup/_static/js/openVinoDataTables.js index c65748065b2ad2..bd56a71533786c 100644 --- a/docs/sphinx_setup/_static/js/openVinoDataTables.js +++ b/docs/sphinx_setup/_static/js/openVinoDataTables.js @@ -1,48 +1,41 @@ $(document).ready(function () { - var pageTitle = document.title; - var columnDefs; - if(pageTitle.includes('Most Efficient Large Language Models for AI PC')) - { - columnDefs= [ - { "visible": false, "targets": [3,4,6] } - ] - } - else - { - columnDefs=[] - } + var columnDefs = []; - var table = $('table.modeldata').DataTable({ - responsive: true, - "autoWidth": false, - stateSave: true, - language: { - buttons: { - colvisRestore: "Show all columns" - } - }, - lengthMenu: [ - [10, 25, 50, -1], - ['10 rows', '25 rows', '50 rows', 'Show all rows'] - ], - "columnDefs": columnDefs, - layout: { - topStart: { - buttons: [ - 'pageLength', - { - extend: 'colvis', - postfixButtons: ['colvisRestore'], - }, - { - extend: 'print', - text: 'Print pdf', - exportOptions: { - columns: ':visible' + var tables = $('table.modeldata'); + for (let table of tables) { + var hidden = table.getAttribute('data-columns-hidden'); + columnDefs = [{ "visible": false, "targets": JSON.parse(hidden) }] + $(table).DataTable({ + responsive: true, + "autoWidth": false, + language: { + buttons: { + colvisRestore: "Restore default" + } + }, + lengthMenu: [ + [10, 25, 50, -1], + ['10 rows', '25 rows', '50 rows', 'Show all records'] + ], + "columnDefs": columnDefs, + layout: { + topStart: { + buttons: [ + 'pageLength', + { + extend: 'colvis', + postfixButtons: ['colvisRestore'], + }, + { + extend: 'print', + text: 'Print pdf', + exportOptions: { + columns: ':visible' + } } - } - ] + ] + } } - } - }); + }); + } }); \ No newline at end of file From a021d7ef921452d9b5e32337953a3c795ac9d27b Mon Sep 17 00:00:00 2001 From: Wanglei Shen Date: Mon, 16 Dec 2024 20:50:28 +0800 Subject: [PATCH 06/17] Support new windows platform (#27809) ### Details: - *support new windows platform which is a VM (VPS) running on a Hypervisor* - *using one stream on two sockets* ### Tickets: - *[issues-27581](https://github.com/openvinotoolkit/openvino/issues/27581)* --- src/inference/src/os/win/win_system_conf.cpp | 69 ++++++----- .../unit/cpu_map_parser/parser_windows.cpp | 54 +++++++++ .../intel_cpu/src/cpu_streams_calculation.cpp | 4 +- .../streams_info/streams_info_table_test.cpp | 109 ++++++++++-------- 4 files changed, 161 insertions(+), 75 deletions(-) diff --git a/src/inference/src/os/win/win_system_conf.cpp b/src/inference/src/os/win/win_system_conf.cpp index a4129a80b599ba..97831207df73d5 100644 --- a/src/inference/src/os/win/win_system_conf.cpp +++ b/src/inference/src/os/win/win_system_conf.cpp @@ -92,6 +92,10 @@ void parse_processor_info_win(const char* base_ptr, _cores = 0; _blocked_cores = 0; + constexpr int initial_core_type = -1; + constexpr int group_with_2_cores = 2; + constexpr int group_with_4_cores = 4; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = NULL; auto MaskToList = [&](const KAFFINITY mask_input) { @@ -131,7 +135,7 @@ void parse_processor_info_win(const char* base_ptr, base_proc = _processors; } - if (2 == list_len) { + if (group_with_2_cores == list_len) { proc_info = cpu_init_line; proc_info[CPU_MAP_PROCESSOR_ID] = list[0] + base_proc; proc_info[CPU_MAP_NUMA_NODE_ID] = _sockets; @@ -162,7 +166,11 @@ void parse_processor_info_win(const char* base_ptr, proc_info[CPU_MAP_CORE_ID] = _cores; if ((_processors > group_start) && (_processors <= group_end)) { proc_info[CPU_MAP_CORE_TYPE] = group_type; - proc_info[CPU_MAP_GROUP_ID] = group_id; + if ((group_type == MAIN_CORE_PROC) && (group_end - group_start != 1)) { + proc_info[CPU_MAP_GROUP_ID] = group++; + } else { + proc_info[CPU_MAP_GROUP_ID] = group_id; + } if (group_id == CPU_BLOCKED) { proc_info[CPU_MAP_USED_FLAG] = CPU_BLOCKED; _blocked_cores++; @@ -178,7 +186,7 @@ void parse_processor_info_win(const char* base_ptr, } else if ((info->Relationship == RelationCache) && (info->Cache.Level == 2)) { MaskToList(info->Cache.GroupMask.Mask); - if (4 == list_len) { + if (group_with_4_cores == list_len) { if (_processors <= list[list_len - 1] + base_proc) { group_start = list[0]; group_end = list[list_len - 1]; @@ -190,38 +198,45 @@ void parse_processor_info_win(const char* base_ptr, _cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group_id; _proc_type_table[0][EFFICIENT_CORE_PROC]++; } - } else if ((2 == list_len) && (-1 == _cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE])) { + } else if (group_with_2_cores == list_len) { + if (initial_core_type == _cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE]) { + if (_processors <= list[list_len - 1] + base_proc) { + group_start = list[0]; + group_end = list[list_len - 1]; + if (_proc_type_table[0][EFFICIENT_CORE_PROC] > 0) { + group_id = CPU_BLOCKED; + group_type = EFFICIENT_CORE_PROC; + _blocked_cores++; + } else { + group_id = group++; + group_type = MAIN_CORE_PROC; + } + } + for (int m = 0; m < _processors - list[0]; m++) { + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_CORE_TYPE] = group_type; + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group_id; + if (group_id == CPU_BLOCKED) { + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = CPU_BLOCKED; + } else { + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = NOT_USED; + _proc_type_table[0][MAIN_CORE_PROC]++; + } + } + } + } else { if (_processors <= list[list_len - 1] + base_proc) { group_start = list[0]; group_end = list[list_len - 1]; - if (_proc_type_table[0][EFFICIENT_CORE_PROC] > 0) { - group_id = CPU_BLOCKED; - group_type = EFFICIENT_CORE_PROC; - _blocked_cores++; - } else { - group_id = group++; - group_type = MAIN_CORE_PROC; - } + group_id = group; + group_type = MAIN_CORE_PROC; } for (int m = 0; m < _processors - list[0]; m++) { - _cpu_mapping_table[list[m] + base_proc][CPU_MAP_CORE_TYPE] = group_type; - _cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group_id; - if (group_id == CPU_BLOCKED) { - _cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = CPU_BLOCKED; - } else { - _cpu_mapping_table[list[m] + base_proc][CPU_MAP_USED_FLAG] = NOT_USED; + if (_cpu_mapping_table[list[m] + base_proc][CPU_MAP_CORE_TYPE] == initial_core_type) { + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; + _cpu_mapping_table[list[m] + base_proc][CPU_MAP_GROUP_ID] = group++; _proc_type_table[0][MAIN_CORE_PROC]++; } } - - } else if (1 == list_len) { - if ((_cpu_mapping_table.size() > list[0]) && - (_cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE] == -1)) { - group_id = group++; - _cpu_mapping_table[list[0] + base_proc][CPU_MAP_CORE_TYPE] = MAIN_CORE_PROC; - _cpu_mapping_table[list[0] + base_proc][CPU_MAP_GROUP_ID] = group_id; - _proc_type_table[0][MAIN_CORE_PROC]++; - } } } } diff --git a/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp b/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp index 2c87b97562e975..280594b8ec897f 100644 --- a/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp +++ b/src/inference/tests/unit/cpu_map_parser/parser_windows.cpp @@ -1448,6 +1448,59 @@ WinCpuMapTestCase _2sockets_48cores = { "fffff0000"}, }; +WinCpuMapTestCase _2sockets_16cores = { + 16, + 2, + 2, + 16, + 0, + {{16, 16, 0, 0, -1, -1}, {8, 8, 0, 0, 0, 0}, {8, 8, 0, 0, 1, 1}}, + { + {0, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {1, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {2, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {3, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {4, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {5, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {6, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {7, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {8, 1, 1, 8, MAIN_CORE_PROC, 8, -1}, + {9, 1, 1, 9, MAIN_CORE_PROC, 9, -1}, + {10, 1, 1, 10, MAIN_CORE_PROC, 10, -1}, + {11, 1, 1, 11, MAIN_CORE_PROC, 11, -1}, + {12, 1, 1, 12, MAIN_CORE_PROC, 12, -1}, + {13, 1, 1, 13, MAIN_CORE_PROC, 13, -1}, + {14, 1, 1, 14, MAIN_CORE_PROC, 14, -1}, + {15, 1, 1, 15, MAIN_CORE_PROC, 15, -1}, + }, + {"0300000030000000000000000000000000000000000000000000000000000100ff00000000000000000000000000000000000000300000000" + "00000000000000000000000000000000000000000000100010000000000000000000000000000000200000038000000010840000080000002" + "0000000000000000000000000000000000000000000000ff00000000000000000000000000000002000000380000000108400000800000010" + "000000000000000000000000000000000000000000000ff000000000000000000000000000000020000003800000002044000000004000000" + "00000000000000000000000000000000000000000000ff0000000000000000000000000000000200000038000000031040000000000100000" + "0000000000000000000000000000000000000000000ff00000000000000000000000000000000000000300000000000000000000000000000" + "00000000000000000000000100020000000000000000000000000000000000000030000000000000000000000000000000000000000000000" + "00000010004000000000000000000000000000000000000003000000000000000000000000000000000000000000000000000010008000000" + "00000000000000000000000000000000300000000000000000000000000000000000000000000000000001001000000000000000000000000" + "00000000000000030000000000000000000000000000000000000000000000000000100200000000000000000000000000000000000000030" + "00000000000000000000000000000000000000000000000000010040000000000000000000000000000000000000003000000000000000000" + "00000000000000000000000000000000001008000000000000000000000000000000003000000300000000000000000000000000000000000" + "0000000000000000010000ff00000000000000000000000000000000000030000000000000000000000000000000000000000000000000000" + "10000010000000000000000000000000000020000003800000001084000008000000200000000000000000000000000000000000000000000" + "0000ff00000000000000000000000000000200000038000000010840000080000001000000000000000000000000000000000000000000000" + "000ff000000000000000000000000000002000000380000000204400000000400000000000000000000000000000000000000000000000000" + "00ff0000000000000000000000000000020000003800000003104000000000010000000000000000000000000000000000000000000000000" + "0ff00000000000000000000000000000000000030000000000000000000000000000000000000000000000000000100000200000000000000" + "00000000000000000000003000000000000000000000000000000000000000000000000000010000040000000000000000000000000000000" + "00000300000000000000000000000000000000000000000000000000001000008000000000000000000000000000000000000300000000000" + "00000000000000000000000000000000000000000100001000000000000000000000000000000000000030000000000000000000000000000" + "00000000000000000000000010000200000000000000000000000000000000000003000000000000000000000000000000000000000000000" + "00000001000040000000000000000000000000000000000000300000000000000000000000000000000000000000000000000001000080000" + "00000000000000000000000000100000030000000000000000000000000000000000000000000000000000000ffff00000000000000000000" + "00000000040000005000000001000100000000000000000000000000000000000000000010100000000000000000000000000000000000000" + "000000000000000000000000000000000000000ffff000000000000"}, +}; + WinCpuMapTestCase _1sockets_24cores_hyperthreading_set1 = { 32, 1, @@ -2328,6 +2381,7 @@ INSTANTIATE_TEST_SUITE_P(CPUMap, _2sockets_48cores_hyperthreading, _2sockets_36cores_hyperthreading, _2sockets_48cores, + _2sockets_16cores, _1sockets_24cores_hyperthreading_set1, _1sockets_24cores_hyperthreading_set2, _1sockets_22cores_hyperthreading, diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp index be6f5c4035d1ee..3af6a52d5f3342 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp @@ -205,13 +205,13 @@ std::vector> get_streams_info_table( current_socket_id = input_current_socket_id == -1 ? get_current_socket_id() : input_current_socket_id; if (input_threads > 0) { if (hint_model_distribution_policy.size() == 0) { + n_threads_per_stream = std::min(input_threads, proc_type_table[0][ALL_PROC]); + } else { for (auto& row : proc_socket_table) { if (current_socket_id == row[PROC_SOCKET_ID]) { n_threads_per_stream = std::min(input_threads, row[ALL_PROC]); } } - } else { - n_threads_per_stream = std::min(input_threads, proc_type_table[0][ALL_PROC]); } if (proc_type_table.size() == 1) { if ((n_threads_per_stream > proc_type_table[0][MAIN_CORE_PROC]) && diff --git a/src/plugins/intel_cpu/tests/unit/streams_info/streams_info_table_test.cpp b/src/plugins/intel_cpu/tests/unit/streams_info/streams_info_table_test.cpp index 57f9f5c5d72a14..7dc643c181df7e 100644 --- a/src/plugins/intel_cpu/tests/unit/streams_info/streams_info_table_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/streams_info/streams_info_table_test.cpp @@ -278,7 +278,11 @@ StreamsCalculationTestCase _2sockets_104cores_latency_socket_8 = { "LATENCY", {}, {{208, 104, 0, 104, -1, -1}, {104, 52, 0, 52, 0, 0}, {104, 52, 0, 52, 1, 1}}, - {{1, ALL_PROC, 104, 0, 0}, {0, MAIN_CORE_PROC, 52, 0, 0}, {0, HYPER_THREADING_PROC, 52, 0, 0}}, + {{1, ALL_PROC, 208, -1, -1}, + {0, MAIN_CORE_PROC, 52, 0, 0}, + {0, HYPER_THREADING_PROC, 52, 0, 0}, + {0, MAIN_CORE_PROC, 52, 1, 1}, + {0, HYPER_THREADING_PROC, 52, 1, 1}}, }; StreamsCalculationTestCase _2sockets_104cores_latency_socket_9 = { 1, @@ -329,18 +333,6 @@ StreamsCalculationTestCase _2sockets_104cores_latency_2 = { {{208, 104, 0, 104, -1, -1}, {104, 52, 0, 52, 0, 0}, {104, 52, 0, 52, 1, 1}}, {{1, MAIN_CORE_PROC, 20, 0, 0}}, }; -StreamsCalculationTestCase _2sockets_104cores_latency_3 = { - 1, - false, - 208, - 0, - 0, - 0, - "LATENCY", - {}, - {{208, 104, 0, 104, -1, -1}, {104, 52, 0, 52, 0, 0}, {104, 52, 0, 52, 1, 1}}, - {{1, ALL_PROC, 104, 0, 0}, {0, MAIN_CORE_PROC, 52, 0, 0}, {0, HYPER_THREADING_PROC, 52, 0, 0}}, -}; StreamsCalculationTestCase _2sockets_104cores_latency_4 = { 1, true, @@ -375,7 +367,11 @@ StreamsCalculationTestCase _2sockets_104cores_latency_6 = { "LATENCY", {}, {{208, 104, 0, 104, -1, -1}, {104, 52, 0, 52, 0, 0}, {104, 52, 0, 52, 1, 1}}, - {{1, ALL_PROC, 104, 0, 0}, {0, MAIN_CORE_PROC, 52, 0, 0}, {0, HYPER_THREADING_PROC, 52, 0, 0}}, + {{1, ALL_PROC, 208, -1, -1}, + {0, MAIN_CORE_PROC, 52, 0, 0}, + {0, HYPER_THREADING_PROC, 52, 0, 0}, + {0, MAIN_CORE_PROC, 52, 1, 1}, + {0, HYPER_THREADING_PROC, 52, 1, 1}}, }; StreamsCalculationTestCase _2sockets_104cores_tput_1 = { 1, @@ -749,7 +745,7 @@ StreamsCalculationTestCase _2sockets_48cores_latency_3 = { "LATENCY", {}, {{48, 48, 0, 0, -1, -1}, {24, 24, 0, 0, 0, 0}, {24, 24, 0, 0, 1, 1}}, - {{1, MAIN_CORE_PROC, 24, 0, 0}}, + {{1, ALL_PROC, 48, -1, -1}, {0, MAIN_CORE_PROC, 24, 0, 0}, {0, MAIN_CORE_PROC, 24, 1, 1}}, }; StreamsCalculationTestCase _2sockets_48cores_tput_1 = { 1, @@ -2038,7 +2034,7 @@ StreamsCalculationTestCase _2sockets_mock_latency_5 = { "LATENCY", {}, {{60, 60, 0, 0, -1, -1}, {20, 20, 0, 0, 1, 1}, {40, 40, 0, 0, 0, 0}}, - {{1, MAIN_CORE_PROC, 20, 1, 1}}, + {{1, ALL_PROC, 50, -1, -1}, {0, MAIN_CORE_PROC, 20, 1, 1}, {0, MAIN_CORE_PROC, 30, 0, 0}}, }; StreamsCalculationTestCase _2sockets_mock_latency_6 = { 1, @@ -2086,7 +2082,7 @@ StreamsCalculationTestCase _2sockets_mock_latency_9 = { "LATENCY", {}, {{60, 0, 60, 0, -1, -1}, {20, 0, 20, 0, 1, 1}, {40, 0, 40, 0, 0, 0}}, - {{1, EFFICIENT_CORE_PROC, 20, 1, 1}}, + {{1, ALL_PROC, 50, -1, -1}, {0, EFFICIENT_CORE_PROC, 20, 1, 1}, {0, EFFICIENT_CORE_PROC, 30, 0, 0}}, }; StreamsCalculationTestCase _2sockets_mock_latency_10 = { 1, @@ -2134,7 +2130,11 @@ StreamsCalculationTestCase _2sockets_mock_latency_13 = { "LATENCY", {}, {{60, 30, 0, 30, -1, -1}, {20, 10, 0, 10, 1, 1}, {40, 20, 0, 20, 0, 0}}, - {{1, ALL_PROC, 20, 1, 1}, {0, MAIN_CORE_PROC, 10, 1, 1}, {0, HYPER_THREADING_PROC, 10, 1, 1}}, + {{1, ALL_PROC, 50, -1, -1}, + {0, MAIN_CORE_PROC, 10, 1, 1}, + {0, HYPER_THREADING_PROC, 10, 1, 1}, + {0, MAIN_CORE_PROC, 20, 0, 0}, + {0, HYPER_THREADING_PROC, 10, 0, 0}}, }; StreamsCalculationTestCase _2sockets_mock_latency_14 = { 1, @@ -2198,7 +2198,10 @@ StreamsCalculationTestCase _2sockets_mock_latency_17 = { {80, 40, 0, 40, 0, 0}, {60, 30, 0, 30, 1, 1}, {40, 20, 0, 20, 2, 2}}, - {{1, ALL_PROC, 20, 3, 3}, {0, MAIN_CORE_PROC, 10, 3, 3}, {0, HYPER_THREADING_PROC, 10, 3, 3}}, + {{1, ALL_PROC, 50, -1, -1}, + {0, MAIN_CORE_PROC, 10, 3, 3}, + {0, HYPER_THREADING_PROC, 10, 3, 3}, + {0, MAIN_CORE_PROC, 30, 0, 0}}, }; StreamsCalculationTestCase _2sockets_mock_latency_18 = { 1, @@ -2278,7 +2281,10 @@ StreamsCalculationTestCase _2sockets_mock_latency_21 = { {30, 15, 0, 15, 3, 3}, {80, 40, 0, 40, 4, 4}, {60, 30, 0, 30, 5, 5}}, - {{1, ALL_PROC, 40, 6, 6}, {0, MAIN_CORE_PROC, 20, 6, 6}, {0, HYPER_THREADING_PROC, 20, 6, 6}}, + {{1, ALL_PROC, 50, -1, -1}, + {0, MAIN_CORE_PROC, 20, 6, 6}, + {0, HYPER_THREADING_PROC, 20, 6, 6}, + {0, MAIN_CORE_PROC, 10, 7, 7}}, }; StreamsCalculationTestCase _2sockets_mock_latency_22 = { 1, @@ -2418,31 +2424,15 @@ StreamsCalculationTestCase _2sockets_mock_latency_29 = { {60, 30, 0, 30, 1, 0}, {40, 20, 0, 20, 2, 1}, {20, 10, 0, 10, 3, 1}}, - {{1, ALL_PROC, 140, -1, 0}, - {0, MAIN_CORE_PROC, 40, 0, 0}, - {0, MAIN_CORE_PROC, 30, 1, 0}, - {0, HYPER_THREADING_PROC, 40, 0, 0}, - {0, HYPER_THREADING_PROC, 30, 1, 0}}, -}; -StreamsCalculationTestCase _2sockets_mock_latency_30 = { - 1, - false, - 200, - 0, - 0, - 0, - "LATENCY", - {}, - {{200, 100, 0, 100, -1, -1}, - {80, 40, 0, 40, 0, 0}, - {60, 30, 0, 30, 1, 0}, - {40, 20, 0, 20, 2, 1}, - {20, 10, 0, 10, 3, 1}}, - {{1, ALL_PROC, 140, -1, 0}, + {{1, ALL_PROC, 200, -1, -1}, {0, MAIN_CORE_PROC, 40, 0, 0}, {0, MAIN_CORE_PROC, 30, 1, 0}, {0, HYPER_THREADING_PROC, 40, 0, 0}, - {0, HYPER_THREADING_PROC, 30, 1, 0}}, + {0, HYPER_THREADING_PROC, 30, 1, 0}, + {0, MAIN_CORE_PROC, 20, 2, 1}, + {0, MAIN_CORE_PROC, 10, 3, 1}, + {0, HYPER_THREADING_PROC, 20, 2, 1}, + {0, HYPER_THREADING_PROC, 10, 3, 1}}, }; StreamsCalculationTestCase _2sockets_mock_latency_31 = { 1, @@ -2595,7 +2585,15 @@ StreamsCalculationTestCase _2sockets_mock_latency_35 = { {60, 30, 0, 30, 1, 1}, {40, 20, 0, 20, 2, 2}, {20, 10, 0, 10, 3, 3}}, - {{1, ALL_PROC, 20, 3, 3}, {0, MAIN_CORE_PROC, 10, 3, 3}, {0, HYPER_THREADING_PROC, 10, 3, 3}}, + {{1, ALL_PROC, 200, -1, -1}, + {0, MAIN_CORE_PROC, 10, 3, 3}, + {0, HYPER_THREADING_PROC, 10, 3, 3}, + {0, MAIN_CORE_PROC, 40, 0, 0}, + {0, MAIN_CORE_PROC, 30, 1, 1}, + {0, MAIN_CORE_PROC, 20, 2, 2}, + {0, HYPER_THREADING_PROC, 40, 0, 0}, + {0, HYPER_THREADING_PROC, 30, 1, 1}, + {0, HYPER_THREADING_PROC, 20, 2, 2}}, }; StreamsCalculationTestCase _2sockets_mock_latency_36 = { 1, @@ -2611,7 +2609,15 @@ StreamsCalculationTestCase _2sockets_mock_latency_36 = { {60, 30, 0, 30, 1, 1}, {40, 20, 0, 20, 2, 2}, {20, 10, 0, 10, 3, 3}}, - {{1, ALL_PROC, 20, 3, 3}, {0, MAIN_CORE_PROC, 10, 3, 3}, {0, HYPER_THREADING_PROC, 10, 3, 3}}, + {{1, ALL_PROC, 200, -1, -1}, + {0, MAIN_CORE_PROC, 10, 3, 3}, + {0, HYPER_THREADING_PROC, 10, 3, 3}, + {0, MAIN_CORE_PROC, 40, 0, 0}, + {0, MAIN_CORE_PROC, 30, 1, 1}, + {0, MAIN_CORE_PROC, 20, 2, 2}, + {0, HYPER_THREADING_PROC, 40, 0, 0}, + {0, HYPER_THREADING_PROC, 30, 1, 1}, + {0, HYPER_THREADING_PROC, 20, 2, 2}}, }; StreamsCalculationTestCase _2sockets_mock_latency_37 = { 1, @@ -2823,6 +2829,18 @@ StreamsCalculationTestCase _2sockets_mock_latency_50 = { {{208, 208, 0, 0, -1, -1}, {52, 52, 0, 0, 2, 1}, {52, 52, 0, 0, 3, 1}, {52, 52, 0, 0, 0, 0}, {52, 52, 0, 0, 1, 0}}, {{1, ALL_PROC, 80, -1, 1}, {0, MAIN_CORE_PROC, 52, 2, 1}, {0, MAIN_CORE_PROC, 28, 3, 1}}, }; +StreamsCalculationTestCase _2sockets_mock_latency_51 = { + 1, + false, + 16, + 0, + 0, + 0, + "LATENCY", + {}, + {{16, 16, 0, 0, -1, -1}, {8, 8, 0, 0, 0, 0}, {8, 8, 0, 0, 1, 1}}, + {{1, ALL_PROC, 16, -1, -1}, {0, MAIN_CORE_PROC, 8, 0, 0}, {0, MAIN_CORE_PROC, 8, 1, 1}}, +}; TEST_P(StreamsCalculationTests, StreamsCalculation) {} @@ -2848,7 +2866,6 @@ INSTANTIATE_TEST_SUITE_P(StreamsInfoTable, _2sockets_104cores_latency_socket_10, _2sockets_104cores_latency_1, _2sockets_104cores_latency_2, - _2sockets_104cores_latency_3, _2sockets_104cores_latency_4, _2sockets_104cores_latency_5, _2sockets_104cores_latency_6, @@ -2998,7 +3015,6 @@ INSTANTIATE_TEST_SUITE_P(StreamsInfoTable, _2sockets_mock_latency_27, _2sockets_mock_latency_28, _2sockets_mock_latency_29, - _2sockets_mock_latency_30, _2sockets_mock_latency_31, _2sockets_mock_latency_32, _2sockets_mock_latency_33, @@ -3019,6 +3035,7 @@ INSTANTIATE_TEST_SUITE_P(StreamsInfoTable, _2sockets_mock_latency_48, _2sockets_mock_latency_49, _2sockets_mock_latency_50, + _2sockets_mock_latency_51, _1sockets_mock_latency_1, _1sockets_mock_latency_2, _1sockets_mock_latency_3, From 313e39e50b98a057ffebf2f5c553636305923661 Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Mon, 16 Dec 2024 14:03:41 +0100 Subject: [PATCH 07/17] [GHA] Explicitly enable wheel build in CI (#27999) ### Details: - Enable wheel packages in the all pipelines where we are expecting them, because it could be disabled silently if some of the requirements are missed on the environment ### Tickets: - *ticket-id* --- .github/workflows/linux_arm64.yml | 1 + .github/workflows/ubuntu_20.yml | 1 + .github/workflows/ubuntu_22.yml | 1 + .github/workflows/windows_vs2019_release.yml | 1 + 4 files changed, 4 insertions(+) diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index e1aaa886d631c7..255a30cbc88770 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -97,6 +97,7 @@ jobs: -DENABLE_SYSTEM_OPENCL=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCPACK_GENERATOR=TGZ \ + -DENABLE_WHEEL=ON \ -DCMAKE_COMPILE_WARNING_AS_ERROR=ON Debian_Packages: diff --git a/.github/workflows/ubuntu_20.yml b/.github/workflows/ubuntu_20.yml index 63a1fab87d566f..ac00405ae71ed3 100644 --- a/.github/workflows/ubuntu_20.yml +++ b/.github/workflows/ubuntu_20.yml @@ -98,6 +98,7 @@ jobs: -DENABLE_SYSTEM_OPENCL=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCPACK_GENERATOR=TGZ \ + -DENABLE_WHEEL=ON \ -DCMAKE_COMPILE_WARNING_AS_ERROR=ON CXX_Unit_Tests: diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index 4fc93d73213f78..a32caecfbd073d 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -101,6 +101,7 @@ jobs: -DENABLE_SYSTEM_OPENCL=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCPACK_GENERATOR=TGZ \ + -DENABLE_WHEEL=ON \ -DCMAKE_COMPILE_WARNING_AS_ERROR=ON Debian_Packages: diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index c42475fea9cd64..da526e57bed1ec 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -63,6 +63,7 @@ jobs: cmake-options: |- -G "Ninja Multi-Config" ` -DENABLE_PYTHON=ON ` + -DENABLE_WHEEL=ON ` -DENABLE_CPPLINT=OFF ` -DBUILD_SHARED_LIBS=ON ` -DENABLE_TESTS=ON ` From 63c30473363e85a833f09a0cded1ed66fb9f2c02 Mon Sep 17 00:00:00 2001 From: Tomasz Krupa Date: Mon, 16 Dec 2024 13:15:24 +0000 Subject: [PATCH 08/17] Private/tkrupa/cache encryption docs (#28035) ### Details: - Replacement for https://github.com/openvinotoolkit/openvino/pull/27993 (can't modify it, not from fork). - Docs for https://github.com/openvinotoolkit/openvino/pull/27912. Added separately because docs addition breaks CI (no GPU Plugin in Python API tests currently), ### Tickets: - CVS-158140 --------- Co-authored-by: Sebastian Golebiewski Co-authored-by: Pavel Durandin --- .../assets/snippets/ov_caching.cpp | 30 ++++++++++++++++++- .../articles_en/assets/snippets/ov_caching.py | 17 +++++++++++ .../model-caching-overview.rst | 22 ++++++++++++-- 3 files changed, 66 insertions(+), 3 deletions(-) diff --git a/docs/articles_en/assets/snippets/ov_caching.cpp b/docs/articles_en/assets/snippets/ov_caching.cpp index aa08a739261b81..b132df9ce6705b 100644 --- a/docs/articles_en/assets/snippets/ov_caching.cpp +++ b/docs/articles_en/assets/snippets/ov_caching.cpp @@ -90,6 +90,33 @@ auto compiled = core.compile_model(model, device, config); // Step 5: } } +void part5() { + std::string modelPath = "/tmp/myModel.xml"; + std::string device = "GPU"; + ov::Core core; // Step 1: create ov::Core object + core.set_property(ov::cache_dir("/path/to/cache/dir")); // Step 1b: Enable caching +//! [ov:caching:part5] +static const char codec_key[] = {0x30, 0x60, 0x70, 0x02, 0x04, 0x08, 0x3F, 0x6F, 0x72, 0x74, 0x78, 0x7F}; +auto codec_xor = [&](const std::string& source_str) { + auto key_size = sizeof(codec_key); + int key_idx = 0; + std::string dst_str = source_str; + for (char& c : dst_str) { + c ^= codec_key[key_idx % key_size]; + key_idx++; + } + return dst_str; +}; +auto compiled = core.compile_model(modelPath, + device, + ov::cache_encryption_callbacks(ov::EncryptionCallbacks{codec_xor, codec_xor}), + ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE)); // Step 5: Compile model +//! [ov:caching:part5] + if (!compiled) { + throw std::runtime_error("error"); + } +} + int main() { try { part0(); @@ -97,7 +124,8 @@ int main() { part2(); part3(); part4(); + part5(); } catch (...) { } return 0; -} \ No newline at end of file +} diff --git a/docs/articles_en/assets/snippets/ov_caching.py b/docs/articles_en/assets/snippets/ov_caching.py index 57bd72f3f9b80b..894f3e02dd5b5b 100644 --- a/docs/articles_en/assets/snippets/ov_caching.py +++ b/docs/articles_en/assets/snippets/ov_caching.py @@ -59,3 +59,20 @@ def decrypt_base64(src): model = core.read_model(model=model_path) compiled_model = core.compile_model(model=model, device_name=device_name, config=config_cache) # ! [ov:caching:part4] + +# ! [ov:caching:part5] +import base64 + +def encrypt_base64(src): + return base64.b64encode(bytes(src, "utf-8")) + +def decrypt_base64(src): + return base64.b64decode(bytes(src, "utf-8")) + +core = ov.Core() +core.set_property({props.cache_dir: path_to_cache_dir}) +config_cache = {} +config_cache["CACHE_ENCRYPTION_CALLBACKS"] = [encrypt_base64, decrypt_base64] +config_cache["CACHE_MODE"] = "OPTIMIZE_SIZE" +compiled_model = core.compile_model(model=model_path, device_name='GPU', config=config_cache) +# ! [ov:caching:part5] diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst index 181622ff55baf1..b3253f775bdb02 100644 --- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst +++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-latency/model-caching-overview.rst @@ -139,7 +139,7 @@ To check in advance if a particular device supports model caching, your applicat Set "cache_encryption_callbacks" config option to enable cache encryption +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -If model caching is enabled, the model topology can be encrypted when saving to the cache and decrypted when loading from the cache. This property can currently be set only in ``compile_model``. +If model caching is enabled in the CPU Plugin, the model topology can be encrypted while it is saved to the cache and decrypted when it is loaded from the cache. Currently, this property can be set only in ``compile_model``. .. tab-set:: @@ -157,6 +157,24 @@ If model caching is enabled, the model topology can be encrypted when saving to :language: cpp :fragment: [ov:caching:part4] +If model caching is enabled in the GPU Plugin, the model topology can be encrypted while it is saved to the cache and decrypted when it is loaded from the cache. Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_SIZE``. + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py + :language: py + :fragment: [ov:caching:part5] + + .. tab-item:: C++ + :sync: cpp + + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp + :language: cpp + :fragment: [ov:caching:part5] + .. important:: - Currently, this property is supported only by the CPU plugin. For other HW plugins, setting this property will not encrypt/decrypt the model topology in cache and will not affect performance. + Currently, this property is supported only by the CPU and GPU plugins. For other HW plugins, setting this property will not encrypt/decrypt the model topology in cache and will not affect performance. From 1dd6492710f61edbab3634ce16066ea14e9b7d80 Mon Sep 17 00:00:00 2001 From: Alicja Miloszewska Date: Mon, 16 Dec 2024 15:01:53 +0100 Subject: [PATCH 09/17] [Py OV] Add utils to openvino module (#28062) ### Details: - Move `runtime/utils/` to openvino - Add proxy locations in `openvino/runtime/utils/` - create proxy for `openvino/runtime/opset_utils.py`. It contained one internal function => `_get_node_factory`. Move its location to `openvino/utils/node_factory.py`. - Rename `utils.py` to `package_utils.py`. Import its functionality in `openvino/utils/__init__.py` so it's accessible for a user in the same way (except internal `_add_openvino_libs_to_search_path` and `_ClassPropertyDescriptor`) NOTE: on Windows `_add_openvino_libs_to_search_path` has to be called (and initialized) on top of `openvino.__init__`, especially before `_pyopenvino` imports. Part of `openvino.runtime` deprecation ### Tickets: [CVS-129450](https://jira.devtools.intel.com/browse/CVS-129450) --------- Signed-off-by: Alicja Miloszewska Co-authored-by: Anastasia Kuporosova --- src/bindings/python/src/openvino/__init__.py | 2 +- .../openvino/{utils.py => package_utils.py} | 4 ++-- .../src/openvino/runtime/opset_utils.py | 22 ------------------- .../openvino/runtime/opset_utils/__init__.py | 6 +++++ .../src/openvino/runtime/utils/__init__.py | 2 +- .../runtime/utils/broadcasting/__init__.py | 5 +++++ .../runtime/utils/data_helpers/__init__.py | 8 +++---- .../data_helpers/data_dispatcher/__init__.py | 20 +++++++++++++++++ .../utils/data_helpers/wrappers/__init__.py | 8 +++++++ .../runtime/utils/decorators/__init__.py | 13 +++++++++++ .../utils/input_validation/__init__.py | 10 +++++++++ .../runtime/utils/node_factory/__init__.py | 5 +++++ .../runtime/utils/reduction/__init__.py | 5 +++++ .../openvino/runtime/utils/types/__init__.py | 21 ++++++++++++++++++ .../python/src/openvino/utils/__init__.py | 12 ++++++++++ .../{runtime => }/utils/broadcasting.py | 9 +++----- .../openvino/utils/data_helpers/__init__.py | 8 +++++++ .../utils/data_helpers/data_dispatcher.py | 2 +- .../utils/data_helpers/wrappers.py | 0 .../{runtime => }/utils/decorators.py | 2 +- .../{runtime => }/utils/input_validation.py | 0 .../{runtime => }/utils/node_factory.py | 11 ++++++++-- .../openvino/{runtime => }/utils/reduction.py | 0 .../src/openvino/{runtime => }/utils/types.py | 2 +- tools/benchmark_tool/openvino/__init__.py | 2 +- tools/mo/openvino/__init__.py | 2 +- tools/openvino_dev/src/openvino/__init__.py | 2 +- tools/ovc/openvino/__init__.py | 2 +- 28 files changed, 140 insertions(+), 45 deletions(-) rename src/bindings/python/src/openvino/{utils.py => package_utils.py} (97%) delete mode 100644 src/bindings/python/src/openvino/runtime/opset_utils.py create mode 100644 src/bindings/python/src/openvino/runtime/opset_utils/__init__.py create mode 100644 src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py create mode 100644 src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py create mode 100644 src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py create mode 100644 src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py create mode 100644 src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py create mode 100644 src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py create mode 100644 src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py create mode 100644 src/bindings/python/src/openvino/runtime/utils/types/__init__.py create mode 100644 src/bindings/python/src/openvino/utils/__init__.py rename src/bindings/python/src/openvino/{runtime => }/utils/broadcasting.py (87%) create mode 100644 src/bindings/python/src/openvino/utils/data_helpers/__init__.py rename src/bindings/python/src/openvino/{runtime => }/utils/data_helpers/data_dispatcher.py (99%) rename src/bindings/python/src/openvino/{runtime => }/utils/data_helpers/wrappers.py (100%) rename src/bindings/python/src/openvino/{runtime => }/utils/decorators.py (98%) rename src/bindings/python/src/openvino/{runtime => }/utils/input_validation.py (100%) rename src/bindings/python/src/openvino/{runtime => }/utils/node_factory.py (94%) rename src/bindings/python/src/openvino/{runtime => }/utils/reduction.py (100%) rename src/bindings/python/src/openvino/{runtime => }/utils/types.py (99%) diff --git a/src/bindings/python/src/openvino/__init__.py b/src/bindings/python/src/openvino/__init__.py index 69c678909b1c9e..e3e878564f3313 100644 --- a/src/bindings/python/src/openvino/__init__.py +++ b/src/bindings/python/src/openvino/__init__.py @@ -7,7 +7,7 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.utils import _add_openvino_libs_to_search_path + from openvino.package_utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass diff --git a/src/bindings/python/src/openvino/utils.py b/src/bindings/python/src/openvino/package_utils.py similarity index 97% rename from src/bindings/python/src/openvino/utils.py rename to src/bindings/python/src/openvino/package_utils.py index 9890ae9b3e6460..6aa3f3ed39b556 100644 --- a/src/bindings/python/src/openvino/utils.py +++ b/src/bindings/python/src/openvino/package_utils.py @@ -21,9 +21,9 @@ def _add_openvino_libs_to_search_path() -> None: if os.path.isdir(os.path.join(os.path.dirname(__file__), "libs")): # looking for the libs in the pip installation path. openvino_libs.append(os.path.join(os.path.dirname(__file__), "libs")) - elif os.path.isdir(os.path.join(os.path.dirname(__file__), "..", "..", "..", "Library", "bin")): + elif os.path.isdir(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "Library", "bin")): # looking for the libs in the conda installation path - openvino_libs.append(os.path.join(os.path.dirname(__file__), "..", "..", "..", "Library", "bin")) + openvino_libs.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "Library", "bin")) else: # setupvars.bat script set all libs paths to OPENVINO_LIB_PATHS environment variable. openvino_libs_installer = os.getenv("OPENVINO_LIB_PATHS") diff --git a/src/bindings/python/src/openvino/runtime/opset_utils.py b/src/bindings/python/src/openvino/runtime/opset_utils.py deleted file mode 100644 index 475750e71f87c5..00000000000000 --- a/src/bindings/python/src/openvino/runtime/opset_utils.py +++ /dev/null @@ -1,22 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional -import numpy as np - -from openvino.runtime import Node -from openvino.runtime.utils.decorators import nameable_op -from openvino.runtime.utils.node_factory import NodeFactory -from openvino.runtime.utils.types import ( - as_node, - NodeInput, -) - - -def _get_node_factory(opset_version: Optional[str] = None) -> NodeFactory: - """Return NodeFactory configured to create operators from specified opset version.""" - if opset_version: - return NodeFactory(opset_version) - else: - return NodeFactory() diff --git a/src/bindings/python/src/openvino/runtime/opset_utils/__init__.py b/src/bindings/python/src/openvino/runtime/opset_utils/__init__.py new file mode 100644 index 00000000000000..6fb3e5f6f0c950 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset_utils/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +from openvino.utils.node_factory import _get_node_factory diff --git a/src/bindings/python/src/openvino/runtime/utils/__init__.py b/src/bindings/python/src/openvino/runtime/utils/__init__.py index 73399ccbed2598..8447e93a907277 100644 --- a/src/bindings/python/src/openvino/runtime/utils/__init__.py +++ b/src/bindings/python/src/openvino/runtime/utils/__init__.py @@ -4,4 +4,4 @@ """Generic utilities. Factor related functions out to separate files.""" -from openvino._pyopenvino.util import numpy_to_c, replace_node, replace_output_update_name +from openvino.utils import numpy_to_c, replace_node, replace_output_update_name diff --git a/src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py b/src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py new file mode 100644 index 00000000000000..3219f239f0ab44 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/utils/broadcasting/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.utils.broadcasting import get_broadcast_axes diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py index a46105efaaeadb..282547dd9df79a 100644 --- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py +++ b/src/bindings/python/src/openvino/runtime/utils/data_helpers/__init__.py @@ -2,7 +2,7 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.utils.data_helpers.data_dispatcher import _data_dispatch -from openvino.runtime.utils.data_helpers.wrappers import tensor_from_file -from openvino.runtime.utils.data_helpers.wrappers import _InferRequestWrapper -from openvino.runtime.utils.data_helpers.wrappers import OVDict +from openvino.utils.data_helpers.data_dispatcher import _data_dispatch +from openvino.utils.data_helpers.wrappers import tensor_from_file +from openvino.utils.data_helpers.wrappers import _InferRequestWrapper +from openvino.utils.data_helpers.wrappers import OVDict diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py new file mode 100644 index 00000000000000..e0a2d022660dd3 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher/__init__.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +from openvino.utils.data_helpers.data_dispatcher import ContainerTypes +from openvino.utils.data_helpers.data_dispatcher import ScalarTypes +from openvino.utils.data_helpers.data_dispatcher import ValidKeys + +from openvino.utils.data_helpers.data_dispatcher import is_list_simple_type +from openvino.utils.data_helpers.data_dispatcher import get_request_tensor +from openvino.utils.data_helpers.data_dispatcher import value_to_tensor +from openvino.utils.data_helpers.data_dispatcher import to_c_style +from openvino.utils.data_helpers.data_dispatcher import normalize_arrays +from openvino.utils.data_helpers.data_dispatcher import create_shared +from openvino.utils.data_helpers.data_dispatcher import set_request_tensor +from openvino.utils.data_helpers.data_dispatcher import update_tensor +from openvino.utils.data_helpers.data_dispatcher import update_inputs +from openvino.utils.data_helpers.data_dispatcher import create_copied +from openvino.utils.data_helpers.data_dispatcher import _data_dispatch diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py new file mode 100644 index 00000000000000..22214fd24682da --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +from openvino.utils.data_helpers.wrappers import tensor_from_file +from openvino.utils.data_helpers.wrappers import _InferRequestWrapper +from openvino.utils.data_helpers.wrappers import OVDict diff --git a/src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py b/src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py new file mode 100644 index 00000000000000..bb0bac112d2c5f --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/utils/decorators/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.utils.decorators import _get_name +from openvino.utils.decorators import _set_node_friendly_name +from openvino.utils.decorators import nameable_op +from openvino.utils.decorators import unary_op +from openvino.utils.decorators import binary_op +from openvino.utils.decorators import custom_preprocess_function +from openvino.utils.decorators import MultiMethod +from openvino.utils.decorators import registry +from openvino.utils.decorators import overloading diff --git a/src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py b/src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py new file mode 100644 index 00000000000000..0b49e9ea33c40d --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/utils/input_validation/__init__.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.utils.input_validation import assert_list_of_ints +from openvino.utils.input_validation import _check_value +from openvino.utils.input_validation import check_valid_attribute +from openvino.utils.input_validation import check_valid_attributes +from openvino.utils.input_validation import is_positive_value +from openvino.utils.input_validation import is_non_negative_value diff --git a/src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py b/src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py new file mode 100644 index 00000000000000..945ea8deb7863c --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/utils/node_factory/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.utils.node_factory import NodeFactory diff --git a/src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py b/src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py new file mode 100644 index 00000000000000..a2fbff9e793dca --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/utils/reduction/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.utils.reduction import get_reduction_axes diff --git a/src/bindings/python/src/openvino/runtime/utils/types/__init__.py b/src/bindings/python/src/openvino/runtime/utils/types/__init__.py new file mode 100644 index 00000000000000..4f88d609988e8d --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/utils/types/__init__.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.utils.types import TensorShape +from openvino.utils.types import NumericData +from openvino.utils.types import NumericType +from openvino.utils.types import ScalarData +from openvino.utils.types import NodeInput + +from openvino.utils.types import openvino_to_numpy_types_map +from openvino.utils.types import openvino_to_numpy_types_str_map +from openvino.utils.types import get_element_type +from openvino.utils.types import get_element_type_str +from openvino.utils.types import get_dtype +from openvino.utils.types import get_numpy_ctype +from openvino.utils.types import get_ndarray +from openvino.utils.types import get_shape +from openvino.utils.types import make_constant_node +from openvino.utils.types import as_node +from openvino.utils.types import as_nodes diff --git a/src/bindings/python/src/openvino/utils/__init__.py b/src/bindings/python/src/openvino/utils/__init__.py new file mode 100644 index 00000000000000..2ccc79d20cce84 --- /dev/null +++ b/src/bindings/python/src/openvino/utils/__init__.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""Generic utilities. Factor related functions out to separate files.""" + +from openvino._pyopenvino.util import numpy_to_c, replace_node, replace_output_update_name + +from openvino.package_utils import get_cmake_path +from openvino.package_utils import deprecated +from openvino.package_utils import classproperty +from openvino.package_utils import deprecatedclassproperty diff --git a/src/bindings/python/src/openvino/runtime/utils/broadcasting.py b/src/bindings/python/src/openvino/utils/broadcasting.py similarity index 87% rename from src/bindings/python/src/openvino/runtime/utils/broadcasting.py rename to src/bindings/python/src/openvino/utils/broadcasting.py index 9fd13da7728e29..a950aea9bba820 100644 --- a/src/bindings/python/src/openvino/runtime/utils/broadcasting.py +++ b/src/bindings/python/src/openvino/utils/broadcasting.py @@ -3,14 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import List, Optional +from typing import Optional -from openvino.runtime import AxisSet, Node -from openvino.runtime.utils.types import ( - NodeInput, +from openvino.runtime import AxisSet +from openvino.utils.types import ( TensorShape, - get_dtype, - make_constant_node, ) log = logging.getLogger(__name__) diff --git a/src/bindings/python/src/openvino/utils/data_helpers/__init__.py b/src/bindings/python/src/openvino/utils/data_helpers/__init__.py new file mode 100644 index 00000000000000..282547dd9df79a --- /dev/null +++ b/src/bindings/python/src/openvino/utils/data_helpers/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.utils.data_helpers.data_dispatcher import _data_dispatch +from openvino.utils.data_helpers.wrappers import tensor_from_file +from openvino.utils.data_helpers.wrappers import _InferRequestWrapper +from openvino.utils.data_helpers.wrappers import OVDict diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py b/src/bindings/python/src/openvino/utils/data_helpers/data_dispatcher.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py rename to src/bindings/python/src/openvino/utils/data_helpers/data_dispatcher.py index bce10c9c3774ef..d4db7cb07b629c 100644 --- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py +++ b/src/bindings/python/src/openvino/utils/data_helpers/data_dispatcher.py @@ -8,7 +8,7 @@ import numpy as np from openvino._pyopenvino import ConstOutput, Tensor, Type, RemoteTensor -from openvino.runtime.utils.data_helpers.wrappers import _InferRequestWrapper, OVDict +from openvino.utils.data_helpers.wrappers import _InferRequestWrapper, OVDict ContainerTypes = Union[dict, list, tuple, OVDict] ScalarTypes = Union[np.number, int, float] diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers.py b/src/bindings/python/src/openvino/utils/data_helpers/wrappers.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/utils/data_helpers/wrappers.py rename to src/bindings/python/src/openvino/utils/data_helpers/wrappers.py diff --git a/src/bindings/python/src/openvino/runtime/utils/decorators.py b/src/bindings/python/src/openvino/utils/decorators.py similarity index 98% rename from src/bindings/python/src/openvino/runtime/utils/decorators.py rename to src/bindings/python/src/openvino/utils/decorators.py index 98da1ba4389ef7..604c0745860bf6 100644 --- a/src/bindings/python/src/openvino/runtime/utils/decorators.py +++ b/src/bindings/python/src/openvino/utils/decorators.py @@ -7,7 +7,7 @@ from typing import Any, Callable, Dict, Optional, Union, get_origin, get_args from openvino.runtime import Node, Output -from openvino.runtime.utils.types import NodeInput, as_node, as_nodes +from openvino.utils.types import NodeInput, as_node, as_nodes def _get_name(**kwargs: Any) -> Node: diff --git a/src/bindings/python/src/openvino/runtime/utils/input_validation.py b/src/bindings/python/src/openvino/utils/input_validation.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/utils/input_validation.py rename to src/bindings/python/src/openvino/utils/input_validation.py diff --git a/src/bindings/python/src/openvino/runtime/utils/node_factory.py b/src/bindings/python/src/openvino/utils/node_factory.py similarity index 94% rename from src/bindings/python/src/openvino/runtime/utils/node_factory.py rename to src/bindings/python/src/openvino/utils/node_factory.py index 25daf739223dba..9841daaea4e818 100644 --- a/src/bindings/python/src/openvino/runtime/utils/node_factory.py +++ b/src/bindings/python/src/openvino/utils/node_factory.py @@ -2,9 +2,8 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import logging as log -from functools import partial, singledispatchmethod +from functools import singledispatchmethod from typing import Any, Dict, List, Optional, Union from pathlib import Path @@ -125,3 +124,11 @@ def _arguments_as_outputs(arguments: List[Union[Node, Output]]) -> List[Output]: else: outputs.extend(argument.outputs()) return outputs + + +def _get_node_factory(opset_version: Optional[str] = None) -> NodeFactory: + """Return NodeFactory configured to create operators from specified opset version.""" + if opset_version: + return NodeFactory(opset_version) + else: + return NodeFactory() diff --git a/src/bindings/python/src/openvino/runtime/utils/reduction.py b/src/bindings/python/src/openvino/utils/reduction.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/utils/reduction.py rename to src/bindings/python/src/openvino/utils/reduction.py diff --git a/src/bindings/python/src/openvino/runtime/utils/types.py b/src/bindings/python/src/openvino/utils/types.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/utils/types.py rename to src/bindings/python/src/openvino/utils/types.py index 52f1faf8e1e839..854cc0c7f6411d 100644 --- a/src/bindings/python/src/openvino/runtime/utils/types.py +++ b/src/bindings/python/src/openvino/utils/types.py @@ -11,7 +11,7 @@ from openvino.runtime.exceptions import OVTypeError from openvino.runtime import Node, Shape, Output, Type -from openvino.runtime.op import Constant +from openvino.op import Constant log = logging.getLogger(__name__) diff --git a/tools/benchmark_tool/openvino/__init__.py b/tools/benchmark_tool/openvino/__init__.py index 69c678909b1c9e..e3e878564f3313 100644 --- a/tools/benchmark_tool/openvino/__init__.py +++ b/tools/benchmark_tool/openvino/__init__.py @@ -7,7 +7,7 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.utils import _add_openvino_libs_to_search_path + from openvino.package_utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass diff --git a/tools/mo/openvino/__init__.py b/tools/mo/openvino/__init__.py index b015570964c520..46e35babdc9fad 100644 --- a/tools/mo/openvino/__init__.py +++ b/tools/mo/openvino/__init__.py @@ -7,7 +7,7 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.utils import _add_openvino_libs_to_search_path + from openvino.package_utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass diff --git a/tools/openvino_dev/src/openvino/__init__.py b/tools/openvino_dev/src/openvino/__init__.py index b015570964c520..46e35babdc9fad 100644 --- a/tools/openvino_dev/src/openvino/__init__.py +++ b/tools/openvino_dev/src/openvino/__init__.py @@ -7,7 +7,7 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.utils import _add_openvino_libs_to_search_path + from openvino.package_utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass diff --git a/tools/ovc/openvino/__init__.py b/tools/ovc/openvino/__init__.py index 69c678909b1c9e..e3e878564f3313 100644 --- a/tools/ovc/openvino/__init__.py +++ b/tools/ovc/openvino/__init__.py @@ -7,7 +7,7 @@ # Required for Windows OS platforms # Note: always top-level try: - from openvino.utils import _add_openvino_libs_to_search_path + from openvino.package_utils import _add_openvino_libs_to_search_path _add_openvino_libs_to_search_path() except ImportError: pass From 3af0f7ce7b062866460e290e7fe5396efef504d5 Mon Sep 17 00:00:00 2001 From: Sebastian Golebiewski Date: Mon, 16 Dec 2024 16:05:06 +0100 Subject: [PATCH 10/17] [DOCS] Updating release notes (#28083) Porting: #28078 Signed-off-by: sgolebiewski-intel --- docs/articles_en/about-openvino/release-notes-openvino.rst | 2 +- docs/articles_en/get-started/install-openvino.rst | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/articles_en/about-openvino/release-notes-openvino.rst b/docs/articles_en/about-openvino/release-notes-openvino.rst index a168d1c44a10c3..c6efb4f25dfb05 100644 --- a/docs/articles_en/about-openvino/release-notes-openvino.rst +++ b/docs/articles_en/about-openvino/release-notes-openvino.rst @@ -26,7 +26,7 @@ OpenVINO Release Notes What's new +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -* OpenVINO 2024.6 LTS release includes updates for enhanced stability and improved LLM performance. +* OpenVINO 2024.6 release includes updates for enhanced stability and improved LLM performance. * Introduced support for Intel® Arc™ B-Series Graphics (formerly known as Battlemage) * Memory optimizations implemented to improve the inference time memory and LLM performance on NPUs. * Improved LLM performance with GenAI API optimizations and bug fixes. diff --git a/docs/articles_en/get-started/install-openvino.rst b/docs/articles_en/get-started/install-openvino.rst index 68656e554145a4..ef9eca39aae388 100644 --- a/docs/articles_en/get-started/install-openvino.rst +++ b/docs/articles_en/get-started/install-openvino.rst @@ -23,11 +23,10 @@ Install OpenVINO™ 2024.6 -OpenVINO 2024.6, described here, is a Long-Term-Support version! +OpenVINO 2024.6, described here, is not a Long-Term-Support version! All currently supported versions are: -* 2025.0 (in development) -* 2024.6 (LTS) +* 2024.6 (development) * 2023.3 (LTS) From 05a6f4fa232c79fa065979a7540a705d666fc8cb Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Mon, 16 Dec 2024 16:11:54 +0100 Subject: [PATCH 11/17] [CPU] Throw when release_memory is called during inference (#27520) ### Details: This PR changes the behavior of the `CompiledModel::release_memory()` implementation in the CPU plugin for the situation when the method is being called concurrently with the other graph state modifying methods (e.g. graph initialization, inference, properties request). This is necessary to ensure thread safety and provide a clear defined behavior when the method is called concurrently. Also, the PR contains some refactoring of the Infer request implementation, aimed at decoupling the InferRequest implementation from the compiled model internals and providing a safer interface that ensures thread safe access to the CPU graph structures. --- src/plugins/intel_cpu/src/compiled_model.cpp | 9 +- src/plugins/intel_cpu/src/compiled_model.h | 76 ++++++- src/plugins/intel_cpu/src/graph.cpp | 19 +- src/plugins/intel_cpu/src/graph.h | 38 +++- src/plugins/intel_cpu/src/infer_request.cpp | 161 ++++++-------- src/plugins/intel_cpu/src/infer_request.h | 15 +- src/plugins/intel_cpu/src/nodes/composite.cpp | 4 +- src/plugins/intel_cpu/src/nodes/if.cpp | 24 +-- src/plugins/intel_cpu/src/nodes/lora.cpp | 4 +- src/plugins/intel_cpu/src/nodes/memory.hpp | 2 +- .../intel_cpu/src/nodes/tensoriterator.cpp | 12 +- .../concurent_release_memory.cpp | 198 ++++++++++++++++++ 12 files changed, 406 insertions(+), 156 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/concurent_release_memory.cpp diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index f0f9668ca3ae50..f81c7dbbced99d 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -184,7 +184,6 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const { } std::shared_ptr CompiledModel::create_sync_infer_request() const { - m_numRequests++; return std::make_shared(std::static_pointer_cast(shared_from_this())); } @@ -344,8 +343,12 @@ void CompiledModel::export_model(std::ostream& modelStream) const { void CompiledModel::release_memory() { for (auto&& graph : m_graphs) { - GraphGuard::Lock graph_lock{graph}; - auto ctx = graph_lock._graph.getGraphContext(); + // try to lock mutex, since it may be already locked (e.g by an infer request) + std::unique_lock lock(graph._mutex, std::try_to_lock); + OPENVINO_ASSERT(lock.owns_lock(), + "Attempt to call release_memory() on a compiled model in a busy state. Please ensure that all " + "infer requests are completed before releasing memory."); + auto ctx = graph.getGraphContext(); ctx->getNetworkMemoryControl()->releaseMemory(); } } diff --git a/src/plugins/intel_cpu/src/compiled_model.h b/src/plugins/intel_cpu/src/compiled_model.h index dc3735b4f3b63e..f7d2903b0526cf 100644 --- a/src/plugins/intel_cpu/src/compiled_model.h +++ b/src/plugins/intel_cpu/src/compiled_model.h @@ -20,6 +20,15 @@ namespace ov { namespace intel_cpu { class CompiledModel : public ov::ICompiledModel { +public: + struct GraphGuard : public Graph { + std::mutex _mutex; + struct Lock : public std::unique_lock { + explicit Lock(GraphGuard& graph) : std::unique_lock(graph._mutex), _graph(graph) {} + GraphGuard& _graph; + }; + }; + public: typedef std::shared_ptr Ptr; @@ -51,9 +60,13 @@ class CompiledModel : public ov::ICompiledModel { void release_memory() override; + std::string name() const { + return m_name; + } + private: std::shared_ptr create_sync_infer_request() const override; - friend class SyncInferRequest; + friend class CompiledModelHolder; const std::shared_ptr m_model; const std::shared_ptr m_plugin; @@ -66,13 +79,6 @@ class CompiledModel : public ov::ICompiledModel { Config m_cfg; mutable std::atomic_int m_numRequests = {0}; std::string m_name; - struct GraphGuard : public Graph { - std::mutex _mutex; - struct Lock : public std::unique_lock { - explicit Lock(GraphGuard& graph) : std::unique_lock(graph._mutex), _graph(graph) {} - GraphGuard& _graph; - }; - }; const bool m_loaded_from_cache; // WARNING: Do not use m_graphs directly. @@ -94,5 +100,59 @@ class CompiledModel : public ov::ICompiledModel { bool m_has_sub_compiled_models = false; }; +// This class provides safe access to the internal CompiledModel structures and helps to decouple SyncInferRequest and +// the CompiledModel internal structures +class CompiledModelHolder { +public: + CompiledModelHolder(std::shared_ptr compiled_model) + : m_compiled_model(std::move(compiled_model)) { + OPENVINO_ASSERT(!m_compiled_model->m_graphs.empty(), + "No graph was found in the compiled model: ", + m_compiled_model->name()); + m_graph = &(m_compiled_model->get_graph()._graph); + m_id = (m_compiled_model->m_numRequests)++; + } + + ~CompiledModelHolder() { + if (m_compiled_model) { + --(m_compiled_model->m_numRequests); + } + } + + CompiledModelHolder(const CompiledModelHolder&) = delete; + CompiledModelHolder& operator=(const CompiledModelHolder&) = delete; + + CompiledModelHolder(CompiledModelHolder&&) = default; + CompiledModelHolder& operator=(CompiledModelHolder&&) = default; + + const Graph& graph() const { + return *m_graph; + } + + CompiledModel::GraphGuard::Lock lock() { + auto lock = m_compiled_model->get_graph(); + m_graph = &(lock._graph); + OPENVINO_ASSERT(m_graph, "Graph ptr null check failed"); + return lock; + } + + std::string name() const { + return m_compiled_model->name(); + } + + std::shared_ptr compiled_model() const { + return m_compiled_model; + } + + int id() const { + return m_id; + } + +private: + std::shared_ptr m_compiled_model; + const Graph* m_graph; + int m_id; +}; + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 7fb5f512227cf9..fd6721ce4c83ad 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -1940,8 +1940,23 @@ std::shared_ptr Graph::dump() const { return dump_graph_as_ie_ngraph_net(*this); } -const std::unordered_map& Graph::getInternalStateNodes() const { - return m_context->getMemoryStatesRegister()->getMemoryStates(); +std::vector Graph::memoryStates() const { + std::vector resultVector; + + for (auto&& item : m_context->getMemoryStatesRegister()->getMemoryStates()) { + resultVector.emplace_back(item.second->makeState()); + } + return resultVector; +} + +void Graph::assignStates(const std::vector& states) { + auto&& inputStateNodes = m_context->getMemoryStatesRegister()->getMemoryStates(); + for (const auto& state : states) { + auto itr = inputStateNodes.find(state->get_name()); + if (itr != inputStateNodes.end()) { + itr->second->assignState(state); + } + } } } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index bdf3205d2baaaf..5d5d5b335a36f2 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -14,6 +14,7 @@ #include "edge.h" #include "graph_context.h" #include "memory_control.hpp" +#include "memory_state.h" #include "node.h" #include "nodes/input.h" #include "openvino/core/node_vector.hpp" @@ -87,28 +88,42 @@ class Graph { return _name; } - std::map& GetInputNodesMap() { - return inputNodesMap; + NodePtr getInputNodeByIndex(std::size_t index) { + auto input = inputNodesMap.find(index); + if (input == inputNodesMap.end()) + return nullptr; + return input->second; } - std::map& GetOutputNodesMap() { - return outputNodesMap; + NodePtr getOutputNodeByIndex(std::size_t index) { + auto output = outputNodesMap.find(index); + if (output == outputNodesMap.end()) + return nullptr; + return output->second; } - NodePtr getInputNodeByIndex(const std::size_t& index) { + NodeConstPtr getInputNodeByIndex(std::size_t index) const { auto input = inputNodesMap.find(index); if (input == inputNodesMap.end()) - OPENVINO_THROW("CPU execution graph doesn't contain input node with index: ", index); + return nullptr; return input->second; } - NodePtr getOutputNodeByIndex(const std::size_t& index) { + NodeConstPtr getOutputNodeByIndex(std::size_t index) const { auto output = outputNodesMap.find(index); if (output == outputNodesMap.end()) - OPENVINO_THROW("CPU execution graph doesn't contain output node with index: ", index); + return nullptr; return output->second; } + size_t inputsNumber() const { + return inputNodesMap.size(); + } + + size_t outputsNumber() const { + return outputNodesMap.size(); + } + dnnl::engine getEngine() const { return m_context->getEngine(); } @@ -117,6 +132,9 @@ class Graph { return m_context; } + std::vector memoryStates() const; + void assignStates(const std::vector& state); + void GetPerfData(std::vector& perfMap) const; void CreateEdge(const NodePtr& parent, const NodePtr& child, int parentPort = 0, int childPort = 0); @@ -202,8 +220,6 @@ class Graph { return graphHasDynamicInput; } - const std::unordered_map& getInternalStateNodes() const; - /** * Init graph using \p model, \p context, \p inputConfigs and \p outputConfigs */ @@ -218,7 +234,7 @@ class Graph { void Activate(const std::vector& externalInputMemory = {}, const std::vector& externalOutputMemory = {}); - const std::unordered_map& getOutputNodesMemBlocksMap() const { + const std::unordered_map& getOutputNodesMemBlocksMap() { return outputNodesMemBlocksMap; } diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index 3cfc34589623d2..44b9904bde202a 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -5,11 +5,9 @@ #include "infer_request.h" #include "async_infer_request.h" -#include "compiled_model.h" #include "dnnl_extension_utils.h" #include "itt.h" #include "memory_desc/cpu_memory_desc_utils.h" -#include "memory_state.h" #include "nodes/common/cpu_convert.h" #include "nodes/memory_state_base.h" #include "openvino/core/shape.hpp" @@ -24,9 +22,9 @@ using OvString = ov::element_type_traits::value_type; namespace ov { namespace intel_cpu { -SyncInferRequest::SyncInferRequest(std::shared_ptr compiled_model) - : ov::ISyncInferRequest(compiled_model), - m_compiled_model(compiled_model) { +SyncInferRequest::SyncInferRequest(CompiledModelHolder compiled_model) + : ov::ISyncInferRequest(compiled_model.compiled_model()), + m_compiled_model(std::move(compiled_model)) { const auto& inputs = get_inputs(); for (std::size_t input_index = 0; input_index < inputs.size(); input_index++) { m_input_ports_map[input_index] = inputs[input_index]; @@ -40,13 +38,8 @@ SyncInferRequest::SyncInferRequest(std::shared_ptr compiled } void SyncInferRequest::create_infer_request() { - auto id = (m_compiled_model->m_numRequests)++; - m_profiling_task = openvino::itt::handle("INTEL_CPU_INFER_" + m_compiled_model->m_name + "_" + std::to_string(id)); - - if (m_compiled_model->m_graphs.size() == 0) { - OPENVINO_THROW("No graph was found"); - } - m_graph = &(m_compiled_model->get_graph()._graph); + m_profiling_task = openvino::itt::handle("INTEL_CPU_INFER_" + m_compiled_model.name() + "_" + + std::to_string(m_compiled_model.id())); // Alocate memory for each tensor if static shape for (const auto& it : m_input_ports_map) { @@ -57,35 +50,16 @@ void SyncInferRequest::create_infer_request() { } // create states according to the list of the MemoryStateNodes - for (auto&& node : m_graph->getInternalStateNodes()) { - m_memory_states.emplace_back(node.second->makeState()); - } -} - -SyncInferRequest::~SyncInferRequest() { - --(m_compiled_model->m_numRequests); -} - -// state -> storage -void SyncInferRequest::assign_states() { - auto&& graph_internal_state_nodes = m_graph->getInternalStateNodes(); - for (const auto& state : m_memory_states) { - auto itr = graph_internal_state_nodes.find(state->get_name()); - if (itr != graph_internal_state_nodes.end()) { - itr->second->assignState(state); - } - } + m_memory_states = m_compiled_model.graph().memoryStates(); } -void SyncInferRequest::redefine_memory_for_input_nodes() { - const auto cpuInputNodes = m_graph->GetInputNodesMap(); +void SyncInferRequest::redefine_memory_for_input_nodes(Graph& graph) { for (const auto& input_port : m_input_ports_map) { - const auto inputNode = cpuInputNodes.find(input_port.first); - if (inputNode == cpuInputNodes.end()) - OPENVINO_THROW("CPU execution graph doesn't contain input node with index: ", input_port.first); - if (inputNode->second->isDynamicNode()) { + auto inputNode = graph.getInputNodeByIndex(input_port.first); + OPENVINO_ASSERT(inputNode, "CPU execution graph doesn't contain output node with index: ", input_port.first); + if (inputNode->isDynamicNode()) { auto tensor = get_tensor(input_port.second); - inputNode->second->redefineOutputMemory({tensor->get_shape()}); + inputNode->redefineOutputMemory({tensor->get_shape()}); } } } @@ -103,8 +77,8 @@ void SyncInferRequest::update_external_tensor_ptrs() { void SyncInferRequest::infer() { using namespace openvino::itt; OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, m_profiling_task); - auto graphLock = m_compiled_model->get_graph(); - m_graph = &(graphLock._graph); + auto graphLock = m_compiled_model.lock(); + auto&& graph = graphLock._graph; auto message = ov::threading::message_manager(); throw_if_canceled(); @@ -120,40 +94,41 @@ void SyncInferRequest::infer() { update_external_tensor_ptrs(); } - if (m_graph->hasDynamicInput()) { - redefine_memory_for_input_nodes(); + if (graph.hasDynamicInput()) { + redefine_memory_for_input_nodes(graph); } - change_default_ptr(); + change_default_ptr(graph); throw_if_canceled(); // state -> node if (!m_memory_states.empty()) { - assign_states(); + graph.assignStates(m_memory_states); } - push_input_data(); + push_input_data(graph); - m_graph->Infer(this); + graph.Infer(this); throw_if_canceled(); // update output control blocks, if any, in order to refresh internal buffers - if (m_graph->IsDynamic()) { + if (graph.IsDynamic()) { for (auto&& item : m_outputControlBlocks) { item.second.update(); } } - m_graph->PullOutputData(m_outputs); + graph.PullOutputData(m_outputs); } std::vector SyncInferRequest::get_profiling_info() const { - if (!m_graph || !m_graph->IsReady()) + auto&& graph = m_compiled_model.graph(); + if (!graph.IsReady()) OPENVINO_THROW("Graph is not ready!"); std::vector perfMap; - m_graph->GetPerfData(perfMap); + graph.GetPerfData(perfMap); return perfMap; } @@ -172,13 +147,10 @@ static inline void change_edge_ptr(const EdgePtr& edge, ov::SoPtr& } } -void SyncInferRequest::change_default_ptr() { - const auto& inputNodesMap = m_graph->GetInputNodesMap(); - const auto& outputNodesMap = m_graph->GetOutputNodesMap(); - +void SyncInferRequest::change_default_ptr(Graph& graph) { std::unordered_set inputPtrs; std::function& tensor)> changeInpPtr; - if (m_graph->IsDynamic()) { + if (graph.IsDynamic()) { changeInpPtr = [&inputPtrs](const EdgePtr& edge, ov::SoPtr& tensor) { change_edge_ptr(edge, tensor); inputPtrs.insert(tensor->data()); @@ -190,9 +162,8 @@ void SyncInferRequest::change_default_ptr() { } for (auto& it : m_input_external_ptr) { - auto input = inputNodesMap.find(it.first); - OPENVINO_ASSERT(inputNodesMap.end() != input, "Cannot find input tensor with index: ", it.first); - NodePtr inputNodePtr = input->second; + auto inputNodePtr = graph.getInputNodeByIndex(it.first); + OPENVINO_ASSERT(inputNodePtr, "Cannot find input tensor with index: ", it.first); if (inputNodePtr->getDstDataAtPort(0) == static_cast(it.second->data())) continue; auto& childEdges = inputNodePtr->getChildEdges(); @@ -238,9 +209,9 @@ void SyncInferRequest::change_default_ptr() { } for (auto& it : m_output_external_ptr) { - auto output = outputNodesMap.find(it.first); - OPENVINO_ASSERT(outputNodesMap.end() != output, "Cannot find output tensor with index: ", it.first); - auto parentEdge = output->second->getParentEdgeAt(0); + auto output = graph.getOutputNodeByIndex(it.first); + OPENVINO_ASSERT(output, "Cannot find output tensor with index: ", it.first); + auto parentEdge = output->getParentEdgeAt(0); void* const outputRawPtr = parentEdge->getMemory().getData(); if (outputRawPtr == static_cast(it.second->data())) continue; @@ -278,24 +249,21 @@ void SyncInferRequest::change_default_ptr() { change_edge_ptr(parentEdge, it.second); } - if (m_graph->IsDynamic()) { - const auto& outMemBlocksMap = m_graph->getOutputNodesMemBlocksMap(); + if (graph.IsDynamic()) { + const auto& outMemBlocksMap = graph.getOutputNodesMemBlocksMap(); for (auto&& item : outMemBlocksMap) { - const auto& name = item.first; + const auto index = item.first; // share intel_cpu::Tensor to Graph by injecting to corresponding ProxyMemoryBlock instance. auto outputMemBlock = item.second; - OPENVINO_ASSERT(outputMemBlock, "proxy mem block for output ", name, " is empty."); + OPENVINO_ASSERT(outputMemBlock, "proxy mem block for output ", index, " is empty."); - auto controlBlockItr = m_outputControlBlocks.find(name); + auto controlBlockItr = m_outputControlBlocks.find(index); if (controlBlockItr != m_outputControlBlocks.end()) { - auto output = outputNodesMap.find(name); - OPENVINO_ASSERT(outputNodesMap.end() != output, - "Node with name: ", - name, - " is absent in the outputNodesMap"); - auto parentEdge = output->second->getParentEdgeAt(0); + auto output = graph.getOutputNodeByIndex(index); + OPENVINO_ASSERT(output, "Output with index: ", index, " is absent in the outputNodesMap"); + auto parentEdge = output->getParentEdgeAt(0); // avoid cyclic memory use auto&& controlBlock = controlBlockItr->second; @@ -311,10 +279,10 @@ void SyncInferRequest::change_default_ptr() { ", actual ", controlBlock.currentMemBlock(), " graph ", - m_graph, - " inferrequest ", + &graph, + " infer request ", this); - DEBUG_LOG(name, ", tensor ", controlBlock.tensor()); + DEBUG_LOG(index, ", tensor ", controlBlock.tensor()); } else { outputMemBlock->reset(); // switch to the internal memory since memory sharing is no longer possible } @@ -413,7 +381,12 @@ void SyncInferRequest::set_tensor(const ov::Output& in_port, con " are different."); } - MemoryDescPtr actualDesc = m_graph->getInputNodeByIndex(input_index)->getBaseMemDescAtOutputPort(0); + auto&& graph = m_compiled_model.graph(); + + auto inputNode = graph.getInputNodeByIndex(input_index); + OPENVINO_ASSERT(inputNode, "CPU execution graph doesn't contain input node with index: ", input_index); + + MemoryDescPtr actualDesc = inputNode->getBaseMemDescAtOutputPort(0); if (!actualDesc->isDefined()) { // we must define desc for dynamic case // otherwise we got incorrect check on shape compatibility inside isCompatible @@ -460,7 +433,11 @@ void SyncInferRequest::set_tensor(const ov::Output& in_port, con " are different."); } - const auto& desc = m_graph->getOutputNodeByIndex(output_index)->getParentEdgeAt(0)->getMemory().getDesc(); + auto&& graph = m_compiled_model.graph(); + + auto outputNode = graph.getOutputNodeByIndex(output_index); + OPENVINO_ASSERT(outputNode, "CPU execution graph doesn't contain output node with index: ", output_index); + const auto& desc = outputNode->getParentEdgeAt(0)->getMemory().getDesc(); if (!isDynamic && mem_desc_ptr->isCompatible(desc)) { m_output_external_ptr[output_index] = tensor; } else if (m_output_external_ptr.find(output_index) != m_output_external_ptr.end()) { @@ -484,15 +461,15 @@ void SyncInferRequest::set_tensors_impl(const ov::Output port, void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyncInferRequest::FoundPort::Type& type) { OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "init_tensor"); - if (!m_graph || !m_graph->IsReady()) - OPENVINO_THROW("Graph is not ready!"); + auto&& graph = m_compiled_model.graph(); + OPENVINO_ASSERT(graph.IsReady(), "Graph is not ready!"); ov::SoPtr tensor; if (type == ov::ISyncInferRequest::FoundPort::Type::INPUT) { - OPENVINO_ASSERT(m_graph->GetInputNodesMap().find(port_index) != m_graph->GetInputNodesMap().end(), + OPENVINO_ASSERT(graph.getInputNodeByIndex(port_index), "Tensor with index: ", port_index, - " exists in CPU plugin graph, but absents in model inputs"); + " absent in the plugin's graph inputs"); const auto& port = m_input_ports_map[port_index]; tensor = ov::ISyncInferRequest::get_tensor(port); @@ -513,8 +490,9 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn if (!isDynamic) { auto mem_desc_ptr = MemoryDescUtils::generateCpuBlockedMemoryDesc(tensor); - if (mem_desc_ptr->isCompatible( - m_graph->getInputNodeByIndex(port_index)->getChildEdgeAt(0)->getMemory().getDesc())) { + auto inputNode = graph.getInputNodeByIndex(port_index); + OPENVINO_ASSERT(inputNode, "CPU execution graph doesn't contain input node with index: ", port_index); + if (mem_desc_ptr->isCompatible(inputNode->getChildEdgeAt(0)->getMemory().getDesc())) { m_input_external_ptr[port_index] = tensor; } } @@ -522,16 +500,12 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn } if (type == ov::ISyncInferRequest::FoundPort::Type::OUTPUT) { - const auto& outMap = m_graph->GetOutputNodesMap(); - auto output = outMap.find(port_index); - OPENVINO_ASSERT(output != outMap.end(), - "Tensor with index: ", - port_index, - " exists in CPU plugin graph, but absents in model outputs"); + auto output = graph.getOutputNodeByIndex(port_index); + OPENVINO_ASSERT(output, "Tensor with index: ", port_index, " absent in the plugin's graph outputs"); if (m_outputs.find(port_index) == m_outputs.end()) { const auto& port = m_output_ports_map[port_index]; const auto& port_shape = port.get_partial_shape(); - const auto& graph_shape = output->second->getInputShapeAtPort(0); + const auto& graph_shape = output->getInputShapeAtPort(0); // WA, due to the transformations and constant folding, shape inference of the resulting model may // have static shapes, while they are dynamic in the initial representation @@ -560,8 +534,7 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn tensor = std::make_shared(memory); } else { - const auto graph_prec = - output->second->getParentEdgeAt(0)->getMemory().getDesc().getPrecision(); + const auto graph_prec = output->getParentEdgeAt(0)->getMemory().getDesc().getPrecision(); OutputControlBlock control_block{model_prec, Shape{shape}}; DEBUG_LOG(port_index, @@ -585,7 +558,7 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn m_outputs[port_index] = tensor; if (!port_shape.is_dynamic() && !m_output_external_ptr.count(port_index)) { auto desc = MemoryDescUtils::generateCpuBlockedMemoryDesc(tensor); - if (desc->isCompatible(output->second->getParentEdgeAt(0)->getMemory().getDesc())) { + if (desc->isCompatible(output->getParentEdgeAt(0)->getMemory().getDesc())) { m_output_external_ptr[port_index] = tensor; } } @@ -603,10 +576,10 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn return; } -void SyncInferRequest::push_input_data() { +void SyncInferRequest::push_input_data(Graph& graph) { for (auto& input : m_input_ports_map) { auto tensor = get_tensor(input.second); - m_graph->PushInputData(input.first, tensor); + graph.PushInputData(input.first, tensor); } } diff --git a/src/plugins/intel_cpu/src/infer_request.h b/src/plugins/intel_cpu/src/infer_request.h index b66387ecc4d4d5..daae553dff2ea4 100644 --- a/src/plugins/intel_cpu/src/infer_request.h +++ b/src/plugins/intel_cpu/src/infer_request.h @@ -4,6 +4,7 @@ #pragma once +#include "compiled_model.h" #include "cpu_tensor.h" #include "graph.h" #include "memory_state.h" @@ -13,13 +14,11 @@ namespace ov { namespace intel_cpu { -class CompiledModel; class AsyncInferRequest; class SyncInferRequest : public ov::ISyncInferRequest { public: - SyncInferRequest(std::shared_ptr compiled_model); - virtual ~SyncInferRequest(); + SyncInferRequest(CompiledModelHolder compiled_model); void infer() override; @@ -97,11 +96,10 @@ class SyncInferRequest : public ov::ISyncInferRequest { void create_infer_request(); void init_tensor(const std::size_t& port_index, const ov::ISyncInferRequest::FoundPort::Type& type); - void push_input_data(); - void redefine_memory_for_input_nodes(); - void assign_states(); + void push_input_data(Graph& graph); + void redefine_memory_for_input_nodes(Graph& graph); void update_external_tensor_ptrs(); - void change_default_ptr(); + void change_default_ptr(Graph& graph); const ov::Output& get_internal_port(const ov::Output& port) const; @@ -110,14 +108,13 @@ class SyncInferRequest : public ov::ISyncInferRequest { private: std::unordered_map m_outputControlBlocks; - Graph* m_graph = nullptr; std::unordered_map> m_input_external_ptr; std::unordered_map> m_output_external_ptr; - std::shared_ptr m_compiled_model; openvino::itt::handle_t m_profiling_task; std::vector m_memory_states; AsyncInferRequest* m_asyncRequest = nullptr; + CompiledModelHolder m_compiled_model; std::unordered_map> m_input_ports_map; std::unordered_map> m_output_ports_map; diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index 616d3df6950e9a..0d8b33d90fbd9c 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -75,7 +75,7 @@ void Composite::selectOptimalPrimitiveDescriptor() { // @todo add ascii diagramm for memory mapping / reuse void Composite::createPrimitive() { - OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.inputsNumber(), "Number of node inputs must be equal the number of inner graph's inputs"); std::vector inputMemory; @@ -83,7 +83,7 @@ void Composite::createPrimitive() { inputMemory.emplace_back(getSrcMemoryAtPort(i)); } - OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.outputsNumber(), "Number of node outputs must be equal the number of inner graph's outputs"); std::vector outputMemory; diff --git a/src/plugins/intel_cpu/src/nodes/if.cpp b/src/plugins/intel_cpu/src/nodes/if.cpp index 8de1cf14920d74..88e2c84970d874 100644 --- a/src/plugins/intel_cpu/src/nodes/if.cpp +++ b/src/plugins/intel_cpu/src/nodes/if.cpp @@ -86,11 +86,9 @@ void If::getSupportedDescriptors() { subGraphThen.CreateGraph(thenBody, context); subGraphElse.CreateGraph(elseBody, context); - const auto& inMapThen = subGraphThen.GetInputNodesMap(); for (const auto& param : ifOp->get_then_body()->get_parameters()) { - auto inNode = inMapThen.find(ifOp->get_then_body()->get_parameter_index(param)); - if (inNode != inMapThen.end()) { - inputMemThen.push_back(getToMemories(inNode->second.get(), 0)); + if (auto inNode = subGraphThen.getInputNodeByIndex(ifOp->get_then_body()->get_parameter_index(param))) { + inputMemThen.push_back(getToMemories(inNode.get(), 0)); } else { OPENVINO_THROW("Then body of node If with name ", getName(), @@ -99,11 +97,9 @@ void If::getSupportedDescriptors() { } } - const auto& inMapElse = subGraphElse.GetInputNodesMap(); for (const auto& param : ifOp->get_else_body()->get_parameters()) { - auto inNode = inMapElse.find(ifOp->get_else_body()->get_parameter_index(param)); - if (inNode != inMapElse.end()) { - inputMemElse.push_back(getToMemories(inNode->second.get(), 0)); + if (auto inNode = subGraphElse.getInputNodeByIndex(ifOp->get_else_body()->get_parameter_index(param))) { + inputMemElse.push_back(getToMemories(inNode.get(), 0)); } else { OPENVINO_THROW("Else body of node If with name ", getName(), @@ -112,11 +108,9 @@ void If::getSupportedDescriptors() { } } - const auto& outMapThen = subGraphThen.GetOutputNodesMap(); for (const auto& out : ifOp->get_then_body()->get_results()) { - auto outNode = outMapThen.find(ifOp->get_then_body()->get_result_index(out)); - if (outNode != outMapThen.end()) { - auto outMem = outNode->second->getSrcMemoryAtPort(0); + if (auto outNode = subGraphThen.getOutputNodeByIndex(ifOp->get_then_body()->get_result_index(out))) { + auto outMem = outNode->getSrcMemoryAtPort(0); outputMemThen.push_back(outMem); } else { OPENVINO_THROW("Then body of node If with name ", @@ -126,11 +120,9 @@ void If::getSupportedDescriptors() { } } - const auto& outMapElse = subGraphElse.GetOutputNodesMap(); for (const auto& out : ifOp->get_else_body()->get_results()) { - auto outNode = outMapElse.find(ifOp->get_else_body()->get_result_index(out)); - if (outNode != outMapElse.end()) { - auto outMem = outNode->second->getSrcMemoryAtPort(0); + if (auto outNode = subGraphElse.getOutputNodeByIndex(ifOp->get_else_body()->get_result_index(out))) { + auto outMem = outNode->getSrcMemoryAtPort(0); outputMemElse.push_back(outMem); } else { OPENVINO_THROW("Else body of node If with name ", diff --git a/src/plugins/intel_cpu/src/nodes/lora.cpp b/src/plugins/intel_cpu/src/nodes/lora.cpp index 0dcb2e9ef2b9e5..c59a3a7fa37578 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.cpp +++ b/src/plugins/intel_cpu/src/nodes/lora.cpp @@ -88,7 +88,7 @@ void LoRA::selectOptimalPrimitiveDescriptor() { // @todo add ascii diagram for memory mapping / reuse void LoRA::createPrimitive() { - CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.inputsNumber(), "Number of node inputs must be equal the number of inner graph's inputs"); std::vector inputMemory; @@ -99,7 +99,7 @@ void LoRA::createPrimitive() { inputMemory.emplace_back(std::move(mem)); } - CPU_NODE_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + CPU_NODE_ASSERT(getOriginalOutputsNumber() == m_graph.outputsNumber(), "Number of node outputs must be equal the number of inner graph's outputs"); std::vector outputMemory{getDstMemoryAtPort(0)}; diff --git a/src/plugins/intel_cpu/src/nodes/memory.hpp b/src/plugins/intel_cpu/src/nodes/memory.hpp index 1571e8fffa2231..9c0c9664ce8a27 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.hpp +++ b/src/plugins/intel_cpu/src/nodes/memory.hpp @@ -29,7 +29,7 @@ class MemoryStatesRegister { void registerInput(MemoryInputBase* node); void remove(MemoryNode* node); - const InputNodesMap& getMemoryStates() const { + const InputNodesMap& getMemoryStates() { return memory_inputs; } diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp index e2bd8ed6f25b5d..99281189b8196b 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp @@ -440,19 +440,15 @@ void TensorIterator::getSupportedDescriptors() { const std::shared_ptr body = tiOp->get_function(); sub_graph.CreateGraph(body, context); - const auto& inMap = sub_graph.GetInputNodesMap(); for (const auto& param : tiOp->get_function()->get_parameters()) { - auto inNode = inMap.find(tiOp->get_function()->get_parameter_index(param)); - if (inNode != inMap.end()) { - input_mems.push_back(getToMemories(inNode->second.get(), 0)); + if (auto inNode = sub_graph.getInputNodeByIndex(tiOp->get_function()->get_parameter_index(param))) { + input_mems.push_back(getToMemories(inNode.get(), 0)); } } - const auto& outMap = sub_graph.GetOutputNodesMap(); for (const auto& out : tiOp->get_function()->get_results()) { - auto outNode = outMap.find(tiOp->get_function()->get_result_index(out)); - if (outNode != outMap.end()) { - auto outMem = outNode->second->getSrcMemoryAtPort(0); + if (auto outNode = sub_graph.getOutputNodeByIndex(tiOp->get_function()->get_result_index(out))) { + auto outMem = outNode->getSrcMemoryAtPort(0); output_mem.push_back(outMem); } } diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/concurent_release_memory.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/concurent_release_memory.cpp new file mode 100644 index 00000000000000..4c8fb1945bb8fa --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/concurent_release_memory.cpp @@ -0,0 +1,198 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include +#include +#include + +#include +#include + +namespace ov { +namespace test { +// Openvino extension operation that sleeps for X us in its evaluate method +namespace { +enum class TestSteps { INIT, ENTER_EVALUATE, RUN_EVALUATE }; +} // namespace + +class SleepCustomOp : public ov::op::Op { +public: + OPENVINO_OP("SleepCustomOp"); + SleepCustomOp() = default; + SleepCustomOp(const ov::OutputVector& args, + size_t sleep, + std::shared_ptr mutex, + std::shared_ptr cv, + std::shared_ptr> test_step) + : Op(args), + m_sleep(sleep), + m_mutex(mutex), + m_cv(cv), + m_test_step(test_step) { + constructor_validate_and_infer_types(); + } + + void validate_and_infer_types() override { + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); + } + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override { + OPENVINO_ASSERT(new_args.size() == 1, "Incorrect number of new arguments"); + auto new_op = std::make_shared(new_args, m_sleep, m_mutex, m_cv, m_test_step); + return new_op; + } + + bool visit_attributes(ov::AttributeVisitor& visitor) override { + return true; + } + + void revalidate_and_infer_types() override { + validate_and_infer_types(); + } + + bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override { + // signal entering the evaluate method + { + std::lock_guard lock(*m_mutex); + m_test_step->store(TestSteps::ENTER_EVALUATE); + } + m_cv->notify_all(); + { + // this is required to start all the evaluate calls at the same time + std::unique_lock lock(*m_mutex); + m_cv->wait(lock, [&] { + return m_test_step->load() == TestSteps::RUN_EVALUATE; + }); + } + std::this_thread::sleep_for(std::chrono::microseconds(m_sleep)); + return true; + } + + bool evaluate(ov::TensorVector& output_values, + const ov::TensorVector& input_values, + const ov::EvaluationContext& evaluationContext) const override { + return evaluate(output_values, input_values); + } + + bool has_evaluate() const override { + return true; + } + +private: + size_t m_sleep; // sleep time in us + std::shared_ptr m_mutex; + std::shared_ptr m_cv; + std::shared_ptr> m_test_step; +}; + +class ReleaseMemoryMultiThreadTest : public ::testing::Test { +protected: + void SetUp() override { + param = std::make_shared(ov::element::f32, ov::Shape{1}); + + constexpr size_t sleep_time = 5; // us + mutex = std::make_shared(); + cv = std::make_shared(); + test_step = std::make_shared>(TestSteps::INIT); + + auto sleep = std::make_shared(ov::OutputVector{param}, sleep_time, mutex, cv, test_step); + ov::ResultVector results{std::make_shared(sleep)}; + ov::ParameterVector params{param}; + + auto model = std::make_shared(results, params, "testModel"); + + compiled_model = core.compile_model(model, ov::test::utils::DEVICE_CPU, {{"NUM_STREAMS", num_streams}}); + } + +protected: + const size_t num_streams = 1; // use only one async stream to simplify invocation order syncronization + ov::Core core; + ov::CompiledModel compiled_model; + std::shared_ptr param; + + std::shared_ptr mutex; + std::shared_ptr cv; + std::shared_ptr> test_step; +}; +} // namespace test +} // namespace ov + +using namespace ov::test; + +TEST_F(ReleaseMemoryMultiThreadTest, smoke_throwInferenceIsRunning) { + // Create and infer a few infer requests concurrently + std::vector inferRequests; + for (size_t i = 0; i < num_streams; i++) { + auto inferRequest = compiled_model.create_infer_request(); + inferRequest.set_tensor(param, ov::Tensor(ov::element::f32, ov::Shape{1})); + inferRequests.push_back(std::move(inferRequest)); + } + // infer the infer requests + for (auto& inferRequest : inferRequests) { + inferRequest.start_async(); + } + + //wait till the infer request enters evaluate + { + std::unique_lock lock(*mutex); + cv->wait(lock, [&] { + return test_step->load() == TestSteps::ENTER_EVALUATE; + }); + } + + // While the infer requests are waiting on the cv, call release_memory. + // We expect that the method will throw an exception when it is called while infer requests are running. + EXPECT_THROW(compiled_model.release_memory(), ov::Exception); + + // lets unlock cv + { + std::lock_guard lock(*mutex); + test_step->store(TestSteps::RUN_EVALUATE); + } + cv->notify_all(); + + for (auto& inferRequest : inferRequests) { + inferRequest.wait(); + } +} + +TEST_F(ReleaseMemoryMultiThreadTest, smoke_noThrowInferenceIsNotRunning) { + // Create and infer a few infer requests concurrently + std::vector inferRequests; + for (size_t i = 0; i < num_streams; i++) { + auto inferRequest = compiled_model.create_infer_request(); + inferRequest.set_tensor(param, ov::Tensor(ov::element::f32, ov::Shape{1})); + inferRequests.push_back(std::move(inferRequest)); + } + // infer the infer requests + for (auto& inferRequest : inferRequests) { + inferRequest.start_async(); + } + + //wait till the infer request enters evaluate + { + std::unique_lock lock(*mutex); + cv->wait(lock, [&] { + return test_step->load() == TestSteps::ENTER_EVALUATE; + }); + } + + // lets unlock cv + { + std::lock_guard lock(*mutex); + test_step->store(TestSteps::RUN_EVALUATE); + } + cv->notify_all(); + + for (auto& inferRequest : inferRequests) { + inferRequest.wait(); + } + + // Don't throw when the infer requests are finished + EXPECT_NO_THROW(compiled_model.release_memory()); +} \ No newline at end of file From 0f3c9724df438fb905d83fc6bf631140df8f00d2 Mon Sep 17 00:00:00 2001 From: Sungeun Kim Date: Tue, 17 Dec 2024 00:57:41 +0900 Subject: [PATCH 12/17] [GPU] fix issue: use debug variable without GPU_DEBUG_IF (#28077) Co-authored-by: Pavel Durandin --- .../plugin/transformations/dynamic_quantize_fully_connected.cpp | 2 +- src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index 61dc40e2713800..7b365ab7164ba7 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -56,7 +56,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size config.scale_dt = element::f16; config.group_sizes = shape_group_size; - if (debug_config->dynamic_quantize_asym) { + GPU_DEBUG_IF(debug_config->dynamic_quantize_asym) { config.quantization_type = QuantizationType::Asymmetric; config.quantization_dt = element::u8; config.zp_dt = element::u8; // it supports u8 only now diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 50eecf51b945b7..a53b6ddac7332a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -991,7 +991,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { } // AZP does not support 8bit weight - if (debug_config->dynamic_quantize_asym + GPU_DEBUG_IF(debug_config->dynamic_quantize_asym && (root->get_input_element_type(1) == ov::element::i8 || root->get_input_element_type(1) == ov::element::u8)) { GPU_DEBUG_TRACE << root->get_friendly_name() << " dyn_quan is turned off: asym quantization does not support 8bit weight" << std::endl; return true; From b5a3c6527526af63b56ec8e9e8f991188f47b2c7 Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Mon, 16 Dec 2024 18:22:33 +0100 Subject: [PATCH 13/17] [GHA] Enable dependabot for docker images (#28079) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- .github/dependabot.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 359ff683c9b22a..5eac7709e32703 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -141,3 +141,18 @@ updates: - "mryzhov" - "ilya-lavrenov" open-pull-requests-limit: 3 + + # Docker images + - package-ecosystem: docker + directory: "/" + schedule: + interval: "daily" + time: "09:00" + timezone: "Asia/Dubai" + allow: + - dependency-type: "direct" + assignees: + - "akashchi" + - "mryzhov" + - "ilya-lavrenov" + open-pull-requests-limit: 3 \ No newline at end of file From 44f7ddb554b94415ddbd0f3f889b55846242347d Mon Sep 17 00:00:00 2001 From: Egor Duplenskii Date: Mon, 16 Dec 2024 18:23:05 +0100 Subject: [PATCH 14/17] [CPU] Enable clang-format for aarch64 (#28071) --- .github/workflows/code_style.yml | 30 + .../aarch64/jit_conversion_emitters.cpp | 242 ++-- .../aarch64/jit_conversion_emitters.hpp | 53 +- .../plugin/aarch64/jit_eltwise_emitters.cpp | 1032 ++++++++++------- .../plugin/aarch64/jit_eltwise_emitters.hpp | 416 +++---- .../emitters/plugin/aarch64/jit_emitter.cpp | 152 +-- .../emitters/plugin/aarch64/jit_emitter.hpp | 71 +- .../aarch64/jit_load_store_emitters.cpp | 364 +++--- .../aarch64/jit_load_store_emitters.hpp | 60 +- .../snippets/aarch64/cpu_generator.cpp | 156 +-- .../snippets/aarch64/cpu_generator.hpp | 9 +- .../snippets/aarch64/jit_fill_emitter.cpp | 47 +- .../snippets/aarch64/jit_fill_emitter.hpp | 27 +- .../snippets/aarch64/jit_kernel_emitter.cpp | 86 +- .../snippets/aarch64/jit_kernel_emitter.hpp | 49 +- .../snippets/aarch64/jit_loop_emitters.cpp | 75 +- .../snippets/aarch64/jit_loop_emitters.hpp | 47 +- .../snippets/aarch64/jit_memory_emitters.cpp | 44 +- .../snippets/aarch64/jit_memory_emitters.hpp | 53 +- .../aarch64/jit_snippets_emitters.cpp | 58 +- .../aarch64/jit_snippets_emitters.hpp | 35 +- .../nodes/executors/aarch64/jit_eltwise.cpp | 59 +- .../nodes/executors/aarch64/jit_eltwise.hpp | 25 +- .../executors/acl/acl_common_executor.cpp | 33 +- .../executors/acl/acl_common_executor.hpp | 19 +- .../src/nodes/executors/acl/acl_convert.cpp | 63 +- .../src/nodes/executors/acl/acl_convert.hpp | 13 +- .../src/nodes/executors/acl/acl_deconv.cpp | 178 +-- .../src/nodes/executors/acl/acl_deconv.hpp | 18 +- .../src/nodes/executors/acl/acl_eltwise.cpp | 683 ++++++----- .../src/nodes/executors/acl/acl_eltwise.hpp | 9 +- .../executors/acl/acl_fullyconnected.cpp | 207 ++-- .../executors/acl/acl_fullyconnected.hpp | 23 +- .../nodes/executors/acl/acl_ie_scheduler.cpp | 24 +- .../nodes/executors/acl/acl_ie_scheduler.hpp | 13 +- .../nodes/executors/acl/acl_interpolate.cpp | 144 ++- .../nodes/executors/acl/acl_interpolate.hpp | 17 +- .../src/nodes/executors/acl/acl_mvn.cpp | 89 +- .../src/nodes/executors/acl/acl_mvn.hpp | 10 +- .../src/nodes/executors/acl/acl_pooling.cpp | 92 +- .../src/nodes/executors/acl/acl_pooling.hpp | 106 +- .../src/nodes/executors/acl/acl_reduce.cpp | 142 ++- .../src/nodes/executors/acl/acl_reduce.hpp | 51 +- .../src/nodes/executors/acl/acl_transpose.cpp | 23 +- .../src/nodes/executors/acl/acl_transpose.hpp | 28 +- .../src/nodes/executors/acl/acl_utils.cpp | 85 +- .../src/nodes/executors/acl/acl_utils.hpp | 192 +-- .../aarch64/jit_uni_eltwise_generic.cpp | 675 +++++------ .../aarch64/jit_uni_eltwise_generic.hpp | 31 +- .../src/nodes/kernels/acl/gemm_kernel.cpp | 173 ++- .../src/nodes/kernels/acl/gemm_kernel.hpp | 16 +- .../cpu_opset/arm/pass/convert_group_conv.cpp | 26 +- .../cpu_opset/arm/pass/convert_group_conv.hpp | 12 +- .../arm/pass/convert_group_conv1d.cpp | 28 +- .../arm/pass/convert_group_conv1d.hpp | 16 +- .../arm/pass/convert_reduce_multi_axis.cpp | 23 +- .../arm/pass/convert_reduce_multi_axis.hpp | 26 +- .../arm/pass/convert_reduce_no_keep_dims.cpp | 8 +- .../arm/pass/convert_reduce_no_keep_dims.hpp | 23 +- .../cpu_opset/arm/pass/mish_decomposition.cpp | 7 +- .../cpu_opset/arm/pass/mish_decomposition.hpp | 8 +- .../aarch64/pass/snippets_mark_skipped.cpp | 115 +- .../aarch64/pass/snippets_mark_skipped.hpp | 22 +- .../snippets/aarch64/shape_inference.cpp | 33 +- .../snippets/aarch64/shape_inference.hpp | 13 +- 65 files changed, 3693 insertions(+), 3014 deletions(-) diff --git a/.github/workflows/code_style.yml b/.github/workflows/code_style.yml index a70d2641cb57f3..d4da2a16d38923 100644 --- a/.github/workflows/code_style.yml +++ b/.github/workflows/code_style.yml @@ -38,6 +38,36 @@ jobs: level: warning fail_on_error: true + clang-format-aarch64: + runs-on: ubuntu-22.04 + if: ${{ github.repository_owner == 'openvinotoolkit' }} + permissions: + pull-requests: write + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + submodules: 'true' + + - name: Install clang-format-15 and cross-compilation dependencies + run: | + sudo apt update + sudo apt --assume-yes install binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu scons clang-format-15 + + # Run cmake with -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT in order to enable codestyle check for ITT collector + - name: CMake configure + run: cmake -DENABLE_CLANG_FORMAT=ON -DENABLE_TESTS=ON -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT -DCMAKE_TOOLCHAIN_FILE=cmake/arm64.toolchain.cmake -B build_arm64 + + - name: Create code style diff + run: cmake --build build_arm64 --target clang_format_fix_all -j8 + + - name: suggester / clang-format + if: startsWith(github.event_name, 'pull_request') + uses: reviewdog/action-suggester@db4abb16fbaabe386831e5addb7be1485d0d63d3 # v1.18.0 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + level: warning + fail_on_error: true + ShellCheck: runs-on: ubuntu-22.04 if: ${{ github.repository_owner == 'openvinotoolkit' }} diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.cpp index 7383b8b8f9ddab..43417942e8bc53 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_conversion_emitters.hpp" + #include "emitters/utils.hpp" using namespace dnnl::impl::cpu::aarch64; @@ -25,27 +26,27 @@ namespace aarch64 { // does not distinguish ARMv8.2 with ARMv8.2-A, conversion between f16 and i16 will still use three // instructions f16 -> f32 -> i32 -> i16 (f16 <- f32 <- i32 <- i16). template -inline void jit_convert_emitter::cvt_f16_to_f32(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_f16_to_f32(const TReg& src, const TReg& dst) const { h->fcvtl(dst.s4, src.h4); } template -inline void jit_convert_emitter::cvt_f32_to_f16(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_f32_to_f16(const TReg& src, const TReg& dst) const { h->fcvtn(dst.h4, src.s4); } template -inline void jit_convert_emitter::cvt_f32_to_i32(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_f32_to_i32(const TReg& src, const TReg& dst) const { h->fcvtzs(dst.s, src.s); } template -inline void jit_convert_emitter::cvt_i32_to_f32(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_i32_to_f32(const TReg& src, const TReg& dst) const { h->scvtf(dst.s, src.s); } template -inline void jit_convert_emitter::cvt_i32_to_i16(const TReg &src, const TReg &dst, bool is_saturated) const { +inline void jit_convert_emitter::cvt_i32_to_i16(const TReg& src, const TReg& dst, bool is_saturated) const { if (is_saturated) { h->sqxtn(dst.h4, src.s4); } else { @@ -54,22 +55,25 @@ inline void jit_convert_emitter::cvt_i32_to_i16(const TReg &src, const TReg &dst } template -inline void jit_convert_emitter::cvt_i16_to_i32(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_i16_to_i32(const TReg& src, const TReg& dst) const { h->sxtl(dst.s4, src.h4); } template -inline void jit_convert_emitter::cvt_f16_to_i16(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_f16_to_i16(const TReg& src, const TReg& dst) const { h->fcvtzs(dst.h4, src.h4); } template -inline void jit_convert_emitter::cvt_i16_to_f16(const TReg &src, const TReg &dst) const { +inline void jit_convert_emitter::cvt_i16_to_f16(const TReg& src, const TReg& dst) const { h->scvtf(dst.h4, src.h4); } template -inline void jit_convert_emitter::cvt_i16_to_byte(const TReg &src, const TReg &dst, bool is_signed, bool is_saturated) const { +inline void jit_convert_emitter::cvt_i16_to_byte(const TReg& src, + const TReg& dst, + bool is_signed, + bool is_saturated) const { if (is_saturated) { if (is_signed) { h->sqxtn(dst.b8, src.h8); @@ -82,7 +86,7 @@ inline void jit_convert_emitter::cvt_i16_to_byte(const TReg &src, const TReg &ds } template -inline void jit_convert_emitter::cvt_byte_to_i16(const TReg &src, const TReg &dst, bool is_signed) const { +inline void jit_convert_emitter::cvt_byte_to_i16(const TReg& src, const TReg& dst, bool is_signed) const { if (is_signed) { h->sxtl(dst.h8, src.b8); } else { @@ -91,10 +95,13 @@ inline void jit_convert_emitter::cvt_byte_to_i16(const TReg &src, const TReg &ds } template -void jit_convert_emitter::jit_convert_process(const TReg &src, const TReg &dst, ov::element::Type input_type, ov::element::Type output_type, +void jit_convert_emitter::jit_convert_process(const TReg& src, + const TReg& dst, + ov::element::Type input_type, + ov::element::Type output_type, bool is_saturated) const { - if (input_type == output_type || (!is_saturated && - one_of(input_type, ov::element::i8, ov::element::u8) && one_of(output_type, ov::element::i8, ov::element::u8))) { + if (input_type == output_type || (!is_saturated && one_of(input_type, ov::element::i8, ov::element::u8) && + one_of(output_type, ov::element::i8, ov::element::u8))) { if (src.getIdx() != dst.getIdx()) { h->mov(dst.b16, src.b16); } @@ -102,119 +109,130 @@ void jit_convert_emitter::jit_convert_process(const TReg &src, const TReg &dst, } switch (output_type) { + case ov::element::f32: + switch (input_type) { + case ov::element::i32: + cvt_i32_to_f32(src, dst); + break; + case ov::element::f16: + cvt_f16_to_f32(src, dst); + break; + case ov::element::i8: + case ov::element::u8: + cvt_byte_to_i16(src, dst, input_type.is_signed()); + cvt_i16_to_i32(dst, dst); + cvt_i32_to_f32(dst, dst); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); + } + break; + case ov::element::i32: + switch (input_type) { + case ov::element::f32: + cvt_f32_to_i32(src, dst); + break; + case ov::element::f16: + cvt_f16_to_f32(src, dst); + cvt_f32_to_i32(dst, dst); + break; + case ov::element::i8: + case ov::element::u8: + cvt_byte_to_i16(src, dst, input_type.is_signed()); + cvt_i16_to_i32(dst, dst); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); + } + break; + case ov::element::f16: + switch (input_type) { + case ov::element::f32: + cvt_f32_to_f16(src, dst); + break; + case ov::element::i32: + cvt_i32_to_f32(src, dst); + cvt_f32_to_f16(dst, dst); + break; + case ov::element::i8: + case ov::element::u8: + cvt_byte_to_i16(src, dst, input_type.is_signed()); + cvt_i16_to_i32(dst, dst); + cvt_i32_to_f32(dst, dst); + cvt_f32_to_f16(dst, dst); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); + } + break; + case ov::element::i8: + case ov::element::u8: + switch (input_type) { case ov::element::f32: - switch (input_type) { - case ov::element::i32: - cvt_i32_to_f32(src, dst); - break; - case ov::element::f16: - cvt_f16_to_f32(src, dst); - break; - case ov::element::i8: - case ov::element::u8: - cvt_byte_to_i16(src, dst, input_type.is_signed()); - cvt_i16_to_i32(dst, dst); - cvt_i32_to_f32(dst, dst); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); - } + cvt_f32_to_i32(src, dst); + cvt_i32_to_i16(dst, dst, is_saturated); + cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); break; case ov::element::i32: - switch (input_type) { - case ov::element::f32: - cvt_f32_to_i32(src, dst); - break; - case ov::element::f16: - cvt_f16_to_f32(src, dst); - cvt_f32_to_i32(dst, dst); - break; - case ov::element::i8: - case ov::element::u8: - cvt_byte_to_i16(src, dst, input_type.is_signed()); - cvt_i16_to_i32(dst, dst); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); - } + cvt_i32_to_i16(src, dst, is_saturated); + cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); break; case ov::element::f16: - switch (input_type) { - case ov::element::f32: - cvt_f32_to_f16(src, dst); - break; - case ov::element::i32: - cvt_i32_to_f32(src, dst); - cvt_f32_to_f16(dst, dst); - break; - case ov::element::i8: - case ov::element::u8: - cvt_byte_to_i16(src, dst, input_type.is_signed()); - cvt_i16_to_i32(dst, dst); - cvt_i32_to_f32(dst, dst); - cvt_f32_to_f16(dst, dst); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); - } + cvt_f16_to_f32(src, dst); + cvt_f32_to_i32(dst, dst); + cvt_i32_to_i16(dst, dst, is_saturated); + cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); break; case ov::element::i8: case ov::element::u8: - switch (input_type) { - case ov::element::f32: - cvt_f32_to_i32(src, dst); - cvt_i32_to_i16(dst, dst, is_saturated); - cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); - break; - case ov::element::i32: - cvt_i32_to_i16(src, dst, is_saturated); - cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); - break; - case ov::element::f16: - cvt_f16_to_f32(src, dst); - cvt_f32_to_i32(dst, dst); - cvt_i32_to_i16(dst, dst, is_saturated); - cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); - break; - case ov::element::i8: - case ov::element::u8: - cvt_byte_to_i16(src, dst, input_type.is_signed()); - cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); - } + cvt_byte_to_i16(src, dst, input_type.is_signed()); + cvt_i16_to_byte(dst, dst, output_type.is_signed(), is_saturated); break; default: - OV_CPU_JIT_EMITTER_THROW("Unsupported output type: ", output_type.get_type_name()); + OV_CPU_JIT_EMITTER_THROW("Unsupported input type: ", input_type.get_type_name()); + } + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported output type: ", output_type.get_type_name()); } } -jit_convert_emitter::jit_convert_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_convert_emitter::jit_convert_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { input_type = node->get_input_element_type(0); output_type = node->get_output_element_type(0); } void jit_convert_emitter::validate_types() const { - OV_CPU_JIT_EMITTER_ASSERT(one_of(input_type, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), - "Unsupported input type: ", input_type.get_type_name()); - OV_CPU_JIT_EMITTER_ASSERT(one_of(output_type, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), - "Unsupported output type: ", output_type.get_type_name()); + OV_CPU_JIT_EMITTER_ASSERT( + one_of(input_type, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), + "Unsupported input type: ", + input_type.get_type_name()); + OV_CPU_JIT_EMITTER_ASSERT( + one_of(output_type, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), + "Unsupported output type: ", + output_type.get_type_name()); } -size_t jit_convert_emitter::get_inputs_count() const { return 1; } +size_t jit_convert_emitter::get_inputs_count() const { + return 1; +} void jit_convert_emitter::emit_data() const { jit_emitter::emit_data(); } -jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator *host, cpu_isa_t host_isa, - const std::shared_ptr& node, ov::element::Type exec_prc) - : jit_convert_emitter(host, host_isa, node, exec_prc) { -} +jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) {} -void jit_convert_truncation_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_convert_truncation_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { validate_types(); if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_idxs, out_idxs); @@ -224,19 +242,22 @@ void jit_convert_truncation_emitter::emit_impl(const std::vector &in_idx } template -void jit_convert_truncation_emitter::emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_convert_truncation_emitter::emit_isa(const std::vector& in_idxs, + const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); jit_convert_process(src, dst, input_type, output_type, false); } -jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator *host, cpu_isa_t host_isa, - const std::shared_ptr& node, ov::element::Type exec_prc) - : jit_convert_emitter(host, host_isa, node, exec_prc) { -} +jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) {} -void jit_convert_saturation_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_convert_saturation_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { validate_types(); if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_idxs, out_idxs); @@ -246,13 +267,14 @@ void jit_convert_saturation_emitter::emit_impl(const std::vector &in_idx } template -void jit_convert_saturation_emitter::emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_convert_saturation_emitter::emit_isa(const std::vector& in_idxs, + const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); jit_convert_process(src, dst, input_type, output_type, true); } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.hpp index af0310f736e5c9..bc9bb1e5005672 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_conversion_emitters.hpp @@ -12,8 +12,10 @@ namespace aarch64 { class jit_convert_emitter : public jit_emitter { public: - jit_convert_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_count() const override; @@ -21,7 +23,10 @@ class jit_convert_emitter : public jit_emitter { void emit_data() const override; void validate_types() const; template - void jit_convert_process(const TReg &src, const TReg &dst, ov::element::Type input_type, ov::element::Type output_type, + void jit_convert_process(const TReg& src, + const TReg& dst, + ov::element::Type input_type, + ov::element::Type output_type, bool is_saturated) const; ov::element::Type input_type; @@ -29,25 +34,25 @@ class jit_convert_emitter : public jit_emitter { private: template - inline void cvt_f16_to_f32(const TReg &src, const TReg &dst) const; + inline void cvt_f16_to_f32(const TReg& src, const TReg& dst) const; template - inline void cvt_f32_to_f16(const TReg &src, const TReg &dst) const; + inline void cvt_f32_to_f16(const TReg& src, const TReg& dst) const; template - inline void cvt_f32_to_i32(const TReg &src, const TReg &dst) const; + inline void cvt_f32_to_i32(const TReg& src, const TReg& dst) const; template - inline void cvt_i32_to_f32(const TReg &src, const TReg &dst) const; + inline void cvt_i32_to_f32(const TReg& src, const TReg& dst) const; template - inline void cvt_i32_to_i16(const TReg &src, const TReg &dst, bool is_saturated) const; + inline void cvt_i32_to_i16(const TReg& src, const TReg& dst, bool is_saturated) const; template - inline void cvt_i16_to_i32(const TReg &src, const TReg &dst) const; + inline void cvt_i16_to_i32(const TReg& src, const TReg& dst) const; template - inline void cvt_f16_to_i16(const TReg &src, const TReg &dst) const; + inline void cvt_f16_to_i16(const TReg& src, const TReg& dst) const; template - inline void cvt_i16_to_f16(const TReg &src, const TReg &dst) const; + inline void cvt_i16_to_f16(const TReg& src, const TReg& dst) const; template - inline void cvt_i16_to_byte(const TReg &src, const TReg &dst, bool is_signed, bool is_saturated) const; + inline void cvt_i16_to_byte(const TReg& src, const TReg& dst, bool is_signed, bool is_saturated) const; template - inline void cvt_byte_to_i16(const TReg &src, const TReg &dst, bool is_signed) const; + inline void cvt_byte_to_i16(const TReg& src, const TReg& dst, bool is_signed) const; }; // This emitter is covered by specification of "Convert" operation. The implementation uses a "warp-around" conversion. @@ -56,13 +61,15 @@ class jit_convert_emitter : public jit_emitter { // 129 -> -127 class jit_convert_truncation_emitter : public jit_convert_emitter { public: - jit_convert_truncation_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_truncation_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); private: void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; template - void emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const; + void emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const; }; // This emitter is covered by the common dnnl behavior. The implementation uses a "saturation" conversion. @@ -71,15 +78,17 @@ class jit_convert_truncation_emitter : public jit_convert_emitter { // 129 -> 127 class jit_convert_saturation_emitter : public jit_convert_emitter { public: - jit_convert_saturation_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_saturation_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); private: void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; template - void emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const; + void emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index 05a0e0a2cf6a0e..534470c746f2fe 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -3,11 +3,12 @@ // #include "jit_eltwise_emitters.hpp" -#include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include + #include "common/utils.hpp" #include "emitters/utils.hpp" +#include "transformations/cpu_opset/common/op/swish_cpu.hpp" namespace ov { namespace intel_cpu { @@ -21,34 +22,35 @@ namespace { ov::element::Type get_arithmetic_binary_exec_precision(const std::shared_ptr& n) { std::vector input_precisions; for (const auto& input : n->inputs()) { - input_precisions.push_back( - input.get_source_output().get_element_type()); + input_precisions.push_back(input.get_source_output().get_element_type()); } - assert(std::all_of( - input_precisions.begin(), - input_precisions.end(), - [&input_precisions](const ov::element::Type& precision) {return precision == input_precisions[0]; })); + assert(std::all_of(input_precisions.begin(), + input_precisions.end(), + [&input_precisions](const ov::element::Type& precision) { + return precision == input_precisions[0]; + })); return input_precisions[0]; } -} // namespace +} // namespace /// ABS /// jit_abs_emitter::jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_abs_emitter::jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_abs_emitter::get_inputs_count() const { return 1; } +size_t jit_abs_emitter::get_inputs_count() const { + return 1; +} -void jit_abs_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_abs_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -57,7 +59,7 @@ void jit_abs_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_abs_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_abs_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -75,17 +77,18 @@ std::set> jit_abs_emitter::get_supported_precisions(c jit_add_emitter::jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_add_emitter::jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_add_emitter::get_inputs_count() const { return 2; } +size_t jit_add_emitter::get_inputs_count() const { + return 2; +} -void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_add_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -94,7 +97,7 @@ void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_add_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -113,7 +116,7 @@ std::set> jit_add_emitter::get_supported_precisions(c jit_clamp_emitter::jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { const auto clamp = std::dynamic_pointer_cast(node); if (clamp == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v0::Clamp"); @@ -129,24 +132,31 @@ jit_clamp_emitter::jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator* ho const float min, const float max, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc), - min(min), - max(max) { + : jit_emitter(host, host_isa, exec_prc), + min(min), + max(max) { prepare_table(); } -size_t jit_clamp_emitter::get_inputs_count() const { return 1; } +size_t jit_clamp_emitter::get_inputs_count() const { + return 1; +} -size_t jit_clamp_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_clamp_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_clamp_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_clamp_emitter::get_aux_gprs_count() const { + return 1; +} void jit_clamp_emitter::register_table_entries() { push_arg_entry_of("min", dnnl::impl::float2int(min), true); push_arg_entry_of("max", dnnl::impl::float2int(max), true); } -void jit_clamp_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_clamp_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -155,7 +165,8 @@ void jit_clamp_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_clamp_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_clamp_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -169,24 +180,28 @@ void jit_clamp_emitter::emit_isa(const std::vector &in_vec_idxs, const s h->fmin(dst.s, dst.s, aux.s); } -std::set> jit_clamp_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_clamp_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } /// DIVIDE /// -jit_divide_emitter::jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} +jit_divide_emitter::jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_divide_emitter::jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) {} +jit_divide_emitter::jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_divide_emitter::get_inputs_count() const { return 2; } +size_t jit_divide_emitter::get_inputs_count() const { + return 2; +} -void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_divide_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -195,7 +210,8 @@ void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_divide_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -206,35 +222,44 @@ void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const h->uni_fdiv(dst.s, src0.s, src1.s); } -std::set> jit_divide_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_divide_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } /// EQUAL /// -jit_equal_emitter::jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_equal_emitter::jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } -jit_equal_emitter::jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_equal_emitter::jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_equal_emitter::get_inputs_count() const { return 2; } +size_t jit_equal_emitter::get_inputs_count() const { + return 2; +} -size_t jit_equal_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_equal_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_equal_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_equal_emitter::get_aux_gprs_count() const { + return 1; +} -std::set> jit_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -243,7 +268,8 @@ void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -266,7 +292,7 @@ void jit_equal_emitter::register_table_entries() { jit_elu_emitter::jit_elu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { const auto elu = std::dynamic_pointer_cast(node); if (elu == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v0::Clamp"); @@ -280,12 +306,16 @@ jit_elu_emitter::jit_elu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, jit_elu_emitter::jit_elu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const float alpha, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc), alpha(alpha) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc), + alpha(alpha) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_elu_emitter::get_inputs_count() const { return 1; } +size_t jit_elu_emitter::get_inputs_count() const { + return 1; +} size_t jit_elu_emitter::get_aux_vecs_count() const { return std::max(exp_emitter->get_aux_vecs_count() + 1ull, 2ull); @@ -295,7 +325,7 @@ size_t jit_elu_emitter::get_aux_gprs_count() const { return exp_emitter->get_aux_gprs_count() + 1; } -void jit_elu_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_elu_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -304,7 +334,7 @@ void jit_elu_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_elu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_elu_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -315,11 +345,7 @@ void jit_elu_emitter::emit_isa(const std::vector &in_vec_idxs, const std h->mov(vmm_aux1.b16, vmm_src.b16); // compute exponent - exp_emitter->emit_code( - { vmm_src.getIdx() }, - out_vec_idxs, - aux_vec_idxs, - aux_gpr_idxs); + exp_emitter->emit_code({vmm_src.getIdx()}, out_vec_idxs, aux_vec_idxs, aux_gpr_idxs); // alpha * (exp(x) - 1) const TReg vmm_aux0(aux_vec_idxs[0]); @@ -351,23 +377,30 @@ std::set> jit_elu_emitter::get_supported_precisions(c jit_exp_emitter::jit_exp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } jit_exp_emitter::jit_exp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_exp_emitter::get_inputs_count() const { return 1; } +size_t jit_exp_emitter::get_inputs_count() const { + return 1; +} -size_t jit_exp_emitter::get_aux_vecs_count() const { return 4; } +size_t jit_exp_emitter::get_aux_vecs_count() const { + return 4; +} -size_t jit_exp_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_exp_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_exp_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_exp_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -376,7 +409,7 @@ void jit_exp_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_exp_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (exec_prc_ != ov::element::f32) { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); } @@ -484,17 +517,19 @@ std::set> jit_exp_emitter::get_supported_precisions(c jit_floor_emitter::jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_floor_emitter::jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_floor_emitter::get_inputs_count() const { return 1; } +size_t jit_floor_emitter::get_inputs_count() const { + return 1; +} -void jit_floor_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -503,7 +538,8 @@ void jit_floor_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -512,27 +548,32 @@ void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const s h->frintm(dst.s, src.s); } -std::set> jit_floor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_floor_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } /// FLOOR_MOD /// -jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc): jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_floor_mod_emitter::get_inputs_count() const { return 2; } +size_t jit_floor_mod_emitter::get_inputs_count() const { + return 2; +} -size_t jit_floor_mod_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_floor_mod_emitter::get_aux_vecs_count() const { + return 1; +} -void jit_floor_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -541,7 +582,8 @@ void jit_floor_mod_emitter::emit_impl(const std::vector &in_vec_idxs, co } template -void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_mod_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -557,29 +599,32 @@ void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, con h->fsub(r.s, dividend.s, aux.s); } -std::set> jit_floor_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_floor_mod_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } /// CEILING /// -//Initialization of the emitter, taking node as input +// Initialization of the emitter, taking node as input jit_ceiling_emitter::jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -//Initialization of emitter, without taking node as input +// Initialization of emitter, without taking node as input jit_ceiling_emitter::jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -//This will tell the JIT compiler that how many inputs the ceiling operation requires (here 1) -size_t jit_ceiling_emitter::get_inputs_count() const { return 1; } +// This will tell the JIT compiler that how many inputs the ceiling operation requires (here 1) +size_t jit_ceiling_emitter::get_inputs_count() const { + return 1; +} -//Main implementation method that emits the JIT code -void jit_ceiling_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +// Main implementation method that emits the JIT code +void jit_ceiling_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -590,7 +635,8 @@ void jit_ceiling_emitter::emit_impl(const std::vector &in_vec_idxs, cons // Template method that generates actual instruction sequence for ceiling operation // The h->frintp() method rounds up the floating value to the nearest integer. template -void jit_ceiling_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_ceiling_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -601,7 +647,8 @@ void jit_ceiling_emitter::emit_isa(const std::vector &in_vec_idxs, const // Template method that generates actual instruction sequence for ceiling operation // Currently only supports 32-bit floating point (f32) -std::set> jit_ceiling_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_ceiling_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -609,19 +656,22 @@ std::set> jit_ceiling_emitter::get_supported_precisio jit_gelu_erf_emitter::jit_gelu_erf_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, node); } jit_gelu_erf_emitter::jit_gelu_erf_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_gelu_erf_emitter::get_inputs_count() const { return 1; } +size_t jit_gelu_erf_emitter::get_inputs_count() const { + return 1; +} size_t jit_gelu_erf_emitter::get_aux_vecs_count() const { return std::max(exp_emitter->get_aux_vecs_count() + 3, 7); @@ -631,7 +681,8 @@ size_t jit_gelu_erf_emitter::get_aux_gprs_count() const { return exp_emitter->get_aux_gprs_count() + 1; } -void jit_gelu_erf_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_gelu_erf_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -640,7 +691,8 @@ void jit_gelu_erf_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_gelu_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_gelu_erf_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -673,11 +725,7 @@ void jit_gelu_erf_emitter::emit_isa(const std::vector &in_vec_idxs, cons h->fmul(vmm_aux.s, vmm_aux0.s, vmm_aux0.s); h->ld1r(vmm_aux2.s, table_val2("sign_mask")); h->orr(vmm_aux.b16, vmm_aux.b16, vmm_aux2.b16); - exp_emitter->emit_code( - { vmm_aux.getIdx() }, - { vmm_aux_dst.getIdx() }, - aux_vec_idxs, - aux_gpr_idxs); + exp_emitter->emit_code({vmm_aux.getIdx()}, {vmm_aux_dst.getIdx()}, aux_vec_idxs, aux_gpr_idxs); h->ld1r(vmm_aux2.s, table_val2("sign_mask")); // vmm_aux_dst = -exp(-x*x) h->orr(vmm_aux_dst.b16, vmm_aux_dst.b16, vmm_aux2.b16); @@ -723,11 +771,11 @@ void jit_gelu_erf_emitter::register_table_entries() { push_arg_entry_of("gelu_erf_one_over_sqrt_two", 0x3f3504f3, true); push_arg_entry_of("gelu_erf_one_over_sqrt_pi", 0x3f106eba, true); - push_arg_entry_of("erf_pol1", 0x3e827906, true); // p1 = 0.254829592f - push_arg_entry_of("erf_pol2", 0xbe91a98e, true); // p2 = -0.284496736f - push_arg_entry_of("erf_pol3", 0x3fb5f0e3, true); // p3 = 1.421413741f - push_arg_entry_of("erf_pol4", 0xbfba00e3, true); // p4 = -1.453152027f - push_arg_entry_of("erf_pol5", 0x3f87dc22, true); // p5 = 1.061405429f + push_arg_entry_of("erf_pol1", 0x3e827906, true); // p1 = 0.254829592f + push_arg_entry_of("erf_pol2", 0xbe91a98e, true); // p2 = -0.284496736f + push_arg_entry_of("erf_pol3", 0x3fb5f0e3, true); // p3 = 1.421413741f + push_arg_entry_of("erf_pol4", 0xbfba00e3, true); // p4 = -1.453152027f + push_arg_entry_of("erf_pol5", 0x3f87dc22, true); // p5 = 1.061405429f } void jit_gelu_erf_emitter::emit_data() const { @@ -735,7 +783,8 @@ void jit_gelu_erf_emitter::emit_data() const { exp_emitter->emit_data(); } -std::set> jit_gelu_erf_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_gelu_erf_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -743,19 +792,22 @@ std::set> jit_gelu_erf_emitter::get_supported_precisi jit_gelu_tanh_emitter::jit_gelu_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); tanh_emitter = std::make_unique(h, host_isa, node); } jit_gelu_tanh_emitter::jit_gelu_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); tanh_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_gelu_tanh_emitter::get_inputs_count() const { return 1; } +size_t jit_gelu_tanh_emitter::get_inputs_count() const { + return 1; +} size_t jit_gelu_tanh_emitter::get_aux_vecs_count() const { return std::max(tanh_emitter->get_aux_vecs_count() + 2, 3); @@ -765,7 +817,8 @@ size_t jit_gelu_tanh_emitter::get_aux_gprs_count() const { return tanh_emitter->get_aux_gprs_count() + 1; } -void jit_gelu_tanh_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_gelu_tanh_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -774,7 +827,8 @@ void jit_gelu_tanh_emitter::emit_impl(const std::vector &in_vec_idxs, co } template -void jit_gelu_tanh_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_gelu_tanh_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -794,11 +848,7 @@ void jit_gelu_tanh_emitter::emit_isa(const std::vector &in_vec_idxs, con h->ld1r(vmm_aux1.s, table_val2("gelu_tanh_sqrt_two_over_pi")); h->fmul(vmm_aux0.s, vmm_aux1.s, vmm_aux2.s); - tanh_emitter->emit_code( - { vmm_aux0.getIdx() }, - { vmm_aux2.getIdx() }, - aux_vec_idxs, - aux_gpr_idxs); + tanh_emitter->emit_code({vmm_aux0.getIdx()}, {vmm_aux2.getIdx()}, aux_vec_idxs, aux_gpr_idxs); // compute 0.5 * x * (1 + tanh(G(x))) h->ld1r(vmm_aux1.s, table_val2("one")); @@ -821,7 +871,8 @@ void jit_gelu_tanh_emitter::emit_data() const { tanh_emitter->emit_data(); } -std::set> jit_gelu_tanh_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_gelu_tanh_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -829,7 +880,7 @@ std::set> jit_gelu_tanh_emitter::get_supported_precis jit_greater_emitter::jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -840,13 +891,20 @@ jit_greater_emitter::jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator prepare_table(); } -size_t jit_greater_emitter::get_inputs_count() const { return 2; } +size_t jit_greater_emitter::get_inputs_count() const { + return 2; +} -size_t jit_greater_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_greater_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_greater_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_greater_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_greater_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -855,7 +913,8 @@ void jit_greater_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_greater_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -873,7 +932,8 @@ void jit_greater_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_greater_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_greater_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -881,7 +941,7 @@ std::set> jit_greater_emitter::get_supported_precisio jit_greater_equal_emitter::jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -892,13 +952,20 @@ jit_greater_equal_emitter::jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::j prepare_table(); } -size_t jit_greater_equal_emitter::get_inputs_count() const { return 2; } +size_t jit_greater_equal_emitter::get_inputs_count() const { + return 2; +} -size_t jit_greater_equal_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_greater_equal_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_greater_equal_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_greater_equal_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_greater_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -907,7 +974,8 @@ void jit_greater_equal_emitter::emit_impl(const std::vector &in_vec_idxs } template -void jit_greater_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -925,31 +993,40 @@ void jit_greater_equal_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_greater_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_greater_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } /// HARD_SWISH /// jit_hswish_emitter::jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } jit_hswish_emitter::jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_hswish_emitter::get_inputs_count() const { return 1; } +size_t jit_hswish_emitter::get_inputs_count() const { + return 1; +} -size_t jit_hswish_emitter::get_aux_vecs_count() const { return 2; } +size_t jit_hswish_emitter::get_aux_vecs_count() const { + return 2; +} -size_t jit_hswish_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_hswish_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_hswish_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_hswish_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -958,7 +1035,8 @@ void jit_hswish_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_hswish_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_hswish_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -983,18 +1061,19 @@ void jit_hswish_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); push_arg_entry_of("three", 0x40400000, true); push_arg_entry_of("six", 0x40c00000, true); - push_arg_entry_of("one_sixth", dnnl::impl::float2int(1.f/6.f), true); + push_arg_entry_of("one_sixth", dnnl::impl::float2int(1.f / 6.f), true); } -std::set> jit_hswish_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_hswish_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } /// IS_FINITE /// jit_is_finite_emitter::jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { auto isNaN = ov::as_type_ptr(node); if (isNaN == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v10::IsNaN"); @@ -1004,23 +1083,31 @@ jit_is_finite_emitter::jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_gener } jit_is_finite_emitter::jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_is_finite_emitter::get_inputs_count() const { return 1; } +size_t jit_is_finite_emitter::get_inputs_count() const { + return 1; +} -size_t jit_is_finite_emitter::get_aux_vecs_count() const { return 2; } +size_t jit_is_finite_emitter::get_aux_vecs_count() const { + return 2; +} -size_t jit_is_finite_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_is_finite_emitter::get_aux_gprs_count() const { + return 1; +} -std::set> jit_is_finite_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_is_finite_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_is_finite_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_is_finite_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1029,7 +1116,8 @@ void jit_is_finite_emitter::emit_impl(const std::vector& in_vec_idxs, co } template -void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_finite_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1039,7 +1127,8 @@ void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, con TReg aux0 = TReg(aux_vec_idxs[0]); TReg aux1 = TReg(aux_vec_idxs[1]); - // According to the IEEE standard, NaN values have the odd property that comparisons involving them are always false. + // According to the IEEE standard, NaN values have the odd property that comparisons involving them are always + // false. h->fcmeq(aux0.s, src.s, src.s); h->not_(aux0.b16, aux0.b16); @@ -1068,7 +1157,6 @@ jit_is_inf_emitter::jit_is_inf_emitter(dnnl::impl::cpu::aarch64::jit_generator* dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { - auto isInf = ov::as_type_ptr(node); if (isInf == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v10::IsInf"); @@ -1163,9 +1251,9 @@ void jit_is_inf_emitter::register_table_entries() { /// IS_NAN /// jit_is_nan_emitter::jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { auto isNaN = ov::as_type_ptr(node); if (isNaN == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to ov::op::v10::IsNaN"); @@ -1175,23 +1263,31 @@ jit_is_nan_emitter::jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* } jit_is_nan_emitter::jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_is_nan_emitter::get_inputs_count() const { return 1; } +size_t jit_is_nan_emitter::get_inputs_count() const { + return 1; +} -size_t jit_is_nan_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_is_nan_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_is_nan_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_is_nan_emitter::get_aux_gprs_count() const { + return 1; +} -std::set> jit_is_nan_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_is_nan_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_is_nan_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_is_nan_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1200,7 +1296,8 @@ void jit_is_nan_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_nan_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1209,7 +1306,8 @@ void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const TReg dst = TReg(out_vec_idxs[0]); TReg aux = TReg(aux_vec_idxs[0]); - // According to the IEEE standard, NaN values have the odd property that comparisons involving them are always false. + // According to the IEEE standard, NaN values have the odd property that comparisons involving them are always + // false. h->fcmeq(dst.s, src.s, src.s); h->ld1r(aux.s, table_val2("zero")); h->fcmeq(dst.s, dst.s, aux.s); @@ -1228,7 +1326,7 @@ void jit_is_nan_emitter::register_table_entries() { jit_less_equal_emitter::jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -1239,13 +1337,20 @@ jit_less_equal_emitter::jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_gen prepare_table(); } -size_t jit_less_equal_emitter::get_inputs_count() const { return 2; } +size_t jit_less_equal_emitter::get_inputs_count() const { + return 2; +} -size_t jit_less_equal_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_less_equal_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_less_equal_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_less_equal_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_less_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_less_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1254,7 +1359,8 @@ void jit_less_equal_emitter::emit_impl(const std::vector &in_vec_idxs, c } template -void jit_less_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_less_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1273,7 +1379,8 @@ void jit_less_equal_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_less_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_less_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1281,7 +1388,7 @@ std::set> jit_less_equal_emitter::get_supported_preci jit_logical_and_emitter::jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -1292,13 +1399,20 @@ jit_logical_and_emitter::jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_g prepare_table(); } -size_t jit_logical_and_emitter::get_inputs_count() const { return 2; } +size_t jit_logical_and_emitter::get_inputs_count() const { + return 2; +} -size_t jit_logical_and_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_logical_and_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_logical_and_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_logical_and_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_logical_and_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_and_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1307,7 +1421,8 @@ void jit_logical_and_emitter::emit_impl(const std::vector &in_vec_idxs, } template -void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_and_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1325,32 +1440,40 @@ void jit_logical_and_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_logical_and_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_and_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } /// LOGICAL_OR /// jit_logical_or_emitter::jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } jit_logical_or_emitter::jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_logical_or_emitter::get_inputs_count() const { return 2; } +size_t jit_logical_or_emitter::get_inputs_count() const { + return 2; +} -size_t jit_logical_or_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_logical_or_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_logical_or_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_logical_or_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_logical_or_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_or_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1359,7 +1482,8 @@ void jit_logical_or_emitter::emit_impl(const std::vector &in_vec_idxs, c } template -void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_or_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1377,24 +1501,25 @@ void jit_logical_or_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_logical_or_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_or_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } /// LOGICAL_NOT /// jit_logical_not_emitter::jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { - prepare_table(); - } + prepare_table(); +} jit_logical_not_emitter::jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { - prepare_table(); - } + prepare_table(); +} size_t jit_logical_not_emitter::get_inputs_count() const { return 1; @@ -1409,7 +1534,7 @@ size_t jit_logical_not_emitter::get_aux_gprs_count() const { } void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, - const std::vector& out_vec_idxs) const { + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1419,7 +1544,7 @@ void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, template void jit_logical_not_emitter::emit_isa(const std::vector& in_vec_idxs, - const std::vector& out_vec_idxs) const { + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1447,7 +1572,7 @@ std::set> jit_logical_not_emitter::get_supported_prec jit_logical_xor_emitter::jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } @@ -1458,13 +1583,20 @@ jit_logical_xor_emitter::jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_g prepare_table(); } -size_t jit_logical_xor_emitter::get_inputs_count() const { return 2; } +size_t jit_logical_xor_emitter::get_inputs_count() const { + return 2; +} -size_t jit_logical_xor_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_logical_xor_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_logical_xor_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_logical_xor_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_logical_xor_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_xor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1473,7 +1605,8 @@ void jit_logical_xor_emitter::emit_impl(const std::vector &in_vec_idxs, } template -void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_xor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1491,7 +1624,8 @@ void jit_logical_xor_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_logical_xor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_xor_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1499,17 +1633,19 @@ std::set> jit_logical_xor_emitter::get_supported_prec jit_maximum_emitter::jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_maximum_emitter::jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_maximum_emitter::get_inputs_count() const { return 2; } +size_t jit_maximum_emitter::get_inputs_count() const { + return 2; +} -void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_maximum_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1518,7 +1654,8 @@ void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_maximum_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1529,7 +1666,8 @@ void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const h->fmaxnm(dst.s, src1.s, src2.s); } -std::set> jit_maximum_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_maximum_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1537,17 +1675,19 @@ std::set> jit_maximum_emitter::get_supported_precisio jit_minimum_emitter::jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_minimum_emitter::jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_minimum_emitter::get_inputs_count() const { return 2; } +size_t jit_minimum_emitter::get_inputs_count() const { + return 2; +} -void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_minimum_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1556,7 +1696,8 @@ void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_minimum_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1567,7 +1708,8 @@ void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const h->fminnm(dst.s, src1.s, src2.s); } -std::set> jit_minimum_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_minimum_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -1575,19 +1717,22 @@ std::set> jit_minimum_emitter::get_supported_precisio jit_mish_emitter::jit_mish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, node); } jit_mish_emitter::jit_mish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_mish_emitter::get_inputs_count() const { return 1; } +size_t jit_mish_emitter::get_inputs_count() const { + return 1; +} size_t jit_mish_emitter::get_aux_vecs_count() const { return std::max(exp_emitter->get_aux_vecs_count() + 1, 2); @@ -1597,7 +1742,8 @@ size_t jit_mish_emitter::get_aux_gprs_count() const { return exp_emitter->get_aux_gprs_count() + 1; } -void jit_mish_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mish_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1606,7 +1752,7 @@ void jit_mish_emitter::emit_impl(const std::vector &in_vec_idxs, const s } template -void jit_mish_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mish_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); // An equation other than mish(x) = x*tanh(srelu(x)) was used @@ -1628,11 +1774,7 @@ void jit_mish_emitter::emit_isa(const std::vector &in_vec_idxs, const st h->ld1r(vmm_aux0.s, table_val2("fwd_mish_max_x_for_equation_f")); h->fminnm(vmm_aux2.s, vmm_src.s, vmm_aux0.s); - exp_emitter->emit_code( - { vmm_aux2.getIdx() }, - { vmm_aux2.getIdx() }, - aux_vec_idxs, - aux_gpr_idxs); + exp_emitter->emit_code({vmm_aux2.getIdx()}, {vmm_aux2.getIdx()}, aux_vec_idxs, aux_gpr_idxs); // (e^x+1)^2 h->fmov(vmm_aux0.s, 1.f); @@ -1665,22 +1807,25 @@ std::set> jit_mish_emitter::get_supported_precisions( } /// MOD /// -jit_mod_emitter::jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_mod_emitter::jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_mod_emitter::jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc): jit_emitter(host, host_isa, exec_prc) { -} +jit_mod_emitter::jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_mod_emitter::get_inputs_count() const { return 2; } +size_t jit_mod_emitter::get_inputs_count() const { + return 2; +} -size_t jit_mod_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_mod_emitter::get_aux_vecs_count() const { + return 1; +} -void jit_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mod_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1689,7 +1834,7 @@ void jit_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mod_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1713,20 +1858,23 @@ std::set> jit_mod_emitter::get_supported_precisions(c jit_mul_add_emitter::jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_mul_add_emitter::jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_mul_add_emitter::jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { -} + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_mul_add_emitter::get_inputs_count() const { return 3; } +size_t jit_mul_add_emitter::get_inputs_count() const { + return 3; +} -size_t jit_mul_add_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_mul_add_emitter::get_aux_vecs_count() const { + return 1; +} -void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mul_add_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1735,7 +1883,8 @@ void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mul_add_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1765,24 +1914,28 @@ void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const h->fmla(dst.s, mul0.s, mul1.s); } -std::set> jit_mul_add_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_mul_add_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32, element::f32}}; } /// MULTIPLY /// -jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) {} + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_multiply_emitter::get_inputs_count() const { return 2; } +size_t jit_multiply_emitter::get_inputs_count() const { + return 2; +} -void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_multiply_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1791,7 +1944,8 @@ void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_multiply_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1802,16 +1956,17 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons h->uni_fmul(dst.s, src0.s, src1.s); } -std::set> jit_multiply_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_multiply_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } /// POWER /// -jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, node, exec_prc) { + : jit_emitter(host, host_isa, node, exec_prc) { auto powerStaticNode = ov::as_type_ptr(node); if (powerStaticNode == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to snippets::op::PowerStatic"); @@ -1824,24 +1979,30 @@ jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit prepare_table(); } -jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_power_static_emitter::jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const float power, const float scale, const float shift, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc), - power(power), - scale(scale), - shift(shift) { + : jit_emitter(host, host_isa, exec_prc), + power(power), + scale(scale), + shift(shift) { prepare_table(); } -size_t jit_power_static_emitter::get_inputs_count() const { return 1; } +size_t jit_power_static_emitter::get_inputs_count() const { + return 1; +} -size_t jit_power_static_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_power_static_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_power_static_emitter::get_aux_gprs_count() const { return 2; } +size_t jit_power_static_emitter::get_aux_gprs_count() const { + return 2; +} void jit_power_static_emitter::register_table_entries() { push_arg_entry_of("power", dnnl::impl::float2int(power), true); @@ -1849,11 +2010,13 @@ void jit_power_static_emitter::register_table_entries() { push_arg_entry_of("shift", dnnl::impl::float2int(shift), true); } -std::set> jit_power_static_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_power_static_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1862,7 +2025,8 @@ void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_power_static_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1945,26 +2109,30 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, /// PRELU /// jit_prelu_emitter::jit_prelu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_prelu_emitter::jit_prelu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { -} + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_prelu_emitter::get_inputs_count() const { return 2; } +size_t jit_prelu_emitter::get_inputs_count() const { + return 2; +} -size_t jit_prelu_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_prelu_emitter::get_aux_vecs_count() const { + return 1; +} -std::set> jit_prelu_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_prelu_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -1973,7 +2141,8 @@ void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_prelu_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -1992,24 +2161,27 @@ void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const s jit_relu_emitter::jit_relu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_relu_emitter::jit_relu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { -} + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_relu_emitter::get_inputs_count() const { return 1; } +size_t jit_relu_emitter::get_inputs_count() const { + return 1; +} -size_t jit_relu_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_relu_emitter::get_aux_vecs_count() const { + return 1; +} std::set> jit_relu_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } -void jit_relu_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_relu_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2018,7 +2190,7 @@ void jit_relu_emitter::emit_impl(const std::vector& in_vec_idxs, const s } template -void jit_relu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_relu_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2032,27 +2204,29 @@ void jit_relu_emitter::emit_isa(const std::vector &in_vec_idxs, const st } /// ROUND_HALF_AWAY_FROM_ZERO /// -jit_round_half_away_from_zero_emitter::jit_round_half_away_from_zero_emitter - (dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} - -jit_round_half_away_from_zero_emitter::jit_round_half_away_from_zero_emitter - (dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { +jit_round_half_away_from_zero_emitter::jit_round_half_away_from_zero_emitter( + dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} + +jit_round_half_away_from_zero_emitter::jit_round_half_away_from_zero_emitter( + dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} + +size_t jit_round_half_away_from_zero_emitter::get_inputs_count() const { + return 1; } -size_t jit_round_half_away_from_zero_emitter::get_inputs_count() const { return 1; } - -std::set> jit_round_half_away_from_zero_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_round_half_away_from_zero_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_round_half_away_from_zero_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_round_half_away_from_zero_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2061,7 +2235,8 @@ void jit_round_half_away_from_zero_emitter::emit_impl(const std::vector& } template -void jit_round_half_away_from_zero_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_round_half_away_from_zero_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2073,27 +2248,27 @@ void jit_round_half_away_from_zero_emitter::emit_isa(const std::vector & } /// ROUND_HALF_TO_EVEN /// -jit_round_half_to_even_emitter::jit_round_half_to_even_emitter - (dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} +jit_round_half_to_even_emitter::jit_round_half_to_even_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} -jit_round_half_to_even_emitter::jit_round_half_to_even_emitter - (dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { -} +jit_round_half_to_even_emitter::jit_round_half_to_even_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_round_half_to_even_emitter::get_inputs_count() const { return 1; } +size_t jit_round_half_to_even_emitter::get_inputs_count() const { + return 1; +} -std::set> jit_round_half_to_even_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_round_half_to_even_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_round_half_to_even_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_round_half_to_even_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2102,7 +2277,8 @@ void jit_round_half_to_even_emitter::emit_impl(const std::vector& in_vec } template -void jit_round_half_to_even_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_round_half_to_even_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2114,26 +2290,30 @@ void jit_round_half_to_even_emitter::emit_isa(const std::vector &in_vec_ } /// SELECT /// -jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { -} -jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { -} + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_select_emitter::get_inputs_count() const { return 3; } +size_t jit_select_emitter::get_inputs_count() const { + return 3; +} -size_t jit_select_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_select_emitter::get_aux_vecs_count() const { + return 1; +} -std::set> jit_select_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_select_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32, element::f32}}; } -void jit_select_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_select_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2142,7 +2322,8 @@ void jit_select_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_select_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_select_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2163,19 +2344,22 @@ void jit_select_emitter::emit_isa(const std::vector &in_vec_idxs, const jit_sigmoid_emitter::jit_sigmoid_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, node); } jit_sigmoid_emitter::jit_sigmoid_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); exp_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_sigmoid_emitter::get_inputs_count() const { return 1; } +size_t jit_sigmoid_emitter::get_inputs_count() const { + return 1; +} size_t jit_sigmoid_emitter::get_aux_vecs_count() const { return exp_emitter->get_aux_vecs_count() + 2; @@ -2185,7 +2369,8 @@ size_t jit_sigmoid_emitter::get_aux_gprs_count() const { return exp_emitter->get_aux_gprs_count() + 1; } -void jit_sigmoid_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_sigmoid_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2194,7 +2379,8 @@ void jit_sigmoid_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_sigmoid_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_sigmoid_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (exec_prc_ != ov::element::f32) { OPENVINO_THROW("unsupported precision: " + exec_prc_.to_string()); } @@ -2217,11 +2403,7 @@ void jit_sigmoid_emitter::emit_isa(const std::vector &in_vec_idxs, const h->ld1r(vmm_aux0.s, table_val2("sign_mask")); h->orr(vmm_aux0.b16, vmm_src.b16, vmm_aux0.b16); - exp_emitter->emit_code( - { vmm_aux0.getIdx() }, - out_vec_idxs, - aux_vec_idxs, - aux_gpr_idxs); + exp_emitter->emit_code({vmm_aux0.getIdx()}, out_vec_idxs, aux_vec_idxs, aux_gpr_idxs); const TReg vmm_aux1(aux_vec_idxs[0]); const TReg vmm_aux2(aux_vec_idxs[1]); @@ -2251,7 +2433,8 @@ void jit_sigmoid_emitter::emit_data() const { exp_emitter->emit_data(); } -std::set> jit_sigmoid_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_sigmoid_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -2259,23 +2442,31 @@ std::set> jit_sigmoid_emitter::get_supported_precisio jit_soft_sign_emitter::jit_soft_sign_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } jit_soft_sign_emitter::jit_soft_sign_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_soft_sign_emitter::get_inputs_count() const { return 1; } +size_t jit_soft_sign_emitter::get_inputs_count() const { + return 1; +} -size_t jit_soft_sign_emitter::get_aux_vecs_count() const { return 2; } +size_t jit_soft_sign_emitter::get_aux_vecs_count() const { + return 2; +} -size_t jit_soft_sign_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_soft_sign_emitter::get_aux_gprs_count() const { + return 1; +} -void jit_soft_sign_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_soft_sign_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2284,7 +2475,8 @@ void jit_soft_sign_emitter::emit_impl(const std::vector &in_vec_idxs, co } template -void jit_soft_sign_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_soft_sign_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (exec_prc_ != ov::element::f32) { OPENVINO_THROW("unsupported precision: " + exec_prc_.to_string()); } @@ -2305,7 +2497,8 @@ void jit_soft_sign_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -std::set> jit_soft_sign_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_soft_sign_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -2314,15 +2507,15 @@ jit_sqrt_emitter::jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { - prepare_table(); - } + prepare_table(); +} jit_sqrt_emitter::jit_sqrt_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { - prepare_table(); - } + prepare_table(); +} size_t jit_sqrt_emitter::get_inputs_count() const { return 1; @@ -2338,8 +2531,7 @@ void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_sqrt_emitter::emit_isa(const std::vector& in_vec_idxs, - const std::vector& out_vec_idxs) const { +void jit_sqrt_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2349,8 +2541,7 @@ void jit_sqrt_emitter::emit_isa(const std::vector& in_vec_idxs, h->fsqrt(dst.s, src.s); } -std::set> jit_sqrt_emitter::get_supported_precisions( - const std::shared_ptr& node) { +std::set> jit_sqrt_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } @@ -2358,17 +2549,19 @@ std::set> jit_sqrt_emitter::get_supported_precisions( jit_subtract_emitter::jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { -} + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) {} jit_subtract_emitter::jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { -} + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_subtract_emitter::get_inputs_count() const { return 2; } +size_t jit_subtract_emitter::get_inputs_count() const { + return 2; +} -void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_subtract_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2377,7 +2570,8 @@ void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2388,7 +2582,8 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons h->uni_fsub(dst.s, src0.s, src1.s); } -std::set> jit_subtract_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_subtract_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } @@ -2396,7 +2591,7 @@ std::set> jit_subtract_emitter::get_supported_precisi jit_swish_emitter::jit_swish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { const auto swish = std::dynamic_pointer_cast(node); if (swish == nullptr) { OV_CPU_JIT_EMITTER_THROW("Can't cast to SwishNode"); @@ -2411,12 +2606,15 @@ jit_swish_emitter::jit_swish_emitter(dnnl::impl::cpu::aarch64::jit_generator* ho dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const float beta, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc), beta(beta) { + : jit_emitter(host, host_isa, exec_prc), + beta(beta) { prepare_table(); sigmoid_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_swish_emitter::get_inputs_count() const {return 1; } +size_t jit_swish_emitter::get_inputs_count() const { + return 1; +} size_t jit_swish_emitter::get_aux_vecs_count() const { return sigmoid_emitter->get_aux_vecs_count() + 2; @@ -2426,7 +2624,8 @@ size_t jit_swish_emitter::get_aux_gprs_count() const { return sigmoid_emitter->get_aux_gprs_count() + 1; } -void jit_swish_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_swish_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2435,7 +2634,8 @@ void jit_swish_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_swish_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_swish_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2451,11 +2651,7 @@ void jit_swish_emitter::emit_isa(const std::vector &in_vec_idxs, const s h->fmul(vmm_aux.s, vmm_aux.s, vmm_src.s); // sigmoid(x*beta) - sigmoid_emitter->emit_code( - { vmm_aux.getIdx() }, - out_vec_idxs, - aux_vec_idxs, - aux_gpr_idxs); + sigmoid_emitter->emit_code({vmm_aux.getIdx()}, out_vec_idxs, aux_vec_idxs, aux_gpr_idxs); // x*sigmoid(x*beta) h->fmul(vmm_dst.s, vmm_dst.s, vmm_orig_src.s); @@ -2470,28 +2666,31 @@ void jit_swish_emitter::emit_data() const { sigmoid_emitter->emit_data(); } -std::set> jit_swish_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_swish_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } /// TANH /// -jit_tanh_emitter::jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_tanh_emitter::jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node) - : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { prepare_table(); sigmoid_emitter = std::make_unique(h, host_isa, node); } -jit_tanh_emitter::jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, +jit_tanh_emitter::jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); sigmoid_emitter = std::make_unique(h, host_isa, exec_prc); } -size_t jit_tanh_emitter::get_inputs_count() const { return 1; } +size_t jit_tanh_emitter::get_inputs_count() const { + return 1; +} size_t jit_tanh_emitter::get_aux_vecs_count() const { return sigmoid_emitter->get_aux_vecs_count() + 1; @@ -2501,7 +2700,8 @@ size_t jit_tanh_emitter::get_aux_gprs_count() const { return sigmoid_emitter->get_aux_gprs_count() + 1; } -void jit_tanh_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_tanh_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -2510,7 +2710,7 @@ void jit_tanh_emitter::emit_impl(const std::vector &in_vec_idxs, const s } template -void jit_tanh_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_tanh_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; @@ -2522,11 +2722,7 @@ void jit_tanh_emitter::emit_isa(const std::vector &in_vec_idxs, const st h->ld1r(aux.s, table_val2("two")); h->uni_fmul(aux.s, src.s, aux.s); - sigmoid_emitter->emit_code( - { aux.getIdx() }, - out_vec_idxs, - aux_vec_idxs, - aux_gpr_idxs); + sigmoid_emitter->emit_code({aux.getIdx()}, out_vec_idxs, aux_vec_idxs, aux_gpr_idxs); h->ld1r(aux.s, table_val2("two")); h->uni_fmul(dst.s, aux.s, dst.s); @@ -2548,6 +2744,6 @@ std::set> jit_tanh_emitter::get_supported_precisions( return {{element::f32}}; } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index be4e51cd0b759d..13567b6fbf7d64 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -12,55 +12,57 @@ namespace aarch64 { class jit_abs_emitter : public jit_emitter { public: - jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_abs_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_add_emitter : public jit_emitter { public: - jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_clamp_emitter : public jit_emitter { public: - jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const float min, const float max, const ov::element::Type exec_prc = ov::element::f32); - jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_clamp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -72,46 +74,48 @@ class jit_clamp_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: float min; float max; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_divide_emitter : public jit_emitter { public: - jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); + jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); - jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node); + jit_divide_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_equal_emitter : public jit_emitter { public: - jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -122,13 +126,13 @@ class jit_equal_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; @@ -151,13 +155,14 @@ class jit_exp_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_elu_emitter : public jit_emitter { @@ -181,45 +186,47 @@ class jit_elu_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr exp_emitter; float alpha; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_floor_emitter : public jit_emitter { public: - jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_floor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_floor_mod_emitter : public jit_emitter { public: - jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -227,23 +234,24 @@ class jit_floor_mod_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_ceiling_emitter : public jit_emitter { public: // Constructor with explicit precision - jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); // Constructor from node - jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -256,13 +264,11 @@ class jit_ceiling_emitter : public jit_emitter { private: // Implementation of JIT code emission - void emit_impl(const std::vector &in_vec_idxs, - const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; // ISA-specific implementation template - void emit_isa(const std::vector &in_vec_idxs, - const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_gelu_erf_emitter : public jit_emitter { @@ -285,15 +291,16 @@ class jit_gelu_erf_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr exp_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_tanh_emitter; @@ -318,24 +325,25 @@ class jit_gelu_tanh_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr tanh_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_greater_emitter : public jit_emitter { public: - jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_greater_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -346,24 +354,24 @@ class jit_greater_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_greater_equal_emitter : public jit_emitter { public: - jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_greater_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -374,24 +382,24 @@ class jit_greater_equal_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_hswish_emitter : public jit_emitter { public: - jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_hswish_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -403,24 +411,25 @@ class jit_hswish_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_is_finite_emitter : public jit_emitter { public: - jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); + jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); - jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node); + jit_is_finite_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); size_t get_inputs_count() const override; @@ -428,24 +437,25 @@ class jit_is_finite_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_is_nan_emitter : public jit_emitter { public: - jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_is_nan_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -455,57 +465,60 @@ class jit_is_nan_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_maximum_emitter : public jit_emitter { public: - jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_maximum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_minimum_emitter : public jit_emitter { public: - jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_minimum_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_mish_emitter : public jit_emitter { @@ -528,15 +541,16 @@ class jit_mish_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr exp_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_is_inf_emitter : public jit_emitter { @@ -574,11 +588,11 @@ class jit_is_inf_emitter : public jit_emitter { class jit_less_equal_emitter : public jit_emitter { public: - jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_less_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -589,24 +603,24 @@ class jit_less_equal_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_logical_and_emitter : public jit_emitter { public: - jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_logical_and_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -617,26 +631,26 @@ class jit_logical_and_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_logical_or_emitter : public jit_emitter { public: - jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); + jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); - jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& n); + jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_count() const override; @@ -645,13 +659,13 @@ class jit_logical_or_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; @@ -659,12 +673,12 @@ class jit_logical_or_emitter : public jit_emitter { class jit_logical_not_emitter : public jit_emitter { public: jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node); + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); size_t get_inputs_count() const override; @@ -686,11 +700,11 @@ class jit_logical_not_emitter : public jit_emitter { class jit_logical_xor_emitter : public jit_emitter { public: - jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_logical_xor_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -701,24 +715,24 @@ class jit_logical_xor_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_mod_emitter : public jit_emitter { public: - jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -726,13 +740,14 @@ class jit_mod_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_mul_add_emitter : public jit_emitter { @@ -749,49 +764,50 @@ class jit_mul_add_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_multiply_emitter : public jit_emitter { public: - jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_power_static_emitter : public jit_emitter { public: jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const float power, - const float scale, - const float shift, - const ov::element::Type exec_prc = ov::element::f32); + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const float power, + const float scale, + const float shift, + const ov::element::Type exec_prc = ov::element::f32); jit_power_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node, - const ov::element::Type exec_prc = ov::element::f32); + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_count() const override; @@ -801,16 +817,17 @@ class jit_power_static_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: float power; float scale; float shift; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_prelu_emitter : public jit_emitter { @@ -827,13 +844,14 @@ class jit_prelu_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_relu_emitter : public jit_emitter { @@ -850,13 +868,14 @@ class jit_relu_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_round_half_away_from_zero_emitter : public jit_emitter { @@ -871,13 +890,14 @@ class jit_round_half_away_from_zero_emitter : public jit_emitter { size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_round_half_to_even_emitter : public jit_emitter { @@ -892,22 +912,23 @@ class jit_round_half_to_even_emitter : public jit_emitter { size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_select_emitter : public jit_emitter { public: - jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const ov::element::Type exec_prc = ov::element::f32); - jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n); @@ -916,13 +937,13 @@ class jit_select_emitter : public jit_emitter { size_t get_aux_vecs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_sigmoid_emitter : public jit_emitter { @@ -945,15 +966,16 @@ class jit_sigmoid_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr exp_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_soft_sign_emitter : public jit_emitter { @@ -974,13 +996,14 @@ class jit_soft_sign_emitter : public jit_emitter { void register_table_entries() override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_sqrt_emitter : public jit_emitter { @@ -1007,23 +1030,24 @@ class jit_sqrt_emitter : public jit_emitter { class jit_subtract_emitter : public jit_emitter { public: - jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); + jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); - jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node); + jit_subtract_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); size_t get_inputs_count() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_swish_emitter : public jit_emitter { @@ -1047,25 +1071,26 @@ class jit_swish_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr sigmoid_emitter; float beta; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_tanh_emitter : public jit_emitter { public: - jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + jit_tanh_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& node); @@ -1079,17 +1104,18 @@ class jit_tanh_emitter : public jit_emitter { void emit_data() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: std::unique_ptr sigmoid_emitter; - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.cpp index bb7783b60a1b53..4c0b0f95f783c2 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.cpp @@ -3,9 +3,11 @@ // #include "jit_emitter.hpp" + #include -#include "utils/general_utils.h" + #include "emitters/utils.hpp" +#include "utils/general_utils.h" using namespace dnnl::impl::cpu; using namespace dnnl::impl; @@ -16,26 +18,37 @@ namespace aarch64 { const std::vector jit_emitter::store_gpr_regs = { // Parameter/result registers - 0, 1, 2, 3, 4, 5, 6, 7, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, // r8: Indirect result location register // r9...r15: Temporary registers - 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, // r19...r28: Callee-saved registers - 29, 30 -}; - -static const std::vector vec_regs = { - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31 -}; - -void jit_emitter::emit_code(const std::vector &in_idxs, - const std::vector &out_idxs, - const std::vector &pool_vec_idxs, - const std::vector &pool_gpr_idxs) const { + 29, + 30}; + +static const std::vector vec_regs = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +void jit_emitter::emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); emit_impl(in_idxs, out_idxs); @@ -52,7 +65,7 @@ void jit_emitter::emit_data() const { // Run through the map and insert values stored there for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { - const auto &te = (*it).second; // get map entry for a given key + const auto& te = (*it).second; // get map entry for a given key const auto len = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val); @@ -88,7 +101,7 @@ void jit_emitter::prepare_table() { // prepare_table. size_t off = 0; for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { - auto &te = (*it).second; + auto& te = (*it).second; te.off = off; off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t); } @@ -99,10 +112,10 @@ void jit_emitter::emitter_preamble(const std::vector& in_idxs, const std::vector& pool_aux_vec_idxs, const std::vector& pool_aux_gpr_idxs) const { using namespace Xbyak_aarch64::util; - const bool is_vec_input = (in_out_type_ == emitter_in_out_map::vec_to_vec) || - (in_out_type_ == emitter_in_out_map::vec_to_gpr); - const bool is_vec_output = (in_out_type_ == emitter_in_out_map::vec_to_vec) || - (in_out_type_ == emitter_in_out_map::gpr_to_vec); + const bool is_vec_input = + (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::vec_to_gpr); + const bool is_vec_output = + (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::gpr_to_vec); // vector registers for (auto idx : pool_aux_vec_idxs) { @@ -110,20 +123,27 @@ void jit_emitter::emitter_preamble(const std::vector& in_idxs, } for (size_t idx = 0; idx < get_max_vecs_count(); idx++) { - if (aux_vec_idxs.size() >= get_aux_vecs_count()) break; + if (aux_vec_idxs.size() >= get_aux_vecs_count()) + break; if (is_vec_input) { - if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) + continue; } if (is_vec_output) { - if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) continue; + if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) + continue; } - if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) continue; - if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) + continue; + if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) + continue; - if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) continue; - if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) continue; + if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) + continue; + if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) + continue; aux_vec_idxs.push_back(idx); preserved_vec_idxs.push_back(idx); @@ -138,23 +158,27 @@ void jit_emitter::emitter_preamble(const std::vector& in_idxs, const uint32_t end_gpr_idx = Xbyak_aarch64::Operand::X30; for (size_t gpr_idx = 0; gpr_idx <= end_gpr_idx; ++gpr_idx) { - size_t _idx = end_gpr_idx - gpr_idx; // we allocate from the end + size_t _idx = end_gpr_idx - gpr_idx; // we allocate from the end - if (aux_gpr_idxs.size() >= get_aux_gprs_count()) break; - if ((_idx == Xbyak_aarch64::Operand::X18) || - (_idx == Xbyak_aarch64::Operand::X23) || - (_idx == Xbyak_aarch64::Operand::X24) || - (_idx == Xbyak_aarch64::Operand::X28)) continue; + if (aux_gpr_idxs.size() >= get_aux_gprs_count()) + break; + if ((_idx == Xbyak_aarch64::Operand::X18) || (_idx == Xbyak_aarch64::Operand::X23) || + (_idx == Xbyak_aarch64::Operand::X24) || (_idx == Xbyak_aarch64::Operand::X28)) + continue; if (!is_vec_input) { - if (std::find(in_idxs.begin(), in_idxs.end(), _idx) != in_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), _idx) != in_idxs.end()) + continue; } if (!is_vec_output) { - if (std::find(out_idxs.begin(), out_idxs.end(), _idx) != out_idxs.end()) continue; + if (std::find(out_idxs.begin(), out_idxs.end(), _idx) != out_idxs.end()) + continue; } - if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) continue; - if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) continue; + if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) + continue; + if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) + continue; aux_gpr_idxs.push_back(_idx); preserved_gpr_idxs.push_back(_idx); @@ -189,23 +213,21 @@ void jit_emitter::store_context(const std::unordered_set& ignore_registe store_context(store_gpr_regs, vec_regs, ignore_registers); } -void jit_emitter::store_context( - const std::vector& gpr_regs, - const std::vector& vec_regs, - const std::unordered_set& ignore_vec_regs) const { +void jit_emitter::store_context(const std::vector& gpr_regs, + const std::vector& vec_regs, + const std::unordered_set& ignore_vec_regs) const { // 1. General-purpose Registers // 1.1. store pair registers const auto store_gpr_regs_size = gpr_regs.size(); const auto last = store_gpr_regs_size % 2; for (size_t i = 0; i < (store_gpr_regs_size - last); i += 2) { - h->stp(Xbyak_aarch64::XReg(gpr_regs[i]), - Xbyak_aarch64::XReg(gpr_regs[i + 1]), - pre_ptr(h->sp, -get_gpr_length() * 2)); + h->stp(Xbyak_aarch64::XReg(gpr_regs[i]), + Xbyak_aarch64::XReg(gpr_regs[i + 1]), + pre_ptr(h->sp, -get_gpr_length() * 2)); } // 1.2. store the remaining register if (last != 0) { - h->str(Xbyak_aarch64::XReg(gpr_regs[store_gpr_regs_size - 1]), - pre_ptr(h->sp, -get_gpr_length())); + h->str(Xbyak_aarch64::XReg(gpr_regs[store_gpr_regs_size - 1]), pre_ptr(h->sp, -get_gpr_length())); } // 2. SIMD and Floating-Point registers @@ -221,17 +243,14 @@ void jit_emitter::store_context( prev_reg_idx = static_cast(reg_idx); continue; } - h->stp(Xbyak_aarch64::QReg(prev_reg_idx), - Xbyak_aarch64::QReg(reg_idx), - pre_ptr(h->sp, -get_vec_length() * 2)); + h->stp(Xbyak_aarch64::QReg(prev_reg_idx), Xbyak_aarch64::QReg(reg_idx), pre_ptr(h->sp, -get_vec_length() * 2)); prev_reg_idx = -1; } // 2.1. store the remaining register if (prev_reg_idx != -1) { if (ignore_vec_regs.find(prev_reg_idx) == ignore_vec_regs.end()) { - h->str(Xbyak_aarch64::QReg(prev_reg_idx), - pre_ptr(h->sp, -get_vec_length())); + h->str(Xbyak_aarch64::QReg(prev_reg_idx), pre_ptr(h->sp, -get_vec_length())); } else { ignore_registers_count++; } @@ -245,10 +264,9 @@ void jit_emitter::restore_context(const std::unordered_set& ignore_vec_r restore_context(store_gpr_regs, vec_regs, ignore_vec_regs); } -void jit_emitter::restore_context( - const std::vector& gpr_regs, - const std::vector& vec_regs, - const std::unordered_set& ignore_vec_regs) const { +void jit_emitter::restore_context(const std::vector& gpr_regs, + const std::vector& vec_regs, + const std::unordered_set& ignore_vec_regs) const { // 1. SIMD and Floating-Point registers // 1.1. restore the remaining register auto v_last = (vec_regs.size() - ignore_vec_regs.size()) % 2; @@ -260,8 +278,7 @@ void jit_emitter::restore_context( continue; } - h->ldr(Xbyak_aarch64::QReg(reg_idx), - post_ptr(h->sp, get_vec_length())); + h->ldr(Xbyak_aarch64::QReg(reg_idx), post_ptr(h->sp, get_vec_length())); break; } } @@ -278,9 +295,7 @@ void jit_emitter::restore_context( prev_reg_idx = static_cast(reg_idx); continue; } - h->ldp(Xbyak_aarch64::QReg(reg_idx), - Xbyak_aarch64::QReg(prev_reg_idx), - post_ptr(h->sp, get_vec_length() * 2)); + h->ldp(Xbyak_aarch64::QReg(reg_idx), Xbyak_aarch64::QReg(prev_reg_idx), post_ptr(h->sp, get_vec_length() * 2)); prev_reg_idx = -1; } @@ -292,8 +307,7 @@ void jit_emitter::restore_context( const auto save_gpr_regs_size = gpr_regs.size(); const auto last = save_gpr_regs_size % 2; if (last != 0) { - h->ldr(Xbyak_aarch64::XReg(gpr_regs[save_gpr_regs_size - 1]), - post_ptr(h->sp, get_gpr_length())); + h->ldr(Xbyak_aarch64::XReg(gpr_regs[save_gpr_regs_size - 1]), post_ptr(h->sp, get_gpr_length())); } // 2.2. restore pair registers @@ -304,6 +318,6 @@ void jit_emitter::restore_context( } } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp index ba9b8c2d9cbdf1..9ce8203afe7783 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_emitter.hpp @@ -4,14 +4,13 @@ #pragma once +#include #include #include -#include -#include "snippets/snippets_isa.hpp" -#include "snippets/generator.hpp" #include "node.h" - +#include "snippets/generator.hpp" +#include "snippets/snippets_isa.hpp" namespace ov { namespace intel_cpu { @@ -29,25 +28,32 @@ class jit_emitter : public ov::snippets::Emitter { jit_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32, - emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) : - Emitter(), h(host), host_isa_(host_isa), exec_prc_(exec_prc), - in_out_type_(in_out_type), p_table(0), l_table (new Xbyak_aarch64::Label()) { - } + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) + : Emitter(), + h(host), + host_isa_(host_isa), + exec_prc_(exec_prc), + in_out_type_(in_out_type), + p_table(0), + l_table(new Xbyak_aarch64::Label()) {} jit_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32, - emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) : - Emitter(), h(host), host_isa_(host_isa), exec_prc_(exec_prc), - in_out_type_(in_out_type), p_table(0), l_table (new Xbyak_aarch64::Label()) { - } - - void emit_code( - const std::vector &in_idxs, - const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, - const std::vector &pool_gpr_idxs = {}) const override; + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) + : Emitter(), + h(host), + host_isa_(host_isa), + exec_prc_(exec_prc), + in_out_type_(in_out_type), + p_table(0), + l_table(new Xbyak_aarch64::Label()) {} + + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; void emit_data() const override; @@ -60,7 +66,8 @@ class jit_emitter : public ov::snippets::Emitter { * Precisions are ordered, the first bigger bitness precision with the same type will be selected. * Empty collection means the emitter supports any input precisions. */ - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); protected: size_t get_max_vecs_count() const; @@ -78,12 +85,14 @@ class jit_emitter : public ov::snippets::Emitter { virtual void prepare_table(); virtual void register_table_entries() {} - void load_table_addr() const { h->adr(p_table, *l_table.get()); } + void load_table_addr() const { + h->adr(p_table, *l_table.get()); + } // we accept only 32bit hexadecimal table values to avoid any rounding using table_entry_val_t = uint32_t; - using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table - using table_entry_bcast_t = bool; // true => bcast value + using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table + using table_entry_bcast_t = bool; // true => bcast value struct table_entry_t { table_entry_val_t val; @@ -98,7 +107,7 @@ class jit_emitter : public ov::snippets::Emitter { mutable Xbyak_aarch64::XReg p_table; mutable std::shared_ptr l_table; - virtual void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const = 0; + virtual void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const = 0; virtual void emitter_preamble(const std::vector& in_idxs, const std::vector& out_idxs, @@ -128,14 +137,14 @@ class jit_emitter : public ov::snippets::Emitter { } void push_arg_entry_of(const std::string key, const table_entry_val_t val, const bool broadcast) { - mapped_table_entry_t te {0, val, broadcast}; + mapped_table_entry_t te{0, val, broadcast}; entry_map_.insert(std::make_pair(key, te)); } - void push_entries_of(const table_t &t) { + void push_entries_of(const table_t& t) { for (auto it = t.begin(); it != t.end(); it++) { auto key = (*it).first; - auto te = (*it).second; // copy values from table + auto te = (*it).second; // copy values from table push_arg_entry_of(key, te.val, te.bcast); } } @@ -150,9 +159,9 @@ class jit_emitter : public ov::snippets::Emitter { size_t table_off(const std::string& key, const size_t key_off_val_shift = 0) const { // assumption: all table entries sharing the same key also // share their broadcast property - const auto it = entry_map_.find(key); // search an entry for a key + const auto it = entry_map_.find(key); // search an entry for a key assert(it != entry_map_.end()); - const auto &te = (*it).second; + const auto& te = (*it).second; const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); return te.off + key_off_val_shift * scale; } @@ -176,6 +185,6 @@ class jit_emitter : public ov::snippets::Emitter { const std::unordered_set& ignore_vec_regs = {}) const; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.cpp index ca18bc5d4b575d..3ca77bdac53baf 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_load_store_emitters.hpp" + #include "cpu/aarch64/cpu_isa_traits.hpp" #include "emitters/utils.hpp" @@ -15,14 +16,23 @@ namespace aarch64 { using jit_generator = dnnl::impl::cpu::aarch64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::aarch64::cpu_isa_t; -jit_load_emitter::jit_load_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int load_num, int byte_offset, - ov::element::Type exec_prc, emitter_in_out_map in_out_type) -: jit_emitter(host, host_isa, exec_prc, in_out_type), name_("unknown"), load_num_(load_num), byte_offset_(byte_offset), prc_(src_prc) { +jit_load_emitter::jit_load_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, + int byte_offset, + ov::element::Type exec_prc, + emitter_in_out_map in_out_type) + : jit_emitter(host, host_isa, exec_prc, in_out_type), + name_("unknown"), + load_num_(load_num), + byte_offset_(byte_offset), + prc_(src_prc) { OV_CPU_JIT_EMITTER_ASSERT(src_prc == dst_prc, "Unsupported precision pair."); } -void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_idxs, out_idxs); } else { @@ -31,7 +41,7 @@ void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std:: } template -void jit_load_emitter::load_qbyte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::load_qbyte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; XReg src = XReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); @@ -39,31 +49,31 @@ void jit_load_emitter::load_qbyte(const std::vector &in_idxs, const std: DReg dst_d = DReg(out_idxs[0]); switch (load_num_) { - case 0: - break; - case 1: - h->ldr(dst_s, ptr(src, byte_offset_)); - break; - case 2: - h->ldr(dst_d, ptr(src, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->ldr(dst_d, ptr(src, byte_offset_)); - h->add_imm(prc, src, byte_offset_ + 2 * sizeof(float), h->X_DEFAULT_ADDR); - h->ld1(dst.s[2], ptr(prc)); - break; - } - case 4: - h->uni_ldr(dst, src, byte_offset_); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); + case 0: + break; + case 1: + h->ldr(dst_s, ptr(src, byte_offset_)); + break; + case 2: + h->ldr(dst_d, ptr(src, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->ldr(dst_d, ptr(src, byte_offset_)); + h->add_imm(prc, src, byte_offset_ + 2 * sizeof(float), h->X_DEFAULT_ADDR); + h->ld1(dst.s[2], ptr(prc)); + break; + } + case 4: + h->uni_ldr(dst, src, byte_offset_); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); } } template -void jit_load_emitter::load_dbyte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::load_dbyte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; XReg src = XReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); @@ -72,31 +82,31 @@ void jit_load_emitter::load_dbyte(const std::vector &in_idxs, const std: DReg dst_d = DReg(out_idxs[0]); switch (load_num_) { - case 0: - break; - case 1: - h->ldr(dst_h, ptr(src, byte_offset_)); - break; - case 2: - h->ldr(dst_s, ptr(src, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->ldr(dst_s, ptr(src, byte_offset_)); - h->add_imm(prc, src, byte_offset_ + 2 * sizeof(uint16_t), h->X_DEFAULT_ADDR); - h->ld1(dst.h[2], ptr(prc)); - break; - } - case 4: - h->ldr(dst_d, ptr(src, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); + case 0: + break; + case 1: + h->ldr(dst_h, ptr(src, byte_offset_)); + break; + case 2: + h->ldr(dst_s, ptr(src, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->ldr(dst_s, ptr(src, byte_offset_)); + h->add_imm(prc, src, byte_offset_ + 2 * sizeof(uint16_t), h->X_DEFAULT_ADDR); + h->ld1(dst.h[2], ptr(prc)); + break; + } + case 4: + h->ldr(dst_d, ptr(src, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); } } template -void jit_load_emitter::load_byte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::load_byte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; XReg src = XReg(in_idxs[0]); TReg dst = TReg(out_idxs[0]); @@ -105,49 +115,50 @@ void jit_load_emitter::load_byte(const std::vector &in_idxs, const std:: SReg dst_s = SReg(out_idxs[0]); switch (load_num_) { - case 0: - break; - case 1: - h->ldr(dst_b, ptr(src, byte_offset_)); - break; - case 2: - h->ldr(dst_h, ptr(src, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->ldr(dst_h, ptr(src, byte_offset_)); - h->add_imm(prc, src, byte_offset_ + 2 * sizeof(int8_t), h->X_DEFAULT_ADDR); - h->ld1(dst.b[2], ptr(prc)); - break; - } - case 4: - h->ldr(dst_s, ptr(src, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); + case 0: + break; + case 1: + h->ldr(dst_b, ptr(src, byte_offset_)); + break; + case 2: + h->ldr(dst_h, ptr(src, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->ldr(dst_h, ptr(src, byte_offset_)); + h->add_imm(prc, src, byte_offset_ + 2 * sizeof(int8_t), h->X_DEFAULT_ADDR); + h->ld1(dst.b[2], ptr(prc)); + break; + } + case 4: + h->ldr(dst_s, ptr(src, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to load."); } } template -void jit_load_emitter::emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const { - OV_CPU_JIT_EMITTER_ASSERT(one_of(prc_, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), - "Unsupported precision."); +void jit_load_emitter::emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT( + one_of(prc_, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), + "Unsupported precision."); OV_CPU_JIT_EMITTER_ASSERT(load_num_ <= 4, "Unexpected number of elements to load."); switch (prc_) { - case ov::element::f32: - case ov::element::i32: - load_qbyte(in_idxs, out_idxs); - break; - case ov::element::f16: - load_dbyte(in_idxs, out_idxs); - break; - case ov::element::i8: - case ov::element::u8: - load_byte(in_idxs, out_idxs); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported precision: ", prc_.get_type_name()); + case ov::element::f32: + case ov::element::i32: + load_qbyte(in_idxs, out_idxs); + break; + case ov::element::f16: + load_dbyte(in_idxs, out_idxs); + break; + case ov::element::i8: + case ov::element::u8: + load_byte(in_idxs, out_idxs); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision: ", prc_.get_type_name()); } } @@ -158,14 +169,24 @@ size_t jit_load_emitter::get_aux_gprs_count() const { return 0; } -jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int store_num, int byte_offset, - arithmetic_mode mode, ov::element::Type exec_prc, emitter_in_out_map in_out_type) - : jit_emitter(host, host_isa, exec_prc, in_out_type), name_("unknown"), store_num_(store_num), byte_offset_(byte_offset), prc_(dst_prc) { +jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int store_num, + int byte_offset, + arithmetic_mode mode, + ov::element::Type exec_prc, + emitter_in_out_map in_out_type) + : jit_emitter(host, host_isa, exec_prc, in_out_type), + name_("unknown"), + store_num_(store_num), + byte_offset_(byte_offset), + prc_(dst_prc) { OV_CPU_JIT_EMITTER_ASSERT(src_prc == dst_prc, "Unsupported precision pair."); } -void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_idxs, out_idxs); } else { @@ -174,7 +195,7 @@ void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std: } template -void jit_store_emitter::store_qbyte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::store_qbyte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); SReg src_s = SReg(in_idxs[0]); @@ -183,31 +204,31 @@ void jit_store_emitter::store_qbyte(const std::vector &in_idxs, const st XReg dst = XReg(out_idxs[0]); switch (store_num_) { - case 0: - break; - case 1: - h->str(src_s, ptr(dst, byte_offset_)); - break; - case 2: - h->str(src_d, ptr(dst, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->str(src_d, ptr(dst, byte_offset_)); - h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(float), h->X_DEFAULT_ADDR); - h->st1(src.s[2], ptr(prc)); - break; - } - case 4: - h->str(src_q, ptr(dst, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); + case 0: + break; + case 1: + h->str(src_s, ptr(dst, byte_offset_)); + break; + case 2: + h->str(src_d, ptr(dst, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->str(src_d, ptr(dst, byte_offset_)); + h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(float), h->X_DEFAULT_ADDR); + h->st1(src.s[2], ptr(prc)); + break; + } + case 4: + h->str(src_q, ptr(dst, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); } } template -void jit_store_emitter::store_dbyte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::store_dbyte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); HReg src_h = HReg(in_idxs[0]); @@ -216,31 +237,31 @@ void jit_store_emitter::store_dbyte(const std::vector &in_idxs, const st XReg dst = XReg(out_idxs[0]); switch (store_num_) { - case 0: - break; - case 1: - h->str(src_h, ptr(dst, byte_offset_)); - break; - case 2: - h->str(src_s, ptr(dst, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->str(src_s, ptr(dst, byte_offset_)); - h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(uint16_t), h->X_DEFAULT_ADDR); - h->st1(src.h[2], ptr(prc)); - break; - } - case 4: - h->str(src_d, ptr(dst, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); + case 0: + break; + case 1: + h->str(src_h, ptr(dst, byte_offset_)); + break; + case 2: + h->str(src_s, ptr(dst, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->str(src_s, ptr(dst, byte_offset_)); + h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(uint16_t), h->X_DEFAULT_ADDR); + h->st1(src.h[2], ptr(prc)); + break; + } + case 4: + h->str(src_d, ptr(dst, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); } } template -void jit_store_emitter::store_byte(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::store_byte(const std::vector& in_idxs, const std::vector& out_idxs) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in_idxs[0]); BReg src_b = BReg(in_idxs[0]); @@ -249,49 +270,50 @@ void jit_store_emitter::store_byte(const std::vector &in_idxs, const std XReg dst = XReg(out_idxs[0]); switch (store_num_) { - case 0: - break; - case 1: - h->str(src_b, ptr(dst, byte_offset_)); - break; - case 2: - h->str(src_h, ptr(dst, byte_offset_)); - break; - case 3: { - XReg prc = XReg(aux_gpr_idxs[0]); - h->str(src_h, ptr(dst, byte_offset_)); - h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(int8_t), h->X_DEFAULT_ADDR); - h->st1(src.b[2], ptr(prc)); - break; - } - case 4: - h->str(src_s, ptr(dst, byte_offset_)); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); + case 0: + break; + case 1: + h->str(src_b, ptr(dst, byte_offset_)); + break; + case 2: + h->str(src_h, ptr(dst, byte_offset_)); + break; + case 3: { + XReg prc = XReg(aux_gpr_idxs[0]); + h->str(src_h, ptr(dst, byte_offset_)); + h->add_imm(prc, dst, byte_offset_ + 2 * sizeof(int8_t), h->X_DEFAULT_ADDR); + h->st1(src.b[2], ptr(prc)); + break; + } + case 4: + h->str(src_s, ptr(dst, byte_offset_)); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unexpected number of elements to store."); } } template -void jit_store_emitter::emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const { - OV_CPU_JIT_EMITTER_ASSERT(one_of(prc_, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), - "Unsupported precision."); +void jit_store_emitter::emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT( + one_of(prc_, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8), + "Unsupported precision."); OV_CPU_JIT_EMITTER_ASSERT(store_num_ <= 4, "Unexpected number of elements to store."); switch (prc_) { - case ov::element::f32: - case ov::element::i32: - store_qbyte(in_idxs, out_idxs); - break; - case ov::element::f16: - store_dbyte(in_idxs, out_idxs); - break; - case ov::element::i8: - case ov::element::u8: - store_byte(in_idxs, out_idxs); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported precision: ", prc_.get_type_name()); + case ov::element::f32: + case ov::element::i32: + store_qbyte(in_idxs, out_idxs); + break; + case ov::element::f16: + store_dbyte(in_idxs, out_idxs); + break; + case ov::element::i8: + case ov::element::u8: + store_byte(in_idxs, out_idxs); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision: ", prc_.get_type_name()); } } @@ -302,6 +324,6 @@ size_t jit_store_emitter::get_aux_gprs_count() const { return 0; } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.hpp index 887522ed1055e1..8c0983189f3083 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_load_store_emitters.hpp @@ -4,38 +4,41 @@ #pragma once -#include "jit_emitter.hpp" #include "cpu/aarch64/jit_generator.hpp" +#include "jit_emitter.hpp" namespace ov { namespace intel_cpu { namespace aarch64 { // Arithmetic modes for data type conversion in store_emitter -enum class arithmetic_mode { - saturation, - truncation -}; +enum class arithmetic_mode { saturation, truncation }; class jit_load_emitter : public jit_emitter { public: - jit_load_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int load_num, int byte_offset, + jit_load_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, + int byte_offset, ov::element::Type exec_prc = ov::element::f32, emitter_in_out_map in_out_type = emitter_in_out_map::gpr_to_vec); - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; - size_t get_inputs_count() const override { return 1; }; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; + size_t get_inputs_count() const override { + return 1; + }; private: template - void emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const; + void emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void load_qbyte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void load_qbyte(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void load_dbyte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void load_dbyte(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void load_byte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void load_byte(const std::vector& in_idxs, const std::vector& out_idxs) const; size_t get_aux_gprs_count() const override; std::string name_; @@ -46,23 +49,30 @@ class jit_load_emitter : public jit_emitter { class jit_store_emitter : public jit_emitter { public: - jit_store_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int store_num, int byte_offset_, - arithmetic_mode mode = arithmetic_mode::saturation, ov::element::Type exec_prc = ov::element::f32, + jit_store_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int store_num, + int byte_offset_, + arithmetic_mode mode = arithmetic_mode::saturation, + ov::element::Type exec_prc = ov::element::f32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr); - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; - size_t get_inputs_count() const override { return 1; } + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; + size_t get_inputs_count() const override { + return 1; + } private: template - void emit_isa(const std::vector &in_idxs, const std::vector &out_idxs) const; + void emit_isa(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void store_qbyte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void store_qbyte(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void store_dbyte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void store_dbyte(const std::vector& in_idxs, const std::vector& out_idxs) const; template - void store_byte(const std::vector &in_idxs, const std::vector &out_idxs) const; + void store_byte(const std::vector& in_idxs, const std::vector& out_idxs) const; size_t get_aux_gprs_count() const override; std::string name_; @@ -71,6 +81,6 @@ class jit_store_emitter : public jit_emitter { ov::element::Type prc_; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp index a56c2316183643..95698f8ac78bb0 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp @@ -2,74 +2,76 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "snippets/snippets_isa.hpp" #include "cpu_generator.hpp" -#include "jit_snippets_emitters.hpp" -#include "emitters/utils.hpp" -#include "emitters/snippets/cpu_runtime_configurator.hpp" -#include "emitters/plugin/aarch64/jit_eltwise_emitters.hpp" + #include "emitters/plugin/aarch64/jit_conversion_emitters.hpp" +#include "emitters/plugin/aarch64/jit_eltwise_emitters.hpp" +#include "emitters/snippets/aarch64/jit_fill_emitter.hpp" #include "emitters/snippets/aarch64/jit_kernel_emitter.hpp" #include "emitters/snippets/aarch64/jit_loop_emitters.hpp" #include "emitters/snippets/aarch64/jit_memory_emitters.hpp" -#include "emitters/snippets/aarch64/jit_fill_emitter.hpp" - -#include "transformations/snippets/common/op/fused_mul_add.hpp" -#include "transformations/cpu_opset/common/op/swish_cpu.hpp" - +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "emitters/utils.hpp" +#include "jit_snippets_emitters.hpp" #include "openvino/opsets/opset13.hpp" +#include "snippets/snippets_isa.hpp" +#include "transformations/cpu_opset/common/op/swish_cpu.hpp" +#include "transformations/snippets/common/op/fused_mul_add.hpp" namespace ov { -#define CREATE_SNIPPETS_EMITTER(e_type) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(h.get(), isa, expr); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +#define CREATE_SNIPPETS_EMITTER(e_type) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } -#define CREATE_CPU_EMITTER(e_type) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(h.get(), isa, expr->get_node()); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +#define CREATE_CPU_EMITTER(e_type) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr->get_node()); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } -#define CREATE_GELU_V7_EMITTER(e_type_erf, e_type_tanh) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - const auto& n = expr->get_node(); \ - const auto& gelu = std::dynamic_pointer_cast(n); \ - if (gelu == nullptr) { \ - OPENVINO_THROW("Can't cast to ov::op::v7::Gelu"); \ - } \ - const auto approximationMode = gelu->get_approximation_mode(); \ - if (approximationMode == ov::op::GeluApproximationMode::ERF) { \ - return std::make_shared(h.get(), isa, n); \ - } else if (approximationMode == ov::op::GeluApproximationMode::TANH) { \ - return std::make_shared(h.get(), isa, n); \ - } else { \ - OPENVINO_THROW("Unsupported Gelu approximation mode"); \ - } \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - const auto& gelu = std::dynamic_pointer_cast(n); \ - if (gelu == nullptr) { \ - OPENVINO_THROW("Can't cast to ov::op::v7::Gelu"); \ - } \ - const auto approximationMode = gelu->get_approximation_mode(); \ - if (approximationMode == ov::op::GeluApproximationMode::ERF) { \ - return e_type_erf::get_supported_precisions(n); \ - } else if (approximationMode == ov::op::GeluApproximationMode::TANH) { \ - return e_type_tanh::get_supported_precisions(n); \ - } else { \ - OPENVINO_THROW("Unsupported Gelu approximation mode"); \ - } \ - } \ -} +#define CREATE_GELU_V7_EMITTER(e_type_erf, e_type_tanh) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + const auto& n = expr->get_node(); \ + const auto& gelu = std::dynamic_pointer_cast(n); \ + if (gelu == nullptr) { \ + OPENVINO_THROW("Can't cast to ov::op::v7::Gelu"); \ + } \ + const auto approximationMode = gelu->get_approximation_mode(); \ + if (approximationMode == ov::op::GeluApproximationMode::ERF) { \ + return std::make_shared(h.get(), isa, n); \ + } else if (approximationMode == ov::op::GeluApproximationMode::TANH) { \ + return std::make_shared(h.get(), isa, n); \ + } else { \ + OPENVINO_THROW("Unsupported Gelu approximation mode"); \ + } \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + const auto& gelu = std::dynamic_pointer_cast(n); \ + if (gelu == nullptr) { \ + OPENVINO_THROW("Can't cast to ov::op::v7::Gelu"); \ + } \ + const auto approximationMode = gelu->get_approximation_mode(); \ + if (approximationMode == ov::op::GeluApproximationMode::ERF) { \ + return e_type_erf::get_supported_precisions(n); \ + } else if (approximationMode == ov::op::GeluApproximationMode::TANH) { \ + return e_type_tanh::get_supported_precisions(n); \ + } else { \ + OPENVINO_THROW("Unsupported Gelu approximation mode"); \ + } \ + } \ + } class jit_snippet : public dnnl::impl::cpu::aarch64::jit_generator { public: @@ -85,7 +87,8 @@ class jit_snippet : public dnnl::impl::cpu::aarch64::jit_generator { namespace intel_cpu { namespace aarch64 { -CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) : h_compiled(std::move(h)) { +CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) + : h_compiled(std::move(h)) { OPENVINO_ASSERT(h_compiled && h_compiled->jit_ker(), "Got invalid jit generator or kernel was nopt compiled"); } @@ -102,15 +105,19 @@ bool CompiledSnippetCPU::empty() const { } CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa) - : TargetMachine(std::make_shared()), h(new jit_snippet()), isa(host_isa) { + : TargetMachine(std::make_shared()), + h(new jit_snippet()), + isa(host_isa) { // data movement jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[snippets::op::VectorBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[snippets::op::BroadcastMove::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_broadcast_move_emitter); - jitters[snippets::op::ConvertTruncation::get_type_info_static()] = CREATE_CPU_EMITTER(jit_convert_truncation_emitter); - jitters[snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_CPU_EMITTER(jit_convert_saturation_emitter); + jitters[snippets::op::ConvertTruncation::get_type_info_static()] = + CREATE_CPU_EMITTER(jit_convert_truncation_emitter); + jitters[snippets::op::ConvertSaturation::get_type_info_static()] = + CREATE_CPU_EMITTER(jit_convert_saturation_emitter); // memory access jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_load_memory_emitter); @@ -136,7 +143,8 @@ CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa) jitters[ov::op::v0::Exp::get_type_info_static()] = CREATE_CPU_EMITTER(jit_exp_emitter); jitters[ov::op::v0::Floor::get_type_info_static()] = CREATE_CPU_EMITTER(jit_floor_emitter); jitters[ov::op::v0::Gelu::get_type_info_static()] = CREATE_CPU_EMITTER(jit_gelu_erf_emitter); - jitters[ov::op::v7::Gelu::get_type_info_static()] = CREATE_GELU_V7_EMITTER(jit_gelu_erf_emitter, jit_gelu_tanh_emitter); + jitters[ov::op::v7::Gelu::get_type_info_static()] = + CREATE_GELU_V7_EMITTER(jit_gelu_erf_emitter, jit_gelu_tanh_emitter); jitters[ov::op::v4::HSwish::get_type_info_static()] = CREATE_CPU_EMITTER(jit_hswish_emitter); jitters[ov::op::v4::Mish::get_type_info_static()] = CREATE_CPU_EMITTER(jit_mish_emitter); jitters[ov::op::v0::Relu::get_type_info_static()] = CREATE_CPU_EMITTER(jit_relu_emitter); @@ -168,7 +176,8 @@ bool CPUTargetMachine::is_supported() const { snippets::CompiledSnippetPtr CPUTargetMachine::get_snippet() { OPENVINO_ASSERT(h->create_kernel() == dnnl::impl::status::success, "Failed to create jit_kernel in get_snippet()"); - const auto& result = std::make_shared(std::unique_ptr(h.release())); + const auto& result = + std::make_shared(std::unique_ptr(h.release())); // Note that we reset all the generated code, since it was copied into CompiledSnippetCPU h.reset(new jit_snippet()); return result; @@ -176,8 +185,10 @@ snippets::CompiledSnippetPtr CPUTargetMachine::get_snippet() { size_t CPUTargetMachine::get_lanes() const { switch (isa) { - case dnnl::impl::cpu::aarch64::asimd : return dnnl::impl::cpu::aarch64::cpu_isa_traits::vlen / sizeof(float); - default : OPENVINO_THROW("unknown isa ", isa); + case dnnl::impl::cpu::aarch64::asimd: + return dnnl::impl::cpu::aarch64::cpu_isa_traits::vlen / sizeof(float); + default: + OPENVINO_THROW("unknown isa ", isa); } } @@ -190,18 +201,19 @@ dnnl::impl::cpu::aarch64::cpu_isa_t CPUTargetMachine::get_isa() const { return isa; } -CPUGenerator::CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa_) : Generator(std::make_shared(isa_)) {} +CPUGenerator::CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa_) + : Generator(std::make_shared(isa_)) {} std::shared_ptr CPUGenerator::clone() const { const auto& cpu_target_machine = std::dynamic_pointer_cast(target); - OPENVINO_ASSERT(cpu_target_machine, "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); + OPENVINO_ASSERT(cpu_target_machine, + "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); return std::make_shared(cpu_target_machine->get_isa()); } ov::snippets::RegType CPUGenerator::get_specific_op_out_reg_type(const ov::Output& out) const { const auto op = out.get_node_shared_ptr(); - if (std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op)) + if (std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op)) return ov::snippets::RegType::vec; else return ov::snippets::RegType::undefined; @@ -211,6 +223,6 @@ bool CPUGenerator::uses_precompiled_kernel(const std::shared_ptr& out) const override; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp index 2b6056e92644a3..053cebe747e529 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.cpp @@ -3,6 +3,7 @@ // #include "jit_fill_emitter.hpp" + #include "cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h" #include "emitters/utils.hpp" @@ -21,7 +22,8 @@ jit_fill_emitter::jit_fill_emitter(jit_generator* h, cpu_isa_t isa, const Expres const auto fill = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(fill != nullptr, "Expects Fill expression"); OV_CPU_JIT_EMITTER_ASSERT(fill->get_element_type().size() == 4, - "Supports only 4 Byte element types but gets ", fill->get_element_type()); + "Supports only 4 Byte element types but gets ", + fill->get_element_type()); offset = fill->get_offset(); fill_value = fill->get_fill_value(); @@ -38,8 +40,7 @@ size_t jit_fill_emitter::get_aux_gprs_count() const { return 1; } -void jit_fill_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_fill_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -48,7 +49,7 @@ void jit_fill_emitter::emit_impl(const std::vector& in, } template -void jit_fill_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_fill_emitter::emit_isa(const std::vector& in, const std::vector& out) const { if (is_full_reg()) fill_full(out); else @@ -56,7 +57,7 @@ void jit_fill_emitter::emit_isa(const std::vector &in, const std::vector } template -void jit_fill_emitter::fill_full(const std::vector &out) const { +void jit_fill_emitter::fill_full(const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg dst = TReg(out[0]); @@ -71,28 +72,28 @@ void jit_fill_emitter::fill_full(const std::vector &out) const { } template -void jit_fill_emitter::fill_tail(const std::vector &in, const std::vector &out) const { +void jit_fill_emitter::fill_tail(const std::vector& in, const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg dst = TReg(out[0]); switch (offset) { - case 1: - h->ld1(dst.s[1], table_val2("value", sizeof(float))); - h->ld1(dst.d[1], table_val2("value", 2 * sizeof(float))); - break; - case 2: - h->ld1(dst.d[1], table_val2("value", 2 * sizeof(float))); - break; - case 3: - h->ld1(dst.s[3], table_val2("value", 3 * sizeof(float))); - break; - case 4: - break; - default: - OV_CPU_JIT_EMITTER_THROW("Fill emitter has unexpected offset ", offset); + case 1: + h->ld1(dst.s[1], table_val2("value", sizeof(float))); + h->ld1(dst.d[1], table_val2("value", 2 * sizeof(float))); + break; + case 2: + h->ld1(dst.d[1], table_val2("value", 2 * sizeof(float))); + break; + case 3: + h->ld1(dst.s[3], table_val2("value", 3 * sizeof(float))); + break; + case 4: + break; + default: + OV_CPU_JIT_EMITTER_THROW("Fill emitter has unexpected offset ", offset); } } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.hpp index 0ce0ac62d03979..7c827e4920d5eb 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_fill_emitter.hpp @@ -16,29 +16,34 @@ class jit_fill_emitter : public jit_emitter { dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } protected: size_t get_aux_gprs_count() const override; private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; template - void fill_full(const std::vector &out) const; + void fill_full(const std::vector& out) const; template - void fill_tail(const std::vector &in, const std::vector &out) const; + void fill_tail(const std::vector& in, const std::vector& out) const; - bool is_full_reg() const { return offset == 0; } - bool is_optimized() const { return is_full_reg() && fill_value == uint32_t(0x0); } + bool is_full_reg() const { + return offset == 0; + } + bool is_optimized() const { + return is_full_reg() && fill_value == uint32_t(0x0); + } size_t offset = 0; uint32_t fill_value = 0x0; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp index 8f7a54dc9ebdb3..32ed1a844b6724 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp @@ -19,18 +19,25 @@ using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; inline static std::vector transform_idxs_to_regs(const std::vector& idxs) { std::vector regs(idxs.size(), XReg(0)); - std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return XReg(idx);}); + std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx) { + return XReg(idx); + }); return regs; } inline static std::vector transform_snippets_regs_to_idxs(const std::vector& regs) { std::vector idxs(regs.size()); - std::transform(regs.cbegin(), regs.cend(), idxs.begin(), [](const snippets::Reg& reg) { return reg.idx; }); + std::transform(regs.cbegin(), regs.cend(), idxs.begin(), [](const snippets::Reg& reg) { + return reg.idx; + }); return idxs; } -jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), reg_runtime_params_idx(Operand::X0) { +jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, + cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : jit_emitter(h, isa), + reg_runtime_params_idx(Operand::X0) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "Invoked with invalid op argument"); OV_CPU_JIT_EMITTER_ASSERT(!kernel->region->empty(), "Invoked with empty body"); @@ -113,35 +120,50 @@ void jit_kernel_emitter::init_reg_pools(const std::set& gpr_blacklist, c gp_regs_pool[i] = vec_regs_pool[i] = 31 - i; auto remove_regs_from_pool = [](std::vector& pool, const std::set& to_remove) { // It's important to keep the order of other elements - pool.erase(std::remove_if(pool.begin(), pool.end(), - [&](size_t x) {return to_remove.count(x) != 0;}), pool.end()); + pool.erase(std::remove_if(pool.begin(), + pool.end(), + [&](size_t x) { + return to_remove.count(x) != 0; + }), + pool.end()); }; - std::set gprs_blacklist_extended{Operand::X18, Operand::X23, Operand::X24, Operand::X28, Operand::X29, Operand::SP}; + std::set gprs_blacklist_extended{Operand::X18, + Operand::X23, + Operand::X24, + Operand::X28, + Operand::X29, + Operand::SP}; gprs_blacklist_extended.insert(gpr_blacklist.begin(), gpr_blacklist.end()); // Reserve reg_indexes_idx and reg_runtime_params_idx, since they'll be used to pass runtime call args to kernel remove_regs_from_pool(gp_regs_pool, gprs_blacklist_extended); remove_regs_from_pool(vec_regs_pool, vec_blacklist); } -void jit_kernel_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_kernel_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } -void jit_kernel_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_kernel_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.empty() && out.empty(), ": Expects 0 registers on input and output"); const auto num_params = num_inputs + num_outputs + num_unique_buffers; // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount OV_CPU_JIT_EMITTER_ASSERT(data_ptr_regs_idx.size() == num_params, - "Number of inputs and outputs is inconsistent with the number of allocated registers ", num_params, - " data_ptr_regs_idx.size() = ", data_ptr_regs_idx.size()); + "Number of inputs and outputs is inconsistent with the number of allocated registers ", + num_params, + " data_ptr_regs_idx.size() = ", + data_ptr_regs_idx.size()); } void jit_kernel_emitter::init_body_regs(const std::set& kernel_regs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) { // Initialize pools of gp and vec registers - // Reserve kernel regs (reg_indexes_idx and, if there is, reg_runtime_params_idx), since they'll be used to pass runtime call args to kernel + // Reserve kernel regs (reg_indexes_idx and, if there is, reg_runtime_params_idx), since they'll be used to pass + // runtime call args to kernel init_reg_pools(kernel_regs, {}); mapping_info gpr_map_pool({}, gp_regs_pool); @@ -175,9 +197,11 @@ void jit_kernel_emitter::emit_impl(const std::vector& in, const std::vec h->postamble(); } -jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, +jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_kernel_emitter(h, isa, expr), reg_indexes_idx(Operand::X1) { + : jit_kernel_emitter(h, isa, expr), + reg_indexes_idx(Operand::X1) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "Expectes KernelStatic expression"); jcp = *reinterpret_cast(kernel->compile_params); @@ -219,24 +243,29 @@ void jit_kernel_static_emitter::init_data_pointers(const std::vector& data // NOTE: Snippets Buffer Scratchpad has the common data pointer for all Buffers (even with different ID). // The accessing memory is covered by correct offsets in each Buffer and the corresponding MemoryAccess ops for (size_t i = 0; i < num_unique_buffers; i++) { - h->ldr(data_ptr_regs[num_params + i], ptr(reg_runtime_params, static_cast(GET_OFF(buffer_scratchpad_ptr)))); + h->ldr(data_ptr_regs[num_params + i], + ptr(reg_runtime_params, static_cast(GET_OFF(buffer_scratchpad_ptr)))); } for (size_t i = 0; i < num_params; i++) { if (i < num_inputs) - h->ldr(data_ptr_regs[i], ptr(reg_runtime_params, static_cast(GET_OFF(src_ptrs) + i * sizeof(void*)))); + h->ldr(data_ptr_regs[i], + ptr(reg_runtime_params, static_cast(GET_OFF(src_ptrs) + i * sizeof(void*)))); else - h->ldr(data_ptr_regs[i], ptr(reg_runtime_params, static_cast(GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)))); + h->ldr(data_ptr_regs[i], + ptr(reg_runtime_params, static_cast(GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)))); init_ptr_with_offset(data_ptr_regs[i], data_offsets[i]); } } -jit_kernel_dynamic_emitter::jit_kernel_dynamic_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, +jit_kernel_dynamic_emitter::jit_kernel_dynamic_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) : jit_kernel_emitter(h, isa, expr) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel, "Expectes KernelDynamic expression"); - // - Reserve reg_runtime_params_idx, since it wll be used to pass runtime call args to all dynamic emitters that needs runtime args + // - Reserve reg_runtime_params_idx, since it wll be used to pass runtime call args to all dynamic emitters that + // needs runtime args // - We cannot assign this register to the body emitters since runtime params MUST be valid during whole execution // for all dynamic emitters init_body_regs({reg_runtime_params_idx}); @@ -247,16 +276,19 @@ void jit_kernel_dynamic_emitter::init_data_pointers(const std::vector& dat const auto num_params = num_inputs + num_outputs; for (size_t i = 0; i < num_unique_buffers; ++i) { - h->ldr(data_ptr_regs[num_params + i], ptr(reg_runtime_params, static_cast(GET_OFF(buffer_scratchpad_ptr)))); + h->ldr(data_ptr_regs[num_params + i], + ptr(reg_runtime_params, static_cast(GET_OFF(buffer_scratchpad_ptr)))); } for (size_t i = 0; i < num_params; i++) { if (i < num_inputs) - h->ldr(data_ptr_regs[i], ptr(reg_runtime_params, static_cast(GET_OFF(src_ptrs) + i * sizeof(void*)))); + h->ldr(data_ptr_regs[i], + ptr(reg_runtime_params, static_cast(GET_OFF(src_ptrs) + i * sizeof(void*)))); else - h->ldr(data_ptr_regs[i], ptr(reg_runtime_params, static_cast(GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)))); + h->ldr(data_ptr_regs[i], + ptr(reg_runtime_params, static_cast(GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)))); } } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.hpp index 63bac54e5c1f26..0ede91f100f110 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.hpp @@ -5,8 +5,8 @@ #pragma once #include "emitters/plugin/aarch64/jit_emitter.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" #include "emitters/snippets/jit_container_emitter.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" namespace ov { namespace intel_cpu { @@ -15,8 +15,9 @@ namespace aarch64 { /// /// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register /// mapping and creates a pools of available gpr and vec registers. Kernel usually contains (at least one) -/// jit_loop_begin_emitter and jit_loop_end_emitter pair. In general the enclosed emitters should be organized in the following way: -/// jit_kernel_emitter { /* entry point, maps registers, creates pools of available registers */ +/// jit_loop_begin_emitter and jit_loop_end_emitter pair. In general the enclosed emitters should be organized in the +/// following way: jit_kernel_emitter { /* entry point, maps registers, creates pools of available +/// registers */ /// 1.S jit_loop_begin_emitter /* Scalar Loop over the outer dimension [START] */ /// 2.S jit_loop_begin_emitter /* inner vector loop [START] */ /// ... /* All the necessary Load/Strore/elementwise emitters */ @@ -31,21 +32,29 @@ namespace aarch64 { class jit_kernel_emitter : public jit_emitter, public jit_container_emitter { public: - jit_kernel_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - - size_t get_inputs_count() const override {return 0;} - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + jit_kernel_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_count() const override { + return 0; + } + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: void validate_arguments(const std::vector& in, const std::vector& out) const override; - void init_body_regs(const std::set& kernel_regs, const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}); + void init_body_regs(const std::set& kernel_regs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}); /** - * @brief populates physical registers pools for x86 (both vec and gp). + * @brief populates physical registers pools for x86 (both vec and gp). * Skips stack-related gprs and extra gprs passed as arguments. * @arg gpr_blacklist - set of gp registers that should not be added to register pool * @arg vec_blacklist - set of vec registers should not be added to register pool - */ + */ void init_reg_pools(const std::set& gpr_blacklist, const std::set& vec_blacklist); virtual void init_data_pointers(const std::vector& data_ptr_regs) const = 0; @@ -69,14 +78,15 @@ class jit_kernel_emitter : public jit_emitter, public jit_container_emitter { std::shared_ptr body; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_emitter(const jit_kernel_emitter *emitter); + friend std::string init_info_jit_kernel_emitter(const jit_kernel_emitter* emitter); #endif }; class jit_kernel_static_emitter : public jit_kernel_emitter { public: jit_kernel_static_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); private: void init_data_pointers(const std::vector& data_ptr_regs) const override; @@ -86,23 +96,24 @@ class jit_kernel_static_emitter : public jit_kernel_emitter { std::vector> data_offsets; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter *emitter); + friend std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter* emitter); #endif }; class jit_kernel_dynamic_emitter : public jit_kernel_emitter { public: jit_kernel_dynamic_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); private: void init_data_pointers(const std::vector& data_ptr_regs) const override; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter *emitter); + friend std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter* emitter); #endif }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp index 2b5b41fb912606..0666505a6d31ab 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.cpp @@ -3,8 +3,9 @@ // #include "jit_loop_emitters.hpp" -#include "jit_kernel_emitter.hpp" + #include "emitters/utils.hpp" +#include "jit_kernel_emitter.hpp" using namespace Xbyak_aarch64; @@ -18,9 +19,11 @@ using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; /* ================== jit_loop_begin_emitter ====================== */ -jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, +jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), loop_begin_label{new Xbyak_aarch64::Label()} { + : jit_emitter(h, isa), + loop_begin_label{new Xbyak_aarch64::Label()} { const auto loop_begin = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(loop_begin, "expects LoopBegin expression"); const auto loop_end = loop_begin->get_loop_end(); @@ -31,15 +34,17 @@ jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_gen in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -void jit_loop_begin_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_loop_begin_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.empty(), "Invalid inputs size: expected 0 got " + std::to_string(in.size())); // Note: the only expected output is work amount register (communicated to jit_loop_end_emitter) OV_CPU_JIT_EMITTER_ASSERT(out.size() == 1, "Invalid outputs size: expected 1 got " + std::to_string(out.size())); OV_CPU_JIT_EMITTER_ASSERT(loop_begin_label != nullptr, "has not inited label!"); } -void jit_loop_begin_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_loop_begin_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } @@ -56,9 +61,11 @@ void jit_loop_begin_emitter::emit_impl(const std::vector& in, const std: /* ================== jit_loop_end_emitter ====================== */ -jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, +jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), loop_begin_label{nullptr} { + : jit_emitter(h, isa), + loop_begin_label{nullptr} { in_out_type_ = emitter_in_out_map::gpr_to_gpr; const auto loop_end = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(loop_end != nullptr, "expected LoopEnd expr"); @@ -79,27 +86,49 @@ jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generat loop_begin_label = loop_begin_emitter->get_begin_label(); } -ov::snippets::lowered::ExpressionPtr jit_loop_end_emitter::get_loop_begin_expr(const ov::snippets::lowered::ExpressionPtr& expr) { +ov::snippets::lowered::ExpressionPtr jit_loop_end_emitter::get_loop_begin_expr( + const ov::snippets::lowered::ExpressionPtr& expr) { const auto begin_expr = expr->get_input_port_connectors().back()->get_source().get_expr(); OV_CPU_JIT_EMITTER_ASSERT(ov::is_type(begin_expr->get_node()), "LoopEnd expression must have th last port connector to LoopBegin"); return begin_expr; } -void jit_loop_end_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { -const auto io_size = num_inputs + num_outputs; +void jit_loop_end_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { + const auto io_size = num_inputs + num_outputs; OV_CPU_JIT_EMITTER_ASSERT(out.size() == 0, "Invalid number of out arguments: expected ", 0, " got ", out.size()); - OV_CPU_JIT_EMITTER_ASSERT(in.size() == io_size + 1, "Invalid number of in arguments: expected ", io_size + 1, " got ", in.size()); - OV_CPU_JIT_EMITTER_ASSERT(is_incremented.size() == io_size, "Invalid is_incremented size: expected ", io_size, " got ", is_incremented.size()); - OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == io_size, "Invalid ptr_increments size: expected ", io_size, " got ", ptr_increments.size()); + OV_CPU_JIT_EMITTER_ASSERT(in.size() == io_size + 1, + "Invalid number of in arguments: expected ", + io_size + 1, + " got ", + in.size()); + OV_CPU_JIT_EMITTER_ASSERT(is_incremented.size() == io_size, + "Invalid is_incremented size: expected ", + io_size, + " got ", + is_incremented.size()); + OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == io_size, + "Invalid ptr_increments size: expected ", + io_size, + " got ", + ptr_increments.size()); OV_CPU_JIT_EMITTER_ASSERT(finalization_offsets.size() == io_size, - "Invalid finalization_offsets size: expected: ", io_size, " got ", finalization_offsets.size()); - OV_CPU_JIT_EMITTER_ASSERT(data_sizes.size() == io_size, "Invalid data_sizes size: expected: ", io_size, " got ", data_sizes.size()); + "Invalid finalization_offsets size: expected: ", + io_size, + " got ", + finalization_offsets.size()); + OV_CPU_JIT_EMITTER_ASSERT(data_sizes.size() == io_size, + "Invalid data_sizes size: expected: ", + io_size, + " got ", + data_sizes.size()); OV_CPU_JIT_EMITTER_ASSERT(loop_begin_label != nullptr, "has not inited begin label!"); } -void jit_loop_end_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_loop_end_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } @@ -118,7 +147,7 @@ void jit_loop_end_emitter::emit_impl(const std::vector& in, const std::v if (ptr_increments[idx] > 0) { h->add_imm(data_reg, data_reg, ptr_increments[idx] * wa_increment * data_sizes[idx], h->X_TMP_0); } else if (ptr_increments[idx] < 0) { - h->sub_imm(data_reg, data_reg, - ptr_increments[idx] * wa_increment * data_sizes[idx], h->X_TMP_0); + h->sub_imm(data_reg, data_reg, -ptr_increments[idx] * wa_increment * data_sizes[idx], h->X_TMP_0); } } h->sub_imm(reg_work_amount, reg_work_amount, wa_increment, h->X_TMP_0); @@ -133,13 +162,13 @@ void jit_loop_end_emitter::emit_impl(const std::vector& in, const std::v if (finalization_offsets[idx] > 0) { h->add_imm(data_reg, data_reg, finalization_offsets[idx] * data_sizes[idx], h->X_TMP_0); } else if (finalization_offsets[idx] < 0) { - h->sub_imm(data_reg, data_reg, - finalization_offsets[idx] * data_sizes[idx], h->X_TMP_0); + h->sub_imm(data_reg, data_reg, -finalization_offsets[idx] * data_sizes[idx], h->X_TMP_0); } } } /* ============================================================== */ -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp index 6ec87835821df2..c89928353646cd 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_loop_emitters.hpp @@ -5,7 +5,6 @@ #pragma once #include "emitters/plugin/aarch64/jit_emitter.hpp" - #include "snippets/op/loop.hpp" namespace ov { @@ -14,20 +13,27 @@ namespace aarch64 { /* ================== jit_loop_begin_emitter ====================== */ -class jit_loop_begin_emitter: public jit_emitter { +class jit_loop_begin_emitter : public jit_emitter { public: - jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, + jit_loop_begin_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override { return 0; } + size_t get_inputs_count() const override { + return 0; + } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; - std::shared_ptr get_begin_label() { return loop_begin_label; } + std::shared_ptr get_begin_label() { + return loop_begin_label; + } protected: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; std::shared_ptr loop_begin_label; @@ -40,18 +46,23 @@ class jit_loop_begin_emitter: public jit_emitter { /* ================== jit_loop_end_emitter ====================== */ -class jit_loop_end_emitter: public jit_emitter { +class jit_loop_end_emitter : public jit_emitter { public: - jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + jit_loop_end_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override { return 0; } + size_t get_inputs_count() const override { + return 0; + } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; static ov::snippets::lowered::ExpressionPtr get_loop_begin_expr(const ov::snippets::lowered::ExpressionPtr& expr); @@ -70,6 +81,6 @@ class jit_loop_end_emitter: public jit_emitter { /* ============================================================== */ -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp index d19843f395d2f3..9989f3431fb2a8 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_memory_emitters.hpp" + #include "emitters/utils.hpp" using namespace Xbyak_aarch64; @@ -15,15 +16,18 @@ using jit_generator = dnnl::impl::cpu::aarch64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::aarch64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) { +jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_emitter(h, isa) { const auto n = expr->get_node(); src_prc = n->get_input_element_type(0); dst_prc = n->get_output_element_type(0); } -jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_memory_emitter(h, isa, expr) { - bool is_supported_precision = one_of(src_prc, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8) && - src_prc == dst_prc; +jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr) { + bool is_supported_precision = + one_of(src_prc, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8) && + src_prc == dst_prc; OV_CPU_JIT_EMITTER_ASSERT(is_supported_precision, "Unsupported precision pair."); const auto load = std::dynamic_pointer_cast(expr->get_node()); @@ -34,8 +38,7 @@ jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count, byte_offset)); } -void jit_load_memory_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_load_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -44,7 +47,7 @@ void jit_load_memory_emitter::emit_impl(const std::vector& in, } template -void jit_load_memory_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_load_memory_emitter::emit_isa(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(load_emitter != nullptr, "Load CPU emitter isn't initialized!"); load_emitter->emit_code(in, out, aux_vec_idxs, aux_gpr_idxs); @@ -56,7 +59,8 @@ void jit_load_memory_emitter::emit_data() const { jit_load_broadcast_emitter::jit_load_broadcast_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_memory_emitter(h, isa, expr) { - OV_CPU_JIT_EMITTER_ASSERT(src_prc == dst_prc, "Only support equal input and output types but gets ", + OV_CPU_JIT_EMITTER_ASSERT(src_prc == dst_prc, + "Only support equal input and output types but gets ", src_prc.get_type_name(), " and ", dst_prc.get_type_name()); @@ -68,8 +72,7 @@ jit_load_broadcast_emitter::jit_load_broadcast_emitter(jit_generator* h, cpu_isa in_out_type_ = emitter_in_out_map::gpr_to_vec; } -void jit_load_broadcast_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_load_broadcast_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -78,7 +81,7 @@ void jit_load_broadcast_emitter::emit_impl(const std::vector& in, } template -void jit_load_broadcast_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_load_broadcast_emitter::emit_isa(const std::vector& in, const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; XReg src = XReg(in[0]); TReg dst = TReg(out[0]); @@ -86,9 +89,11 @@ void jit_load_broadcast_emitter::emit_isa(const std::vector &in, const s h->uni_ld1rw(dst.s, src, byte_offset); } -jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_memory_emitter(h, isa, expr) { - bool is_supported_precision = one_of(dst_prc, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8) && - src_prc == dst_prc; +jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr) { + bool is_supported_precision = + one_of(dst_prc, ov::element::f32, ov::element::i32, ov::element::f16, ov::element::i8, ov::element::u8) && + src_prc == dst_prc; OV_CPU_JIT_EMITTER_ASSERT(is_supported_precision, "Unsupported precision pair."); const auto store = ov::as_type_ptr(expr->get_node()); @@ -99,8 +104,7 @@ jit_store_memory_emitter::jit_store_memory_emitter(jit_generator* h, cpu_isa_t i store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count, byte_offset)); } -void jit_store_memory_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_store_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -109,7 +113,7 @@ void jit_store_memory_emitter::emit_impl(const std::vector& in, } template -void jit_store_memory_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_store_memory_emitter::emit_isa(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(store_emitter != nullptr, "Store CPU emitter isn't initialized!"); store_emitter->emit_code(in, out, aux_vec_idxs, aux_gpr_idxs); @@ -119,6 +123,6 @@ void jit_store_memory_emitter::emit_data() const { store_emitter->emit_data(); } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp index ba0b4e4acfedb4..edb85751f9086d 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp @@ -11,11 +11,11 @@ namespace ov { namespace intel_cpu { namespace aarch64 { -class jit_memory_emitter : public jit_emitter { +class jit_memory_emitter : public jit_emitter { public: jit_memory_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); protected: ov::element::Type src_prc; @@ -28,17 +28,18 @@ class jit_memory_emitter : public jit_emitter { class jit_load_memory_emitter : public jit_memory_emitter { public: jit_load_memory_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; void emit_data() const override; private: @@ -48,39 +49,41 @@ class jit_load_memory_emitter : public jit_memory_emitter { class jit_load_broadcast_emitter : public jit_memory_emitter { public: jit_load_broadcast_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; }; -class jit_store_memory_emitter : public jit_memory_emitter { +class jit_store_memory_emitter : public jit_memory_emitter { public: jit_store_memory_emitter(dnnl::impl::cpu::aarch64::jit_generator* h, - dnnl::impl::cpu::aarch64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + dnnl::impl::cpu::aarch64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; void emit_data() const override; private: std::unique_ptr store_emitter = nullptr; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.cpp index 6529312ae1095a..69fcc7a92fd259 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_snippets_emitters.hpp" + #include "cpu/aarch64/jit_generator.hpp" #include "cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h" #include "emitters/utils.hpp" @@ -17,7 +18,8 @@ using jit_generator = dnnl::impl::cpu::aarch64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::aarch64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -jit_nop_emitter::jit_nop_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : aarch64::jit_emitter(h, isa) { +jit_nop_emitter::jit_nop_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : aarch64::jit_emitter(h, isa) { in_out_type_ = emitter_in_out_map::gpr_to_gpr; } @@ -29,14 +31,12 @@ jit_broadcast_move_emitter::jit_broadcast_move_emitter(jit_generator* h, cpu_isa n->get_input_element_type(0), " and ", n->get_output_element_type(0)); - OV_CPU_JIT_EMITTER_ASSERT(n->get_input_element_type(0) == ov::element::f32, - "Only supports FP32 precision."); + OV_CPU_JIT_EMITTER_ASSERT(n->get_input_element_type(0) == ov::element::f32, "Only supports FP32 precision."); byte_size = n->get_input_element_type(0).size(); } -void jit_broadcast_move_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_broadcast_move_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -45,42 +45,42 @@ void jit_broadcast_move_emitter::emit_impl(const std::vector& in, } template -void jit_broadcast_move_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_broadcast_move_emitter::emit_isa(const std::vector& in, const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg src = TReg(in[0]); TReg dst = TReg(out[0]); switch (byte_size) { - case 4: - h->dup(dst.s, src.s[0]); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported data size ", byte_size); + case 4: + h->dup(dst.s, src.s[0]); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported data size ", byte_size); } } -jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) { +jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_emitter(h, isa) { const auto n = expr->get_node(); const auto& precision = n->get_output_element_type(0); switch (precision) { - case element::i32: { - value = ov::as_type_ptr(n)->cast_vector()[0]; - break; - } - case element::f32: { - value = dnnl::impl::float2int(ov::as_type_ptr(n)->cast_vector()[0]); - break; - } - default: { - OV_CPU_JIT_EMITTER_THROW("Doesn't support precision ", precision); - } + case element::i32: { + value = ov::as_type_ptr(n)->cast_vector()[0]; + break; + } + case element::f32: { + value = dnnl::impl::float2int(ov::as_type_ptr(n)->cast_vector()[0]); + break; + } + default: { + OV_CPU_JIT_EMITTER_THROW("Doesn't support precision ", precision); + } } push_arg_entry_of("scalar", value, true); prepare_table(); } -void jit_scalar_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_scalar_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in, out); } else { @@ -89,7 +89,7 @@ void jit_scalar_emitter::emit_impl(const std::vector& in, } template -void jit_scalar_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_scalar_emitter::emit_isa(const std::vector& in, const std::vector& out) const { using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; TReg dst = TReg(out[0]); AdrImm src = table_val("scalar"); @@ -97,6 +97,6 @@ void jit_scalar_emitter::emit_isa(const std::vector &in, const std::vect h->uni_ld1rw(dst.s, src.getXn(), src.getImm()); } -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.hpp index 0f05024ed12168..13f9aa70fb2c8e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_snippets_emitters.hpp @@ -16,11 +16,12 @@ class jit_nop_emitter : public jit_emitter { dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 0;} + size_t get_inputs_count() const override { + return 0; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override {} + void emit_impl(const std::vector& in, const std::vector& out) const override {} }; class jit_broadcast_move_emitter : public jit_emitter { @@ -29,14 +30,15 @@ class jit_broadcast_move_emitter : public jit_emitter { dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 1;} + size_t get_inputs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; private: size_t byte_size = 0lu; @@ -48,22 +50,25 @@ class jit_scalar_emitter : public jit_emitter { dnnl::impl::cpu::aarch64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_count() const override {return 0;} + size_t get_inputs_count() const override { + return 0; + } protected: - size_t get_aux_gprs_count() const override {return 1;} + size_t get_aux_gprs_count() const override { + return 1; + } private: - void emit_impl(const std::vector& in, - const std::vector& out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; private: int32_t value; }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 912fe23fcd1fcf..86d090a858fd7b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -3,6 +3,7 @@ // #include "jit_eltwise.hpp" + #include namespace ov { @@ -10,13 +11,12 @@ namespace intel_cpu { namespace executors { namespace aarch64 { -bool JitEltwiseExecutor::isSupported( - const Algorithm& algorithm, - const std::vector& input_precisions, - const std::vector& output_precisions, - const float alpha, - const float beta, - const float gamma) { +bool JitEltwiseExecutor::isSupported(const Algorithm& algorithm, + const std::vector& input_precisions, + const std::vector& output_precisions, + const float alpha, + const float beta, + const float gamma) { const auto is_supported = one_of(algorithm, Algorithm::EltwiseAbs, Algorithm::EltwiseAdd, @@ -67,10 +67,9 @@ bool JitEltwiseExecutor::isSupported( return false; } - const auto check_precisions = []( - const std::vector& input_precisions, - const std::vector& output_precisions, - const std::set& supported_precisions) { + const auto check_precisions = [](const std::vector& input_precisions, + const std::vector& output_precisions, + const std::set& supported_precisions) { if (std::any_of(input_precisions.begin(), input_precisions.end(), [&supported_precisions](const ov::element::Type& precision) { @@ -92,15 +91,13 @@ bool JitEltwiseExecutor::isSupported( const std::set supported_precisions = // Divide and Floor (issue #138629) operations are supported for fp32 and fp16 only. - ((algorithm == Algorithm::EltwiseDivide) || (algorithm == Algorithm::EltwiseFloor)) ? - std::set { ov::element::f16, ov::element::f32 } : - std::set { - ov::element::f16, - ov::element::f32, - ov::element::i32, - ov::element::i8, - ov::element::u8 - }; + ((algorithm == Algorithm::EltwiseDivide) || (algorithm == Algorithm::EltwiseFloor)) + ? std::set{ov::element::f16, ov::element::f32} + : std::set{ov::element::f16, + ov::element::f32, + ov::element::i32, + ov::element::i8, + ov::element::u8}; if (!check_precisions(input_precisions, output_precisions, supported_precisions)) { return false; @@ -111,20 +108,20 @@ bool JitEltwiseExecutor::isSupported( JitEltwiseExecutor::JitEltwiseExecutor(const ExecutorContext::CPtr context) : EltwiseExecutor(context) {} -bool JitEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, - const std::vector &srcDescs, - const std::vector &dstDescs, - const std::vector &postOps) { +bool JitEltwiseExecutor::init(const EltwiseAttrs& eltwiseAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const std::vector& postOps) { return true; } -void JitEltwiseExecutor::exec(const std::vector &src, - const std::vector &dst, - const void *post_ops_data_) { +void JitEltwiseExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { exec_func(); } -} // namespace aarch64 -} // namespace executors -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace executors +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp index 5244a80b542fe5..adaaea6a738c7a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp @@ -5,8 +5,8 @@ #pragma once #include "cpu_types.h" -#include "nodes/executors/eltwise.hpp" #include "node.h" +#include "nodes/executors/eltwise.hpp" namespace ov { namespace intel_cpu { @@ -17,13 +17,12 @@ class JitEltwiseExecutor : public EltwiseExecutor { public: explicit JitEltwiseExecutor(const ExecutorContext::CPtr context); - static bool isSupported( - const Algorithm& algorithm, - const std::vector& input_precisions, - const std::vector& output_precisions, - const float alpha, - const float beta, - const float gamma); + static bool isSupported(const Algorithm& algorithm, + const std::vector& input_precisions, + const std::vector& output_precisions, + const float alpha, + const float beta, + const float gamma); bool init(const EltwiseAttrs& eltwiseAttrs, const std::vector& srcDescs, @@ -32,7 +31,7 @@ class JitEltwiseExecutor : public EltwiseExecutor { void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return impl_desc_type::asimd; @@ -42,7 +41,7 @@ class JitEltwiseExecutor : public EltwiseExecutor { std::function exec_func; }; -} // namespace aarch64 -} // namespace executors -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace executors +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp index 646cf47c1bcf6c..4804c7b4efe252 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp @@ -3,6 +3,7 @@ // #include "acl_common_executor.hpp" + #include "acl_utils.hpp" #include "nodes/executors/memory_arguments.hpp" #include "utils/debug_capabilities.h" @@ -14,12 +15,12 @@ static const std::unordered_map argConvert = { {ARG_SRC_0, ACL_SRC_0}, {ARG_SRC_1, ACL_SRC_1}, {ARG_SRC_2, ACL_SRC_2}, - {ARG_BIAS, ACL_BIAS}, - {ARG_WEI, ACL_WEI}, - {ARG_DST, ACL_DST}, + {ARG_BIAS, ACL_BIAS}, + {ARG_WEI, ACL_WEI}, + {ARG_DST, ACL_DST}, }; -using ACLTypes = std::array; +using ACLTypes = std::array; using ACLLayouts = std::array; static void initACLTensorParams(const MemoryPtr& memoryPtr, @@ -39,14 +40,11 @@ static void initACLTensorParams(const MemoryPtr& memoryPtr, } static std::shared_ptr initTensorInfo(const arm_compute::TensorShape& tensorShape, - const arm_compute::DataType& dataType, - const arm_compute::DataLayout& dataLayout) { + const arm_compute::DataType& dataType, + const arm_compute::DataLayout& dataLayout) { std::shared_ptr aclMemoryInfo = nullptr; if (dataType != arm_compute::DataType::UNKNOWN) { - aclMemoryInfo = std::make_shared( - tensorShape, 1, - dataType, - dataLayout); + aclMemoryInfo = std::make_shared(tensorShape, 1, dataType, dataLayout); } return aclMemoryInfo; } @@ -66,14 +64,15 @@ ACLCommonExecutor::ACLCommonExecutor() { } } -bool ACLCommonExecutor::update(const MemoryArgs &memory) { +bool ACLCommonExecutor::update(const MemoryArgs& memory) { // Initialize ACL tensors params - ACLShapes aclMemoryShapes; - ACLTypes aclDataType{}; + ACLShapes aclMemoryShapes; + ACLTypes aclDataType{}; ACLLayouts aclDataLayout{}; for (auto& cpu_mem_ptr : memory) { const ACLArgs index = argConvert.at(cpu_mem_ptr.first); - initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs, + initACLTensorParams(cpu_mem_ptr.second, + aclTensorAttrs, aclMemoryShapes[index], aclDataType[index], aclDataLayout[index]); @@ -110,7 +109,7 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) { return true; } -void ACLCommonExecutor::execute(const MemoryArgs &memory) { +void ACLCommonExecutor::execute(const MemoryArgs& memory) { // TODO: Move import_memory() to update() function - CVS-145871 for (auto& cpu_mem_ptr : memory) { const ACLArgs index = argConvert.at(cpu_mem_ptr.first); @@ -129,5 +128,5 @@ ACLCommonExecutor::~ACLCommonExecutor() { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp index 1a5a00c7a85f7a..94c5dbe219aae8 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp @@ -4,27 +4,19 @@ #pragma once +#include "arm_compute/runtime/NEON/NEFunctions.h" #include "cpu_memory.h" #include "nodes/executors/executor.hpp" -#include "arm_compute/runtime/NEON/NEFunctions.h" namespace ov { namespace intel_cpu { -enum ACLArgs { - ACL_SRC_0, - ACL_SRC_1, - ACL_SRC_2, - ACL_BIAS, - ACL_WEI, - ACL_DST, - COUNT_OF_ARGS -}; +enum ACLArgs { ACL_SRC_0, ACL_SRC_1, ACL_SRC_2, ACL_BIAS, ACL_WEI, ACL_DST, COUNT_OF_ARGS }; using ACLFunction = std::unique_ptr; -using ACLShapes = std::array; -using ACLInfos = std::array, ACLArgs::COUNT_OF_ARGS>; -using ACLTensors = std::array, ACLArgs::COUNT_OF_ARGS>; +using ACLShapes = std::array; +using ACLInfos = std::array, ACLArgs::COUNT_OF_ARGS>; +using ACLTensors = std::array, ACLArgs::COUNT_OF_ARGS>; struct ACLTensorAttrs { bool hasLayoutTypeNHWC = false; @@ -50,6 +42,7 @@ class ACLCommonExecutor : public Executor { protected: ACLTensorAttrs aclTensorAttrs; + private: ACLTensors aclMemoryTensors; ACLInfos aclMemoryInfos; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp index 1bc0585930387f..ed12b0b76a2c1e 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp @@ -3,6 +3,7 @@ // #include "acl_convert.hpp" + #include "acl_utils.hpp" namespace ov { @@ -10,7 +11,6 @@ namespace intel_cpu { using namespace arm_compute; - bool ACLConvertExecutor::init(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, @@ -51,10 +51,14 @@ bool ACLConvertExecutor::init(const ConvertParams& convertParams, if (isCopyOp) { acl_copy = std::make_unique(); - configureThreadSafe([&] { acl_copy->configure(&srcTensor, &dstTensor); }); + configureThreadSafe([&] { + acl_copy->configure(&srcTensor, &dstTensor); + }); } else { acl_cast = std::make_unique(); - configureThreadSafe([&] { acl_cast->configure(&srcTensor, &dstTensor, ConvertPolicy::SATURATE); }); + configureThreadSafe([&] { + acl_cast->configure(&srcTensor, &dstTensor, ConvertPolicy::SATURATE); + }); } return true; } @@ -91,45 +95,34 @@ bool ACLConvertExecutorBuilder::isSupported(const ConvertParams& convertParams, DEBUG_LOG("NECopy does not support source precision: ", convertParams.srcPrc.to_string()); return false; } - if ((convertParams.srcPrc == ov::element::i8 && !one_of(convertParams.dstPrc, - ov::element::i16, - ov::element::i32, - ov::element::f16, - ov::element::f32)) || + if ((convertParams.srcPrc == ov::element::i8 && + !one_of(convertParams.dstPrc, ov::element::i16, ov::element::i32, ov::element::f16, ov::element::f32)) || (convertParams.srcPrc == ov::element::u8 && !one_of(convertParams.dstPrc, - ov::element::u16, - ov::element::i16, - ov::element::i32, - ov::element::f16, - ov::element::f32)) || - (convertParams.srcPrc == ov::element::u16 && !one_of(convertParams.dstPrc, - ov::element::u8, - ov::element::u32)) || - (convertParams.srcPrc == ov::element::i16 && !one_of(convertParams.dstPrc, - ov::element::i8, - ov::element::u8, - ov::element::i32)) || - (convertParams.srcPrc == ov::element::f16 && !one_of(convertParams.dstPrc, - ov::element::i8, - ov::element::f32, + ov::element::u16, + ov::element::i16, ov::element::i32, - ov::element::u8)) || - (convertParams.srcPrc == ov::element::i32 && !one_of(convertParams.dstPrc, - ov::element::i8, - ov::element::f16, - ov::element::f32, - ov::element::u8)) || - (convertParams.srcPrc == ov::element::f32 && !one_of(convertParams.dstPrc, - ov::element::bf16, ov::element::f16, - ov::element::i32))) { + ov::element::f32)) || + (convertParams.srcPrc == ov::element::u16 && + !one_of(convertParams.dstPrc, ov::element::u8, ov::element::u32)) || + (convertParams.srcPrc == ov::element::i16 && + !one_of(convertParams.dstPrc, ov::element::i8, ov::element::u8, ov::element::i32)) || + (convertParams.srcPrc == ov::element::f16 && + !one_of(convertParams.dstPrc, ov::element::i8, ov::element::f32, ov::element::i32, ov::element::u8)) || + (convertParams.srcPrc == ov::element::i32 && + !one_of(convertParams.dstPrc, ov::element::i8, ov::element::f16, ov::element::f32, ov::element::u8)) || + (convertParams.srcPrc == ov::element::f32 && + !one_of(convertParams.dstPrc, ov::element::bf16, ov::element::f16, ov::element::i32))) { DEBUG_LOG("NECopy does not support passed combination of source and destination precisions. ", - "source precision: ", convertParams.srcPrc.to_string(), " destination precsion: ", convertParams.dstPrc.to_string()); + "source precision: ", + convertParams.srcPrc.to_string(), + " destination precsion: ", + convertParams.dstPrc.to_string()); return false; } } return true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.hpp index b81e34004f9f31..431f8ce6887cbe 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.hpp @@ -4,9 +4,9 @@ #pragma once +#include "arm_compute/runtime/NEON/NEFunctions.h" #include "nodes/executors/convert.hpp" #include "utils/debug_capabilities.h" -#include "arm_compute/runtime/NEON/NEFunctions.h" namespace ov { namespace intel_cpu { @@ -17,9 +17,12 @@ class ACLConvertExecutor : public ConvertExecutor { bool init(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst) override; - impl_desc_type implType() const override { return impl_desc_type::acl; }; + impl_desc_type implType() const override { + return impl_desc_type::acl; + }; + protected: ConvertParams aclConvertParams; bool isCopyOp; @@ -38,5 +41,5 @@ class ACLConvertExecutorBuilder : public ConvertExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp index fa40b3a27322c2..cd5e935b41a2d5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp @@ -3,6 +3,7 @@ // #include "acl_deconv.hpp" + #include "openvino/core/parallel.hpp" namespace ov { @@ -13,7 +14,9 @@ using namespace arm_compute; ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs) { - auto func_mod = [](long a) -> unsigned int { return a < 0 ? 0 : a; }; + auto func_mod = [](long a) -> unsigned int { + return a < 0 ? 0 : a; + }; auto pad_l = deconvAttrs.paddingL.size() > 1 ? deconvAttrs.paddingL.at(1) : deconvAttrs.paddingL.at(0); auto pad_r = deconvAttrs.paddingR.size() > 1 ? deconvAttrs.paddingR.at(1) : deconvAttrs.paddingR.at(0); auto pad_t = deconvAttrs.paddingL.at(0); @@ -21,18 +24,30 @@ ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, unsigned int stride_x = (deconvAttrs.stride.size() > 1) ? deconvAttrs.stride.at(1) : deconvAttrs.stride.at(0); unsigned int stride_y = deconvAttrs.stride.at(0); - auto deconv_info = PadStrideInfo(stride_x, stride_y, func_mod(pad_l), func_mod(pad_r), func_mod(pad_t), func_mod(pad_b), DimensionRoundingType::FLOOR); - - auto srcDims = srcDescs[0]->getShape().getDims(); - auto weiDims = srcDescs[1]->getShape().getDims(); - auto dstDims = dstDescs[0]->getShape().getDims(); + auto deconv_info = PadStrideInfo(stride_x, + stride_y, + func_mod(pad_l), + func_mod(pad_r), + func_mod(pad_t), + func_mod(pad_b), + DimensionRoundingType::FLOOR); + + auto srcDims = srcDescs[0]->getShape().getDims(); + auto weiDims = srcDescs[1]->getShape().getDims(); + auto dstDims = dstDescs[0]->getShape().getDims(); // ACL can't work with custom output shape, this we make WA for that problem if (pad_l < 0 || pad_r < 0 || pad_t < 0 || pad_b < 0) { auto out_dims = deconvolution_output_dimensions(srcDims[3], srcDims[2], weiDims[3], weiDims[2], deconv_info); stride_x += (out_dims.first - dstDims[3] - 2 * (pad_l + pad_r)) / (srcDims[3] - 1); stride_y += (out_dims.second - dstDims[2] - 2 * (pad_t + pad_b)) / (srcDims[2] - 1); - deconv_info = PadStrideInfo(stride_x, stride_y, func_mod(pad_l), func_mod(pad_r), func_mod(pad_t), func_mod(pad_b), DimensionRoundingType::FLOOR); + deconv_info = PadStrideInfo(stride_x, + stride_y, + func_mod(pad_l), + func_mod(pad_r), + func_mod(pad_t), + func_mod(pad_b), + DimensionRoundingType::FLOOR); } std::swap(weiDims[0], weiDims[1]); @@ -59,16 +74,18 @@ ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, weiLayout = arm_compute::DataLayout::NHWC; } - TensorInfo srcTensorInfo = TensorInfo(srcVecDims, 1, - precisionToAclDataType(srcDescs[0]->getPrecision()), srcLayout); - TensorInfo weiTensorInfo = TensorInfo(weiVecDims, 1, - precisionToAclDataType(srcDescs[1]->getPrecision()), weiLayout); - TensorInfo dstTensorInfo = TensorInfo(dstVecDims, 1, - precisionToAclDataType(dstDescs[0]->getPrecision()), dstLayout); + TensorInfo srcTensorInfo = + TensorInfo(srcVecDims, 1, precisionToAclDataType(srcDescs[0]->getPrecision()), srcLayout); + TensorInfo weiTensorInfo = + TensorInfo(weiVecDims, 1, precisionToAclDataType(srcDescs[1]->getPrecision()), weiLayout); + TensorInfo dstTensorInfo = + TensorInfo(dstVecDims, 1, precisionToAclDataType(dstDescs[0]->getPrecision()), dstLayout); TensorInfo biasTensorInfo; if (deconvAttrs.withBiasesParam) { - biasTensorInfo = TensorInfo(biasVecDims, 1, - precisionToAclDataType(srcDescs[2]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2])); + biasTensorInfo = TensorInfo(biasVecDims, + 1, + precisionToAclDataType(srcDescs[2]->getPrecision()), + getAclDataLayoutByMemoryDesc(srcDescs[2])); } return ACLDeconvTensorInfo{srcTensorInfo, weiTensorInfo, biasTensorInfo, dstTensorInfo, deconv_info}; @@ -77,9 +94,9 @@ ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, AclDeconvExecutor::AclDeconvExecutor(const ExecutorContext::CPtr context) : DeconvExecutor(context) {} bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { this->deconvAttrs = deconvAttrs; ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs); TensorInfo srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo; @@ -99,12 +116,17 @@ bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs, deconv = std::make_unique(); configureThreadSafe([&] { - deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info, deconvAttrs.aclFastMath); + deconv->configure(&srcTensor, + &weiTensor, + deconvAttrs.withBiasesParam ? &biasTensor : nullptr, + &dstTensor, + deconv_info, + deconvAttrs.aclFastMath); }); return true; } -template +template static void transpose_weights(const MemoryCPtr& srcMemPtr, MemoryPtr& newSrcMemPtr, bool isNCHW) { const auto src_data = srcMemPtr->getDataAs(); const auto new_src_data = newSrcMemPtr->getDataAs(); @@ -118,29 +140,17 @@ static void transpose_weights(const MemoryCPtr& srcMemPtr, MemoryPtr& newSrcMemP if (isNCHW) { parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) { for (int dim3 = 0; dim3 < DIM3; ++dim3) { - const int src_off = dim0 * DIM1 * DIM2 * DIM3 + - dim1 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; - const int dst_off = dim1 * DIM0 * DIM2 * DIM3 + - dim0 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; + const int src_off = dim0 * DIM1 * DIM2 * DIM3 + dim1 * DIM2 * DIM3 + dim2 * DIM3 + dim3; + const int dst_off = dim1 * DIM0 * DIM2 * DIM3 + dim0 * DIM2 * DIM3 + dim2 * DIM3 + dim3; new_src_data[dst_off] = src_data[src_off]; } }); - // 0231 -> 1230 + // 0231 -> 1230 } else { parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) { for (int dim3 = 0; dim3 < DIM3; ++dim3) { - const int src_off = dim0 * DIM1 * DIM2 * DIM3 + - dim1 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; - const int dst_off = dim1 * DIM2 * DIM3 * DIM0 + - dim2 * DIM3 * DIM0 + - dim3 * DIM0 + - dim0; + const int src_off = dim0 * DIM1 * DIM2 * DIM3 + dim1 * DIM2 * DIM3 + dim2 * DIM3 + dim3; + const int dst_off = dim1 * DIM2 * DIM3 * DIM0 + dim2 * DIM3 * DIM0 + dim3 * DIM0 + dim0; new_src_data[dst_off] = src_data[src_off]; } }); @@ -176,7 +186,9 @@ static MemoryPtr prepareWeightMemory(const std::vector& src, const E return create(); } -void AclDeconvExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { +void AclDeconvExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { // TODO: Remove transpose from exec auto newWei = prepareWeightMemory(src, context); @@ -194,16 +206,19 @@ void AclDeconvExecutor::exec(const std::vector& src, const std::vect biasTensor.allocator()->free(); } -bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs, - const std::vector &srcDescs, - const std::vector &dstDescs) { +bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) { if ((srcDescs[0]->getShape().getDims().size() != 3 && srcDescs[0]->getShape().getDims().size() != 4) || dstDescs[0]->getShape().getDims().size() != srcDescs[0]->getShape().getDims().size() || srcDescs[1]->getShape().getDims().size() != 4) { DEBUG_LOG("AclDeconvExecutor does not support dimension:", - " src[0]=", srcDescs[0]->getShape().getDims().size(), - " src[1]=", srcDescs[1]->getShape().getDims().size(), - " dst[0]=", dstDescs[0]->getShape().getDims().size()); + " src[0]=", + srcDescs[0]->getShape().getDims().size(), + " src[1]=", + srcDescs[1]->getShape().getDims().size(), + " dst[0]=", + dstDescs[0]->getShape().getDims().size()); return false; } @@ -211,62 +226,71 @@ bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs, srcDescs[0]->getPrecision() == srcDescs[1]->getPrecision() && srcDescs[1]->getPrecision() == dstDescs[0]->getPrecision())) { DEBUG_LOG("AclDeconvExecutor does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " src[1]=", srcDescs[1]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " src[1]=", + srcDescs[1]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } if (deconvAttrs.withBiasesParam && srcDescs[2]->getPrecision() != srcDescs[0]->getPrecision()) { - DEBUG_LOG("AclDeconvExecutor does not support precisions:", - " src[2]=", srcDescs[2]->getPrecision()); + DEBUG_LOG("AclDeconvExecutor does not support precisions:", " src[2]=", srcDescs[2]->getPrecision()); return false; } - if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - srcDescs[1]->hasLayoutType(LayoutType::ncsp) && + if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && srcDescs[1]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && // Check weights as ncsp because we remove reorder and will transform ncsp -> nspc in exec() function - srcDescs[1]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + srcDescs[1]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { DEBUG_LOG("AclDeconvExecutor does not support layouts:", - " src[0]=", srcDescs[0]->serializeFormat(), - " src[1]=", srcDescs[1]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); + " src[0]=", + srcDescs[0]->serializeFormat(), + " src[1]=", + srcDescs[1]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); return false; } - if (deconvAttrs.withBiasesParam && - !(srcDescs[2]->hasLayoutType(LayoutType::ncsp)) && + if (deconvAttrs.withBiasesParam && !(srcDescs[2]->hasLayoutType(LayoutType::ncsp)) && !(srcDescs[2]->hasLayoutType(LayoutType::nspc))) { DEBUG_LOG("AclDeconvExecutor does not support layouts:", - " src[0]=", srcDescs[0]->serializeFormat(), - " src[1]=", srcDescs[1]->serializeFormat(), - " src[2]=", srcDescs[2]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); + " src[0]=", + srcDescs[0]->serializeFormat(), + " src[1]=", + srcDescs[1]->serializeFormat(), + " src[2]=", + srcDescs[2]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); return false; } ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs); - auto srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo; - auto weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo; + auto srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo; + auto weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo; auto biasTensorInfo = aclDeconvTensorInfo.biasTensorInfo; - auto dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo; - auto deconv_info = aclDeconvTensorInfo.deconv_info; + auto dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo; + auto deconv_info = aclDeconvTensorInfo.deconv_info; - unsigned int dilation_x = (deconvAttrs.dilation.size() > 1) ? deconvAttrs.dilation.at(1) : deconvAttrs.dilation.at(0); + unsigned int dilation_x = + (deconvAttrs.dilation.size() > 1) ? deconvAttrs.dilation.at(1) : deconvAttrs.dilation.at(0); unsigned int dilation_y = deconvAttrs.dilation.at(0); - if (!one_of(dilation_x, static_cast(0), static_cast(1)) || - !one_of(dilation_y, static_cast(0), static_cast(1))) return false; + if (!one_of(dilation_x, static_cast(0), static_cast(1)) || + !one_of(dilation_y, static_cast(0), static_cast(1))) + return false; try { - arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo, - &weiTensorInfo, - deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr, - &dstTensorInfo, - deconv_info, - deconvAttrs.aclFastMath); + arm_compute::Status status = + arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo, + &weiTensorInfo, + deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr, + &dstTensorInfo, + deconv_info, + deconvAttrs.aclFastMath); if (!status) { DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description()); return false; @@ -280,5 +304,5 @@ bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs, return true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp index ad743690f45c52..e27551ad48d44d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp @@ -4,11 +4,11 @@ #pragma once -#include "nodes/executors/deconv.hpp" -#include "arm_compute/runtime/NEON/NEFunctions.h" -#include "utils/debug_capabilities.h" #include "acl_utils.hpp" +#include "arm_compute/runtime/NEON/NEFunctions.h" +#include "nodes/executors/deconv.hpp" #include "src/cpu/CpuTypes.h" +#include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { @@ -22,8 +22,8 @@ struct ACLDeconvTensorInfo { }; ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs); + const std::vector& srcDescs, + const std::vector& dstDescs); class AclDeconvExecutor : public DeconvExecutor { public: @@ -31,10 +31,10 @@ class AclDeconvExecutor : public DeconvExecutor { bool init(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; @@ -68,5 +68,5 @@ class AclDeconvExecutorBuilder : public DeconvExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index 942bacd91349ff..26d387c7659dc5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -3,6 +3,7 @@ // #include "acl_eltwise.hpp" + #include "acl_utils.hpp" #include "utils/debug_capabilities.h" @@ -34,40 +35,45 @@ inline void log_unsupported_prec(const std::vector& srcDescs, for (size_t i = 0; i < srcDescs.size(); i++) { srcPrec += srcDescs[i]->getPrecision().to_string() + " "; } - DEBUG_LOG(algToString(eltwiseAlgorithm), ": provided combination of src precisions: [", srcPrec, - "] and dst precision: ", dstDescs[0]->getPrecision().to_string(), " is not supported"); + DEBUG_LOG(algToString(eltwiseAlgorithm), + ": provided combination of src precisions: [", + srcPrec, + "] and dst precision: ", + dstDescs[0]->getPrecision().to_string(), + " is not supported"); } bool AclEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) { - if (one_of(algorithm, Algorithm::EltwiseSqrt, - Algorithm::EltwiseDivide, - Algorithm::EltwiseRelu, + if (one_of(algorithm, + Algorithm::EltwiseSqrt, + Algorithm::EltwiseDivide, + Algorithm::EltwiseRelu, #ifdef OPENVINO_ARCH_ARM64 - Algorithm::EltwiseGeluErf, + Algorithm::EltwiseGeluErf, #endif - Algorithm::EltwiseElu, - Algorithm::EltwiseTanh, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseSoftRelu, - Algorithm::EltwiseClamp, - Algorithm::EltwiseSwish, - Algorithm::EltwisePrelu, - Algorithm::EltwiseHswish, - Algorithm::EltwiseAbs, - Algorithm::EltwiseExp, - Algorithm::EltwiseLog, - Algorithm::EltwiseMaximum, - Algorithm::EltwiseMinimum, - Algorithm::EltwiseSquaredDifference, - Algorithm::EltwiseAdd, - Algorithm::EltwiseSubtract, - Algorithm::EltwiseMultiply, - Algorithm::EltwiseEqual, - Algorithm::EltwiseNotEqual, - Algorithm::EltwiseGreater, - Algorithm::EltwiseGreaterEqual, - Algorithm::EltwiseLess, - Algorithm::EltwiseLessEqual)) { + Algorithm::EltwiseElu, + Algorithm::EltwiseTanh, + Algorithm::EltwiseSigmoid, + Algorithm::EltwiseSoftRelu, + Algorithm::EltwiseClamp, + Algorithm::EltwiseSwish, + Algorithm::EltwisePrelu, + Algorithm::EltwiseHswish, + Algorithm::EltwiseAbs, + Algorithm::EltwiseExp, + Algorithm::EltwiseLog, + Algorithm::EltwiseMaximum, + Algorithm::EltwiseMinimum, + Algorithm::EltwiseSquaredDifference, + Algorithm::EltwiseAdd, + Algorithm::EltwiseSubtract, + Algorithm::EltwiseMultiply, + Algorithm::EltwiseEqual, + Algorithm::EltwiseNotEqual, + Algorithm::EltwiseGreater, + Algorithm::EltwiseGreaterEqual, + Algorithm::EltwiseLess, + Algorithm::EltwiseLessEqual)) { return true; } return false; @@ -76,107 +82,111 @@ bool AclEltwiseExecutor::isEltwiseAlgorithmSupported(Algorithm algorithm) { bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const { - auto checkPrecision = [&srcDescs, &dstDescs](std::vector srcVecPrc, ov::element::Type dstPrc) -> bool { + auto checkPrecision = [&srcDescs, &dstDescs](std::vector srcVecPrc, + ov::element::Type dstPrc) -> bool { for (size_t i = 0; i < srcDescs.size(); i++) { - if (srcDescs[i]->getPrecision() != srcVecPrc[i]) return false; + if (srcDescs[i]->getPrecision() != srcVecPrc[i]) + return false; + } + if (dstDescs[0]->getPrecision() != dstPrc) { + return false; } - if (dstDescs[0]->getPrecision() != dstPrc) { return false; } return true; }; switch (eltwiseAttrs.algorithm) { - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseDivide: - case Algorithm::EltwiseRelu: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseDivide: + case Algorithm::EltwiseRelu: #ifdef OPENVINO_ARCH_ARM64 - case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseGeluErf: #endif - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseSwish: - case Algorithm::EltwisePrelu: - case Algorithm::EltwiseHswish: - if (!(checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - case Algorithm::EltwiseAbs: - case Algorithm::EltwiseExp: - case Algorithm::EltwiseLog: - if (!(checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - case Algorithm::EltwiseMaximum: - case Algorithm::EltwiseMinimum: - case Algorithm::EltwiseSquaredDifference: - if (!(checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || - checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - case Algorithm::EltwiseAdd: - case Algorithm::EltwiseSubtract: - if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || - checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || - checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - case Algorithm::EltwiseMultiply: - if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || - checkPrecision({ov::element::u8, ov::element::u8}, ov::element::i16) || - checkPrecision({ov::element::u8, ov::element::i16}, ov::element::i16) || - checkPrecision({ov::element::i16, ov::element::u8}, ov::element::i16) || - checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - // ACL supports only U8 precision on output for comparison operations - case Algorithm::EltwiseEqual: - case Algorithm::EltwiseNotEqual: - case Algorithm::EltwiseGreater: - case Algorithm::EltwiseGreaterEqual: - case Algorithm::EltwiseLess: - case Algorithm::EltwiseLessEqual: - if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || - checkPrecision({ov::element::i16, ov::element::i16}, ov::element::u8) || - checkPrecision({ov::element::i32, ov::element::i32}, ov::element::u8) || - checkPrecision({ov::element::f16, ov::element::f16}, ov::element::u8) || - checkPrecision({ov::element::f32, ov::element::f32}, ov::element::u8))) { - log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); - return false; - } - break; - default: - DEBUG_LOG("Eltwise algorithm ", algToString(eltwiseAttrs.algorithm), " is not supported"); + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseSwish: + case Algorithm::EltwisePrelu: + case Algorithm::EltwiseHswish: + if (!(checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + case Algorithm::EltwiseAbs: + case Algorithm::EltwiseExp: + case Algorithm::EltwiseLog: + if (!(checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + case Algorithm::EltwiseMaximum: + case Algorithm::EltwiseMinimum: + case Algorithm::EltwiseSquaredDifference: + if (!(checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || + checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + case Algorithm::EltwiseAdd: + case Algorithm::EltwiseSubtract: + if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || + checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || + checkPrecision({ov::element::i32, ov::element::i32}, ov::element::i32) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); return false; + } + break; + case Algorithm::EltwiseMultiply: + if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || + checkPrecision({ov::element::u8, ov::element::u8}, ov::element::i16) || + checkPrecision({ov::element::u8, ov::element::i16}, ov::element::i16) || + checkPrecision({ov::element::i16, ov::element::u8}, ov::element::i16) || + checkPrecision({ov::element::i16, ov::element::i16}, ov::element::i16) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::f16) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::f32))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + // ACL supports only U8 precision on output for comparison operations + case Algorithm::EltwiseEqual: + case Algorithm::EltwiseNotEqual: + case Algorithm::EltwiseGreater: + case Algorithm::EltwiseGreaterEqual: + case Algorithm::EltwiseLess: + case Algorithm::EltwiseLessEqual: + if (!(checkPrecision({ov::element::u8, ov::element::u8}, ov::element::u8) || + checkPrecision({ov::element::i16, ov::element::i16}, ov::element::u8) || + checkPrecision({ov::element::i32, ov::element::i32}, ov::element::u8) || + checkPrecision({ov::element::f16, ov::element::f16}, ov::element::u8) || + checkPrecision({ov::element::f32, ov::element::f32}, ov::element::u8))) { + log_unsupported_prec(srcDescs, dstDescs, eltwiseAttrs.algorithm); + return false; + } + break; + default: + DEBUG_LOG("Eltwise algorithm ", algToString(eltwiseAttrs.algorithm), " is not supported"); + return false; } - for (const auto & srcDesc : srcDescs) { + for (const auto& srcDesc : srcDescs) { if (getAclDataLayoutByMemoryDesc(srcDesc) == arm_compute::DataLayout::UNKNOWN) { DEBUG_LOG("src descriptor layout is unsupported by ACL: ", srcDesc->serializeFormat()); return false; } } - for (const auto & dstDesc : dstDescs) { + for (const auto& dstDesc : dstDescs) { if (getAclDataLayoutByMemoryDesc(dstDesc) == arm_compute::DataLayout::UNKNOWN) { DEBUG_LOG("dst descriptor layout is unsupported by ACL: ", dstDesc->serializeFormat()); return false; @@ -188,10 +198,13 @@ bool AclEltwiseExecutorBuilder::isSupported(const EltwiseAttrs& eltwiseAttrs, AclEltwiseExecutor::AclEltwiseExecutor(const ExecutorContext::CPtr context) : EltwiseExecutor(context) {} -bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vector &srcDescs, - const std::vector &dstDescs, - const std::vector &postOps) { - if (!postOps.empty()) { return false; } +bool AclEltwiseExecutor::init(const EltwiseAttrs& eltwiseAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const std::vector& postOps) { + if (!postOps.empty()) { + return false; + } aclEltwiseAttrs = eltwiseAttrs; std::vector srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size()); @@ -209,15 +222,19 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto for (size_t i = 0; i < srcDescs.size(); i++) { srcDataLayout[i] = getAclDataLayoutByMemoryDesc(srcDescs[i]); - if (srcDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { return false; } + if (srcDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { + return false; + } } for (size_t i = 0; i < dstDescs.size(); i++) { dstDataLayout[i] = getAclDataLayoutByMemoryDesc(dstDescs[i]); - if (dstDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { return false; } + if (dstDataLayout[i] == arm_compute::DataLayout::UNKNOWN) { + return false; + } } - if (srcDescs.size() == 2 && - srcDescs[0]->hasLayoutType(LayoutType::nspc) && srcDescs[1]->hasLayoutType(LayoutType::nspc) && + if (srcDescs.size() == 2 && srcDescs[0]->hasLayoutType(LayoutType::nspc) && + srcDescs[1]->hasLayoutType(LayoutType::nspc) && srcDescs[0]->getShape().getDims() != srcDescs[1]->getShape().getDims()) { if (srcVecDims[0].num_dimensions() < 5) { srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCHW; @@ -228,210 +245,248 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto } for (size_t i = 0; i < srcVecDims.size(); i++) { - srcTensorsInfo[i] = TensorInfo(srcVecDims[i], 1, - precisionToAclDataType(srcDescs[i]->getPrecision()), - srcDataLayout[i]); + srcTensorsInfo[i] = + TensorInfo(srcVecDims[i], 1, precisionToAclDataType(srcDescs[i]->getPrecision()), srcDataLayout[i]); srcTensors[i].allocator()->init(srcTensorsInfo[i]); } for (size_t i = 0; i < dstVecDims.size(); i++) { - dstTensorsInfo[i] = TensorInfo(dstVecDims[i], 1, - precisionToAclDataType(dstDescs[i]->getPrecision()), - dstDataLayout[i]); + dstTensorsInfo[i] = + TensorInfo(dstVecDims[i], 1, precisionToAclDataType(dstDescs[i]->getPrecision()), dstDataLayout[i]); dstTensors[i].allocator()->init(dstTensorsInfo[i]); } std::function(void)> exec_func; switch (aclEltwiseAttrs.algorithm) { - case Algorithm::EltwiseAdd: - if (!NEArithmeticAddition::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ConvertPolicy::SATURATE)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE); - return acl_op; - }; - break; - case Algorithm::EltwiseMultiply: - if (!NEPixelWiseMultiplication::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], - 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - return acl_op; - }; - break; - case Algorithm::EltwiseSubtract: - if (!NEArithmeticSubtraction::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ConvertPolicy::SATURATE)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE); - return acl_op; - }; - break; - case Algorithm::EltwiseDivide: - if (!NEElementwiseDivision::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseMaximum: - if (!NEElementwiseMax::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseMinimum: - if (!NEElementwiseMin::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseSquaredDifference: - if (!NEElementwiseSquaredDiff::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseEqual: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Equal)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Equal); - return acl_op; - }; - break; - case Algorithm::EltwiseNotEqual: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::NotEqual)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::NotEqual); - return acl_op; - }; - break; - case Algorithm::EltwiseGreater: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Greater)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Greater); - return acl_op; - }; - break; - case Algorithm::EltwiseGreaterEqual: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::GreaterEqual)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::GreaterEqual); - return acl_op; - }; - break; - case Algorithm::EltwiseLess: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::Less)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Less); - return acl_op; - }; - break; - case Algorithm::EltwiseLessEqual: - if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0], ComparisonOperation::LessEqual)) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::LessEqual); - return acl_op; - }; - break; - case Algorithm::EltwiseAbs: - if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseExp: - if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwisePrelu: - if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); - return acl_op; - }; - break; - case Algorithm::EltwiseRelu: - case Algorithm::EltwiseGeluErf: - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseSwish: - case Algorithm::EltwiseHswish: - if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm, - aclEltwiseAttrs.alpha, - aclEltwiseAttrs.beta, - aclEltwiseAttrs.gamma))) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &dstTensors[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm, - aclEltwiseAttrs.alpha, - aclEltwiseAttrs.beta, - aclEltwiseAttrs.gamma)); - return acl_op; - }; - break; - case Algorithm::EltwiseLog: - if (!NELogLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) - return false; - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensors[0], &dstTensors[0]); - return acl_op; - }; - break; - default: - OPENVINO_THROW("Unsupported operation type for ACL Eltwise executor: ", - static_cast(aclEltwiseAttrs.algorithm)); + case Algorithm::EltwiseAdd: + if (!NEArithmeticAddition::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ConvertPolicy::SATURATE)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE); + return acl_op; + }; + break; + case Algorithm::EltwiseMultiply: + if (!NEPixelWiseMultiplication::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + 1.0f, + ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], + &srcTensors[1], + &dstTensors[0], + 1.0f, + ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + return acl_op; + }; + break; + case Algorithm::EltwiseSubtract: + if (!NEArithmeticSubtraction::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ConvertPolicy::SATURATE)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ConvertPolicy::SATURATE); + return acl_op; + }; + break; + case Algorithm::EltwiseDivide: + if (!NEElementwiseDivision::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseMaximum: + if (!NEElementwiseMax::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseMinimum: + if (!NEElementwiseMin::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseSquaredDifference: + if (!NEElementwiseSquaredDiff::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseEqual: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::Equal)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Equal); + return acl_op; + }; + break; + case Algorithm::EltwiseNotEqual: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::NotEqual)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::NotEqual); + return acl_op; + }; + break; + case Algorithm::EltwiseGreater: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::Greater)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Greater); + return acl_op; + }; + break; + case Algorithm::EltwiseGreaterEqual: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::GreaterEqual)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::GreaterEqual); + return acl_op; + }; + break; + case Algorithm::EltwiseLess: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::Less)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::Less); + return acl_op; + }; + break; + case Algorithm::EltwiseLessEqual: + if (!NEElementwiseComparison::validate(&srcTensorsInfo[0], + &srcTensorsInfo[1], + &dstTensorsInfo[0], + ComparisonOperation::LessEqual)) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0], ComparisonOperation::LessEqual); + return acl_op; + }; + break; + case Algorithm::EltwiseAbs: + if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseExp: + if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwisePrelu: + if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &srcTensors[1], &dstTensors[0]); + return acl_op; + }; + break; + case Algorithm::EltwiseRelu: + case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseSwish: + case Algorithm::EltwiseHswish: + if (!NEActivationLayer::validate(&srcTensorsInfo[0], + &dstTensorsInfo[0], + getActivationLayerInfo(aclEltwiseAttrs.algorithm, + aclEltwiseAttrs.alpha, + aclEltwiseAttrs.beta, + aclEltwiseAttrs.gamma))) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], + &dstTensors[0], + getActivationLayerInfo(aclEltwiseAttrs.algorithm, + aclEltwiseAttrs.alpha, + aclEltwiseAttrs.beta, + aclEltwiseAttrs.gamma)); + return acl_op; + }; + break; + case Algorithm::EltwiseLog: + if (!NELogLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0])) + return false; + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensors[0], &dstTensors[0]); + return acl_op; + }; + break; + default: + OPENVINO_THROW("Unsupported operation type for ACL Eltwise executor: ", + static_cast(aclEltwiseAttrs.algorithm)); } - configureThreadSafe([&] { ifunc = exec_func(); }); + configureThreadSafe([&] { + ifunc = exec_func(); + }); return true; } -void AclEltwiseExecutor::exec(const std::vector &src, const std::vector &dst, - const void *post_ops_data_) { +void AclEltwiseExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { for (size_t i = 0; i < src.size(); i++) { srcTensors[i].allocator()->import_memory(src[i]->getData()); } @@ -448,5 +503,5 @@ void AclEltwiseExecutor::exec(const std::vector &src, const std::vec dstTensors[i].allocator()->free(); } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp index 6daf9e606c461b..1aae396f25a0fe 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp @@ -5,8 +5,8 @@ #pragma once #include "../eltwise.hpp" -#include "arm_compute/runtime/NEON/NEFunctions.h" #include "acl_utils.hpp" +#include "arm_compute/runtime/NEON/NEFunctions.h" namespace ov { namespace intel_cpu { @@ -23,11 +23,12 @@ class AclEltwiseExecutor : public EltwiseExecutor { void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; } + private: EltwiseAttrs aclEltwiseAttrs{}; impl_desc_type implType = impl_desc_type::acl; @@ -46,5 +47,5 @@ class AclEltwiseExecutorBuilder : public EltwiseExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index 9660178e1af4a4..74bdb97cdf2a8c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -2,23 +2,24 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "acl_fullyconnected.hpp" + #include +#include -#include "ov_optional.hpp" -#include "acl_fullyconnected.hpp" #include "acl_utils.hpp" -#include "nodes/executors/executor.hpp" -#include "nodes/executors/memory_arguments.hpp" -#include "utils/debug_capabilities.h" -#include "utils/cpu_utils.hpp" -#include "nodes/executors/debug_messages.hpp" -#include "nodes/executors/implementation_utils.hpp" -#include "nodes/convert.h" +#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/common/cpu_convert.h" #include "nodes/common/cpu_memcpy.h" #include "nodes/common/reorder_prim.h" -#include "memory_desc/cpu_memory_desc_utils.h" +#include "nodes/convert.h" +#include "nodes/executors/debug_messages.hpp" +#include "nodes/executors/executor.hpp" +#include "nodes/executors/implementation_utils.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "ov_optional.hpp" +#include "utils/cpu_utils.hpp" +#include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { @@ -57,13 +58,16 @@ static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDim static DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc, const DnnlMemoryDescPtr dstDesc) { const auto& weiDesc = srcDesc->getDnnlDesc(); - const auto reorderedWeiDesc = dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; + const auto reorderedWeiDesc = + dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); } -static ov::optional convertWeightPrecision(MemoryPtr input, MemoryPtr output, ov::element::Type weightPrecision) { +static ov::optional convertWeightPrecision(MemoryPtr input, + MemoryPtr output, + ov::element::Type weightPrecision) { MemoryArgs memoryArgs; memoryArgs[ARG_SRC] = input; memoryArgs[ARG_DST] = output; @@ -74,16 +78,18 @@ static ov::optional convertWeightPrecision(MemoryPtr input, MemoryPtr return ov::optional(memoryArgs.at(ARG_DST)); } - if (!node::Convert::isSupportedDesc(input->getDesc()) || - !node::Convert::isSupportedDesc(output->getDesc())) { + if (!node::Convert::isSupportedDesc(input->getDesc()) || !node::Convert::isSupportedDesc(output->getDesc())) { return {}; } - auto data = static_cast(input->getData()); + auto data = static_cast(input->getData()); std::vector tmpBuff; tmpBuff.resize(output->getSize()); - cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToElementType(input->getDataType()), - weightPrecision, input->getSize() / input->getDesc().getPrecision().size()); + cpu_convert(data, + tmpBuff.data(), + DnnlExtensionUtils::DataTypeToElementType(input->getDataType()), + weightPrecision, + input->getSize() / input->getDesc().getPrecision().size()); return ov::optional(std::make_shared(output->getPrimitive().get_engine(), output->getDesc().cloneWithNewPrecision(weightPrecision), @@ -96,12 +102,14 @@ static ov::optional reorderDataFallback(MemoryPtr input, MemoryPtr ou } const auto inPrc = DnnlExtensionUtils::DataTypeToElementType(input->getDataType()); auto convertedDstMemoryDesc = output->getDesc().cloneWithNewPrecision(inPrc); - dnnl::reorder reorderWithoutConvert = getReorderPrim(context->getRuntimeCache(), - output->getPrimitive().get_engine(), - input->getPrimitive().get_desc(), - MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc()); - - if (reorderWithoutConvert && parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) { + dnnl::reorder reorderWithoutConvert = + getReorderPrim(context->getRuntimeCache(), + output->getPrimitive().get_engine(), + input->getPrimitive().get_desc(), + MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc()); + + if (reorderWithoutConvert && + parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) { auto convertOutput = convertWeightPrecision(input, output, inPrc); if (!convertOutput) { return {}; @@ -110,7 +118,9 @@ static ov::optional reorderDataFallback(MemoryPtr input, MemoryPtr ou if (reorderWithoutConvert) { dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order); - reorderWithoutConvert.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); + reorderWithoutConvert.execute( + loc_stream, + {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); return ov::optional(output); } } @@ -155,14 +165,15 @@ static MemoryPtr reorderData(DnnlMemoryDescPtr srcWeightDesc, // if precision conversion does not work then do direct reference reorder if (directReorder) { dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order); - directReorder.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); + directReorder.execute(loc_stream, + {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); } else { OPENVINO_THROW("Could not make onednn reorder."); } return output; } -static MemoryPtr reorderWeights(const MemoryArgs &memory, +static MemoryPtr reorderWeights(const MemoryArgs& memory, const ExecutorContext::CPtr context, ACLFCAttrs& aclfcAttrs, DnnlMemoryDescPtr dnnlSrcDesc, @@ -192,16 +203,16 @@ static MemoryPtr reorderWeights(const MemoryArgs &memory, return create(); } -static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, +static MemoryPtr prepareWeightMemory(const MemoryArgs& memory, const ExecutorContext::CPtr context, - const FCAttrs &attrs, + const FCAttrs& attrs, ACLFCAttrs& aclfcAttrs, - const PostOps &postOps, + const PostOps& postOps, arm_compute::WeightFormat& expectedWeightFormat, arm_compute::TensorInfo& weiTensorInfo) { MemoryArgs memoryArgs; - memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); - memoryArgs[ARG_WEI] = memory.at(ARG_WEI); + memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); + memoryArgs[ARG_WEI] = memory.at(ARG_WEI); auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr(); @@ -219,21 +230,25 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, const auto& inShape = memory.at(ARG_SRC_0)->getShape(); const auto& wShape = originalWeightsDesc->getShape(); const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); - const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); - memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), - memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims)); + const auto& outDymmyDims = + makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); + memoryArgs[ARG_SRC_0] = + std::make_shared(context->getEngine(), + memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); + memoryArgs[ARG_DST] = + std::make_shared(context->getEngine(), + memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims)); } else { memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); - memoryArgs[ARG_DST] = memory.at(ARG_DST); + memoryArgs[ARG_DST] = memory.at(ARG_DST); } // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); bool isNeededReorder = aclWeightsRepack->update(memoryArgs); - expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED; + expectedWeightFormat = + isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED; weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI); if (isNeededReorder) { @@ -241,8 +256,13 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, dnnl::impl::dim_t inner_dim = 1; std::vector remaining_dims = {}; auto weights_md_ = dnnlDstDesc->getDnnlDesc().get(); - dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, *weights_md_, expectedWeightFormat, - inner_dim, o_dim, remaining_dims, {}); + dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, + *weights_md_, + expectedWeightFormat, + inner_dim, + o_dim, + remaining_dims, + {}); if (aclfcAttrs.weightsNonTransposed) { dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc); } @@ -256,7 +276,7 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc); } -static bool checkPostOps(const PostOps &postOps) { +static bool checkPostOps(const PostOps& postOps) { if (postOps.empty()) { return true; } @@ -271,12 +291,12 @@ static bool checkPostOps(const PostOps &postOps) { return false; } -static void initFCAttrs(const FCAttrs &attrs, +static void initFCAttrs(const FCAttrs& attrs, ACLTensorAttrs& aclTensorAttrs, ACLFCAttrs& aclfcAttrs, - const MemoryArgs &memory, + const MemoryArgs& memory, arm_compute::FullyConnectedLayerInfo& fullyConnectedLayerInfo, - const PostOps &postOps) { + const PostOps& postOps) { aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc); fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision(); @@ -285,9 +305,10 @@ static void initFCAttrs(const FCAttrs &attrs, if (!postOps.empty() && checkPostOps(postOps)) { auto activation = std::dynamic_pointer_cast(postOps[0]); - fullyConnectedLayerInfo.activation_info = getActivationLayerInfo( - convertToEltwiseAlgorithm(activation->type()), - activation->alpha(), activation->beta(), activation->gamma()); + fullyConnectedLayerInfo.activation_info = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()), + activation->alpha(), + activation->beta(), + activation->gamma()); } if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) { @@ -295,21 +316,22 @@ static void initFCAttrs(const FCAttrs &attrs, } } -ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, - const PostOps &postOps, - const MemoryArgs &memory, +ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory, const ExecutorContext::CPtr context) { initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); - packedWeights = prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps, expectedWeightFormat, weiTensorInfo); + packedWeights = + prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps, expectedWeightFormat, weiTensorInfo); } -bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { +bool ACLFullyConnectedExecutor::supports(const FCConfig& config) { VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_SRC_PRECISIONS); VERIFY(one_of(weiType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_WEI_PRECISIONS); - VERIFY(postOpsNumbers(config) < 2, UNSUPPORTED_NUMBER_OF_POSTOPS); - VERIFY(checkPostOps(config.postOps), UNSUPPORTED_TYPE_OF_POSTOPS); - VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); - VERIFY(one_of(weiRank(config), 2U, 3U), UNSUPPORTED_WEI_RANK); + VERIFY(postOpsNumbers(config) < 2, UNSUPPORTED_NUMBER_OF_POSTOPS); + VERIFY(checkPostOps(config.postOps), UNSUPPORTED_TYPE_OF_POSTOPS); + VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK); + VERIFY(one_of(weiRank(config), 2U, 3U), UNSUPPORTED_WEI_RANK); return true; } @@ -329,35 +351,34 @@ void ACLFullyConnectedExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) updateFCTensorsShapes(aclMemoryShapes); } -arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLInfos & aclMemoryInfos) { +arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLInfos& aclMemoryInfos) { if (aclfcAttrs.isConvertedWeights) { aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); } int ic_total = aclMemoryInfos[ACLArgs::ACL_SRC_0]->dimension(0); return arm_compute::NEFullyConnectedLayer::validate( - aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - &weiTensorInfo, - aclMemoryInfos[ACLArgs::ACL_BIAS].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - fullyConnectedLayerInfo, - expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED ? - arm_compute::WeightsInfo() : - arm_compute::WeightsInfo(false, 1, 1, ic_total, false, expectedWeightFormat)); + aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + &weiTensorInfo, + aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + fullyConnectedLayerInfo, + expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED + ? arm_compute::WeightsInfo() + : arm_compute::WeightsInfo(false, 1, 1, ic_total, false, expectedWeightFormat)); } -ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors & aclMemoryTensors) { +ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors& aclMemoryTensors) { auto neFC = std::make_unique(); aclMemoryTensors[ACLArgs::ACL_WEI]->allocator()->init(weiTensorInfo); int icTotal = aclMemoryTensors[ACLArgs::ACL_WEI]->info()->dimension(0); - neFC->configure( - aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), - aclMemoryTensors[ACLArgs::ACL_WEI].get(), - aclMemoryTensors[ACLArgs::ACL_BIAS].get(), - aclMemoryTensors[ACLArgs::ACL_DST].get(), - fullyConnectedLayerInfo, - expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED ? - arm_compute::WeightsInfo() : - arm_compute::WeightsInfo(false, 1, 1, icTotal, false, expectedWeightFormat)); + neFC->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), + aclMemoryTensors[ACLArgs::ACL_WEI].get(), + aclMemoryTensors[ACLArgs::ACL_BIAS].get(), + aclMemoryTensors[ACLArgs::ACL_DST].get(), + fullyConnectedLayerInfo, + expectedWeightFormat == arm_compute::WeightFormat::UNSPECIFIED + ? arm_compute::WeightsInfo() + : arm_compute::WeightsInfo(false, 1, 1, icTotal, false, expectedWeightFormat)); // TODO: get rid of those flags and decide whether to import memory or not just based on input type if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) { aclTensorAttrs.memoryUsageIndicator[ACLArgs::ACL_WEI] = false; @@ -366,13 +387,13 @@ ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLTensors & aclM return neFC; } -arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { +arm_compute::Status acl_fc_executor::ACLWeightsConverter::validateTensorsInfo(const ACLInfos& aclMemoryInfos) { return arm_compute::NECast::validate(aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), aclMemoryInfos[ACLArgs::ACL_DST].get(), arm_compute::ConvertPolicy::SATURATE); } -ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors &aclMemoryTensors) { +ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTensors& aclMemoryTensors) { auto neCast = std::make_unique(); neCast->configure(aclMemoryTensors[ACLArgs::ACL_SRC_0].get(), aclMemoryTensors[ACLArgs::ACL_DST].get(), @@ -380,34 +401,34 @@ ACLFunction acl_fc_executor::ACLWeightsConverter::configureFunction(const ACLTen return neCast; } -acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs &attrs, - const PostOps &postOps, - const MemoryArgs &memory) { +acl_fc_executor::ACLWeightFormatGenerator::ACLWeightFormatGenerator(const FCAttrs& attrs, + const PostOps& postOps, + const MemoryArgs& memory) { initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, fullyConnectedLayerInfo, postOps); } -void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes &aclMemoryShapes) { +void acl_fc_executor::ACLWeightFormatGenerator::updateTensorsShapes(ACLShapes& aclMemoryShapes) { updateFCTensorsShapes(aclMemoryShapes); } -arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos &aclMemoryInfos) { +arm_compute::Status acl_fc_executor::ACLWeightFormatGenerator::validateTensorsInfo(const ACLInfos& aclMemoryInfos) { if (aclfcAttrs.isConvertedWeights) { aclMemoryInfos[ACLArgs::ACL_WEI]->set_data_type(aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_type()); } int icTotal = aclMemoryInfos[ACLArgs::ACL_SRC_0]->dimension(0); return arm_compute::NEFullyConnectedLayer::has_opt_impl( - expectedWeightFormat, - aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), - aclMemoryInfos[ACLArgs::ACL_WEI].get(), - aclMemoryInfos[ACLArgs::ACL_BIAS].get(), - aclMemoryInfos[ACLArgs::ACL_DST].get(), - fullyConnectedLayerInfo, - arm_compute::WeightsInfo(false, 1, 1, icTotal, false, arm_compute::WeightFormat::ANY)); + expectedWeightFormat, + aclMemoryInfos[ACLArgs::ACL_SRC_0].get(), + aclMemoryInfos[ACLArgs::ACL_WEI].get(), + aclMemoryInfos[ACLArgs::ACL_BIAS].get(), + aclMemoryInfos[ACLArgs::ACL_DST].get(), + fullyConnectedLayerInfo, + arm_compute::WeightsInfo(false, 1, 1, icTotal, false, arm_compute::WeightFormat::ANY)); } -ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors &aclMemoryTensors) { +ACLFunction acl_fc_executor::ACLWeightFormatGenerator::configureFunction(const ACLTensors& aclMemoryTensors) { return std::make_unique(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp index fcbcb1475efa15..afeb4a5ce45c95 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp @@ -23,21 +23,20 @@ class ACLWeightsConverter : public ACLCommonExecutor { public: ACLWeightsConverter() = default; void updateTensorsShapes(ACLShapes& aclMemoryShapes) override {} - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; + arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) override; + ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) override; }; class ACLWeightFormatGenerator : public ACLCommonExecutor { public: - ACLWeightFormatGenerator(const FCAttrs& attrs, - const PostOps& postOps, - const MemoryArgs& memory); + ACLWeightFormatGenerator(const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory); void updateTensorsShapes(ACLShapes& aclMemoryShapes) override; - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; + arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) override; + ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) override; arm_compute::WeightFormat getOptImplWeightFormat() { return expectedWeightFormat; } + private: arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; ACLFCAttrs aclfcAttrs; @@ -49,17 +48,17 @@ class ACLWeightFormatGenerator : public ACLCommonExecutor { class ACLFullyConnectedExecutor : public ACLCommonExecutor { public: ACLFullyConnectedExecutor(const FCAttrs& attrs, - const PostOps& postOps, - const MemoryArgs& memory, - const ExecutorContext::CPtr context); + const PostOps& postOps, + const MemoryArgs& memory, + const ExecutorContext::CPtr context); static bool supports(const FCConfig& config); void updateTensorsShapes(ACLShapes& aclMemoryShapes) override; - arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override; + arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) override; - ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override; + ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) override; private: arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp index b6fa129974107f..2af982024b5637 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp @@ -22,23 +22,23 @@ unsigned int ACLScheduler::num_threads() const { void ACLScheduler::set_num_threads(unsigned int num_threads) {} -void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) { - const Window & max_window = window; +void ACLScheduler::schedule_custom(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors) { + const Window& max_window = window; const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); #if OV_THREAD == OV_THREAD_OMP - //In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section - //This is the reason why this method isn't used to initialize _num_threads + // In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section + // This is the reason why this method isn't used to initialize _num_threads const auto _num_threads = num_iterations; #else const auto _num_threads = std::min(num_iterations, static_cast(parallel_get_num_threads())); #endif - std::function main_run; + std::function main_run; if (tensors.empty()) { - main_run = [&](const Window &window, const ThreadInfo &info) { + main_run = [&](const Window& window, const ThreadInfo& info) { kernel->run(window, info); }; } else { - main_run = [&](const Window &window, const ThreadInfo &info) { + main_run = [&](const Window& window, const ThreadInfo& info) { kernel->run_op(tensors, window, info); }; } @@ -59,20 +59,20 @@ void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const } } -void ACLScheduler::schedule(ICPPKernel *kernel, const Hints &hints) { +void ACLScheduler::schedule(ICPPKernel* kernel, const Hints& hints) { ITensorPack tensors; schedule_custom(kernel, hints, kernel->window(), tensors); } -void ACLScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) { +void ACLScheduler::schedule_op(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors) { schedule_custom(kernel, hints, window, tensors); } -void ACLScheduler::run_workloads(std::vector &workloads) { +void ACLScheduler::run_workloads(std::vector& workloads) { ov::parallel_for(workloads.size(), [&](int wid) { workloads[wid]({wid, static_cast(parallel_get_num_threads()), &cpu_info()}); }); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp index 1148f4ad5edd69..c94f0aa3abce3a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp @@ -4,9 +4,10 @@ #pragma once -#include #include #include +#include + #include "support/Mutex.h" namespace ov { @@ -20,12 +21,14 @@ class ACLScheduler final : public IScheduler { ~ACLScheduler() override = default; std::uint32_t num_threads() const override; void set_num_threads(unsigned int num_threads) override; - void schedule(ICPPKernel *kernel, const Hints &hints) override; - void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override; + void schedule(ICPPKernel* kernel, const Hints& hints) override; + void schedule_op(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors) override; + protected: - void run_workloads(std::vector &workloads) override; + void run_workloads(std::vector& workloads) override; + private: - void schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors); + void schedule_custom(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors); }; } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp index 33bd49e2f04d9b..077759193d1c30 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp @@ -3,13 +3,14 @@ // #include "acl_interpolate.hpp" + #include "acl_utils.hpp" #include "utils/debug_capabilities.h" -bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpolateAttrs, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { aclInterpolateAttrs = interpolateAttrs; InterpolateExecutor::init(aclInterpolateAttrs, srcDescs, dstDescs, attr); acl_coord = arm_compute::SamplingPolicy::TOP_LEFT; @@ -17,22 +18,23 @@ bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpo static const size_t index_h = 2; static const size_t index_w = 3; - if ((aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::pytorch_half_pixel && out_shape[index_h] > 1 && out_shape[index_w] > 1) || + if ((aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::pytorch_half_pixel && + out_shape[index_h] > 1 && out_shape[index_w] > 1) || aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::half_pixel) { acl_coord = arm_compute::SamplingPolicy::CENTER; } switch (aclInterpolateAttrs.mode) { - case InterpolateMode::linear: - case InterpolateMode::linear_onnx: - acl_policy = arm_compute::InterpolationPolicy::BILINEAR; - break; - case InterpolateMode::nearest: - acl_policy = arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR; - break; - default: - DEBUG_LOG("Unsupported interpolate mode: ", static_cast(aclInterpolateAttrs.mode)); - return false; + case InterpolateMode::linear: + case InterpolateMode::linear_onnx: + acl_policy = arm_compute::InterpolationPolicy::BILINEAR; + break; + case InterpolateMode::nearest: + acl_policy = arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR; + break; + default: + DEBUG_LOG("Unsupported interpolate mode: ", static_cast(aclInterpolateAttrs.mode)); + return false; } auto srcDims = shapeCast(srcDescs[0]->getShape().getDims()); @@ -42,22 +44,25 @@ bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpo changeLayoutToNH_C({&srcDims, &dstDims}); } - auto srcTensorInfo = arm_compute::TensorInfo(srcDims, 1, + auto srcTensorInfo = arm_compute::TensorInfo(srcDims, + 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - auto dstTensorInfo = arm_compute::TensorInfo(dstDims, 1, + auto dstTensorInfo = arm_compute::TensorInfo(dstDims, + 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); - arm_compute::Status status = arm_compute::NEScale::validate(&srcTensorInfo, - &dstTensorInfo, - arm_compute::ScaleKernelInfo(acl_policy, - arm_compute::BorderMode::REPLICATE, - arm_compute::PixelValue(), - acl_coord, - false, - aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners, - getAclDataLayoutByMemoryDesc(srcDescs[0]))); + arm_compute::Status status = arm_compute::NEScale::validate( + &srcTensorInfo, + &dstTensorInfo, + arm_compute::ScaleKernelInfo(acl_policy, + arm_compute::BorderMode::REPLICATE, + arm_compute::PixelValue(), + acl_coord, + false, + aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners, + getAclDataLayoutByMemoryDesc(srcDescs[0]))); if (!status) { DEBUG_LOG("NEScale validation failed: ", status.error_description()); return false; @@ -68,21 +73,25 @@ bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpo acl_scale = std::make_unique(); configureThreadSafe([&] { - acl_scale->configure(&srcTensor, &dstTensor, arm_compute::ScaleKernelInfo(acl_policy, - arm_compute::BorderMode::REPLICATE, - arm_compute::PixelValue(), - acl_coord, - false, - aclInterpolateAttrs.coordTransMode == - InterpolateCoordTransMode::align_corners, - getAclDataLayoutByMemoryDesc(srcDescs[0]))); + acl_scale->configure( + &srcTensor, + &dstTensor, + arm_compute::ScaleKernelInfo(acl_policy, + arm_compute::BorderMode::REPLICATE, + arm_compute::PixelValue(), + acl_coord, + false, + aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners, + getAclDataLayoutByMemoryDesc(srcDescs[0]))); }); return true; } -void ov::intel_cpu::ACLInterpolateExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { +void ov::intel_cpu::ACLInterpolateExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { auto in_ptr_ = padPreprocess(src, dst); - srcTensor.allocator()->import_memory(const_cast(reinterpret_cast(in_ptr_))); + srcTensor.allocator()->import_memory(const_cast(reinterpret_cast(in_ptr_))); dstTensor.allocator()->import_memory(dst[0]->getData()); acl_scale->run(); @@ -92,8 +101,9 @@ void ov::intel_cpu::ACLInterpolateExecutor::exec(const std::vector& } bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration( - const ov::intel_cpu::InterpolateAttrs &interpolateAttrs, const std::vector &srcDescs, - const std::vector &dstDescs) { + const ov::intel_cpu::InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) { OPENVINO_ASSERT(srcDescs[0]->getShape().getDims().size() == 4); auto& inp_shape = srcDescs[0]->getShape().getDims(); @@ -116,7 +126,8 @@ bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration( if (coord_mode == InterpolateCoordTransMode::half_pixel && (nearest_mode == InterpolateNearestMode::simple || nearest_mode == InterpolateNearestMode::round_prefer_ceil)) { - DEBUG_LOG("InterpolateCoordTransMode half_pixel is not supported for InterpolateNearestMode simple and round_prefer_ceil"); + DEBUG_LOG("InterpolateCoordTransMode half_pixel is not supported for InterpolateNearestMode simple and " + "round_prefer_ceil"); return false; } @@ -129,15 +140,17 @@ bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration( if (is_upsample) { bool int_factor = scale_h == static_cast(scale_h) && scale_w == static_cast(scale_w); if (int_factor && coord_mode != InterpolateCoordTransMode::asymmetric && - (nearest_mode == InterpolateNearestMode::round_prefer_ceil - || nearest_mode == InterpolateNearestMode::round_prefer_floor)) { - DEBUG_LOG("upsample && int_factor && !asymmetric && (round_prefer_ceil || round_prefer_floor) case is supported"); + (nearest_mode == InterpolateNearestMode::round_prefer_ceil || + nearest_mode == InterpolateNearestMode::round_prefer_floor)) { + DEBUG_LOG( + "upsample && int_factor && !asymmetric && (round_prefer_ceil || round_prefer_floor) case is supported"); return true; } } else if (scale_h < 1 && scale_w < 1) { float down_scale_h = static_cast(inp_shape[index_h]) / out_shape[index_h]; float down_scale_w = static_cast(inp_shape[index_w]) / out_shape[index_w]; - bool int_factor = down_scale_h == static_cast(down_scale_h) && down_scale_w == static_cast(down_scale_w); + bool int_factor = + down_scale_h == static_cast(down_scale_h) && down_scale_w == static_cast(down_scale_w); if (int_factor && coord_mode != InterpolateCoordTransMode::align_corners && nearest_mode == InterpolateNearestMode::simple) { @@ -146,29 +159,45 @@ bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration( } if (int_factor && nearest_mode == InterpolateNearestMode::round_prefer_ceil && - ((out_shape[index_h] > 1 && out_shape[index_w] > 1) || coord_mode != InterpolateCoordTransMode::half_pixel)) { - DEBUG_LOG("!upsample && int_factor && round_prefer_ceil && (out_shape > 1 || half_pixel) case is supported"); + ((out_shape[index_h] > 1 && out_shape[index_w] > 1) || + coord_mode != InterpolateCoordTransMode::half_pixel)) { + DEBUG_LOG( + "!upsample && int_factor && round_prefer_ceil && (out_shape > 1 || half_pixel) case is supported"); return true; } } - DEBUG_LOG("ACL Interpolate executor does not support such configuration: coord_mode=", static_cast(coord_mode), - " nearest_mode=", static_cast(nearest_mode), " upsample=", is_upsample, " scale_h=", scale_h, " scale_w=", scale_w); + DEBUG_LOG("ACL Interpolate executor does not support such configuration: coord_mode=", + static_cast(coord_mode), + " nearest_mode=", + static_cast(nearest_mode), + " upsample=", + is_upsample, + " scale_h=", + scale_h, + " scale_w=", + scale_w); return false; } -bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupported(const ov::intel_cpu::InterpolateAttrs &interpolateAttrs, - const std::vector &srcDescs, - const std::vector &dstDescs) const { +bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupported(const ov::intel_cpu::InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const { if (srcDescs[0]->getShape().getDims().size() != 4u) { DEBUG_LOG("ACL Interpolate does not support src shape rank: ", srcDescs[0]->getShape().getDims().size()); return false; } auto& pads_begin = interpolateAttrs.padBegin; - auto& pads_end = interpolateAttrs.padEnd; - - if (!std::all_of(pads_begin.begin(), pads_begin.end(), [](int i){return i == 0;}) || - !std::all_of(pads_end.begin(), pads_end.end(), [](int i){return i == 0;})) { + auto& pads_end = interpolateAttrs.padEnd; + + if (!std::all_of(pads_begin.begin(), + pads_begin.end(), + [](int i) { + return i == 0; + }) || + !std::all_of(pads_end.begin(), pads_end.end(), [](int i) { + return i == 0; + })) { DEBUG_LOG("ACL Interpolate does not support padding"); return false; } @@ -180,15 +209,16 @@ bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupported(const ov::intel_c return false; } - if (interpolateAttrs.mode == InterpolateMode::cubic || - interpolateAttrs.mode == InterpolateMode::bilinear_pillow || + if (interpolateAttrs.mode == InterpolateMode::cubic || interpolateAttrs.mode == InterpolateMode::bilinear_pillow || interpolateAttrs.mode == InterpolateMode::bicubic_pillow) { DEBUG_LOG("ACL Interpolate does not support cubic, bilinear_pillow, bicubic_pillow modes"); return false; } if (interpolateAttrs.shapeCalcMode == InterpolateShapeCalcMode::scales && - one_of(interpolateAttrs.coordTransMode, InterpolateCoordTransMode::half_pixel, InterpolateCoordTransMode::asymmetric) && + one_of(interpolateAttrs.coordTransMode, + InterpolateCoordTransMode::half_pixel, + InterpolateCoordTransMode::asymmetric) && one_of(interpolateAttrs.mode, InterpolateMode::linear, InterpolateMode::linear_onnx)) { DEBUG_LOG("ACL Interpolate does not support scales mode with linear/linear_onnx and half_pixel/asymmetric"); return false; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp index 17cdfec5928544..c141fa132a31ff 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp @@ -4,9 +4,9 @@ #pragma once -#include "nodes/executors/interpolate.hpp" #include "arm_compute/runtime/NEON/functions/NEScale.h" #include "arm_compute/runtime/Tensor.h" +#include "nodes/executors/interpolate.hpp" namespace ov { namespace intel_cpu { @@ -18,9 +18,11 @@ class ACLInterpolateExecutor : public InterpolateExecutor { bool init(const InterpolateAttrs& interpolateAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; - void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) override; + void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; @@ -44,10 +46,11 @@ class ACLInterpolateExecutorBuilder : public InterpolateExecutorBuilder { InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override { return std::make_shared(context); } + private: static bool isSupportedConfiguration(const InterpolateAttrs& interpolateAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs); + const std::vector& srcDescs, + const std::vector& dstDescs); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp index 6fde4bb0db5604..290cd3c9dbcce9 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp @@ -14,7 +14,7 @@ AclMVNExecutor::AclMVNExecutor(const ExecutorContext::CPtr context) : MVNExecuto bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const dnnl::primitive_attr& attr) { auto srcDims = srcDescs[0]->getShape().getStaticDims(); auto dstDims = dstDescs[0]->getShape().getStaticDims(); @@ -46,9 +46,14 @@ bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs, } } - TensorInfo srcTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - TensorInfo dstTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); - + TensorInfo srcTensorInfo = TensorInfo(TensorShape(X, Y), + 1, + precisionToAclDataType(srcDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(srcDescs[0])); + TensorInfo dstTensorInfo = TensorInfo(TensorShape(X, Y), + 1, + precisionToAclDataType(dstDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(dstDescs[0])); if (!arm_compute::NEMeanStdDevNormalizationLayer::validate(&srcTensorInfo, &dstTensorInfo, mvnAttrs.epsValue_)) return false; @@ -57,12 +62,16 @@ bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs, dstTensor.allocator()->init(dstTensorInfo); mvn = std::make_unique(); - configureThreadSafe([&] { mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_); }); + configureThreadSafe([&] { + mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_); + }); return true; } -void AclMVNExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { +void AclMVNExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { srcTensor.allocator()->import_memory(src[0]->getData()); dstTensor.allocator()->import_memory(dst[0]->getData()); @@ -75,41 +84,41 @@ void AclMVNExecutor::exec(const std::vector& src, const std::vector< bool AclMVNExecutorBuilder::isSupported(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const { - if ((srcDescs[0]->getPrecision() != ov::element::f32 && - srcDescs[0]->getPrecision() != ov::element::f16) || - srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision()) { - DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); - return false; - } - - if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { - DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support layout:", - " src: ", srcDescs[0]->serializeFormat(), - " dst: ", dstDescs[0]->serializeFormat()); - return false; - } + if ((srcDescs[0]->getPrecision() != ov::element::f32 && srcDescs[0]->getPrecision() != ov::element::f16) || + srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision()) { + DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support precisions:", + " src[0]=", + srcDescs[0]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); + return false; + } - if (mvnAttrs.epsMode_ == MVNEpsMode::OUTSIDE_SQRT) { - DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support OUTSIDE_SQRT mode"); - return false; - } - if (!mvnAttrs.normalizeVariance_) { - DEBUG_LOG("NEMeanStdDevNormalizationLayer supports normalize_variance=true only"); - return false; - } - if (!mvnAttrs.initAcrossChannels_ && - srcDescs[0]->hasLayoutType(LayoutType::nspc)) { - DEBUG_LOG("initAcrossChannels = false is not supported by ACL for NHWC layout"); - return false; - } + if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support layout:", + " src: ", + srcDescs[0]->serializeFormat(), + " dst: ", + dstDescs[0]->serializeFormat()); + return false; + } - return true; + if (mvnAttrs.epsMode_ == MVNEpsMode::OUTSIDE_SQRT) { + DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support OUTSIDE_SQRT mode"); + return false; } + if (!mvnAttrs.normalizeVariance_) { + DEBUG_LOG("NEMeanStdDevNormalizationLayer supports normalize_variance=true only"); + return false; + } + if (!mvnAttrs.initAcrossChannels_ && srcDescs[0]->hasLayoutType(LayoutType::nspc)) { + DEBUG_LOG("initAcrossChannels = false is not supported by ACL for NHWC layout"); + return false; + } + + return true; +} -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp index 7ba445253f8d02..02521551509366 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp @@ -5,8 +5,8 @@ #pragma once #include "acl_utils.hpp" -#include "nodes/executors/mvn.hpp" #include "arm_compute/runtime/NEON/NEFunctions.h" +#include "nodes/executors/mvn.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -19,10 +19,10 @@ class AclMVNExecutor : public MVNExecutor { bool init(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; @@ -47,5 +47,5 @@ class AclMVNExecutorBuilder : public MVNExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp index a49a5cea4ef26e..2e4aed30d7b33e 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp @@ -3,6 +3,7 @@ // #include "acl_pooling.hpp" + #include "acl_utils.hpp" namespace ov { @@ -22,10 +23,12 @@ bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo, PoolingLayerInfo* pool_info, Pooling3dLayerInfo* pool3d_info, bool ignoreOutShapeErrors) { - unsigned int pad_left = (poolingAttrs.data_pad_begin.size() >= 2u) ? poolingAttrs.data_pad_begin[1] : poolingAttrs.data_pad_begin[0]; - unsigned int pad_right = (poolingAttrs.data_pad_end.size() >= 2u) ? poolingAttrs.data_pad_end[1] : poolingAttrs.data_pad_end[0]; - unsigned int pad_top = (poolingAttrs.data_pad_begin.size() >= 2u) ? poolingAttrs.data_pad_begin[0] : 0; - unsigned int pad_bottom = (poolingAttrs.data_pad_end.size() >= 2u) ? poolingAttrs.data_pad_end[0] : 0; + unsigned int pad_left = + (poolingAttrs.data_pad_begin.size() >= 2u) ? poolingAttrs.data_pad_begin[1] : poolingAttrs.data_pad_begin[0]; + unsigned int pad_right = + (poolingAttrs.data_pad_end.size() >= 2u) ? poolingAttrs.data_pad_end[1] : poolingAttrs.data_pad_end[0]; + unsigned int pad_top = (poolingAttrs.data_pad_begin.size() >= 2u) ? poolingAttrs.data_pad_begin[0] : 0; + unsigned int pad_bottom = (poolingAttrs.data_pad_end.size() >= 2u) ? poolingAttrs.data_pad_end[0] : 0; unsigned int kernel_w = (poolingAttrs.kernel.size() >= 2u) ? poolingAttrs.kernel[1] : poolingAttrs.kernel[0]; unsigned int kernel_h = (poolingAttrs.kernel.size() >= 2u) ? poolingAttrs.kernel[0] : 1; unsigned int stride_x = (poolingAttrs.stride.size() >= 2u) ? poolingAttrs.stride[1] : poolingAttrs.stride[0]; @@ -47,45 +50,48 @@ bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo, // The combination of parameters: NCHW + CEIL gives an accuracy problem in AvgPool. // One workaround is to disable the ACL executor for these parameters. // Then OneDNN will run this case in ACL backend as reorder -> NHWC -> reorder - if (pool_type == PoolingType::AVG && - dataLayout == arm_compute::DataLayout::NCHW && + if (pool_type == PoolingType::AVG && dataLayout == arm_compute::DataLayout::NCHW && poolingAttrs.rounding == op::RoundingType::CEIL) { DEBUG_LOG("NCHW + CEIL gives an accuracy problem in ACL AvgPool. ACL executor will not be created."); return false; } - DimensionRoundingType round = (poolingAttrs.rounding == op::RoundingType::CEIL) ? - DimensionRoundingType::CEIL : DimensionRoundingType::FLOOR; + DimensionRoundingType round = + (poolingAttrs.rounding == op::RoundingType::CEIL) ? DimensionRoundingType::CEIL : DimensionRoundingType::FLOOR; if (srcDimsSize == 5) { if (dstDescsSize > 1) { DEBUG_LOG("NEPooling3dLayer does not support indices"); return false; } else { - unsigned int kernel_d = poolingAttrs.kernel[2]; - unsigned int stride_z = poolingAttrs.stride[2]; + unsigned int kernel_d = poolingAttrs.kernel[2]; + unsigned int stride_z = poolingAttrs.stride[2]; unsigned int pad_front = poolingAttrs.data_pad_begin[2]; - unsigned int pad_back = poolingAttrs.data_pad_end[2]; - pool3d_info->pool_type = pool_type; + unsigned int pad_back = poolingAttrs.data_pad_end[2]; + pool3d_info->pool_type = pool_type; pool3d_info->exclude_padding = exclude_padding; - pool3d_info->pool_size = arm_compute::Size3D(kernel_w, kernel_h, kernel_d); - pool3d_info->stride = arm_compute::Size3D(stride_x, stride_y, stride_z); - pool3d_info->padding = arm_compute::Padding3D(pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back); - pool3d_info->round_type = round; - arm_compute::Status s = arm_compute::NEPooling3dLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool3d_info); + pool3d_info->pool_size = arm_compute::Size3D(kernel_w, kernel_h, kernel_d); + pool3d_info->stride = arm_compute::Size3D(stride_x, stride_y, stride_z); + pool3d_info->padding = + arm_compute::Padding3D(pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back); + pool3d_info->round_type = round; + arm_compute::Status s = + arm_compute::NEPooling3dLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool3d_info); if (!s) { DEBUG_LOG("NEPooling3dLayer validation failed: ", s.error_description()); return false; } } } else { - pool_info->data_layout = dataLayout; - pool_info->pool_size = arm_compute::Size2D(kernel_w, kernel_h); - pool_info->pad_stride_info = arm_compute::PadStrideInfo(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, round); - pool_info->pool_type = pool_type; - pool_info->exclude_padding = exclude_padding; + pool_info->data_layout = dataLayout; + pool_info->pool_size = arm_compute::Size2D(kernel_w, kernel_h); + pool_info->pad_stride_info = + arm_compute::PadStrideInfo(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, round); + pool_info->pool_type = pool_type; + pool_info->exclude_padding = exclude_padding; if (dstDescsSize > 1) { TensorInfo indTensorInfo = TensorInfo(shapeCast(*indDims), 1, arm_compute::DataType::U32, dataLayout); - arm_compute::Status s = arm_compute::NEPoolingLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool_info, &indTensorInfo); + arm_compute::Status s = + arm_compute::NEPoolingLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool_info, &indTensorInfo); if (!s) { DEBUG_LOG("NEPoolingLayer validation with indices failed: ", s.error_description()); if (ignoreOutShapeErrors && @@ -112,9 +118,9 @@ bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo, } bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto srcDims = srcDescs[0]->getShape().getStaticDims(); auto dstDims = dstDescs[0]->getShape().getStaticDims(); @@ -124,10 +130,14 @@ bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs, changeLayoutToNH_C({&srcShape, &dstShape}); } - TensorInfo srcTensorInfo = TensorInfo(srcShape, 1, - precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - TensorInfo dstTensorInfo = TensorInfo(dstShape, 1, - precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); + TensorInfo srcTensorInfo = TensorInfo(srcShape, + 1, + precisionToAclDataType(srcDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(srcDescs[0])); + TensorInfo dstTensorInfo = TensorInfo(dstShape, + 1, + precisionToAclDataType(dstDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(dstDescs[0])); srcTensor.allocator()->init(srcTensorInfo); dstTensor.allocator()->init(dstTensorInfo); @@ -166,7 +176,9 @@ bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs, nullptr)) return false; auto indDims = dstDescs[1]->getShape().getStaticDims(); - TensorInfo indTensorInfo = TensorInfo(shapeCast(indDims), 1, precisionToAclDataType(dstDescs[1]->getPrecision()), + TensorInfo indTensorInfo = TensorInfo(shapeCast(indDims), + 1, + precisionToAclDataType(dstDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[1])); indTensor.allocator()->init(indTensorInfo); exec_func = [this, pool_info]() -> std::unique_ptr { @@ -192,21 +204,27 @@ bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs, }; } } - configureThreadSafe([&] { ifunc = exec_func(); }); + configureThreadSafe([&] { + ifunc = exec_func(); + }); return true; } -void AclPoolingExecutor::exec(const std::vector& src, const std::vector& dst, std::unordered_map postOpsArgs) { +void AclPoolingExecutor::exec(const std::vector& src, + const std::vector& dst, + std::unordered_map postOpsArgs) { srcTensor.allocator()->import_memory(src[0]->getData()); dstTensor.allocator()->import_memory(dst[0]->getData()); - if (dst.size() > 1u) indTensor.allocator()->import_memory(dst[1]->getData()); + if (dst.size() > 1u) + indTensor.allocator()->import_memory(dst[1]->getData()); ifunc->run(); srcTensor.allocator()->free(); dstTensor.allocator()->free(); - if (dst.size() > 1u) indTensor.allocator()->free(); + if (dst.size() > 1u) + indTensor.allocator()->free(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp index 9f6b1bb0fcc668..75b3d28eecf4aa 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp @@ -4,8 +4,8 @@ #pragma once -#include "nodes/executors/pooling.hpp" #include "arm_compute/runtime/NEON/NEFunctions.h" +#include "nodes/executors/pooling.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -18,7 +18,7 @@ class AclPoolingExecutor : public PoolingExecutor { bool init(const PoolingAttrs& poolingAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst, std::unordered_map postOpsArgs) override; @@ -54,70 +54,72 @@ class AclPoolingExecutorBuilder : public PoolingExecutorBuilder { bool isSupported(const PoolingAttrs& poolingAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const override { - if ((srcDescs[0]->getPrecision() != ov::element::f32 && - dstDescs[0]->getPrecision() != ov::element::f32) && - (srcDescs[0]->getPrecision() != ov::element::f16 && - dstDescs[0]->getPrecision() != ov::element::f16)) { + if ((srcDescs[0]->getPrecision() != ov::element::f32 && dstDescs[0]->getPrecision() != ov::element::f32) && + (srcDescs[0]->getPrecision() != ov::element::f16 && dstDescs[0]->getPrecision() != ov::element::f16)) { DEBUG_LOG("AclPoolingExecutor does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } if (srcDescs.size() == 2u && - (srcDescs[1]->getPrecision() != ov::element::f32 && - srcDescs[0]->getPrecision() != ov::element::f32 && + (srcDescs[1]->getPrecision() != ov::element::f32 && srcDescs[0]->getPrecision() != ov::element::f32 && dstDescs[0]->getPrecision() != ov::element::f32) && - (srcDescs[1]->getPrecision() != ov::element::f16 && - srcDescs[0]->getPrecision() != ov::element::f16 && + (srcDescs[1]->getPrecision() != ov::element::f16 && srcDescs[0]->getPrecision() != ov::element::f16 && dstDescs[0]->getPrecision() != ov::element::f16)) { DEBUG_LOG("AclPoolingExecutor does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " src[1]=", srcDescs[1]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " src[1]=", + srcDescs[1]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } - if (dstDescs.size() == 2u && - dstDescs[1]->getPrecision() != ov::element::u32) { + if (dstDescs.size() == 2u && dstDescs[1]->getPrecision() != ov::element::u32) { DEBUG_LOG("AclPoolingExecutor supports U32 as indices precisions only. ", - "Passed indices precision: ", dstDescs[1]->getPrecision()); - return false; - } + "Passed indices precision: ", + dstDescs[1]->getPrecision()); + return false; + } if (srcDescs[0]->getShape().getRank() < 5) { - if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { - DEBUG_LOG("NEPoolingLayer does not support layouts:", - " src=", srcDescs[0]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); - return false; - } + if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("NEPoolingLayer does not support layouts:", + " src=", + srcDescs[0]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); + return false; + } if (srcDescs.size() == 2u && - !(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - srcDescs[1]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - srcDescs[1]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { - DEBUG_LOG("NEPoolingLayer does not support layouts:", - " src[0]=", srcDescs[0]->serializeFormat(), - " src[1]=", srcDescs[1]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); - return false; - } + !(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && srcDescs[1]->hasLayoutType(LayoutType::ncsp) && + dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && srcDescs[1]->hasLayoutType(LayoutType::nspc) && + dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("NEPoolingLayer does not support layouts:", + " src[0]=", + srcDescs[0]->serializeFormat(), + " src[1]=", + srcDescs[1]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); + return false; + } } else { - if (!(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { - DEBUG_LOG("Pooling3dLayer does not support layouts:", - " src=", srcDescs[0]->serializeFormat(), - " dst=", dstDescs[0]->serializeFormat()); - return false; - } + if (!(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("Pooling3dLayer does not support layouts:", + " src=", + srcDescs[0]->serializeFormat(), + " dst=", + dstDescs[0]->serializeFormat()); + return false; + } } return true; @@ -128,5 +130,5 @@ class AclPoolingExecutorBuilder : public PoolingExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp index e99747121cb623..5973027a0376cb 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp @@ -11,28 +11,31 @@ using namespace arm_compute; static arm_compute::ReductionOperation getAclReductionOperationByAlgorithm(Algorithm algorithm) { switch (algorithm) { - case Algorithm::ReduceMax: return arm_compute::ReductionOperation::MAX; - case Algorithm::ReduceMin: return arm_compute::ReductionOperation::MIN; - case Algorithm::ReduceSum: return arm_compute::ReductionOperation::SUM; - case Algorithm::ReduceProd: return arm_compute::ReductionOperation::PROD; - default: OPENVINO_THROW("Unsupported reduction operation: ", static_cast(algorithm)); + case Algorithm::ReduceMax: + return arm_compute::ReductionOperation::MAX; + case Algorithm::ReduceMin: + return arm_compute::ReductionOperation::MIN; + case Algorithm::ReduceSum: + return arm_compute::ReductionOperation::SUM; + case Algorithm::ReduceProd: + return arm_compute::ReductionOperation::PROD; + default: + OPENVINO_THROW("Unsupported reduction operation: ", static_cast(algorithm)); } } AclReduceExecutor::AclReduceExecutor(const ExecutorContext::CPtr context) : ReduceExecutor(context) {} bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { - if (reduceAttrs.operation != Algorithm::ReduceMax && - reduceAttrs.operation != Algorithm::ReduceMin && - reduceAttrs.operation != Algorithm::ReduceSum && - reduceAttrs.operation != Algorithm::ReduceProd && + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { + if (reduceAttrs.operation != Algorithm::ReduceMax && reduceAttrs.operation != Algorithm::ReduceMin && + reduceAttrs.operation != Algorithm::ReduceSum && reduceAttrs.operation != Algorithm::ReduceProd && reduceAttrs.operation != Algorithm::ReduceMean) { - DEBUG_LOG("Unknown reduce algorithm passed into AclReduceExecutor: ", static_cast(reduceAttrs.operation)); - return false; - } + DEBUG_LOG("Unknown reduce algorithm passed into AclReduceExecutor: ", static_cast(reduceAttrs.operation)); + return false; + } this->reduceAttrs = reduceAttrs; @@ -46,10 +49,14 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs, changeLayoutToNH_C({&srcShape, &dstShape}); } - TensorInfo srcTensorInfo = TensorInfo(srcShape, 1, - precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - TensorInfo dstTensorInfo = TensorInfo(dstShape, 1, - precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); + TensorInfo srcTensorInfo = TensorInfo(srcShape, + 1, + precisionToAclDataType(srcDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(srcDescs[0])); + TensorInfo dstTensorInfo = TensorInfo(dstShape, + 1, + precisionToAclDataType(dstDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(dstDescs[0])); srcTensor.allocator()->init(srcTensorInfo); dstTensor.allocator()->init(dstTensorInfo); @@ -57,54 +64,69 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs, std::function(void)> exec_func; std::vector castedAxes; for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { - int axis = axisCast(reduceAttrs.axes[i], srcDims.size(), hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION); - if (hasSrcNspcLayout && axis == -1) return false; + int axis = + axisCast(reduceAttrs.axes[i], srcDims.size(), hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION); + if (hasSrcNspcLayout && axis == -1) + return false; castedAxes.push_back(axis); } switch (reduceAttrs.operation) { - case Algorithm::ReduceMean: { - for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { - auto pos = axisCast(i, reduceAttrs.axes.size()); - axesMean.set(pos, castedAxes[i]); - } - Status reduceMeanStatus = NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo); - if (!reduceMeanStatus) { - DEBUG_LOG("NEReduceMean validation failed: ", reduceMeanStatus.error_description()); - return false; - } - exec_func = [this]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensor, axesMean, this->reduceAttrs.keepDims, &dstTensor); - return acl_op; - }; - break; + case Algorithm::ReduceMean: { + for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { + auto pos = axisCast(i, reduceAttrs.axes.size()); + axesMean.set(pos, castedAxes[i]); } - case Algorithm::ReduceMax: - case Algorithm::ReduceMin: - case Algorithm::ReduceSum: - case Algorithm::ReduceProd: { - Status reductionOperationStatus = NEReductionOperation::validate(&srcTensorInfo, &dstTensorInfo, castedAxes[0], - getAclReductionOperationByAlgorithm(reduceAttrs.operation), reduceAttrs.keepDims); - if (!reductionOperationStatus) { - DEBUG_LOG("NEReductionOperation validation with indices failed: ", reductionOperationStatus.error_description()); - return false; - } - exec_func = [this, castedAxes]() -> std::unique_ptr { - auto acl_op = std::make_unique(); - acl_op->configure(&srcTensor, &dstTensor, castedAxes[0], - getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), this->reduceAttrs.keepDims); - return acl_op; - }; - break; + Status reduceMeanStatus = + NEReduceMean::validate(&srcTensorInfo, axesMean, reduceAttrs.keepDims, &dstTensorInfo); + if (!reduceMeanStatus) { + DEBUG_LOG("NEReduceMean validation failed: ", reduceMeanStatus.error_description()); + return false; } - default: - OPENVINO_THROW("Unsupported operation type for ACL Reduce executor: ", static_cast(reduceAttrs.operation)); + exec_func = [this]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensor, axesMean, this->reduceAttrs.keepDims, &dstTensor); + return acl_op; + }; + break; + } + case Algorithm::ReduceMax: + case Algorithm::ReduceMin: + case Algorithm::ReduceSum: + case Algorithm::ReduceProd: { + Status reductionOperationStatus = + NEReductionOperation::validate(&srcTensorInfo, + &dstTensorInfo, + castedAxes[0], + getAclReductionOperationByAlgorithm(reduceAttrs.operation), + reduceAttrs.keepDims); + if (!reductionOperationStatus) { + DEBUG_LOG("NEReductionOperation validation with indices failed: ", + reductionOperationStatus.error_description()); + return false; + } + exec_func = [this, castedAxes]() -> std::unique_ptr { + auto acl_op = std::make_unique(); + acl_op->configure(&srcTensor, + &dstTensor, + castedAxes[0], + getAclReductionOperationByAlgorithm(this->reduceAttrs.operation), + this->reduceAttrs.keepDims); + return acl_op; + }; + break; + } + default: + OPENVINO_THROW("Unsupported operation type for ACL Reduce executor: ", static_cast(reduceAttrs.operation)); } - configureThreadSafe([&] { ifunc = exec_func(); }); + configureThreadSafe([&] { + ifunc = exec_func(); + }); return true; } -void AclReduceExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { +void AclReduceExecutor::exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) { srcTensor.allocator()->import_memory(src[0]->getData()); dstTensor.allocator()->import_memory(dst[0]->getData()); @@ -114,5 +136,5 @@ void AclReduceExecutor::exec(const std::vector& src, const std::vect dstTensor.allocator()->free(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp index 69bf6062918963..a121868bf80ba3 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.hpp @@ -20,10 +20,10 @@ class AclReduceExecutor : public ReduceExecutor { bool init(const ReduceAttrs& reduceAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) override; + const void* post_ops_data_) override; impl_desc_type getImplType() const override { return implType; @@ -46,33 +46,38 @@ class AclReduceExecutorBuilder : public ReduceExecutorBuilder { const std::vector& dstDescs) const override { if (reduceAttrs.operation == Algorithm::ReduceMean) { if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision() || - (srcDescs[0]->getPrecision() != ov::element::f32 && - srcDescs[0]->getPrecision() != ov::element::f16)) { + (srcDescs[0]->getPrecision() != ov::element::f32 && srcDescs[0]->getPrecision() != ov::element::f16)) { DEBUG_LOG("NEReduceMean does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } } else { if (srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision() || - (srcDescs[0]->getPrecision() != ov::element::f32 && - srcDescs[0]->getPrecision() != ov::element::f16 && - srcDescs[0]->getPrecision() != ov::element::i32)) { + (srcDescs[0]->getPrecision() != ov::element::f32 && srcDescs[0]->getPrecision() != ov::element::f16 && + srcDescs[0]->getPrecision() != ov::element::i32)) { DEBUG_LOG("NEReductionOperation does not support precisions:", - " src[0]=", srcDescs[0]->getPrecision(), - " dst[0]=", dstDescs[0]->getPrecision()); + " src[0]=", + srcDescs[0]->getPrecision(), + " dst[0]=", + dstDescs[0]->getPrecision()); return false; } } if (srcDescs[0]->getShape().getRank() >= arm_compute::MAX_DIMS) { - DEBUG_LOG("ACL supports ", arm_compute::MAX_DIMS, - " dimensions maximum. src[0] shape rank is ", srcDescs[0]->getShape().getRank()); + DEBUG_LOG("ACL supports ", + arm_compute::MAX_DIMS, + " dimensions maximum. src[0] shape rank is ", + srcDescs[0]->getShape().getRank()); return false; } auto srcShapeRank = srcDescs[0]->getShape().getRank(); bool hasSrcNspcLayout = srcDescs[0]->hasLayoutType(LayoutType::nspc); for (size_t i = 0; i < reduceAttrs.axes.size(); ++i) { - int axis = axisCast(reduceAttrs.axes[i], srcShapeRank, hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION); + int axis = + axisCast(reduceAttrs.axes[i], srcShapeRank, hasSrcNspcLayout ? NHWC_TO_NCHW : NO_LAYOUT_CONVERSION); if (axis == -1) { DEBUG_LOG("Layout conversion to NHWC has failed"); return false; @@ -82,14 +87,12 @@ class AclReduceExecutorBuilder : public ReduceExecutorBuilder { return false; } } - if ((reduceAttrs.operation == Algorithm::ReduceSum || - reduceAttrs.operation == Algorithm::ReduceMax || - reduceAttrs.operation == Algorithm::ReduceMin || - reduceAttrs.operation == Algorithm::ReduceProd) && - reduceAttrs.axes.size() != 1) { - DEBUG_LOG("ACL supports single axes reduce only. Number of axes: ", reduceAttrs.axes.size()); - return false; - } + if ((reduceAttrs.operation == Algorithm::ReduceSum || reduceAttrs.operation == Algorithm::ReduceMax || + reduceAttrs.operation == Algorithm::ReduceMin || reduceAttrs.operation == Algorithm::ReduceProd) && + reduceAttrs.axes.size() != 1) { + DEBUG_LOG("ACL supports single axes reduce only. Number of axes: ", reduceAttrs.axes.size()); + return false; + } return true; } @@ -99,5 +102,5 @@ class AclReduceExecutorBuilder : public ReduceExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp index 801e50831b3bb1..dd16b333cb6b32 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp @@ -3,12 +3,13 @@ // #include "acl_transpose.hpp" + #include "acl_utils.hpp" -bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto inputOrder = transposeParams.permuteParams.order; if (inputOrder.empty()) { inputOrder.resize(srcDescs[0]->getShape().getRank()); @@ -24,7 +25,7 @@ bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposePar }; auto srcDims = changeLayoutToNhwc(srcDescs[0]->getShape().getStaticDims()); auto dstDims = changeLayoutToNhwc(dstDescs[0]->getShape().getStaticDims()); - for (int i = inputOrder.size() - 1; i >= 0 ; --i) { + for (int i = inputOrder.size() - 1; i >= 0; --i) { auto it = find(srcDims.rbegin(), srcDims.rend(), dstDims[i]); int index = it - srcDims.rbegin(); vec.push_back(index); @@ -46,10 +47,12 @@ bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposePar if (srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc)) { changeLayoutToNH_C({&srcDims, &dstDims}); } - auto srcTensorInfo = arm_compute::TensorInfo(srcDims, 1, + auto srcTensorInfo = arm_compute::TensorInfo(srcDims, + 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - auto dstTensorInfo = arm_compute::TensorInfo(dstDims, 1, + auto dstTensorInfo = arm_compute::TensorInfo(dstDims, + 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); arm_compute::Status status = arm_compute::NEPermute::validate(&srcTensorInfo, &dstTensorInfo, order); @@ -61,11 +64,13 @@ bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposePar dstTensor.allocator()->init(dstTensorInfo); acl_permute = std::make_unique(); - configureThreadSafe([&] { acl_permute->configure(&srcTensor, &dstTensor, order); }); + configureThreadSafe([&] { + acl_permute->configure(&srcTensor, &dstTensor, order); + }); return true; } -void ov::intel_cpu::ACLTransposeExecutor::exec(const std::vector &src, const std::vector &dst) { +void ov::intel_cpu::ACLTransposeExecutor::exec(const std::vector& src, const std::vector& dst) { srcTensor.allocator()->import_memory(src[0]->getData()); dstTensor.allocator()->import_memory(dst[0]->getData()); diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.hpp index 02a190597531ea..c6765aa1ff25f0 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.hpp @@ -4,10 +4,9 @@ #pragma once -#include "nodes/executors/transpose.hpp" - -#include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/Tensor.h" +#include "nodes/executors/transpose.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -20,9 +19,12 @@ class ACLTransposeExecutor : public TransposeExecutor { bool init(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst) override; - impl_desc_type implType() const override { return impl_desc_type::acl; } + impl_desc_type implType() const override { + return impl_desc_type::acl; + } + private: arm_compute::Tensor srcTensor, dstTensor; std::unique_ptr acl_permute; @@ -33,13 +35,13 @@ class ACLTransposeExecutorBuilder : public TransposeExecutorBuilder { bool isSupported(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs) const override { - if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && - !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && - dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && dstDescs[0]->hasLayoutType(LayoutType::nspc))) { DEBUG_LOG("NEPermute does not support layout:", - " src: ", srcDescs[0]->serializeFormat(), - " dst: ", dstDescs[0]->serializeFormat()); + " src: ", + srcDescs[0]->serializeFormat(), + " dst: ", + dstDescs[0]->serializeFormat()); return false; } if (srcDescs[0]->getShape().getRank() > 4) { @@ -59,5 +61,5 @@ class ACLTransposeExecutorBuilder : public TransposeExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp index df57d29f4a44ec..6c3799da70bfda 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp @@ -3,6 +3,7 @@ // #include "acl_utils.hpp" + #include "support/Mutex.h" #include "utils/debug_capabilities.h" @@ -18,55 +19,55 @@ void configureThreadSafe(const std::function& config) { arm_compute::ActivationLayerInfo getActivationLayerInfo(Algorithm algorithm, float alpha = 0.0, - float beta = 0.0, + float beta = 0.0, float gamma = 0.0) { switch (algorithm) { - case Algorithm::EltwiseRelu: - if (alpha == 0) { - return arm_compute::ActivationLayerInfo::ActivationFunction::RELU; - } else { - return {arm_compute::ActivationLayerInfo::ActivationFunction::LEAKY_RELU, alpha}; - } - case Algorithm::EltwiseGeluErf: - return arm_compute::ActivationLayerInfo::ActivationFunction::GELU; - case Algorithm::EltwiseElu: - return {arm_compute::ActivationLayerInfo::ActivationFunction::ELU, alpha}; - case Algorithm::EltwiseTanh: - return {arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}; - case Algorithm::EltwiseSigmoid: - return arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC; - case Algorithm::EltwiseSqrt: - return arm_compute::ActivationLayerInfo::ActivationFunction::SQRT; - case Algorithm::EltwiseSoftRelu: - return arm_compute::ActivationLayerInfo::ActivationFunction::SOFT_RELU; - case Algorithm::EltwiseClamp: - return {arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, beta, alpha}; - case Algorithm::EltwiseSwish: - return {arm_compute::ActivationLayerInfo::ActivationFunction::SWISH, alpha}; - case Algorithm::EltwiseHswish: - return arm_compute::ActivationLayerInfo::ActivationFunction::HARD_SWISH; - default: - OPENVINO_THROW("Unsupported operation type for ACL Eltwise executor: ", static_cast(algorithm)); + case Algorithm::EltwiseRelu: + if (alpha == 0) { + return arm_compute::ActivationLayerInfo::ActivationFunction::RELU; + } else { + return {arm_compute::ActivationLayerInfo::ActivationFunction::LEAKY_RELU, alpha}; + } + case Algorithm::EltwiseGeluErf: + return arm_compute::ActivationLayerInfo::ActivationFunction::GELU; + case Algorithm::EltwiseElu: + return {arm_compute::ActivationLayerInfo::ActivationFunction::ELU, alpha}; + case Algorithm::EltwiseTanh: + return {arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}; + case Algorithm::EltwiseSigmoid: + return arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC; + case Algorithm::EltwiseSqrt: + return arm_compute::ActivationLayerInfo::ActivationFunction::SQRT; + case Algorithm::EltwiseSoftRelu: + return arm_compute::ActivationLayerInfo::ActivationFunction::SOFT_RELU; + case Algorithm::EltwiseClamp: + return {arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, beta, alpha}; + case Algorithm::EltwiseSwish: + return {arm_compute::ActivationLayerInfo::ActivationFunction::SWISH, alpha}; + case Algorithm::EltwiseHswish: + return arm_compute::ActivationLayerInfo::ActivationFunction::HARD_SWISH; + default: + OPENVINO_THROW("Unsupported operation type for ACL Eltwise executor: ", static_cast(algorithm)); } } bool checkActivationLayerInfo(Algorithm algorithm) { switch (algorithm) { - case Algorithm::EltwiseRelu: - case Algorithm::EltwiseGeluErf: - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseSwish: - case Algorithm::EltwiseHswish: - return true; - default: - return false; + case Algorithm::EltwiseRelu: + case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseSwish: + case Algorithm::EltwiseHswish: + return true; + default: + return false; } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp index a3d151192e601b..1d30736353b878 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp @@ -3,19 +3,19 @@ // #pragma once -#include "memory_desc/cpu_memory_desc.h" #include "arm_compute/core/Types.h" #include "cpu_types.h" +#include "memory_desc/cpu_memory_desc.h" namespace ov { namespace intel_cpu { /** -* @brief ACL supports arm_compute::MAX_DIMS maximum. The method squashes the last -* dimensions in order to comply with this limitation -* @param dims vector of dimensions to squash -* @return vector of dimensions that complies to ACL -*/ + * @brief ACL supports arm_compute::MAX_DIMS maximum. The method squashes the last + * dimensions in order to comply with this limitation + * @param dims vector of dimensions to squash + * @return vector of dimensions that complies to ACL + */ inline VectorDims collapse_dims_to_max_rank(VectorDims dims, size_t max_num_shape = arm_compute::MAX_DIMS) { VectorDims result_dims(max_num_shape - 1); if (dims.size() >= max_num_shape) { @@ -32,17 +32,23 @@ inline VectorDims collapse_dims_to_max_rank(VectorDims dims, size_t max_num_shap } /** -* @brief ACL handles NH_C specifically, it thinks it is NC_W, so we need to change layout manually: -* e.g. NCHW (0, 1, 2, 3) -> NHWC (0, 2, 3, 1) -* @param _listDims list of dimensions to convert -* @return none -*/ - -inline void changeLayoutToNH_C(const std::vector &_listDims) { - auto mover = [](arm_compute::TensorShape &_shape) { - if (_shape.num_dimensions() > 4) { std::swap(_shape[2], _shape[3]); } - if (_shape.num_dimensions() > 3) { std::swap(_shape[1], _shape[2]); } - if (_shape.num_dimensions() > 2) { std::swap(_shape[0], _shape[1]); } + * @brief ACL handles NH_C specifically, it thinks it is NC_W, so we need to change layout manually: + * e.g. NCHW (0, 1, 2, 3) -> NHWC (0, 2, 3, 1) + * @param _listDims list of dimensions to convert + * @return none + */ + +inline void changeLayoutToNH_C(const std::vector& _listDims) { + auto mover = [](arm_compute::TensorShape& _shape) { + if (_shape.num_dimensions() > 4) { + std::swap(_shape[2], _shape[3]); + } + if (_shape.num_dimensions() > 3) { + std::swap(_shape[1], _shape[2]); + } + if (_shape.num_dimensions() > 2) { + std::swap(_shape[0], _shape[1]); + } }; for (auto& dims : _listDims) { @@ -51,10 +57,10 @@ inline void changeLayoutToNH_C(const std::vector &_li } /** -* @brief Return ComputeLibrary TensorShape with reverted layout schema used in ACL -* @param dims vector of dimensions to convert -* @return ComputeLibrary TensorShape object -*/ + * @brief Return ComputeLibrary TensorShape with reverted layout schema used in ACL + * @param dims vector of dimensions to convert + * @return ComputeLibrary TensorShape object + */ inline arm_compute::TensorShape shapeCast(const VectorDims& dims) { arm_compute::TensorShape tensorShape; for (std::size_t i = 0; i < dims.size(); ++i) { @@ -67,20 +73,18 @@ inline arm_compute::TensorShape shapeCast(const VectorDims& dims) { return tensorShape; } -enum ACLAxisCastMode { - NO_LAYOUT_CONVERSION, - NHWC_TO_NCHW, - NCHW_TO_NHWC -}; +enum ACLAxisCastMode { NO_LAYOUT_CONVERSION, NHWC_TO_NCHW, NCHW_TO_NHWC }; /** -* @brief Return reverted axis used in ACL. If axis cast mode is -* @param axis axis that needs to be converted -* @param shapeSize size of the shape, which axis needs to be converted -* @param axisCastMode specifies whether layout conversion is required or not -* @return reverted axis -*/ -inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxisCastMode axisCastMode = NO_LAYOUT_CONVERSION) { + * @brief Return reverted axis used in ACL. If axis cast mode is + * @param axis axis that needs to be converted + * @param shapeSize size of the shape, which axis needs to be converted + * @param axisCastMode specifies whether layout conversion is required or not + * @return reverted axis + */ +inline int axisCast(const std::size_t axis, + const std::size_t shapeSize, + ACLAxisCastMode axisCastMode = NO_LAYOUT_CONVERSION) { // CWHN (reverted NHWC) (0, 1, 2, 3) into WHCN (reverted NCHW) (1, 2, 0, 3) static const std::array nhwcToNchw = {1, 2, 0, 3}; // WHCN (reverted NCHW) (0, 1, 2, 3) into CWHN (reverted NHWC) (2, 0, 1, 3) @@ -92,80 +96,100 @@ inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxis size_t revertedAxis = shapeSize - axis - 1; switch (axisCastMode) { - case NO_LAYOUT_CONVERSION: - return revertedAxis; - case NHWC_TO_NCHW: - if (shapeSize == 4) return nhwcToNchw[revertedAxis]; - if (shapeSize == 5) return ndhwcToNcdhw[revertedAxis]; - case NCHW_TO_NHWC: - if (shapeSize == 4) return nchwToNhwc[revertedAxis]; - if (shapeSize == 5) return ncdhwToNdhwc[revertedAxis]; - default: - return -1; + case NO_LAYOUT_CONVERSION: + return revertedAxis; + case NHWC_TO_NCHW: + if (shapeSize == 4) + return nhwcToNchw[revertedAxis]; + if (shapeSize == 5) + return ndhwcToNcdhw[revertedAxis]; + case NCHW_TO_NHWC: + if (shapeSize == 4) + return nchwToNhwc[revertedAxis]; + if (shapeSize == 5) + return ncdhwToNdhwc[revertedAxis]; + default: + return -1; } } /** -* @brief Return ComputeLibrary DataType that corresponds to the given precision -* @param precision precision to be converted -* @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType -*/ + * @brief Return ComputeLibrary DataType that corresponds to the given precision + * @param precision precision to be converted + * @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType + */ inline arm_compute::DataType precisionToAclDataType(ov::element::Type precision) { switch (precision) { - case ov::element::i8: return arm_compute::DataType::S8; - case ov::element::u8: return arm_compute::DataType::U8; - case ov::element::i16: return arm_compute::DataType::S16; - case ov::element::u16: return arm_compute::DataType::U16; - case ov::element::i32: return arm_compute::DataType::S32; - case ov::element::u32: return arm_compute::DataType::U32; - case ov::element::f16: return arm_compute::DataType::F16; - case ov::element::f32: return arm_compute::DataType::F32; - case ov::element::f64: return arm_compute::DataType::F64; - case ov::element::i64: return arm_compute::DataType::S64; - case ov::element::bf16: return arm_compute::DataType::BFLOAT16; - default: return arm_compute::DataType::UNKNOWN; + case ov::element::i8: + return arm_compute::DataType::S8; + case ov::element::u8: + return arm_compute::DataType::U8; + case ov::element::i16: + return arm_compute::DataType::S16; + case ov::element::u16: + return arm_compute::DataType::U16; + case ov::element::i32: + return arm_compute::DataType::S32; + case ov::element::u32: + return arm_compute::DataType::U32; + case ov::element::f16: + return arm_compute::DataType::F16; + case ov::element::f32: + return arm_compute::DataType::F32; + case ov::element::f64: + return arm_compute::DataType::F64; + case ov::element::i64: + return arm_compute::DataType::S64; + case ov::element::bf16: + return arm_compute::DataType::BFLOAT16; + default: + return arm_compute::DataType::UNKNOWN; } } /** -* @brief Return ComputeLibrary DataLayout that corresponds to MemoryDecs layout -* @param desc MemoryDecs from which layout is retrieved -* @param treatAs4D the flag that treats MemoryDecs as 4D shape -* @return ComputeLibrary DataLayout or UNKNOWN if MemoryDecs layout is not mapped to DataLayout -*/ + * @brief Return ComputeLibrary DataLayout that corresponds to MemoryDecs layout + * @param desc MemoryDecs from which layout is retrieved + * @param treatAs4D the flag that treats MemoryDecs as 4D shape + * @return ComputeLibrary DataLayout or UNKNOWN if MemoryDecs layout is not mapped to DataLayout + */ inline arm_compute::DataLayout getAclDataLayoutByMemoryDesc(MemoryDescCPtr desc) { if (desc->hasLayoutType(LayoutType::ncsp)) { - if (desc->getShape().getRank() <= 4) return arm_compute::DataLayout::NCHW; - if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW; + if (desc->getShape().getRank() <= 4) + return arm_compute::DataLayout::NCHW; + if (desc->getShape().getRank() == 5) + return arm_compute::DataLayout::NCDHW; } else if (desc->hasLayoutType(LayoutType::nspc)) { - if (desc->getShape().getRank() <= 4) return arm_compute::DataLayout::NHWC; - if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NDHWC; + if (desc->getShape().getRank() <= 4) + return arm_compute::DataLayout::NHWC; + if (desc->getShape().getRank() == 5) + return arm_compute::DataLayout::NDHWC; } return arm_compute::DataLayout::UNKNOWN; } /** -* @brief run thread-safe configure for ComputeLibrary configuration function. -* Arm Compute Library 23.08 does not officially support thread-safe configure() calls. -* For example, calling configure for Eltwise operations from multiple streams leads to a data race and seg fault. -* @param config ComputeLibrary configuration function -*/ + * @brief run thread-safe configure for ComputeLibrary configuration function. + * Arm Compute Library 23.08 does not officially support thread-safe configure() calls. + * For example, calling configure for Eltwise operations from multiple streams leads to a data race and seg fault. + * @param config ComputeLibrary configuration function + */ void configureThreadSafe(const std::function& config); /** -* @brief get ARM Compute Library ActivationLayerInfo for Eltwise or PostOps. -* @param algorithm activation function of openvino representation -* @param alpha alpha coefficient for algorithm -* @param beta beta coefficient for algorithm -* @param gamma gamma coefficient for algorithm -*/ + * @brief get ARM Compute Library ActivationLayerInfo for Eltwise or PostOps. + * @param algorithm activation function of openvino representation + * @param alpha alpha coefficient for algorithm + * @param beta beta coefficient for algorithm + * @param gamma gamma coefficient for algorithm + */ arm_compute::ActivationLayerInfo getActivationLayerInfo(Algorithm algorithm, float alpha, float beta, float gamma); /** -* @brief check ARM Compute Library ActivationLayerInfo for Eltwise or PostOps. -* @param algorithm activation function of openvino representation -*/ + * @brief check ARM Compute Library ActivationLayerInfo for Eltwise or PostOps. + * @param algorithm activation function of openvino representation + */ bool checkActivationLayerInfo(Algorithm algorithm); -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index b3fe7018d23677..4b4b07df572b4a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -12,9 +12,8 @@ using namespace Xbyak_aarch64; using namespace dnnl::impl::cpu; using namespace dnnl::impl::cpu::aarch64; -void jit_uni_eltwise_kernel::operator()( - const node::jit_eltwise_call_args_ptrs* const_args, - const jit_eltwise_call_args_indexes* indexes) { +void jit_uni_eltwise_kernel::operator()(const node::jit_eltwise_call_args_ptrs* const_args, + const jit_eltwise_call_args_indexes* indexes) { assert(ker_); ker_(const_args, indexes); } @@ -23,12 +22,12 @@ template jit_uni_eltwise_generic::jit_uni_eltwise_generic(const jit_eltwise_params& jep, const std::vector& eltwise_data, const std::vector& ops_list, - const dnnl::post_ops& post_ops) : - jit_uni_eltwise_kernel(jep), - jit_generator(), - eltwise_data_(eltwise_data), - ops_list_(ops_list), - post_ops_(post_ops) {} + const dnnl::post_ops& post_ops) + : jit_uni_eltwise_kernel(jep), + jit_generator(), + eltwise_data_(eltwise_data), + ops_list_(ops_list), + post_ops_(post_ops) {} template void jit_uni_eltwise_generic::generate() { @@ -41,7 +40,7 @@ void jit_uni_eltwise_generic::generate() { post_op_emitters.push_back(create_eltwise_emitter(eltwise_data_[i], exec_prc)); } - const auto &jep = jep_; + const auto& jep = jep_; XReg param2 = abi_param2; const int offset_count = jep.input_size - 1; @@ -49,10 +48,15 @@ void jit_uni_eltwise_generic::generate() { // ptrs initializing if (jep.use_runtime_ptrs) { for (size_t i = 0; i < jep.inputs_number; i++) { - ldr(start_to_offsets, ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_offsets) + i * sizeof(size_t)))); - ldr(get_src_reg(i), ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_ptr[0]) + i * sizeof(size_t)))); - XReg offset_reg = get_aux_gpr(0); // X_TMP_0; - XReg index_reg = get_aux_gpr(1); // X_TMP_1; + ldr(start_to_offsets, + ptr(reg_const_params, + static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_offsets) + + i * sizeof(size_t)))); + ldr(get_src_reg(i), + ptr(reg_const_params, + static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_ptr[0]) + i * sizeof(size_t)))); + XReg offset_reg = get_aux_gpr(0); // X_TMP_0; + XReg index_reg = get_aux_gpr(1); // X_TMP_1; for (int j = 0; j < offset_count; j++) { ldr(offset_reg, ptr(start_to_offsets, static_cast(j * sizeof(size_t)))); ldr(index_reg, ptr(reg_indexes, static_cast(j * sizeof(size_t)))); @@ -60,10 +64,11 @@ void jit_uni_eltwise_generic::generate() { } } - ldr(start_to_offsets, ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, dst_offsets)))); + ldr(start_to_offsets, + ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, dst_offsets)))); ldr(reg_dst, ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, dst_ptr)))); - XReg offset_reg = get_aux_gpr(0); // X_TMP_0; - XReg index_reg = get_aux_gpr(1); // X_TMP_1; + XReg offset_reg = get_aux_gpr(0); // X_TMP_0; + XReg index_reg = get_aux_gpr(1); // X_TMP_1; for (int j = 0; j < offset_count; j++) { ldr(offset_reg, ptr(start_to_offsets, static_cast(j * sizeof(size_t)))); ldr(index_reg, ptr(reg_indexes, static_cast(j * sizeof(size_t)))); @@ -72,7 +77,8 @@ void jit_uni_eltwise_generic::generate() { mov(reg_oc_off, 0); - ldr(reg_work_amount, ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, work_amount)))); + ldr(reg_work_amount, + ptr(reg_const_params, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, work_amount)))); } else { auto init_ptrs_with_offsets = [this, offset_count, param2](XReg pointer, const std::vector& offsets) { for (int j = 0; j < offset_count; j++) { @@ -88,7 +94,9 @@ void jit_uni_eltwise_generic::generate() { }; for (size_t i = 0; i < jep.inputs_number; i++) { - ldr(get_src_reg(i), ptr(param1, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_ptr) + i * sizeof(size_t)))); + ldr(get_src_reg(i), + ptr(param1, + static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_ptr) + i * sizeof(size_t)))); init_ptrs_with_offsets(get_src_reg(i), jep.src_offsets[i]); } @@ -149,7 +157,12 @@ void jit_uni_eltwise_generic::generate() { for (size_t j = 0; j < min_src_size / vec_step; j++) { for (size_t i = 0; i < jep.inputs_number; i++) { if (jep.src_size[i] != 1) { - load_vector(get_vmm_reg(i), get_src_reg(i), jep.src_prc[i], exec_prc, false, j * vec_step * jep.src_prc[i].size()); + load_vector(get_vmm_reg(i), + get_src_reg(i), + jep.src_prc[i], + exec_prc, + false, + j * vec_step * jep.src_prc[i].size()); } } @@ -164,7 +177,11 @@ void jit_uni_eltwise_generic::generate() { for (size_t j = tail_start; j < min_src_size; j++) { for (size_t i = 0; i < jep.inputs_number; i++) { if (jep.src_size[i] != 1) { - load_scalar(get_scl_reg(i), get_src_reg(i), jep.src_prc[i], exec_prc, j * jep.src_prc[i].size()); + load_scalar(get_scl_reg(i), + get_src_reg(i), + jep.src_prc[i], + exec_prc, + j * jep.src_prc[i].size()); } } @@ -276,7 +293,7 @@ namespace utils { template void load_vector(const T1& data_lane, const T2& data_lanes, - const Xbyak_aarch64::XReg &ptr_reg, + const Xbyak_aarch64::XReg& ptr_reg, const int64_t offset, const bool broadcast, jit_generator* h) { @@ -296,7 +313,7 @@ void load_vector(const T1& data_lane, } } } -} // namespace utils +} // namespace utils template void jit_uni_eltwise_generic::load_vector(const TReg& data, @@ -306,62 +323,63 @@ void jit_uni_eltwise_generic::load_vector(const TReg& data, const bool broadcast, const int32_t ptr_offset) { switch (src_prc) { - case ov::element::f16: { - utils::load_vector(data.h, data.h4, ptr_reg, ptr_offset, broadcast, this); - break; - } - case ov::element::f32: - case ov::element::i32: { - if (broadcast) { - jit_generator::uni_ld1rw(data.s, ptr_reg, ptr_offset); - } else { - jit_generator::uni_ldr(data, ptr_reg, ptr_offset); - } - break; - } - case ov::element::i8: { - utils::load_vector(data.b, data.s, ptr_reg, ptr_offset, broadcast, this); - sshll(data.h8, data.b8, 0); - sshll(data.s4, data.h4, 0); - break; - } - case ov::element::u8: { - utils::load_vector(data.b, data.s, ptr_reg, ptr_offset, broadcast, this); - ushll(data.h8, data.b8, 0); - ushll(data.s4, data.h4, 0); - break; - } - default: { - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + case ov::element::f16: { + utils::load_vector(data.h, data.h4, ptr_reg, ptr_offset, broadcast, this); + break; + } + case ov::element::f32: + case ov::element::i32: { + if (broadcast) { + jit_generator::uni_ld1rw(data.s, ptr_reg, ptr_offset); + } else { + jit_generator::uni_ldr(data, ptr_reg, ptr_offset); } + break; + } + case ov::element::i8: { + utils::load_vector(data.b, data.s, ptr_reg, ptr_offset, broadcast, this); + sshll(data.h8, data.b8, 0); + sshll(data.s4, data.h4, 0); + break; + } + case ov::element::u8: { + utils::load_vector(data.b, data.s, ptr_reg, ptr_offset, broadcast, this); + ushll(data.h8, data.b8, 0); + ushll(data.s4, data.h4, 0); + break; + } + default: { + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + } } if (dst_prc != src_prc) { switch (dst_prc) { - case ov::element::f32: - switch (src_prc) { - case ov::element::f16: { - fcvtl(data.s4, data.h4); - break; - } - case ov::element::i32: { - scvtf(data.s, data.s); - break; - } - case ov::element::i8: { - scvtf(data.s, data.s); - break; - } - case ov::element::u8: { - ucvtf(data.s, data.s); - break; - } - default: - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); - } + case ov::element::f32: + switch (src_prc) { + case ov::element::f16: { + fcvtl(data.s4, data.h4); + break; + } + case ov::element::i32: { + scvtf(data.s, data.s); break; + } + case ov::element::i8: { + scvtf(data.s, data.s); + break; + } + case ov::element::u8: { + ucvtf(data.s, data.s); + break; + } default: - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + + dst_prc.to_string()); + } + break; + default: + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); } } } @@ -373,61 +391,62 @@ void jit_uni_eltwise_generic::load_scalar(const SReg& data, const ov::element::Type& dst_prc, const int32_t ptr_offset) { switch (src_prc) { - case ov::element::f16: { - ldr(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::f32: - case ov::element::i32: { - ldr(data, Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::i8: { - ldr(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + case ov::element::f16: { + ldr(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::f32: + case ov::element::i32: { + ldr(data, Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::i8: { + ldr(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - // scalar is loaded, operates with vector - TReg vec(data.getIdx()); - sshll(vec.h8, vec.b8, 0); - sshll(vec.s4, vec.h4, 0); - break; - } - case ov::element::u8: { - ldr(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + // scalar is loaded, operates with vector + TReg vec(data.getIdx()); + sshll(vec.h8, vec.b8, 0); + sshll(vec.s4, vec.h4, 0); + break; + } + case ov::element::u8: { + ldr(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - // scalar is loaded, operates with vector - TReg vec(data.getIdx()); - ushll(vec.h8, vec.b8, 0); - ushll(vec.s4, vec.h4, 0); - break; - } - default: { - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); - } + // scalar is loaded, operates with vector + TReg vec(data.getIdx()); + ushll(vec.h8, vec.b8, 0); + ushll(vec.s4, vec.h4, 0); + break; + } + default: { + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + } } if (dst_prc != src_prc) { switch (dst_prc) { - case ov::element::f32: - switch (src_prc) { - case ov::element::f16: { - fcvt(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::HReg(data.getIdx())); - break; - } - case ov::element::i32: - case ov::element::i8: { - scvtf(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::SReg(data.getIdx())); - break; - } - case ov::element::u8: { - ucvtf(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::SReg(data.getIdx())); - break; - } - default: - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); - } + case ov::element::f32: + switch (src_prc) { + case ov::element::f16: { + fcvt(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::HReg(data.getIdx())); + break; + } + case ov::element::i32: + case ov::element::i8: { + scvtf(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::SReg(data.getIdx())); + break; + } + case ov::element::u8: { + ucvtf(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::SReg(data.getIdx())); break; + } default: - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + + dst_prc.to_string()); + } + break; + default: + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); } } } @@ -440,58 +459,59 @@ void jit_uni_eltwise_generic::store_vector(const XReg& ptr, const int32_t ptr_offset) { if (src_prc != dst_prc) { switch (src_prc) { - case ov::element::f32: { - switch (dst_prc) { - case ov::element::f16: { - fcvtn(data.h4, data.s4); - break; - } - case ov::element::i32: { - fcvtns(data.s, data.s); - break; - } - case ov::element::i8: { - fcvtms(data.s, data.s); - xtn(data.h4, data.s4); - xtn(data.b8, data.h8); - break; - } - case ov::element::u8: { - fcvtmu(data.s, data.s); - xtn(data.h4, data.s4); - xtn(data.b8, data.h8); - break; - } - default: { - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); - } - } + case ov::element::f32: { + switch (dst_prc) { + case ov::element::f16: { + fcvtn(data.h4, data.s4); + break; + } + case ov::element::i32: { + fcvtns(data.s, data.s); + break; + } + case ov::element::i8: { + fcvtms(data.s, data.s); + xtn(data.h4, data.s4); + xtn(data.b8, data.h8); + break; + } + case ov::element::u8: { + fcvtmu(data.s, data.s); + xtn(data.h4, data.s4); + xtn(data.b8, data.h8); break; } default: { - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + + src_prc.to_string()); + } } - } - } - - switch (dst_prc) { - case ov::element::f16: { - str(Xbyak_aarch64::DReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::f32: - case ov::element::i32: { - str(Xbyak_aarch64::QReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::i8: - case ov::element::u8: { - str(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); break; } default: { - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_ptr is " + src_prc.to_string()); + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); } + } + } + + switch (dst_prc) { + case ov::element::f16: { + str(Xbyak_aarch64::DReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::f32: + case ov::element::i32: { + str(Xbyak_aarch64::QReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::i8: + case ov::element::u8: { + str(Xbyak_aarch64::SReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + default: { + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_ptr is " + src_prc.to_string()); + } } } @@ -503,99 +523,94 @@ void jit_uni_eltwise_generic::store_scalar(const XReg& ptr, const int32_t ptr_offset) { if (src_prc != dst_prc) { switch (src_prc) { - case ov::element::f32: { - switch (dst_prc) { - case ov::element::f16: { - fcvt(Xbyak_aarch64::HReg(data.getIdx()), data); - break; - } - case ov::element::i32: { - fcvtns(data, data); - break; - } - case ov::element::i8: { - TReg vec_data(data.getIdx()); - fcvtms(vec_data.s, vec_data.s); - xtn(vec_data.h4, vec_data.s4); - xtn(vec_data.b8, vec_data.h8); - break; - } - case ov::element::u8: { - TReg vec_data(data.getIdx()); - fcvtmu(vec_data.s, vec_data.s); - xtn(vec_data.h4, vec_data.s4); - xtn(vec_data.b8, vec_data.h8); - break; - } - default: { - OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); - } - } + case ov::element::f32: { + switch (dst_prc) { + case ov::element::f16: { + fcvt(Xbyak_aarch64::HReg(data.getIdx()), data); + break; + } + case ov::element::i32: { + fcvtns(data, data); + break; + } + case ov::element::i8: { + TReg vec_data(data.getIdx()); + fcvtms(vec_data.s, vec_data.s); + xtn(vec_data.h4, vec_data.s4); + xtn(vec_data.b8, vec_data.h8); + break; + } + case ov::element::u8: { + TReg vec_data(data.getIdx()); + fcvtmu(vec_data.s, vec_data.s); + xtn(vec_data.h4, vec_data.s4); + xtn(vec_data.b8, vec_data.h8); break; } default: { - OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); + OPENVINO_THROW("dst_prc " + dst_prc.to_string() + " is not supported, src_prc is " + + src_prc.to_string()); + } } - } - } - - switch (dst_prc) { - case ov::element::f16: { - str(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::i32: - case ov::element::f32: { - str(data, Xbyak_aarch64::ptr(ptr, ptr_offset)); - break; - } - case ov::element::i8: - case ov::element::u8: { - str(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); break; } default: { - OPENVINO_THROW("dst_prc " + src_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); + OPENVINO_THROW("src_prc " + src_prc.to_string() + " is not supported, dst_prc is " + dst_prc.to_string()); } + } + } + + switch (dst_prc) { + case ov::element::f16: { + str(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::i32: + case ov::element::f32: { + str(data, Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + case ov::element::i8: + case ov::element::u8: { + str(Xbyak_aarch64::BReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, ptr_offset)); + break; + } + default: { + OPENVINO_THROW("dst_prc " + src_prc.to_string() + " is not supported, src_prc is " + src_prc.to_string()); + } } } struct EltwiseEmitterContext { std::shared_ptr emitter; - dnnl::impl::cpu::aarch64::jit_generator *host; + dnnl::impl::cpu::aarch64::jit_generator* host; dnnl::impl::cpu::aarch64::cpu_isa_t host_isa; const EltwiseData& opData; ov::element::Type exec_prc; }; -template +template struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { - ctx.emitter = std::make_shared(ctx.host, - ctx.host_isa, - ctx.opData.alpha, - ctx.exec_prc); + ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.opData.alpha, ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { - ctx.emitter = std::make_shared(ctx.host, - ctx.host_isa, - ctx.opData.alpha, - ctx.exec_prc); + ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.opData.alpha, ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, @@ -606,7 +621,7 @@ struct EltwiseEmitter { } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, @@ -618,7 +633,7 @@ struct EltwiseEmitter { } }; -template<> +template <> struct EltwiseEmitter { void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, @@ -630,57 +645,56 @@ struct EltwiseEmitter { }; template -std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitter(const EltwiseData& data, const ov::element::Type& exec_prec) { - EltwiseEmitterContext ctx = { - nullptr, - this, - isa, - data, - exec_prec - }; - - OV_SWITCH(intel_cpu, EltwiseEmitter, ctx, data.algo, - OV_CASE(Algorithm::EltwiseAbs, ov::intel_cpu::aarch64::jit_abs_emitter), - OV_CASE(Algorithm::EltwiseAdd, ov::intel_cpu::aarch64::jit_add_emitter), - OV_CASE(Algorithm::EltwiseClamp, ov::intel_cpu::aarch64::jit_clamp_emitter), - OV_CASE(Algorithm::EltwiseDivide, ov::intel_cpu::aarch64::jit_divide_emitter), - OV_CASE(Algorithm::EltwiseElu, ov::intel_cpu::aarch64::jit_elu_emitter), - OV_CASE(Algorithm::EltwiseEqual, ov::intel_cpu::aarch64::jit_equal_emitter), - OV_CASE(Algorithm::EltwiseExp, ov::intel_cpu::aarch64::jit_exp_emitter), - OV_CASE(Algorithm::EltwiseFloor, ov::intel_cpu::aarch64::jit_floor_emitter), - OV_CASE(Algorithm::EltwiseFloorMod, ov::intel_cpu::aarch64::jit_floor_mod_emitter), - OV_CASE(Algorithm::EltwiseCeiling, ov::intel_cpu::aarch64::jit_ceiling_emitter), - OV_CASE(Algorithm::EltwiseHswish, ov::intel_cpu::aarch64::jit_hswish_emitter), - OV_CASE(Algorithm::EltwiseIsFinite, ov::intel_cpu::aarch64::jit_is_finite_emitter), - OV_CASE(Algorithm::EltwiseIsInf, ov::intel_cpu::aarch64::jit_is_inf_emitter), - OV_CASE(Algorithm::EltwiseLessEqual, ov::intel_cpu::aarch64::jit_less_equal_emitter), - OV_CASE(Algorithm::EltwiseLogicalAnd, ov::intel_cpu::aarch64::jit_logical_and_emitter), - OV_CASE(Algorithm::EltwiseLogicalOr, ov::intel_cpu::aarch64::jit_logical_or_emitter), - OV_CASE(Algorithm::EltwiseLogicalNot, ov::intel_cpu::aarch64::jit_logical_not_emitter), - OV_CASE(Algorithm::EltwiseLogicalXor, ov::intel_cpu::aarch64::jit_logical_xor_emitter), - OV_CASE(Algorithm::EltwiseIsNaN, ov::intel_cpu::aarch64::jit_is_nan_emitter), - OV_CASE(Algorithm::EltwiseMaximum, ov::intel_cpu::aarch64::jit_maximum_emitter), - OV_CASE(Algorithm::EltwiseMinimum, ov::intel_cpu::aarch64::jit_minimum_emitter), - OV_CASE(Algorithm::EltwiseMish, ov::intel_cpu::aarch64::jit_mish_emitter), - OV_CASE(Algorithm::EltwiseGeluErf, ov::intel_cpu::aarch64::jit_gelu_erf_emitter), - OV_CASE(Algorithm::EltwiseGeluTanh, ov::intel_cpu::aarch64::jit_gelu_tanh_emitter), - OV_CASE(Algorithm::EltwiseGreater, ov::intel_cpu::aarch64::jit_greater_emitter), - OV_CASE(Algorithm::EltwiseGreaterEqual, ov::intel_cpu::aarch64::jit_greater_equal_emitter), - OV_CASE(Algorithm::EltwiseMulAdd, ov::intel_cpu::aarch64::jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseMod, ov::intel_cpu::aarch64::jit_mod_emitter), - OV_CASE(Algorithm::EltwiseMultiply, ov::intel_cpu::aarch64::jit_multiply_emitter), - OV_CASE(Algorithm::EltwisePowerStatic, ov::intel_cpu::aarch64::jit_power_static_emitter), - OV_CASE(Algorithm::EltwisePrelu, ov::intel_cpu::aarch64::jit_prelu_emitter), - OV_CASE(Algorithm::EltwiseRelu, ov::intel_cpu::aarch64::jit_relu_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, ov::intel_cpu::aarch64::jit_round_half_away_from_zero_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfToEven, ov::intel_cpu::aarch64::jit_round_half_to_even_emitter), - OV_CASE(Algorithm::EltwiseSelect, ov::intel_cpu::aarch64::jit_select_emitter), - OV_CASE(Algorithm::EltwiseSigmoid, ov::intel_cpu::aarch64::jit_sigmoid_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, ov::intel_cpu::aarch64::jit_soft_sign_emitter), - OV_CASE(Algorithm::EltwiseSqrt, ov::intel_cpu::aarch64::jit_sqrt_emitter), - OV_CASE(Algorithm::EltwiseSubtract, ov::intel_cpu::aarch64::jit_subtract_emitter), - OV_CASE(Algorithm::EltwiseSwish, ov::intel_cpu::aarch64::jit_swish_emitter), - OV_CASE(Algorithm::EltwiseTanh, ov::intel_cpu::aarch64::jit_tanh_emitter)); +std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitter(const EltwiseData& data, + const ov::element::Type& exec_prec) { + EltwiseEmitterContext ctx = {nullptr, this, isa, data, exec_prec}; + + OV_SWITCH( + intel_cpu, + EltwiseEmitter, + ctx, + data.algo, + OV_CASE(Algorithm::EltwiseAbs, ov::intel_cpu::aarch64::jit_abs_emitter), + OV_CASE(Algorithm::EltwiseAdd, ov::intel_cpu::aarch64::jit_add_emitter), + OV_CASE(Algorithm::EltwiseClamp, ov::intel_cpu::aarch64::jit_clamp_emitter), + OV_CASE(Algorithm::EltwiseDivide, ov::intel_cpu::aarch64::jit_divide_emitter), + OV_CASE(Algorithm::EltwiseElu, ov::intel_cpu::aarch64::jit_elu_emitter), + OV_CASE(Algorithm::EltwiseEqual, ov::intel_cpu::aarch64::jit_equal_emitter), + OV_CASE(Algorithm::EltwiseExp, ov::intel_cpu::aarch64::jit_exp_emitter), + OV_CASE(Algorithm::EltwiseFloor, ov::intel_cpu::aarch64::jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, ov::intel_cpu::aarch64::jit_floor_mod_emitter), + OV_CASE(Algorithm::EltwiseCeiling, ov::intel_cpu::aarch64::jit_ceiling_emitter), + OV_CASE(Algorithm::EltwiseHswish, ov::intel_cpu::aarch64::jit_hswish_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, ov::intel_cpu::aarch64::jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, ov::intel_cpu::aarch64::jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseLessEqual, ov::intel_cpu::aarch64::jit_less_equal_emitter), + OV_CASE(Algorithm::EltwiseLogicalAnd, ov::intel_cpu::aarch64::jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, ov::intel_cpu::aarch64::jit_logical_or_emitter), + OV_CASE(Algorithm::EltwiseLogicalNot, ov::intel_cpu::aarch64::jit_logical_not_emitter), + OV_CASE(Algorithm::EltwiseLogicalXor, ov::intel_cpu::aarch64::jit_logical_xor_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, ov::intel_cpu::aarch64::jit_is_nan_emitter), + OV_CASE(Algorithm::EltwiseMaximum, ov::intel_cpu::aarch64::jit_maximum_emitter), + OV_CASE(Algorithm::EltwiseMinimum, ov::intel_cpu::aarch64::jit_minimum_emitter), + OV_CASE(Algorithm::EltwiseMish, ov::intel_cpu::aarch64::jit_mish_emitter), + OV_CASE(Algorithm::EltwiseGeluErf, ov::intel_cpu::aarch64::jit_gelu_erf_emitter), + OV_CASE(Algorithm::EltwiseGeluTanh, ov::intel_cpu::aarch64::jit_gelu_tanh_emitter), + OV_CASE(Algorithm::EltwiseGreater, ov::intel_cpu::aarch64::jit_greater_emitter), + OV_CASE(Algorithm::EltwiseGreaterEqual, ov::intel_cpu::aarch64::jit_greater_equal_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, ov::intel_cpu::aarch64::jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseMod, ov::intel_cpu::aarch64::jit_mod_emitter), + OV_CASE(Algorithm::EltwiseMultiply, ov::intel_cpu::aarch64::jit_multiply_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, ov::intel_cpu::aarch64::jit_power_static_emitter), + OV_CASE(Algorithm::EltwisePrelu, ov::intel_cpu::aarch64::jit_prelu_emitter), + OV_CASE(Algorithm::EltwiseRelu, ov::intel_cpu::aarch64::jit_relu_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, ov::intel_cpu::aarch64::jit_round_half_away_from_zero_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, ov::intel_cpu::aarch64::jit_round_half_to_even_emitter), + OV_CASE(Algorithm::EltwiseSelect, ov::intel_cpu::aarch64::jit_select_emitter), + OV_CASE(Algorithm::EltwiseSigmoid, ov::intel_cpu::aarch64::jit_sigmoid_emitter), + OV_CASE(Algorithm::EltwiseSoftSign, ov::intel_cpu::aarch64::jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseSqrt, ov::intel_cpu::aarch64::jit_sqrt_emitter), + OV_CASE(Algorithm::EltwiseSubtract, ov::intel_cpu::aarch64::jit_subtract_emitter), + OV_CASE(Algorithm::EltwiseSwish, ov::intel_cpu::aarch64::jit_swish_emitter), + OV_CASE(Algorithm::EltwiseTanh, ov::intel_cpu::aarch64::jit_tanh_emitter)); if (!ctx.emitter) OPENVINO_THROW("Unsupported operation type '" + algToString(data.algo) + "' for Eltwise emitter"); @@ -746,16 +760,16 @@ void jit_uni_eltwise_generic::apply_post_ops() { namespace { -template +template struct SupportedPrecisions { - void operator()(std::set> &precisions) { + void operator()(std::set>& precisions) { precisions = T::get_supported_precisions(); } }; static void set_intersection(const std::set>& precisions1, - const std::set>& precisions2, - std::set>& intersection) { + const std::set>& precisions2, + std::set>& intersection) { std::map intersection_types; for (auto it1 = precisions1.begin(); it1 != precisions1.end(); ++it1) { @@ -773,7 +787,7 @@ static void set_intersection(const std::set>& precisi intersection.insert(std::vector(it->second, it->first)); } } -} // namespace +} // namespace ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_number, const ov::element::Type (&src_prc)[MAX_ELTWISE_INPUTS], @@ -792,16 +806,14 @@ ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_nu supported_precision_intersection = prcs_intersect; } - static const element::Type exec_precisions_priority[] = { - element::f16, - element::f32 - }; + static const element::Type exec_precisions_priority[] = {element::f16, element::f32}; for (const auto prc : exec_precisions_priority) { - if (std::any_of( - supported_precision_intersection.begin(), - supported_precision_intersection.end(), - [&prc](const std::vector& precisions) { return std::find(precisions.begin(), precisions.end(), prc) != precisions.end(); })) { + if (std::any_of(supported_precision_intersection.begin(), + supported_precision_intersection.end(), + [&prc](const std::vector& precisions) { + return std::find(precisions.begin(), precisions.end(), prc) != precisions.end(); + })) { exec_prc = prc; break; } @@ -824,48 +836,51 @@ ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_nu std::set> eltwise_precision_helper::get_supported_precisions(const Algorithm& algo) { std::set> precisions; - OV_SWITCH(intel_cpu, SupportedPrecisions, precisions, algo, - OV_CASE(Algorithm::EltwiseRelu, jit_relu_emitter), - OV_CASE(Algorithm::EltwiseAbs, jit_abs_emitter), - OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), - OV_CASE(Algorithm::EltwiseClamp, jit_clamp_emitter), - OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), - OV_CASE(Algorithm::EltwiseElu, jit_elu_emitter), - OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), - OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), - OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), - OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), - OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), - OV_CASE(Algorithm::EltwiseGeluErf, jit_gelu_erf_emitter), - OV_CASE(Algorithm::EltwiseGeluTanh, jit_gelu_tanh_emitter), - OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), - OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), - OV_CASE(Algorithm::EltwiseHswish, jit_hswish_emitter), - OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), - OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), - OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), - OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), - OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), - OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), - OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), - OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), - OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), - OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), - OV_CASE(Algorithm::EltwiseMish, jit_mish_emitter), - OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), - OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), - OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), - OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_round_half_away_from_zero_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_round_half_to_even_emitter), - OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), - OV_CASE(Algorithm::EltwiseSigmoid, jit_sigmoid_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), - OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter), - OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), - OV_CASE(Algorithm::EltwiseSwish, jit_swish_emitter), - OV_CASE(Algorithm::EltwiseTanh, jit_tanh_emitter)); + OV_SWITCH(intel_cpu, + SupportedPrecisions, + precisions, + algo, + OV_CASE(Algorithm::EltwiseRelu, jit_relu_emitter), + OV_CASE(Algorithm::EltwiseAbs, jit_abs_emitter), + OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), + OV_CASE(Algorithm::EltwiseClamp, jit_clamp_emitter), + OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), + OV_CASE(Algorithm::EltwiseElu, jit_elu_emitter), + OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), + OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), + OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), + OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), + OV_CASE(Algorithm::EltwiseGeluErf, jit_gelu_erf_emitter), + OV_CASE(Algorithm::EltwiseGeluTanh, jit_gelu_tanh_emitter), + OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), + OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), + OV_CASE(Algorithm::EltwiseHswish, jit_hswish_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), + OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), + OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), + OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), + OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), + OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), + OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), + OV_CASE(Algorithm::EltwiseMish, jit_mish_emitter), + OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), + OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_round_half_away_from_zero_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_round_half_to_even_emitter), + OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), + OV_CASE(Algorithm::EltwiseSigmoid, jit_sigmoid_emitter), + OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseSqrt, jit_sqrt_emitter), + OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), + OV_CASE(Algorithm::EltwiseSwish, jit_swish_emitter), + OV_CASE(Algorithm::EltwiseTanh, jit_tanh_emitter)); if (precisions.empty()) OPENVINO_THROW("Unsupported operation type for Eltwise emitter"); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp index 8f18a9815b4fe4..1bf64d096e4a84 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp @@ -4,10 +4,11 @@ #pragma once -#include - #include + #include +#include + #include "nodes/executors/eltwise.hpp" // TODO: handle x64 headers more accurate and remove undef later @@ -24,12 +25,11 @@ #include -#include "utils/general_utils.h" -#include "utils/cpu_utils.hpp" - -#include "emitters/plugin/aarch64/jit_emitter.hpp" #include "emitters/plugin/aarch64/jit_eltwise_emitters.hpp" +#include "emitters/plugin/aarch64/jit_emitter.hpp" #include "nodes/kernels/jit_eltwise_call_args_ptrs.hpp" +#include "utils/cpu_utils.hpp" +#include "utils/general_utils.h" namespace ov { namespace intel_cpu { @@ -154,7 +154,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, jit_generator { OPENVINO_THROW("source vector ptr register " + std::to_string(idx) + " is not supported"); } - static const std::vector src_gprs = { 19, 20, 21, 22, 25, 26, 27 }; + static const std::vector src_gprs = {19, 20, 21, 22, 25, 26, 27}; return XReg(src_gprs[idx]); } @@ -192,8 +192,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, jit_generator { // 24 | src // 25-31 | [not used] - - TReg vmm_dst {9}; + TReg vmm_dst{9}; inline TReg get_vmm_reg(const uint32_t idx) { if (idx > MAX_ELTWISE_INPUTS) { @@ -230,10 +229,10 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, jit_generator { const int32_t ptr_offset = 0); void store_vector(const XReg& ptr, - const TReg& data, - const ov::element::Type& src_prc, - const ov::element::Type& dst_prc, - const int32_t ptr_offset = 0); + const TReg& data, + const ov::element::Type& src_prc, + const ov::element::Type& dst_prc, + const int32_t ptr_offset = 0); void store_scalar(const XReg& ptr, const SReg& data, @@ -264,6 +263,6 @@ class eltwise_precision_helper { static std::set> get_supported_precisions(const Algorithm& algo); }; -} // namespace aarch64 -} // namespace intel_cpu -} // namespace ov +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.cpp index 0ae6a4ad2c45ff..28e17854f46b08 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.cpp @@ -6,102 +6,95 @@ namespace ov { namespace intel_cpu { - GemmKernel::GemmKernel(size_t M, - size_t N, - size_t K, - bool b_transposed, - ov::element::Type inType) +GemmKernel::GemmKernel(size_t M, size_t N, size_t K, bool b_transposed, ov::element::Type inType) : M(M), N(N), K(K), b_transposed(b_transposed) { - if (!one_of(inType, ov::element::f32, ov::element::f16, ov::element::bf16)) - THROW_ERROR("brgemm kernel only supports bf16, f16 and f32"); - - if (inType == ov::element::f32) { - format = arm_compute::Format::F32; - } else if (inType == ov::element::f16) { - format = arm_compute::Format::F16; - } else if (inType == ov::element::bf16) { - format = arm_compute::Format::BFLOAT16; - } - + if (!one_of(inType, ov::element::f32, ov::element::f16, ov::element::bf16)) + THROW_ERROR("brgemm kernel only supports bf16, f16 and f32"); + + if (inType == ov::element::f32) { + format = arm_compute::Format::F32; + } else if (inType == ov::element::f16) { + format = arm_compute::Format::F16; + } else if (inType == ov::element::bf16) { + format = arm_compute::Format::BFLOAT16; + } - aclGemmKernel = std::make_unique(); + aclGemmKernel = std::make_unique(); +} + +arm_compute::Status GemmKernel::executeGemm(void* a, + void* b, + arm_compute::TensorInfo& dstInfo, + arm_compute::Tensor& dstTensor, + arm_compute::Strides aStrides, + arm_compute::Strides bStrides, + void* c, + float alpha, + float beta, + arm_compute::Strides* outStrides, + void* out) { + aInfo.init(shapeCast({M, N}), + format, + aStrides, + size_t(0), + (size_t)(M * N * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); + + arm_compute::TensorShape bShape; + if (b_transposed) + bShape = shapeCast({K, N}); + else + bShape = shapeCast({N, K}); + + bInfo.init(bShape, + format, + bStrides, + size_t(0), + (size_t)(K * N * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); + + aTensor.allocator()->init(aInfo); + bTensor.allocator()->init(bInfo); + + if (c != nullptr) { + cInfo.init(shapeCast({M, K}), format); + cTensor.allocator()->init(cInfo); } - arm_compute::Status GemmKernel::executeGemm(void *a, - void *b, - arm_compute::TensorInfo& dstInfo, - arm_compute::Tensor& dstTensor, - arm_compute::Strides aStrides, - arm_compute::Strides bStrides, - void *c, - float alpha, - float beta, - arm_compute::Strides* outStrides, - void* out) { - aInfo.init( - shapeCast({M, N}), + if (outStrides != nullptr) + dstInfo.init( + shapeCast({M, K}), format, - aStrides, + *outStrides, size_t(0), - (size_t)(M * N * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); - - arm_compute::TensorShape bShape; - if (b_transposed) - bShape = shapeCast({K, N}); - else - bShape = shapeCast({N, K}); - - bInfo.init( - bShape, - format, - bStrides, - size_t(0), - (size_t)(K * N * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); - - aTensor.allocator()->init(aInfo); - bTensor.allocator()->init(bInfo); - - if (c != nullptr) { - cInfo.init(shapeCast({M, K}), format); - cTensor.allocator()->init(cInfo); - } - - if (outStrides != nullptr) - dstInfo.init( - shapeCast({M, K}), - format, - *outStrides, - size_t(0), - (size_t)(M * K * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); - else - dstInfo.init(shapeCast({M, K}), format); - - dstTensor.allocator()->init(dstInfo); - - aTensor.allocator()->import_memory(reinterpret_cast(a)); - bTensor.allocator()->import_memory(reinterpret_cast(b)); - cTensor.allocator()->import_memory(reinterpret_cast(c)); - - if (out == nullptr) - dstTensor.allocator()->allocate(); - else - dstTensor.allocator()->import_memory(out); - - if (b_transposed) - aclGemmInfo.set_pretranspose_B(true); - - auto status = aclGemmKernel->validate(&aInfo, &bInfo, &cInfo, &dstInfo, 1.0, 0.0, aclGemmInfo); - - if (c == nullptr) - aclGemmKernel->configure(&aTensor, &bTensor, nullptr, &dstTensor, alpha, beta, aclGemmInfo); - else - aclGemmKernel->configure(&aTensor, &bTensor, &cTensor, &dstTensor, alpha, beta, aclGemmInfo); - aclGemmKernel->run(); - - return status; - } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file + (size_t)(M * K * arm_compute::element_size_from_data_type(arm_compute::data_type_from_format(format)))); + else + dstInfo.init(shapeCast({M, K}), format); + + dstTensor.allocator()->init(dstInfo); + + aTensor.allocator()->import_memory(reinterpret_cast(a)); + bTensor.allocator()->import_memory(reinterpret_cast(b)); + cTensor.allocator()->import_memory(reinterpret_cast(c)); + + if (out == nullptr) + dstTensor.allocator()->allocate(); + else + dstTensor.allocator()->import_memory(out); + + if (b_transposed) + aclGemmInfo.set_pretranspose_B(true); + + auto status = aclGemmKernel->validate(&aInfo, &bInfo, &cInfo, &dstInfo, 1.0, 0.0, aclGemmInfo); + + if (c == nullptr) + aclGemmKernel->configure(&aTensor, &bTensor, nullptr, &dstTensor, alpha, beta, aclGemmInfo); + else + aclGemmKernel->configure(&aTensor, &bTensor, &cTensor, &dstTensor, alpha, beta, aclGemmInfo); + aclGemmKernel->run(); + + return status; +} +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.hpp index 620f42f239cbbb..06a26743e0b2a4 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/acl/gemm_kernel.hpp @@ -4,21 +4,17 @@ #pragma once #include #include -#include "nodes/executors/acl/acl_utils.hpp" -#include "utils/general_utils.h" -#include "arm_compute/runtime/NEON/NEFunctions.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/NEFunctions.h" +#include "nodes/executors/acl/acl_utils.hpp" +#include "utils/general_utils.h" namespace ov { namespace intel_cpu { class GemmKernel { public: - GemmKernel(size_t M, - size_t N, - size_t K, - bool b_transposed = false, - ov::element::Type inType = ov::element::f32); + GemmKernel(size_t M, size_t N, size_t K, bool b_transposed = false, ov::element::Type inType = ov::element::f32); arm_compute::Status executeGemm(void* a, void* b, @@ -48,5 +44,5 @@ class GemmKernel { arm_compute::GEMMInfo aclGemmInfo; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv.cpp index 746b556314ce3a..f63981078616d5 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv.cpp @@ -1,20 +1,19 @@ // Copyright (C) 2020-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 - #include "convert_group_conv.hpp" #include - -#include "openvino/opsets/opset1.hpp" #include + #include "openvino/core/rt_info.hpp" +#include "openvino/opsets/opset1.hpp" ov::intel_cpu::ConvertGroupConvolution::ConvertGroupConvolution() { auto gconv = ov::pass::pattern::wrap_type(); ov::matcher_pass_callback callback = [](ov::pass::pattern::Matcher& m) { - enum Inputs {Data, Weights}; + enum Inputs { Data, Weights }; auto gconv = std::dynamic_pointer_cast(m.get_match_root()); if (!gconv) { return false; @@ -26,31 +25,32 @@ ov::intel_cpu::ConvertGroupConvolution::ConvertGroupConvolution() { // Weights layout GOIYX int64_t groups = gconv->get_input_shape(Inputs::Weights)[0]; - if (data_shape[channel_axis].is_dynamic() || - output_shape[channel_axis].is_dynamic()) { + if (data_shape[channel_axis].is_dynamic() || output_shape[channel_axis].is_dynamic()) { return false; } if (groups == data_shape[channel_axis].get_length() && - groups == output_shape[channel_axis].get_length()) { // depthwise case + groups == output_shape[channel_axis].get_length()) { // depthwise case return false; } ov::NodeVector replace_nodes; - auto split_weights = std::make_shared(gconv->input_value(Inputs::Weights), - ov::opset8::Constant::create(ov::element::i64, ov::Shape{}, {0}), - groups); + auto split_weights = std::make_shared( + gconv->input_value(Inputs::Weights), + ov::opset8::Constant::create(ov::element::i64, ov::Shape{}, {0}), + groups); replace_nodes.push_back(split_weights); - auto axis = ov::opset8::Constant::create(ov::element::i64, ov::Shape{}, {channel_axis}); + auto axis = ov::opset8::Constant::create(ov::element::i64, ov::Shape{}, {channel_axis}); auto split = std::make_shared(gconv->input_value(Inputs::Data), axis, groups); replace_nodes.push_back(split); ov::NodeVector concat_inputs; for (int64_t g = 0; g < groups; g++) { auto out = split->output(g); - auto filter = std::make_shared(split_weights->output(g), - ov::opset8::Constant::create(ov::element::i64, ov::Shape{}, {0})); + auto filter = std::make_shared( + split_weights->output(g), + ov::opset8::Constant::create(ov::element::i64, ov::Shape{}, {0})); auto conv = std::make_shared(out, filter, gconv->get_strides(), diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv.hpp index eadbf5ff9cd7e2..55c1ecde2aae10 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv.hpp @@ -3,8 +3,8 @@ #pragma once -#include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" /* * Description: @@ -13,7 +13,7 @@ * equals to number of groups. * * Before: - * + * * +--------------+ +---------------+ * | Input tensor | | Kernel tensor | * +-----------+--+ +-+-------------+ @@ -25,9 +25,9 @@ * +------v------+ * | Result | * +-------------+ - * + * * After: - * + * * +--------------+ +--------------+ +---------------+ +--------------+ * | Input tensor | | Constant (1) | | Kernel tensor | | Constant (0) | * +-----------+--+ +-+------------+ +-----------+---+ +-+------------+ @@ -53,13 +53,13 @@ * +----------v------------+ * | Result | * +-----------------------+ - * + * */ namespace ov { namespace intel_cpu { -class ConvertGroupConvolution: public ov::pass::MatcherPass { +class ConvertGroupConvolution : public ov::pass::MatcherPass { public: OPENVINO_RTTI("ConvertGroupConvolution", "0"); ConvertGroupConvolution(); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv1d.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv1d.cpp index 72228d89cb6ab2..94771fa57ed2cd 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv1d.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv1d.cpp @@ -1,14 +1,13 @@ // Copyright (C) 2020-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 - #include "convert_group_conv1d.hpp" #include - -#include "openvino/opsets/opset1.hpp" #include + #include "openvino/core/rt_info.hpp" +#include "openvino/opsets/opset1.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" template @@ -26,17 +25,20 @@ ov::matcher_pass_callback ov::intel_cpu::ConvertConv1DBase::convert_conv1d_to_co return false; } - auto input = conv->input_value(0); + auto input = conv->input_value(0); auto weights = conv->input_value(1); auto weights2d_shape = weights.get_shape(); weights2d_shape.push_back(1); - auto w_shape = std::make_shared(ov::element::i64, ov::Shape{weights2d_shape.size()}, weights2d_shape); + auto w_shape = std::make_shared(ov::element::i64, + ov::Shape{weights2d_shape.size()}, + weights2d_shape); auto getUnsqueeze = [&](const ov::Output& node) { auto rank = node.get_partial_shape().rank().get_length(); - return std::make_shared(node, - ov::opset1::Constant::create(ov::element::i64, ov::Shape{1}, {rank})); + return std::make_shared( + node, + ov::opset1::Constant::create(ov::element::i64, ov::Shape{1}, {rank})); }; auto input2d = getUnsqueeze(input); @@ -63,16 +65,16 @@ ov::matcher_pass_callback ov::intel_cpu::ConvertConv1DBase::convert_conv1d_to_co ov::intel_cpu::ConvertConv1D::ConvertConv1D() { auto m = std::make_shared( - ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(), - ov::pass::pattern::any_input()}), - "ConvertConvolutionToArm"); + ov::pass::pattern::wrap_type( + {ov::pass::pattern::any_input(), ov::pass::pattern::any_input()}), + "ConvertConvolutionToArm"); register_matcher(m, convert_conv1d_to_conv2d()); } ov::intel_cpu::ConvertGroupConv1D::ConvertGroupConv1D() { auto m = std::make_shared( - ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(), - ov::pass::pattern::any_input()}), - "ConvertGroupConvolutionToArm"); + ov::pass::pattern::wrap_type( + {ov::pass::pattern::any_input(), ov::pass::pattern::any_input()}), + "ConvertGroupConvolutionToArm"); register_matcher(m, convert_conv1d_to_conv2d()); } \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp index f1edd9d363df10..5674514eeb8e64 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp @@ -9,11 +9,11 @@ * Description: * ConvertConv1DBase detects 1D Convolution / GroupConvolution and replaces * it with the sequence Unsqueeze - 2D Convolution / GroupConvolution - Squeeze. - * Unsqueeze adds the additional dimension to Convolution inputs and Squeeze + * Unsqueeze adds the additional dimension to Convolution inputs and Squeeze * removes the additional dimension from the Convolution output. * * Before: - * + * * +--------------+ +---------------+ * | Input tensor | | Kernel tensor | * +-----------+--+ +-+-------------+ @@ -25,9 +25,9 @@ * +------v------+ * | Result | * +-------------+ - * + * * After: - * + * * +--------------+ +--------------+ +---------------+ +--------------+ * | Input tensor | | Constant (1) | | Kernel tensor | | Constant (1) | * +-----------+--+ +-+------------+ +-----------+---+ +-+------------+ @@ -47,25 +47,25 @@ * +------v------+ * | Result | * +-------------+ - * + * */ namespace ov { namespace intel_cpu { -class ConvertConv1DBase: public ov::pass::MatcherPass { +class ConvertConv1DBase : public ov::pass::MatcherPass { protected: OPENVINO_RTTI("ConvertConv1DBase", "0"); template ov::matcher_pass_callback convert_conv1d_to_conv2d(); }; -class ConvertConv1D: public ConvertConv1DBase { +class ConvertConv1D : public ConvertConv1DBase { public: OPENVINO_RTTI("ConvertConv1D", "0"); ConvertConv1D(); }; -class ConvertGroupConv1D: public ConvertConv1DBase { +class ConvertGroupConv1D : public ConvertConv1DBase { public: OPENVINO_RTTI("ConvertGroupConv1D", "0"); ConvertGroupConv1D(); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.cpp index ff5632cb0a5e8f..287aad96b14c08 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.cpp @@ -1,7 +1,6 @@ // Copyright (C) 2020-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 - #include "convert_reduce_multi_axis.hpp" #include "openvino/core/rt_info.hpp" @@ -41,7 +40,7 @@ ov::matcher_pass_callback ov::intel_cpu::ConvertReduceMultiAxisBase::convert_red std::shared_ptr node = input0.get_node_shared_ptr(); auto output = input0; bool keepDims = reduce->get_keep_dims(); - //axes should be sorted in descending order if keepDims is false to be keep axis within data shape + // axes should be sorted in descending order if keepDims is false to be keep axis within data shape if (!keepDims) { sort(axes.begin(), axes.end(), std::greater()); } @@ -61,28 +60,32 @@ ov::matcher_pass_callback ov::intel_cpu::ConvertReduceMultiAxisBase::convert_red ov::intel_cpu::ConvertReduceProd::ConvertReduceProd() { auto m = std::make_shared( - ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(), - ov::pass::pattern::wrap_type()}), "ConvertReduceProd"); + ov::pass::pattern::wrap_type( + {ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type()}), + "ConvertReduceProd"); register_matcher(m, convert_reduce()); } ov::intel_cpu::ConvertReduceMin::ConvertReduceMin() { auto m = std::make_shared( - ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(), - ov::pass::pattern::wrap_type()}), "ConvertReduceMin"); + ov::pass::pattern::wrap_type( + {ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type()}), + "ConvertReduceMin"); register_matcher(m, convert_reduce()); } ov::intel_cpu::ConvertReduceMax::ConvertReduceMax() { auto m = std::make_shared( - ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(), - ov::pass::pattern::wrap_type()}), "ConvertReduceMax"); + ov::pass::pattern::wrap_type( + {ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type()}), + "ConvertReduceMax"); register_matcher(m, convert_reduce()); } ov::intel_cpu::ConvertReduceSum::ConvertReduceSum() { auto m = std::make_shared( - ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(), - ov::pass::pattern::wrap_type()}), "ConvertReduceSum"); + ov::pass::pattern::wrap_type( + {ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type()}), + "ConvertReduceSum"); register_matcher(m, convert_reduce()); } diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp index 38a1f8a9c0c601..8e5fd1e38b605a 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp @@ -3,8 +3,8 @@ #pragma once -#include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" /* * Description: @@ -13,7 +13,7 @@ * is replaced with a sequence of single-axe Reduce operations. * * Before: - * + * * +--------------+ +-------------------+ * | Data | | Axes tensor [A,B] | * +-----------+--+ +-+-----------------+ @@ -25,9 +25,9 @@ * +------v------+ * | Result | * +-------------+ - * + * * After: - * + * * +-------------+ +---------------+ * | Data | | Axes scalar A | * +---------+---+ +----+----------+ @@ -43,44 +43,44 @@ * +-------v---------+ * | Result | * +-----------------+ - * + * */ namespace ov { namespace intel_cpu { -class ConvertReduceMultiAxisBase: public ov::pass::MatcherPass { +class ConvertReduceMultiAxisBase : public ov::pass::MatcherPass { public: OPENVINO_RTTI("ConvertReduceMultiAxisBase", "0"); template ov::matcher_pass_callback convert_reduce(); }; -class ConvertReduceProd: public ConvertReduceMultiAxisBase { +class ConvertReduceProd : public ConvertReduceMultiAxisBase { public: OPENVINO_RTTI("ConvertReduceProd", "0"); ConvertReduceProd(); }; -class ConvertReduceMin: public ConvertReduceMultiAxisBase { +class ConvertReduceMin : public ConvertReduceMultiAxisBase { public: OPENVINO_RTTI("ConvertReduceMin", "0"); ConvertReduceMin(); }; -class ConvertReduceMax: public ConvertReduceMultiAxisBase { +class ConvertReduceMax : public ConvertReduceMultiAxisBase { public: OPENVINO_RTTI("ConvertReduceMax", "0"); ConvertReduceMax(); }; -class ConvertReduceSum: public ConvertReduceMultiAxisBase { +class ConvertReduceSum : public ConvertReduceMultiAxisBase { public: OPENVINO_RTTI("ConvertReduceSum", "0"); ConvertReduceSum(); }; -class ConvertReduceMultiAxis: public ov::pass::GraphRewrite { +class ConvertReduceMultiAxis : public ov::pass::GraphRewrite { public: OPENVINO_RTTI("ConvertReduceMultiAxis", "0"); ConvertReduceMultiAxis() { @@ -91,5 +91,5 @@ class ConvertReduceMultiAxis: public ov::pass::GraphRewrite { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.cpp index def7250dd5b938..50530159e0875e 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.cpp @@ -1,7 +1,6 @@ // Copyright (C) 2020-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 - #include "convert_reduce_no_keep_dims.hpp" #include "openvino/core/rt_info.hpp" @@ -29,9 +28,10 @@ ov::matcher_pass_callback ov::intel_cpu::ConvertReduceNoKeepDimsBase::convert_re template ov::intel_cpu::ConvertReduction::ConvertReduction() { auto m = std::make_shared( - ov::pass::pattern::wrap_type({ov::pass::pattern::any_input(), - ov::pass::pattern::wrap_type()}), "ConvertReduction"); - register_matcher(m, convert_reduce()); + ov::pass::pattern::wrap_type( + {ov::pass::pattern::any_input(), ov::pass::pattern::wrap_type()}), + "ConvertReduction"); + register_matcher(m, convert_reduce()); } template class ov::intel_cpu::ConvertReduction; diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp index 2f3c7d19726513..ea4128ea265e42 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_no_keep_dims.hpp @@ -3,10 +3,10 @@ #pragma once -#include "openvino/pass/pattern/op/wrap_type.hpp" -#include "openvino/pass/graph_rewrite.hpp" #include "openvino/op/util/arithmetic_reductions_keep_dims.hpp" #include "openvino/op/util/logical_reduction_keep_dims.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" /* * Description: @@ -15,7 +15,7 @@ * which removes undesired dimensions. * * Before: - * + * * +--------------+ +-----------------+ * | Data | | Axes tensor | * +-----------+--+ +-+---------------+ @@ -23,9 +23,9 @@ * +---------------------------+ * | Reduce (keepDims = false) | * +---------------------------+ - * + * * After: - * + * * +--------------+ +-----------------+ * | Data | | Axes tensor | * +-----------+--+ +-+------------+--+ @@ -37,13 +37,13 @@ * +--------v------v-+ * | Squeeze | * +-----------------+ - * + * */ namespace ov { namespace intel_cpu { -class ConvertReduceNoKeepDimsBase: public ov::pass::MatcherPass { +class ConvertReduceNoKeepDimsBase : public ov::pass::MatcherPass { public: OPENVINO_RTTI("ConvertReduceNoKeepDims", "0"); template @@ -51,14 +51,13 @@ class ConvertReduceNoKeepDimsBase: public ov::pass::MatcherPass { }; template -class ConvertReduction: public ConvertReduceNoKeepDimsBase { +class ConvertReduction : public ConvertReduceNoKeepDimsBase { public: OPENVINO_RTTI("ConvertReduction", "0"); ConvertReduction(); }; - -class ConvertReduceNoKeepDims: public ov::pass::GraphRewrite { +class ConvertReduceNoKeepDims : public ov::pass::GraphRewrite { public: OPENVINO_RTTI("ConvertReduceNoKeepDims", "0"); ConvertReduceNoKeepDims() { @@ -67,5 +66,5 @@ class ConvertReduceNoKeepDims: public ov::pass::GraphRewrite { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.cpp index 76bc2c4f7c1df8..338bb1820fa7f4 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.cpp @@ -1,11 +1,10 @@ // Copyright (C) 2020-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 - #include "mish_decomposition.hpp" -#include "openvino/opsets/opset4.hpp" #include "openvino/core/rt_info.hpp" +#include "openvino/opsets/opset4.hpp" ov::intel_cpu::MishDecomposition::MishDecomposition() { auto mish = ov::pass::pattern::wrap_type(); @@ -17,7 +16,9 @@ ov::intel_cpu::MishDecomposition::MishDecomposition() { } auto exp = std::make_shared(mish->input_value(0)); - auto add = std::make_shared(exp, opset4::Constant::create(mish->get_output_element_type(0), ov::Shape{}, {1.0f})); + auto add = std::make_shared( + exp, + opset4::Constant::create(mish->get_output_element_type(0), ov::Shape{}, {1.0f})); auto log = std::make_shared(add); auto tanh = std::make_shared(log); auto mul = std::make_shared(mish->input_value(0), tanh); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.hpp index 0b93cea24053f5..75b45dca468dc7 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.hpp @@ -3,17 +3,17 @@ #pragma once -#include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" namespace ov { namespace intel_cpu { -class MishDecomposition: public ov::pass::MatcherPass { +class MishDecomposition : public ov::pass::MatcherPass { public: OPENVINO_RTTI("MishDecomposition", "0"); MishDecomposition(); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp index c38d088ef95e7b..25b10d55ca8165 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.cpp @@ -3,35 +3,32 @@ // #include "snippets_mark_skipped.hpp" -#include "snippets/pass/tokenization.hpp" +#include "itt.hpp" #include "snippets/op/subgraph.hpp" +#include "snippets/pass/tokenization.hpp" #include "snippets/utils/utils.hpp" - -#include "transformations/utils/utils.hpp" #include "transformations/utils.hpp" -#include "utils/general_utils.h" +#include "transformations/utils/utils.hpp" #include "utils/cpu_utils.hpp" - -#include "itt.hpp" - +#include "utils/general_utils.h" namespace ov { namespace intel_cpu { namespace { static const int DEFAULT_AXIS = 1; -NodeFusingType GetNodeFusingType(const std::shared_ptr &node) { - auto &rt = node->get_rt_info(); +NodeFusingType GetNodeFusingType(const std::shared_ptr& node) { + auto& rt = node->get_rt_info(); const auto rinfo = rt.find("MayBeFusedInPlugin"); if (rinfo == rt.end()) return NodeFusingType::NotSet; return rinfo->second.as(); } -void SetNodeFusingType(const std::shared_ptr &node, NodeFusingType nodeType) { - auto &rt = node->get_rt_info(); +void SetNodeFusingType(const std::shared_ptr& node, NodeFusingType nodeType) { + auto& rt = node->get_rt_info(); rt["MayBeFusedInPlugin"] = nodeType; } -std::vector getContinuableChains(const std::shared_ptr &node) { +std::vector getContinuableChains(const std::shared_ptr& node) { std::vector result; for (const auto& input : node->inputs()) { const auto parent = input.get_source_output().get_node_shared_ptr(); @@ -42,12 +39,12 @@ std::vector getContinuableChains(const std::shared_ptr &node) { +int getNumNonConstInputs(const std::shared_ptr& node) { int num_non_const_inputs = 0; - for (const auto &parent_out : node->input_values()) { + for (const auto& parent_out : node->input_values()) { const auto parent = parent_out.get_node_shared_ptr(); if (ov::is_type(parent)) { - for (const auto &grandparent_out : parent->input_values()) { + for (const auto& grandparent_out : parent->input_values()) { const auto grandparent = grandparent_out.get_node_shared_ptr(); if (!ov::is_type(grandparent)) num_non_const_inputs++; @@ -65,40 +62,35 @@ bool isFullyConnected(const std::shared_ptr& node) { const auto out_weights = node->input_value(1); const auto rank_a = out_activations.get_partial_shape().rank(); const auto rank_w = out_weights.get_partial_shape().rank(); - return out_weights.get_partial_shape().is_static() && - rank_a.is_static() && rank_w.is_static() && - rank_a.get_length() != 1 && rank_w.get_length() != 1 && - rank_w.get_length() <= 3 && + return out_weights.get_partial_shape().is_static() && rank_a.is_static() && rank_w.is_static() && + rank_a.get_length() != 1 && rank_w.get_length() != 1 && rank_w.get_length() <= 3 && ov::op::util::is_on_constant_path(out_weights); } -bool SupportsFusingWithConvolution_Simple(const std::shared_ptr &node) { +bool SupportsFusingWithConvolution_Simple(const std::shared_ptr& node) { // Note: some other operations support this fusing (SoftPlus, Sqrt). // Skip them here, when they are supported by Snippets ARM. Ticket: 141170. - return ov::is_type(node) || - ov::is_type(node) || - ov::is_type(node) || - ov::is_type(node) || - ov::is_type(node) || - ov::is_type(node); + return ov::is_type(node) || ov::is_type(node) || + ov::is_type(node) || ov::is_type(node) || + ov::is_type(node) || ov::is_type(node); } // Convolution is a special case, since it supports peculiar fusings -bool isSuitableConvolutionParent(const std::shared_ptr &node) { - const bool is_suitable_node = ov::is_type(node) || - ov::is_type(node); +bool isSuitableConvolutionParent(const std::shared_ptr& node) { + const bool is_suitable_node = + ov::is_type(node) || ov::is_type(node); // has a single output, connected to a single child const auto out = node->outputs(); const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1); return is_suitable_node && has_only_child; } -bool isSuitableBinaryConvolutionParent(const std::shared_ptr &node) { +bool isSuitableBinaryConvolutionParent(const std::shared_ptr& node) { const bool is_suitable_node = ov::is_type(node); // has a single output, connected to a single child const auto out = node->outputs(); const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1); return is_suitable_node && has_only_child; } -bool isSuitableMiscParent(const std::shared_ptr &node) { +bool isSuitableMiscParent(const std::shared_ptr& node) { const bool is_suitable_node = ov::is_type(node) || ov::is_type(node) || ov::is_type(node); @@ -108,35 +100,34 @@ bool isSuitableMiscParent(const std::shared_ptr &node) { return is_suitable_node && has_only_child; } // Matmul is a special case, since it supports simple + bias fusings -bool isSuitableMatMulParent(const std::shared_ptr &node) { +bool isSuitableMatMulParent(const std::shared_ptr& node) { const bool is_suitable_node = ov::is_type(node); // has a single output, connected to a single child const auto out = node->outputs(); const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1); return is_suitable_node && has_only_child; } -bool isSuitablePoolChild(const std::shared_ptr &node) { +bool isSuitablePoolChild(const std::shared_ptr& node) { const bool is_suitable_node = ov::is_type(node); // has a single output, connected to a single child const auto out = node->outputs(); const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1); return is_suitable_node && has_only_child; } -bool isSuitableChildForFusingSimple(const std::shared_ptr &node) { +bool isSuitableChildForFusingSimple(const std::shared_ptr& node) { // Note: Fusing child is allowed to have several users, but that must be the end of the chain return SupportsFusingWithConvolution_Simple(node) && getNumNonConstInputs(node) == 1; } -bool isSuitableChildForFusingBias(const std::shared_ptr &node, int fusingAxis) { +bool isSuitableChildForFusingBias(const std::shared_ptr& node, int fusingAxis) { if (!ov::is_type(node)) return false; - auto is_suitable_parent = [](const std::shared_ptr &node) { - return (ov::is_type(node) || - ov::is_type(node) || - ov::is_type(node)); + auto is_suitable_parent = [](const std::shared_ptr& node) { + return (ov::is_type(node) || ov::is_type(node) || + ov::is_type(node)); }; - for (const auto &in : node->inputs()) { + for (const auto& in : node->inputs()) { const auto& parent_out = in.get_source_output(); const auto& parent = parent_out.get_node_shared_ptr(); const auto& parent_pshape = parent_out.get_partial_shape(); @@ -151,7 +142,8 @@ bool isSuitableChildForFusingBias(const std::shared_ptr &node, int f if (bias_pshape.is_dynamic()) break; const auto bias_shape_norm = getNormalizedDimsBySize(bias_pshape.get_shape(), parent_pshape.size()); - if (fusingAxis >= static_cast(bias_shape_norm.size()) || fusingAxis >= static_cast(parent_pshape.size()) || + if (fusingAxis >= static_cast(bias_shape_norm.size()) || + fusingAxis >= static_cast(parent_pshape.size()) || bias_shape_norm.size() != parent_pshape.size() || bias_shape_norm.size() < 2) break; if (parent_pshape[fusingAxis].is_dynamic()) @@ -165,21 +157,23 @@ bool isSuitableChildForFusingBias(const std::shared_ptr &node, int f } // Continue fusing chain of the passed type if the node has one child // Otherwise mark node as FusedTerminator (Fused, but fusing chain is interrupted) -void PropagateIfHasOnlyChild(const std::shared_ptr &node, NodeFusingType nodeType) { +void PropagateIfHasOnlyChild(const std::shared_ptr& node, NodeFusingType nodeType) { const auto out = node->outputs(); const bool has_only_child = out.size() == 1 && out[0].get_target_inputs().size() == 1; SetNodeFusingType(node, has_only_child ? nodeType : NodeFusingType::FusedTerminator); } -// todo: Skipping MultiSubGraphOp such as TensorIterator, Loop and If. Snippets might tokenize their bodies in the future. -// Note that the function is recurrent, since there might be multi-level MultiSubGraphOp, if(){if(){}}else{} for example. -void MarkSubgraphOpAsSkipped(const std::shared_ptr &node) { +// todo: Skipping MultiSubGraphOp such as TensorIterator, Loop and If. Snippets might tokenize their bodies in the +// future. +// Note that the function is recurrent, since there might be multi-level MultiSubGraphOp, if(){if(){}}else{} for +// example. +void MarkSubgraphOpAsSkipped(const std::shared_ptr& node) { if (ov::is_type(node)) { std::vector> models{}; // Covers TensorIterator and Loop if (auto s = ov::as_type_ptr(node)) { models.push_back(s->get_function()); - // Add new multi-body subgraph op here - } else if (auto if_op = ov::as_type_ptr(node)) { + // Add new multi-body subgraph op here + } else if (auto if_op = ov::as_type_ptr(node)) { models.push_back(if_op->get_then_body()); models.push_back(if_op->get_else_body()); } @@ -204,8 +198,8 @@ bool isSuitableConvert(const std::shared_ptr& node) { return true; }; auto isSuitableChild = [](const std::shared_ptr& node) { - for (const auto &out : node->outputs()) { - const auto &child = out.get_node_shared_ptr(); + for (const auto& out : node->outputs()) { + const auto& child = out.get_node_shared_ptr(); if (!ov::is_type(child)) return false; } @@ -215,16 +209,15 @@ bool isSuitableConvert(const std::shared_ptr& node) { } auto is_skipped_op(const std::shared_ptr& op) -> bool { - return ov::is_type(op) || - ov::is_type(op) || + return ov::is_type(op) || ov::is_type(op) || ov::is_type(op); } -} // namespace +} // namespace -bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr &m) { +bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr& m) { RUN_ON_MODEL_SCOPE(SnippetsMarkSkipped); int channelAxis = DEFAULT_AXIS; - for (auto &node : m->get_ordered_ops()) { + for (auto& node : m->get_ordered_ops()) { if (is_skipped_op(node)) continue; if (isSuitableConvolutionParent(node)) { @@ -255,14 +248,18 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr &m) { if (isSuitableChildForFusingBias(node, channelAxis)) { PropagateIfHasOnlyChild(node, fusingChainType); } else if (isSuitableChildForFusingSimple(node)) { -#if defined (OV_CPU_WITH_ACL) - if (one_of(fusingChainType, NodeFusingType::FusedWithConvolution, NodeFusingType::FusedWithBinaryConvolution)) { +#if defined(OV_CPU_WITH_ACL) + if (one_of(fusingChainType, + NodeFusingType::FusedWithConvolution, + NodeFusingType::FusedWithBinaryConvolution)) { PropagateIfHasOnlyChild(node, NodeFusingType::FusedTerminator); continue; } #endif PropagateIfHasOnlyChild(node, fusingChainType); - } else if (one_of(fusingChainType, NodeFusingType::FusedWithConvolution, NodeFusingType::FusedWithBinaryConvolution)) { + } else if (one_of(fusingChainType, + NodeFusingType::FusedWithConvolution, + NodeFusingType::FusedWithBinaryConvolution)) { if (isSuitablePoolChild(node)) { PropagateIfHasOnlyChild(node, fusingChainType); } @@ -279,5 +276,5 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr &m) { return true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.hpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.hpp index 7fdc7244d21de2..2b17039d198bf3 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/snippets_mark_skipped.hpp @@ -11,30 +11,34 @@ namespace intel_cpu { /** * @interface SnippetsMarkSkipped - * @brief Mark operations that should be ignored by snippets on tokenization stage. A typical example is eltwise operations - * that will be fused into convolutions on plugin side. + * @brief Mark operations that should be ignored by snippets on tokenization stage. A typical example is eltwise + * operations that will be fused into convolutions on plugin side. */ class SnippetsMarkSkipped : public ov::pass::ModelPass { public: OPENVINO_RTTI("SnippetsMarkSkipped", "0"); SnippetsMarkSkipped() : ModelPass() {} - bool run_on_model(const std::shared_ptr &) override; + bool run_on_model(const std::shared_ptr&) override; }; /* NotSet - not part of a fusing chain FusedTerminator - the node is fused, but the chain can't be continued FusedWithConvolution, FusedWithMisc - fusing chains with different continuation rules -IgnoredAfterInputs - node must be skipped, since can't be handled properly at this time. Also a continuable fusing chain. -Order of SnippetsNodeType is important!: +IgnoredAfterInputs - node must be skipped, since can't be handled properly at this time. Also a continuable fusing +chain. Order of SnippetsNodeType is important!: * SnippetsNodeType >= FusedTerminator is a Fused chain * SnippetsNodeType > FusedTerminator is a Fused chain that may be continued */ enum class NodeFusingType : int64_t { NotSet, FusedTerminator, - FusedWithConvolution, FusedWithBinaryConvolution, - FusedWithMatMul, FusedWithFC, FusedWithMisc}; + FusedWithConvolution, + FusedWithBinaryConvolution, + FusedWithMatMul, + FusedWithFC, + FusedWithMisc +}; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/shape_inference.cpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/shape_inference.cpp index 967afe946e0793..a3c9a1c184d550 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/shape_inference.cpp @@ -3,9 +3,10 @@ // #include "shape_inference.hpp" + #include "snippets/shape_inference/shape_infer_instances.hpp" -#include "transformations/snippets/common/op/fused_mul_add.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp" +#include "transformations/snippets/common/op/fused_mul_add.hpp" namespace ov { namespace snippets { @@ -19,19 +20,31 @@ ShapeInferPtr CPUShapeInferSnippetsFactory::get_specific_op_shape_infer(const ov return {}; } -#define SHAPE_INFER_PREDEFINED(OP, InferType) \ - { OP::get_type_info_static(), [](const std::shared_ptr& n) { return std::make_shared();} } -#define SHAPE_INFER_OP_SPECIFIC(OP) \ - { OP::get_type_info_static(), [](const std::shared_ptr& n) { return std::make_shared(n);} } -#define SHAPE_INFER_OP_SPECIFIC_EXTERNAL(OP, InferType) \ - { OP::get_type_info_static(), [](const std::shared_ptr& n) { return std::make_shared(n);} } +#define SHAPE_INFER_PREDEFINED(OP, InferType) \ + { \ + OP::get_type_info_static(), [](const std::shared_ptr& n) { \ + return std::make_shared(); \ + } \ + } +#define SHAPE_INFER_OP_SPECIFIC(OP) \ + { \ + OP::get_type_info_static(), [](const std::shared_ptr& n) { \ + return std::make_shared(n); \ + } \ + } +#define SHAPE_INFER_OP_SPECIFIC_EXTERNAL(OP, InferType) \ + { \ + OP::get_type_info_static(), [](const std::shared_ptr& n) { \ + return std::make_shared(n); \ + } \ + } -const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::specific_ops_registry { +const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::specific_ops_registry{ SHAPE_INFER_PREDEFINED(ov::intel_cpu::FusedMulAdd, NumpyBroadcastShapeInfer), SHAPE_INFER_PREDEFINED(ov::intel_cpu::SwishNode, PassThroughShapeInfer), }; #undef SHAPE_INFER_OP_SPECIFIC #undef SHAPE_INFER_PREDEFINED -} // namespace snippets -} // namespace ov +} // namespace snippets +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/shape_inference.hpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/shape_inference.hpp index 3e3a417bd9164b..1028674a081102 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/shape_inference.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/shape_inference.hpp @@ -18,11 +18,12 @@ class CPUShapeInferSnippetsFactory : public IShapeInferSnippetsFactory { protected: /** - * @brief get shape infer instances for operations from backend-specific opset - * @return register ShapeInferPtr - */ - ShapeInferPtr get_specific_op_shape_infer(const ov::DiscreteTypeInfo& key, const std::shared_ptr& op) const override; + * @brief get shape infer instances for operations from backend-specific opset + * @return register ShapeInferPtr + */ + ShapeInferPtr get_specific_op_shape_infer(const ov::DiscreteTypeInfo& key, + const std::shared_ptr& op) const override; }; -} // namespace snippets -} // namespace ov +} // namespace snippets +} // namespace ov From b543d0b0a703b86acbc92189dd22ede1c9a59dee Mon Sep 17 00:00:00 2001 From: Nishant Prabhu <168545601+NishantPrabhuFujitsu@users.noreply.github.com> Date: Tue, 17 Dec 2024 10:34:25 +0530 Subject: [PATCH 15/17] [ARM] [SDPA] SVE implementation of MHASingleToken for FP32 (#27273) ### Details: - Adds SVE FP32 implementations for functions called during execution of `MHASingleToken` for SVE-128, SVE-256 and SVE-512 platforms. - SVE implementations are compiled only if runtime support for SVE is detected on the hardware, otherwise it falls back to Neon. - Adds a new implementation for exponential function `exp_ps_` using fewer FMA operations. Executes ~18% faster and has better output precision. **Note:** I am aware of the Neon FP16 implementation of SDPA added recently. To accommodate for this, the current SVE changes will be used only if the hardware does not have ARM FP16 support. I will follow up with SVE FP16 implementations soon. ### [SVE] Benchmarking results Below are the benchmarking results of execution time of each ported function. Measurements were performed by running each function individually on dummy inputs (128 fp32 elements) for 1,000,000 iterations and computing average time (in micro-seconds). ![image](https://github.com/user-attachments/assets/3f82238f-af7e-4b68-b4b1-259cf389e41a) Execution time of `MHASingleToken` as a whole was also measured for two LLMs, the results of which are shown below. For LlaMA-3-8B, the SVE-128 and SVE-512 systems at my disposal did not have enough memory, so only SVE-256 results are shown. While there is an improvement overall, these results could be contaminated with run-to-run variation due to the small execution time of the kernel. **Benchmarking details:** Prompt length of 108 tokens was used; total time for generating 50 tokens was measured and average execution time was computed. ![image](https://github.com/user-attachments/assets/893c1c46-085f-46af-ab5a-2c1481c75f68) ### New exponential implementation It is based on the discussion in [these slides](https://www.slideshare.net/slideshow/hpc-phys20201203/239717194#23) (this is based on a past talk in Fujitsu hence the document is in Japanese, sorry!). The algorithm followed is slightly different from the current implementation, in that it uses `fexpa` instruction available on ARM and requires only 3 Taylor expansion terms (2 FMA operations) to be precise until the 8th decimal place. Our benchmarking results showed this implementation to be 44%-58% faster than the existing Neon implementation. It is ~18% faster than the SVE implementation of the current algorithm in Neon. ![image](https://github.com/user-attachments/assets/117df21d-3977-499c-8ab8-8f4346286113) In this PR, the new implementation is called by default. The SVE port of the existing Neon implementation has also been retained, if needed. --- .../compile_flags/os_flags.cmake | 88 +++++++++ .../cross_compiled_disp_gen.cmake | 1 + .../cross_compile/cross_compiled_func.cmake | 7 +- cmake/developer_package/features.cmake | 2 + .../dev_api/openvino/runtime/system_conf.hpp | 7 + src/inference/src/system_conf.cpp | 19 ++ src/plugins/intel_cpu/CMakeLists.txt | 22 ++- .../src/nodes/kernels/scaled_attn/common.hpp | 80 ++++++++ .../kernels/scaled_attn/mha_single_token.cpp | 174 +++++++++++++++++- .../kernels/scaled_attn/softmax_kernel.hpp | 46 ++++- 10 files changed, 437 insertions(+), 9 deletions(-) diff --git a/cmake/developer_package/compile_flags/os_flags.cmake b/cmake/developer_package/compile_flags/os_flags.cmake index fdfd7211c8e815..660fd6160893ae 100644 --- a/cmake/developer_package/compile_flags/os_flags.cmake +++ b/cmake/developer_package/compile_flags/os_flags.cmake @@ -4,6 +4,7 @@ include(ProcessorCount) include(CheckCXXCompilerFlag) +include(CheckCXXSourceCompiles) # # ov_disable_deprecated_warnings() @@ -91,6 +92,50 @@ macro(ov_dev_package_no_errors) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ov_c_cxx_dev_no_errors}") endmacro() +# +# ov_check_compiler_supports_sve(flags) +# +# Checks whether CXX compiler for passed language supports SVE code compilation +# +macro(ov_check_compiler_supports_sve flags) + # Code to compile + set(SVE_CODE " + #include + int main() { + svfloat64_t a; + a = svdup_n_f64(0); + return 0; + }") + + # Save the current state of required flags + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + + # Set the flags necessary for compiling the test code with SVE support + set(CMAKE_REQUIRED_FLAGS "${CMAKE_CXX_FLAGS_INIT} ${flags}") + + # Check if the source code compiles with the given flags for C++ + CHECK_CXX_SOURCE_COMPILES("${SVE_CODE}" CXX_HAS_SVE) + + # If the compilation test is successful, set appropriate variables indicating support + if(CXX_HAS_SVE) + set(CXX_SVE_FOUND TRUE CACHE BOOL "SVE available on host") + set(CXX_SVE_FOUND TRUE CACHE BOOL "CXX SVE support") + set(CXX_SVE_FLAGS "${flags}" CACHE STRING "CXX SVE flags") + endif() + + # Restore the original state of required flags + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + # If the compilation test fails, indicate that the support is not found + if(NOT CXX_SVE_FOUND) + set(CXX_SVE_FOUND FALSE CACHE BOOL "CXX SVE support") + set(CXX_SVE_FLAGS "" CACHE STRING "CXX SVE flags") + endif() + + # Mark the variables as advanced to hide them in the default CMake GUI + mark_as_advanced(CXX_SVE_FOUND CXX_SVE_FLAGS) +endmacro() + # # ov_sse42_optimization_flags() # @@ -208,6 +253,49 @@ macro(ov_arm_neon_fp16_optimization_flags flags) endif() endmacro() +# +# ov_arm_sve_optimization_flags() +# +macro(ov_arm_sve_optimization_flags flags) + # Check for compiler SVE support + ov_check_compiler_supports_sve("-march=armv8-a+sve") + + if(OV_COMPILER_IS_INTEL_LLVM) + message(WARNING "Unsupported CXX compiler ${CMAKE_CXX_COMPILER_ID}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # nothing should be required here + elseif(ANDROID) + if(ANDROID_ABI STREQUAL "arm64-v8a") + set(${flags} -Wno-unused-command-line-argument) + if(CXX_SVE_FOUND) + list(APPEND ${flags} -march=armv8-a+sve) + else() + message(WARNING "SVE is not supported on this Android ABI: ${ANDROID_ABI}") + endif() + else() + message(WARNING "SVE is not supported on this Android ABI: ${ANDROID_ABI}") + endif() + else() + if(AARCH64) + set(${flags} -O2) + + # Add flag for SVE if supported + if(CXX_SVE_FOUND) + list(APPEND ${flags} -march=armv8-a+sve) + endif() + if(NOT CMAKE_CL_64) + list(APPEND ${flags} -ftree-vectorize) + endif() + + set(${flags} ${${flags}}) + elseif(ARM) + message(WARNING "SVE is not supported on 32-bit ARM architectures.") + else() + message(WARNING "SVE is not supported by architecture ${CMAKE_SYSTEM_PROCESSOR}") + endif() + endif() +endmacro() + # # ov_disable_all_warnings() # diff --git a/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake b/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake index c33d64635eb10b..fd534f3e600bfe 100644 --- a/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake +++ b/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake @@ -18,6 +18,7 @@ set(_CPU_CHECK_ANY "true") set(_CPU_CHECK_SSE42 "with_cpu_x86_sse42()") set(_CPU_CHECK_AVX "with_cpu_x86_avx()") set(_CPU_CHECK_NEON_FP16 "with_cpu_neon_fp16()") +set(_CPU_CHECK_SVE "with_cpu_sve()") set(_CPU_CHECK_AVX2 "with_cpu_x86_avx2()") set(_CPU_CHECK_AVX512F "with_cpu_x86_avx512f()") diff --git a/cmake/developer_package/cross_compile/cross_compiled_func.cmake b/cmake/developer_package/cross_compile/cross_compiled_func.cmake index 1e92fe3bfdaf8c..962aa5d373a4db 100644 --- a/cmake/developer_package/cross_compile/cross_compiled_func.cmake +++ b/cmake/developer_package/cross_compile/cross_compiled_func.cmake @@ -3,7 +3,7 @@ # ## list of available instruction sets -set(_ARCH_LIST ANY SSE42 AVX AVX2 AVX512F NEON_FP16) +set(_ARCH_LIST ANY SSE42 AVX AVX2 AVX512F NEON_FP16 SVE) set(_ACCEPTED_ARCHS_ANY "^(ANY)$") set(_ACCEPTED_ARCHS_SSE42 "^(ANY|SSE42)$") @@ -11,6 +11,7 @@ set(_ACCEPTED_ARCHS_AVX "^(ANY|SSE42|AVX)$") set(_ACCEPTED_ARCHS_AVX2 "^(ANY|SSE42|AVX|AVX2)$") set(_ACCEPTED_ARCHS_AVX512F "^(ANY|SSE42|AVX|AVX2|AVX512F)$") set(_ACCEPTED_ARCHS_NEON_FP16 "^(ANY|NEON_FP16)$") +set(_ACCEPTED_ARCHS_SVE "^(ANY|SVE)$") ## Arch specific definitions set(_DEFINE_ANY "") @@ -19,12 +20,14 @@ set(_DEFINE_AVX "HAVE_AVX" ${_DEFINE_SSE42}) set(_DEFINE_AVX2 "HAVE_AVX2" ${_DEFINE_AVX}) set(_DEFINE_AVX512F "HAVE_AVX512F" ${_DEFINE_AVX2}) set(_DEFINE_NEON_FP16 "HAVE_NEON_FP16" ${_DEFINE_ANY}) +set(_DEFINE_SVE "HAVE_SVE" ${_DEFINE_SVE}) ## Arch specific compile options ov_avx512_optimization_flags(_FLAGS_AVX512F) ov_avx2_optimization_flags (_FLAGS_AVX2) ov_sse42_optimization_flags (_FLAGS_SSE42) ov_arm_neon_fp16_optimization_flags(_FLAGS_NEON_FP16) +ov_arm_sve_optimization_flags(_FLAGS_SVE) set(_FLAGS_AVX "") ## TBD is not defined for OV project yet set(_FLAGS_ANY "") ## @@ -185,6 +188,8 @@ endfunction() function(_currently_requested_top_arch VAR) if(ENABLE_NEON_FP16) set(RES NEON_FP16) + elseif(ENABLE_SVE) + set(RES SVE) elseif(ENABLE_AVX512F) set(RES AVX512F) elseif(ENABLE_AVX2) diff --git a/cmake/developer_package/features.cmake b/cmake/developer_package/features.cmake index 8d1f3696c6759c..ae5313cea8a8b4 100644 --- a/cmake/developer_package/features.cmake +++ b/cmake/developer_package/features.cmake @@ -51,6 +51,8 @@ ov_dependent_option (ENABLE_AVX512F "Enable AVX512 optimizations" ON "X86_64 OR ov_dependent_option(ENABLE_NEON_FP16 "Enable ARM FP16 optimizations" ON "AARCH64" OFF) +ov_dependent_option(ENABLE_SVE "Enable SVE optimizations" ON "AARCH64" OFF) + # Type of build, we add this as an explicit option to default it to ON get_property(BUILD_SHARED_LIBS_DEFAULT GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS) ov_option (BUILD_SHARED_LIBS "Build as a shared library" ${BUILD_SHARED_LIBS_DEFAULT}) diff --git a/src/inference/dev_api/openvino/runtime/system_conf.hpp b/src/inference/dev_api/openvino/runtime/system_conf.hpp index 59d56dfdd49d73..bebc2014ab8028 100644 --- a/src/inference/dev_api/openvino/runtime/system_conf.hpp +++ b/src/inference/dev_api/openvino/runtime/system_conf.hpp @@ -83,6 +83,13 @@ OPENVINO_RUNTIME_API bool with_cpu_x86_sse42(); */ OPENVINO_RUNTIME_API bool with_cpu_neon_fp16(); +/** + * @brief Checks whether CPU supports ARM SVE capability + * @ingroup ov_dev_api_system_conf + * @return `True` if ARM SVE instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_sve(); + /** * @brief Checks whether CPU supports AVX capability * @ingroup ov_dev_api_system_conf diff --git a/src/inference/src/system_conf.cpp b/src/inference/src/system_conf.cpp index 27c671d07ad5c9..3227b1a3034903 100644 --- a/src/inference/src/system_conf.cpp +++ b/src/inference/src/system_conf.cpp @@ -22,6 +22,7 @@ # include # define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP (1 << 9) # define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP (1 << 10) +# define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE (1 << 24) #elif defined(__APPLE__) && defined(__aarch64__) # include # include @@ -114,6 +115,10 @@ bool with_cpu_neon_fp16() { return false; } +bool with_cpu_sve() { + return false; +} + #else // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 bool with_cpu_x86_sse42() { @@ -173,6 +178,20 @@ bool with_cpu_neon_fp16() { return false; # endif } +bool with_cpu_sve() { +# if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + !defined(__arm__) && defined(__aarch64__) + const uint32_t hwcaps = getauxval(AT_HWCAP); + return hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE; +# elif !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + !defined(__aarch64__) && defined(__arm__) + return false; +# elif defined(__aarch64__) && defined(__APPLE__) + return false; +# else + return false; +# endif +} #endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 bool check_open_mp_env_vars(bool include_omp_num_threads) { diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index eb56a3fb39503e..aa6ce49a051e00 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -278,6 +278,24 @@ target_include_directories(${TARGET_NAME} SYSTEM PRIVATE $) +# ARCH lists for softmax.cpp and mha_single_token.cpp +# Based on result of above calls, decide whether to add SVE +set(SOFTMAX_ARCH_LIST AVX512F AVX2) +set(MHA_SINGLE_TOKEN_ARCH_LIST AVX512F AVX2) + +if(ENABLE_NEON_FP16) + list(APPEND SOFTMAX_ARCH_LIST NEON_FP16) + list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST NEON_FP16) +endif() + +if(ENABLE_SVE) + list(APPEND SOFTMAX_ARCH_LIST SVE) + list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST SVE) +endif() + +list(APPEND SOFTMAX_ARCH_LIST ANY) +list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST ANY) + # Cross compiled function # TODO: The same for proposal, proposalONNX, topk cross_compiled_file(${TARGET_NAME} @@ -288,14 +306,14 @@ cross_compiled_file(${TARGET_NAME} NAMESPACE ov::Extensions::Cpu::XARCH ) cross_compiled_file(${TARGET_NAME} - ARCH AVX512F AVX2 NEON_FP16 ANY + ARCH ${SOFTMAX_ARCH_LIST} src/nodes/kernels/scaled_attn/softmax.cpp API src/nodes/kernels/scaled_attn/softmax.hpp NAME attn_softmax NAMESPACE ov::Extensions::Cpu::XARCH ) cross_compiled_file(${TARGET_NAME} - ARCH AVX512F AVX2 NEON_FP16 ANY + ARCH ${MHA_SINGLE_TOKEN_ARCH_LIST} src/nodes/kernels/scaled_attn/mha_single_token.cpp API src/nodes/kernels/scaled_attn/mha_single_token.hpp NAME mha_single_token diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp index 4e14cf5894b04d..63cbbb4464ee92 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp @@ -17,6 +17,9 @@ #endif #if defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) +# include "arm_sve.h" +# endif # include "arm_neon.h" #endif @@ -35,6 +38,10 @@ static constexpr size_t vec_len_f32_avx2 = vec_len_avx2 / sizeof(float); static constexpr size_t vec_len_f32_neon = vec_len_neon / sizeof(float); static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); +#if defined(HAVE_SVE) +static constexpr size_t vec_len_f32_sve = svcntw(); +#endif + #ifdef HAVE_AVX512F inline __m512 cvt_bf16_to_fp32(const __m256i src) { __m512i y = _mm512_cvtepu16_epi32(src); @@ -250,6 +257,79 @@ inline void hmin(__m256& x) { #endif #ifdef OPENVINO_ARCH_ARM64 +# if defined(HAVE_SVE) +inline svfloat32_t exp_ps_sve(svbool_t& pg, svfloat32_t& src) { + // Constants + const auto log2_e = svdup_n_f32(1.4426950409f); + const auto ln2 = svdup_n_f32(0.6931473921f); + const auto half_ln2_sq = svdup_n_f32(0.2413862043f); + const auto not_mask17 = svdup_n_u32(~((1u << 17) - 1)); + const auto one = svdup_n_f32(1.0f); + + // Algorithm starts here + svfloat32_t t0 = svmul_f32_z(pg, src, log2_e); // y = x * log2(e) + svfloat32_t t1 = svrintm_f32_z(pg, t0); // rount to int (float) + svint32_t t2 = svcvt_s32_f32_z(pg, t1); // n + + t1 = svsub_f32_z(pg, t0, t1); // a = y - floor(y) + t1 = svadd_f32_z(pg, t1, one); // b = a + 1 + + svuint32_t t3 = svlsr_n_u32_z(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32) + svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v) + t4 = svscale_f32_z(pg, t4, t2); // fexpa(v) * 2^(n) + + // and_(t2.d, t1.d, not_mask17.d) + svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_z(pg, svreinterpret_u32_f32(t1), not_mask17)); + t5 = svsub_f32_z(pg, t1, t5); // z + t0 = svmla_f32_z(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z + t0 = svmla_f32_z(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z) + t0 = svmul_f32_z(pg, t0, t4); // Final result + + return t0; +} +inline svfloat32_t exp_ps_sve_legacy(svbool_t& pg, svfloat32_t& src) { + const auto c1 = svreinterpret_f32_u32(svdup_n_u32(0x3f7ffff6)); + const auto c2 = svreinterpret_f32_u32(svdup_n_u32(0x3efffedb)); + const auto c3 = svreinterpret_f32_u32(svdup_n_u32(0x3e2aaf33)); + const auto c4 = svreinterpret_f32_u32(svdup_n_u32(0x3d2b9f17)); + const auto c5 = svreinterpret_f32_u32(svdup_n_u32(0x3c072010)); + + const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto one = svdup_n_f32(1.0f); // 1 + const auto two = svdup_n_f32(2.0f); // 2 + const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); + const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); + const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); + + const auto inf = svdup_n_f32(std::numeric_limits::infinity()); + const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) + const auto zero = svdup_n_f32(0.f); + const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) + + const auto z = svmla_f32_z(pg, shift, src, inv_ln2); + auto n = svsub_f32_z(pg, z, shift); + n = svsub_f32_z(pg, n, one); + const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n + + const auto r_hi = svmla_f32_z(pg, src, n, neg_ln2_hi); + const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); + const auto r2 = svmul_f32_z(pg, r, r); + + const auto p1 = svmul_f32_z(pg, c1, r); + const auto p23 = svmla_f32_z(pg, c2, c3, r); + const auto p45 = svmla_f32_z(pg, c4, c5, r); + const auto p2345 = svmla_f32_z(pg, p23, p45, r2); + const auto p12345 = svmla_f32_z(pg, p1, p2345, r2); + + auto poly = svmla_f32_z(pg, scale, p12345, scale); + poly = svmul_f32_z(pg, poly, two); + + poly = svsel_f32(svcmplt_f32(pg, src, min_input), zero, poly); + poly = svsel_f32(svcmpgt_f32(pg, src, max_input), inf, poly); + + return poly; +} +# endif inline float32x4_t exp_ps_neon_f32(const float32x4_t& src) { const auto c1 = vreinterpretq_f32_u32(vdupq_n_u32(0x3f7ffff6)); const auto c2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3efffedb)); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index f2180b5314cc07..5a6f0d66f1f221 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -20,6 +21,9 @@ #include "softmax_kernel.hpp" #if defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) +# include +# endif # include #endif @@ -59,19 +63,35 @@ void cvt_copy(TA* dst, TB* src, size_t n) { mm256_uni_storeu_ps(dst + i, vb); } #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + auto _dst = reinterpret_cast(dst); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < n) { + if (n - i < vec_len_f32_sve) { + inc = n - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + svfloat32_t b1 = svld1_f32(pg, src + i); + svst1_f32(pg, _dst + i, b1); + i += inc; + } +# else if (std::is_same::value && std::is_same::value) { for (; i + vec_len_f32_neon <= n; i += vec_len_f32_neon) { float32x4_t vb1 = __vld1q_f32(src + i); __vst1q_f32(dst + i, vb1); } } -# if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +# if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) if (std::is_same::value && std::is_same::value) { for (; i + vec_len_f16_neon <= n; i += vec_len_f16_neon) { auto vb1 = vld1q_f16(reinterpret_cast(src + i)); vst1q_f16(reinterpret_cast(dst + i), vb1); } } +# endif # endif #endif for (; i < n; i++) { @@ -99,6 +119,27 @@ static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scal mm256_uni_storeu_ps(out + i, v_out); } #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + auto _v = reinterpret_cast(v); + svfloat32_t attn_w_vec_fp32 = svdup_n_f32(weight); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < S) { + if (S - i < vec_len_f32_sve) { + inc = S - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + svfloat32_t v_value = svld1_f32(pg, _v + i); + svfloat32_t v_out = svld1_f32(pg, out + i); + + // svmla with merging to preserve inactive lane values when there's ... + // fewer than vec_len elements left + v_out = svmla_f32_m(pg, v_out, attn_w_vec_fp32, v_value); + svst1_f32(pg, out + i, v_out); + i += inc; + } +# else float32x4_t attn_w_vec_fp32 = vdupq_n_f32(weight); for (; i + vec_len_f32_neon <= S; i += vec_len_f32_neon) { float32x4_t v_value = __vld1q_f32(v + i); @@ -106,6 +147,7 @@ static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scal v_out = vmlaq_f32(v_out, attn_w_vec_fp32, v_value); __vst1q_f32(out + i, v_out); } +# endif #endif for (; i < S; i++) { out[i] += weight * v[i]; @@ -356,7 +398,50 @@ static float sum_q_head(T* a, size_t n) { hsum(vsum0); sum = _mm256_cvtss_f32(vsum0); #elif defined(OPENVINO_ARCH_ARM64) - size_t vec_len_f32_neon = 4; +# if defined(HAVE_SVE) + svfloat32_t sum0 = svdup_n_f32(0.0f); + svfloat32_t sum1 = svdup_n_f32(0.0f); + svfloat32_t sum2 = svdup_n_f32(0.0f); + svfloat32_t sum3 = svdup_n_f32(0.0f); + svbool_t pg = svptrue_b32(); + + for (; i + 4 * vec_len_f32_sve <= n; i += 4 * vec_len_f32_sve) { + svfloat32_t a0 = svld1_f32(pg, a + i); + svfloat32_t a1 = svld1_f32(pg, a + i + vec_len_f32_sve); + svfloat32_t a2 = svld1_f32(pg, a + i + vec_len_f32_sve * 2); + svfloat32_t a3 = svld1_f32(pg, a + i + vec_len_f32_sve * 3); + + sum0 = svadd_f32_z(pg, a0, sum0); + sum1 = svadd_f32_z(pg, a1, sum1); + sum2 = svadd_f32_z(pg, a2, sum2); + sum3 = svadd_f32_z(pg, a3, sum3); + } + if (i + 2 * vec_len_f32_sve <= n) { + svfloat32_t a0 = svld1_f32(pg, a + i); + svfloat32_t a1 = svld1_f32(pg, a + i + vec_len_f32_sve); + + sum0 = svadd_f32_z(pg, a0, sum0); + sum1 = svadd_f32_z(pg, a1, sum1); + i += 2 * vec_len_f32_sve; + } + if (i + vec_len_f32_sve <= n) { + svfloat32_t a0 = svld1_f32(pg, a + i); + sum0 = svadd_f32_z(pg, a0, sum0); + i += vec_len_f32_sve; + } + // Process tail elements parallely as well (if any) + if (i != n) { + svbool_t pg_rem = svwhilelt_b32(0, static_cast(n - i)); + svfloat32_t a0 = svld1_f32(pg_rem, a + i); + sum0 = svadd_f32_m(pg_rem, sum0, a0); + i = n; + } + float32_t sum_0 = svaddv_f32(pg, sum0); + float32_t sum_1 = svaddv_f32(pg, sum1); + float32_t sum_2 = svaddv_f32(pg, sum2); + float32_t sum_3 = svaddv_f32(pg, sum3); + sum = static_cast(sum_0 + sum_1 + sum_2 + sum_3); +# else float32x4_t vsum0 = vdupq_n_f32(0.0f); float32x4_t vsum1 = vdupq_n_f32(0.0f); float32x4_t vsum2 = vdupq_n_f32(0.0f); @@ -396,8 +481,8 @@ static float sum_q_head(T* a, size_t n) { sum_low = vadd_f32(sum_low, sum_high); sum_low = vpadd_f32(sum_low, sum_low); sum = vget_lane_f32(sum_low, 0); +# endif #endif - for (; i < n; i++) { float tmp = a[i]; sum += tmp; @@ -496,6 +581,63 @@ static float dot_product(TA* a, TB* b, size_t n, float* scale, float* zp, float* sum = _mm256_cvtss_f32(vsum0); #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + svbool_t pg = svptrue_b32(); + svfloat32_t sum0 = svdup_n_f32(0.0f); + svfloat32_t sum1 = svdup_n_f32(0.0f); + svfloat32_t sum2 = svdup_n_f32(0.0f); + svfloat32_t sum3 = svdup_n_f32(0.0f); + + auto _a = reinterpret_cast(a); + auto _b = reinterpret_cast(b); + + for (; i + 4 * vec_len_f32_sve <= n; i += 4 * vec_len_f32_sve) { + svfloat32_t a0 = svld1_f32(pg, _a + i); + svfloat32_t a1 = svld1_f32(pg, _a + i + vec_len_f32_sve); + svfloat32_t a2 = svld1_f32(pg, _a + i + vec_len_f32_sve * 2); + svfloat32_t a3 = svld1_f32(pg, _a + i + vec_len_f32_sve * 3); + + svfloat32_t b0 = svld1_f32(pg, _b + i); + svfloat32_t b1 = svld1_f32(pg, _b + i + vec_len_f32_sve); + svfloat32_t b2 = svld1_f32(pg, _b + i + vec_len_f32_sve * 2); + svfloat32_t b3 = svld1_f32(pg, _b + i + vec_len_f32_sve * 3); + + sum0 = svmla_f32_z(pg, sum0, a0, b0); + sum1 = svmla_f32_z(pg, sum1, a1, b1); + sum2 = svmla_f32_z(pg, sum2, a2, b2); + sum3 = svmla_f32_z(pg, sum3, a3, b3); + } + if (i + 2 * vec_len_f32_sve <= n) { + svfloat32_t a0 = svld1_f32(pg, _a + i); + svfloat32_t a1 = svld1_f32(pg, _a + i + vec_len_f32_sve); + + svfloat32_t b0 = svld1_f32(pg, _b + i); + svfloat32_t b1 = svld1_f32(pg, _b + i + vec_len_f32_sve); + + sum0 = svmla_f32_z(pg, sum0, a0, b0); + sum1 = svmla_f32_z(pg, sum1, a1, b1); + i += 2 * vec_len_f32_sve; + } + if (i + vec_len_f32_sve <= n) { + svfloat32_t a0 = svld1_f32(pg, _a + i); + svfloat32_t b0 = svld1_f32(pg, _b + i); + sum0 = svmla_f32_z(pg, sum0, a0, b0); + i += vec_len_f32_sve; + } + // Process the tail elements parallely as well (if any) + if (i != n) { + svbool_t pg_rem = svwhilelt_b32(0, static_cast(n - i)); + svfloat32_t a0 = svld1_f32(pg_rem, _a + i); + svfloat32_t b0 = svld1_f32(pg_rem, _b + i); + sum0 = svmla_f32_m(pg_rem, sum0, a0, b0); + i = n; + } + float32_t sum_0 = svaddv_f32(pg, sum0); + float32_t sum_1 = svaddv_f32(pg, sum1); + float32_t sum_2 = svaddv_f32(pg, sum2); + float32_t sum_3 = svaddv_f32(pg, sum3); + sum = static_cast(sum_0 + sum_1 + sum_2 + sum_3); +# else float32x4_t vsum0 = vdupq_n_f32(0.0f); float32x4_t vsum1 = vdupq_n_f32(0.0f); float32x4_t vsum2 = vdupq_n_f32(0.0f); @@ -542,8 +684,8 @@ static float dot_product(TA* a, TB* b, size_t n, float* scale, float* zp, float* float32x2_t temp_sum = vadd_f32(vget_low_f32(vsum0), vget_high_f32(vsum0)); temp_sum = vpadd_f32(temp_sum, temp_sum); sum = vget_lane_f32(temp_sum, 0); +# endif #endif - for (; i < n; i++) { sum += a[i] * b[i]; } @@ -794,6 +936,28 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str mm256_uni_storeu_ps(dst + i, result_vec_fp32); } #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + auto _dst = reinterpret_cast(dst); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < S) { + if (S - i < vec_len_f32_sve) { + inc = S - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + auto* src = temp + i; + auto result_vec_fp32 = svdup_n_f32(0.0f); + + for (size_t m = 0; m < M; m++) { + auto o_vec_fp32 = svld1_f32(pg, src); + result_vec_fp32 = svadd_f32_m(pg, result_vec_fp32, o_vec_fp32); + src += temp_stride; + } + svst1_f32(pg, _dst + i, result_vec_fp32); + i += inc; + } +# else for (; i + vec_len_f32_neon <= S; i += vec_len_f32_neon) { auto* src = temp + i; auto result_vec_fp32 = vdupq_n_f32(0.0f); @@ -804,6 +968,7 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str } __vst1q_f32(dst + i, result_vec_fp32); } +# endif #endif for (; i < S; i++) { auto* src = temp + i; @@ -1262,6 +1427,7 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query, OPENVINO_THROW("Unsupported precision: ", query.get_precision()); } } + } // namespace XARCH } // namespace Cpu } // namespace Extensions diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp index 48b92b53fa2727..35aab5b59c7d0e 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp @@ -12,6 +12,9 @@ #include "openvino/core/type/element_type.hpp" #if defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) +# include "arm_sve.h" +# endif # include "arm_neon.h" #endif @@ -657,6 +660,28 @@ inline void exp_reduce_sum(float* a, const float max, const size_t size, float& hsum(v_sum); sum = _mm256_cvtss_f32(v_sum); #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + svfloat32_t v_a; + svfloat32_t v_max = svdup_n_f32(max); + svfloat32_t v_sum = svdup_n_f32(0.0f); + size_t vec_len_f32_sve = svcntw(); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < size) { + if (size - i < vec_len_f32_sve) { + inc = size - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + v_a = svld1_f32(pg, a + i); + v_a = svsub_f32_z(pg, v_a, v_max); + v_a = exp_ps_sve(pg, v_a); + v_sum = svadd_f32_m(pg, v_sum, v_a); + svst1_f32(pg, a + i, v_a); + i += inc; + } + sum = svaddv_f32(svptrue_b32(), v_sum); +# else float32x4_t v_a; float32x4_t v_max = vdupq_n_f32(max); float32x4_t v_sum = vdupq_n_f32(0.0f); @@ -670,7 +695,7 @@ inline void exp_reduce_sum(float* a, const float max, const size_t size, float& i += vec_len_f32_neon; } sum = vaddvq_f32(v_sum); - +# endif #endif for (; i < size; i++) { a[i] = exp(a[i] - max); @@ -781,6 +806,22 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_ i += (size - i); } #elif defined(OPENVINO_ARCH_ARM64) +# if defined(HAVE_SVE) + svfloat32_t v_scale = svdup_n_f32(val); + size_t inc = vec_len_f32_sve; + svbool_t pg = svptrue_b32(); + + while (i < size) { + if (size - i < vec_len_f32_sve) { + inc = size - i; + pg = svwhilelt_b32(0, static_cast(inc)); + } + svfloat32_t v_a = svld1_f32(pg, a + i); + v_a = svmul_f32_z(pg, v_a, v_scale); + svst1_f32(pg, a_dst + i, v_a); + i += inc; + } +# else float32x4_t v_scale = vdupq_n_f32(val); while (i + vec_len_f32_neon <= size) { float32x4_t v_a = vld1q_f32(a + i); @@ -788,6 +829,7 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_ vst1q_f32(a_dst + i, v_a); i += vec_len_f32_neon; } +# endif #endif for (; i < size; i++) { a_dst[i] = a[i] * val; @@ -972,7 +1014,7 @@ inline void attn_softmax_kernel(float* a, // divide sum float scalar = 1.0f / sum; if (dst_precision == ov::element::f32) { - multiply_scalar(a, static_cast(a_dst), scalar, len); + multiply_scalar(a, reinterpret_cast(a_dst), scalar, len); // apply causual mask to final result instead of attn_score if (total_size > len) memset(static_cast(a_dst) + len, 0, sizeof(float) * (total_size - len)); From 8f0094dabda2dfe02c8414fd13f7d268c06ce6c7 Mon Sep 17 00:00:00 2001 From: Chenhu Wang Date: Tue, 17 Dec 2024 13:12:21 +0800 Subject: [PATCH 16/17] [CPU] sns f16_mha_on_avx512_core_amx_f16_target (#27514) ### Details: - *support f16 precision mha on GNR* ### Tickets: - *CVS-122494, CVS-122495* --- src/common/snippets/src/op/brgemm.cpp | 3 +- .../x64/jit_brgemm_copy_b_emitter.hpp | 3 +- .../snippets/x64/jit_brgemm_emitter.cpp | 3 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 15 +- .../snippets/x64/op/brgemm_copy_b.cpp | 2 +- .../snippets/x64/op/brgemm_utils.cpp | 16 ++- .../snippets/x64/op/brgemm_utils.hpp | 4 +- .../x64/pass/brgemm_to_brgemm_cpu.hpp | 2 +- .../snippets/x64/pass/enforce_precision.cpp | 11 +- .../transformation_pipeline.cpp | 28 ++-- .../custom/subgraph_tests/src/x64/mha.cpp | 22 ++- .../skip_tests_config.cpp | 5 + .../shared_tests_instances/snippets/mha.cpp | 135 ++++++++++++++---- .../shared_tests_instances/snippets/utils.hpp | 11 ++ .../plugin/shared/src/snippets/mha.cpp | 2 + 15 files changed, 201 insertions(+), 61 deletions(-) diff --git a/src/common/snippets/src/op/brgemm.cpp b/src/common/snippets/src/op/brgemm.cpp index 72fc692fff5d70..7190074c8ae30b 100644 --- a/src/common/snippets/src/op/brgemm.cpp +++ b/src/common/snippets/src/op/brgemm.cpp @@ -87,7 +87,8 @@ ov::element::Type Brgemm::get_output_type(const ov::element::Type& in_type0, con const bool is_f32 = utils::everyone_is(element::f32, in_type0, in_type1); const bool is_int8 = utils::one_of(in_type0, element::i8, element::u8) && in_type1 == element::i8; const bool is_bf16 = utils::everyone_is(element::bf16, in_type0, in_type1); - if (is_f32 || is_bf16) { + const bool is_f16 = utils::everyone_is(element::f16, in_type0, in_type1); + if (is_f32 || is_bf16 || is_f16) { return element::f32; } else if (is_int8) { return element::i32; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp index 96a80153bba4b6..d937e646b603da 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp @@ -17,13 +17,12 @@ class jit_brgemm_copy_b_emitter : public jit_emitter { const ov::snippets::lowered::ExpressionPtr& expr, const snippets::KernelExecutorTablePtr& kernel_table, const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache); - size_t get_inputs_num() const override { return 1; } static std::set> get_supported_precisions( const std::shared_ptr& node = nullptr) { - return {{element::i8}, {element::bf16}, {element::f32}}; + return {{element::i8}, {element::bf16}, {element::f16}, {element::f32}}; } private: diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp index 172a1cc0b98284..8d343cec908732 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp @@ -79,7 +79,8 @@ std::set> jit_brgemm_emitter::get_supported_precision } else if (brgemm->get_type() == BRGEMM_TYPE::WITH_AMX) { return {{element::i8, element::i8, element::u8}, {element::u8, element::i8, element::u8}, - {element::bf16, element::bf16, element::u8}}; + {element::bf16, element::bf16, element::u8}, + {element::f16, element::f16, element::u8}}; } OV_CPU_JIT_EMITTER_THROW("got BrgemmCPU node with unsupported type"); } diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 94e01cd89a39fa..2b0c7b55fb043d 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -458,11 +458,12 @@ void Subgraph::initSupportedPrimitiveDescriptors() { config.inConfs.resize(inputShapes.size()); for (size_t i = 0; i < inputShapes.size(); i++) { const auto originalInputPrecision = getOriginalInputPrecisionAtPort(i); - const auto precision = ((originalInputPrecision == ov::element::f32) && - context->getConfig().inferencePrecision == ov::element::bf16 && - subgraph_attrs->snippet->has_domain_sensitive_ops()) - ? static_cast(ov::element::bf16) - : originalInputPrecision; + const auto precision = + ((originalInputPrecision == ov::element::f32) && + one_of(context->getConfig().inferencePrecision, ov::element::bf16, ov::element::f16) && + subgraph_attrs->snippet->has_domain_sensitive_ops()) + ? context->getConfig().inferencePrecision + : originalInputPrecision; if (supportedPrecisions.count(precision) == 0) OPENVINO_THROW("Subgraph node with name `", getName(), "` doesn't support ", precision, " precision."); @@ -653,7 +654,7 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() { ov::snippets::pass::Canonicalization, ov::snippets::pass::AnalyzeBroadcastableInputs, broadcastable_inputs); - if (context->getConfig().inferencePrecision == ov::element::bf16 && + if (one_of(context->getConfig().inferencePrecision, ov::element::bf16, ov::element::f16) && subgraph_attrs->snippet->has_domain_sensitive_ops()) { // enforce BF16 precisions to supported operations // MatMul has to be decomposed to Brgemm operations before enforcement @@ -663,7 +664,7 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() { ov::snippets::pass::MatMulToBrgemm, pass::EnforcePrecision, element::f32, - element::bf16); + context->getConfig().inferencePrecision); } SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::Before, ov::snippets::pass::PropagatePrecision, diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp index a513299a516f5f..7e52905145869f 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp @@ -91,7 +91,7 @@ void BrgemmCopyB::validate_and_infer_types() { } void BrgemmCopyB::validate_element_type(const ov::element::Type& element_type) { - OPENVINO_ASSERT(one_of(element_type, element::f32, element::bf16, element::i8), + OPENVINO_ASSERT(one_of(element_type, element::f32, element::bf16, element::f16, element::i8), "BrgemmCopyB doesn't support element type" + element_type.get_type_name()); } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index e1802d2914127a..386941fd94bb98 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -35,7 +35,12 @@ cpu_isa_t get_primitive_isa(const ov::element::Type& dt_in0, bool is_with_amx) { // Note: AMX might be not used even if it's supported by the hardware, check the BrgemmToBrgemmCPU pass for details if (is_with_amx) { - SUPPORT_ONE(avx512_core_amx, "Unsupported hardware configuration: amx is supported only on avx512 platforms") + if (dt_in0 == ov::element::f16) + SUPPORT_ONE(avx512_core_amx_fp16, + "Unsupported hardware configuration: amx is supported only on avx512 platforms") + else + SUPPORT_ONE(avx512_core_amx, + "Unsupported hardware configuration: amx is supported only on avx512 platforms") } else if (dt_in0 == ov::element::bf16) { SUPPORT_ONE(avx512_core_bf16, "Unsupported hardware configuration: bf16 is supported only on avx512 platforms") } else if (one_of(dt_in0, ov::element::u8, ov::element::i8)) { @@ -59,12 +64,15 @@ BRGEMM_TYPE get_brgemm_type(const ov::element::Type& element_type_a, bool transp return transpose_b ? BRGEMM_TYPE::REPACKING_ONLY : BRGEMM_TYPE::STAND_ALONE; OPENVINO_ASSERT(element_type_a != element::bf16 || mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16), - "BF16 precision is not supported on this hardware"); + "BrgemmCPU BF16 precision is not supported on non avx512_core_bf16 system"); + OPENVINO_ASSERT(element_type_a != element::f16 || mayiuse(dnnl::impl::cpu::x64::avx512_core_amx_fp16), + "BrgemmCPU FP16 precision is not supported on non avx512_core_amx_fp16 system"); if (one_of(element_type_a, element::u8, element::i8, element::bf16) && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) return BRGEMM_TYPE::WITH_AMX; - + if (element_type_a == ov::element::f16 && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx_fp16)) + return BRGEMM_TYPE::WITH_AMX; // Note: this condition reproduces logic from the OneDNN Brgemm implementation. This is needed to align with the // backend requirements. More details in onednn/src/cpu/x64/brgemm/brgemm_utils.cpp if (element_type_a == ov::element::i8) @@ -96,6 +104,8 @@ size_t compute_inner_n_block(const ov::element::Type& precision) { return 64; case element::bf16: return 32; + case element::f16: + return 32; case element::f32: return 16; default: diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index 46428828e7139c..b5f470c1c695ba 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -15,8 +15,8 @@ namespace intel_cpu { namespace brgemm_utils { enum class BRGEMM_TYPE { - STAND_ALONE, // No extra requirements, used for f32|f32 - WITH_AMX, // i8|i8 or bf16|bf16 on AMX system - needs BrgemmCopyB and scratchpad + STAND_ALONE, // No extra requirements, used for f32|f32 + WITH_AMX, // i8|i8 or bf16|bf16 on AMX system or fp16|fp16 on AMX_FP16 system - needs BrgemmCopyB and scratchpad WITH_COMPENSATIONS, // i8|i8 (non-AMX system) - needs BrgemmCopyB for data repacking and compensations REPACKING_ONLY, // u8|i8, or bf16|bf16 (non-AMX system), or brgemm with transpose_b=true - needs BrgemmCopyB on // second input for data repacking diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp index 2cbf2d7e087919..9475171b24f65d 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp @@ -25,7 +25,7 @@ namespace pass { * \ Buffer (with repacked data) Buffer (with compensations) * \ | / * BrgemmCPU - * - f32|f32 with transpose_b, u8|i8, i8|i8 or bf16|bf16 on AMX system: + * - f32|f32 with transpose_b, u8|i8, i8|i8 or bf16|bf16 on AMX system or fp16|fp16 on AMX_FP16 system: * \ BrgemmCopyB * \ Buffer (with repacked data) Buffer (with new memory) * \ | / diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/enforce_precision.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/enforce_precision.cpp index 92b5be2692f3b2..6b7d5d31a5b12f 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/enforce_precision.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/enforce_precision.cpp @@ -121,9 +121,12 @@ bool EnforcePrecision::run_on_model(const std::shared_ptr& f) { std::set> EnforcePrecision::get_supported_precisions_default( const std::shared_ptr& op) noexcept { - if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16) && - ov::is_type(op)) { - return {{element::bf16, element::bf16}}; + std::set> types; + if (ov::is_type(op)) { + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx_fp16)) + types.insert({element::f16, element::f16}); + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) + types.insert({element::bf16, element::bf16}); } - return {}; + return types; } diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 909f6b7531d421..4013c1c3cd84f9 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -866,7 +866,7 @@ void Transformations::PostLpt() { postLPTPassManager, [](const std::shared_ptr& node) -> bool { if (!ov::is_type(node) && - node->get_output_element_type(0) != node->get_input_element_type(0)) + node->get_output_element_type(0).size() > node->get_input_element_type(0).size()) return true; if (node->get_input_size() >= 2) { return node->get_input_element_type(1) == ov::element::i8 || @@ -986,7 +986,7 @@ void Transformations::MainSnippets(void) { // MatMul and Result. However there may be Convert [f32->bf16] before Result since: // - bf16 Brgemm has f32 output; // - CPU Node Subgraph requires bf16 on output when inference precision is bf16. - // To avoid sitations when Transpose is not alone node between MatMul and Result, + // To avoid situations when Transpose is not alone node between MatMul and Result, // Plugin disables Transpose tokenization on output bool mha_token_enable_transpose_on_output = one_of(config.inferencePrecision, element::f32, element::undefined); size_t concurrency = config.streamExecutorConfig.get_threads_per_stream(); @@ -1023,6 +1023,7 @@ void Transformations::MainSnippets(void) { ov::pass::Manager snippetsManager("CPU:Snippets"); snippetsManager.set_per_pass_validation(false); + // if callback needed for better perf, enable SnippetsMarkSkipped, and disable TokenizeFCSnippets. if (!ignoreCallback) { #if defined(OPENVINO_ARCH_ARM64) CPU_REGISTER_PASS_ARM(snippetsManager, SnippetsMarkSkipped); @@ -1033,9 +1034,7 @@ void Transformations::MainSnippets(void) { } CPU_REGISTER_PASS_COMMON(snippetsManager, snippets::pass::SnippetsTokenization, tokenization_config); - // - MHA has BRGEMM that is supported only on AVX512 platforms - // - CPU Plugin Subgraph supports only f32, bf16 (and quantized) BRGEMM - // [122494] Need to add support of f16 + // - CPU Plugin Subgraph supports f32, bf16, quantized and fp16(on avx_512_core_amx_fp16 target) BRGEMM const bool isMHASupported = #if defined(OPENVINO_ARCH_ARM64) false; @@ -1043,7 +1042,9 @@ void Transformations::MainSnippets(void) { (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && one_of(config.inferencePrecision, ov::element::f32, element::undefined)) || (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) && - one_of(config.inferencePrecision, ov::element::bf16, ov::element::f32, element::undefined)); + one_of(config.inferencePrecision, ov::element::bf16, ov::element::f32, element::undefined)) || + (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx_fp16) && + one_of(config.inferencePrecision, ov::element::f16)); #endif if (!isMHASupported) { CPU_DISABLE_PASS_COMMON(snippetsManager, snippets::pass::TokenizeMHASnippets); @@ -1059,13 +1060,13 @@ void Transformations::MainSnippets(void) { const auto in_type1 = matmul->get_input_element_type(1); const auto is_fp32 = (in_type0 == ov::element::f32 && in_type1 == ov::element::f32 && one_of(config.inferencePrecision, element::f32, element::undefined)); - const auto is_fp16 = (in_type0 == ov::element::f16 || in_type1 == ov::element::f16); + const auto is_fp16 = + (in_type0 == ov::element::f16 || in_type1 == ov::element::f16) || + (in_type0 == element::f32 && in_type1 == ov::element::f32 && config.inferencePrecision == ov::element::f16); const auto is_bf16 = (in_type0 == ov::element::bf16 && in_type1 == ov::element::bf16) || ((in_type0 == element::f32 && in_type1 == ov::element::f32 && config.inferencePrecision == ov::element::bf16)); const auto is_int8 = in_type0 == ov::element::i8; - if (is_fp16) - return false; if (is_fp32) return true; // Only FP32 dynamic MHA is supported @@ -1076,13 +1077,14 @@ void Transformations::MainSnippets(void) { // brgemm_copy_b kernel if (matmul->get_transpose_a() || matmul->get_transpose_b()) return false; - // [150842] The execution of Brgemm INT8/BF16 on AMX platforms depends on the value of "K % VNNIFactor". + // [150842] The execution of Brgemm INT8/BF16/FP16 on AMX platforms depends on the value of "K % VNNIFactor". // For more details, please teake a look at the ticket 150842 if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { const auto& b_shape = matmul->get_input_partial_shape(1); const auto K = matmul->get_transpose_b() ? *b_shape.rbegin() : *++b_shape.rbegin(); - if (is_bf16) - return K.is_static() && (K.get_length() % 2 == 0); + const size_t brgemm_vnni_factor_for_real16 = 2; // 4/2(size in term of byte for bf16/fp16) + if (is_bf16 || is_fp16) + return K.is_static() && (K.get_length() % brgemm_vnni_factor_for_real16 == 0); if (is_int8) return K.is_static(); } @@ -1091,6 +1093,8 @@ void Transformations::MainSnippets(void) { dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni); if (is_bf16) return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16); + if (is_fp16) + return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx_fp16); return true; }; auto is_unsupported_parallel_work_amount = [&](const std::shared_ptr& n, diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp index 8517612a348f68..a94f52be91df02 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp @@ -189,7 +189,7 @@ class MHATest : public testing::WithParamInterface, virtual public Sub for (size_t i = 0; i < funcInputs.size(); ++i) { const auto& funcInput = funcInputs[i]; ov::Tensor tensor; - if (funcInput.get_element_type() == ov::element::bf16) { + if (funcInput.get_element_type() == ov::element::bf16 || funcInput.get_element_type() == ov::element::f16) { ov::test::utils::InputGenerateData in_data; in_data.start_from = -1; in_data.range = 2; @@ -232,6 +232,9 @@ class MHATest : public testing::WithParamInterface, virtual public Sub configuration.insert({ov::hint::inference_precision(ov::element::bf16)}); } + if (inputPrecisions[0] == ElementType::f16) + configuration.insert({ov::hint::inference_precision(ov::element::f16)}); + // Snippets MHA tokenization has limitations to avoid performance degradations. These limitations depend on // target machine. Just for testing, we disable these limitations to allow Snippets to tokenize pattern on all // machines for validation. @@ -253,6 +256,9 @@ TEST_P(MHATest, CompareWithRefs) { if (inputPrecisions[0] == ElementType::bf16 && !ov::with_cpu_x86_bfloat16()) GTEST_SKIP(); + if (inputPrecisions[0] == ElementType::f16 && !ov::with_cpu_x86_avx512_core_amx_fp16()) + GTEST_SKIP(); + if (!ov::with_cpu_x86_avx512_core()) GTEST_SKIP(); @@ -308,6 +314,20 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::test::utils::DEVICE_CPU)), MHATest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P( + smoke_MHA_FP16, + MHATest, + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inputShapes)), + ::testing::Values( + std::vector{ElementType::f16, ElementType::f16, ElementType::f16, ElementType::f16}), + ::testing::ValuesIn(matMulIn0Precisions), + ::testing::ValuesIn(patternTypes), + ::testing::Values(ExpectedNodes{{"Subgraph", 1}, + {"Transpose", 1}}), // Plugin disables tokenization of Transpose on output + ::testing::Values(ov::test::utils::DEVICE_CPU)), + MHATest::getTestCaseName); + } // namespace static std::shared_ptr initMHAQuantSubgraph0(std::vector& inputDynamicShapes, diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 089a03b4d6bba7..e9b38fedc0b4e5 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -565,6 +565,11 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*smoke_Snippets_MHA.*EnforceBF16.*)"); retVector.emplace_back(R"(.*ConcatSDPTest.*bf16.*)"); } + // MHA FP16 precision is only supported on amx_fp16 platform + if (!ov::with_cpu_x86_avx512_core_amx_fp16()) { + retVector.emplace_back(R"(.*smoke_Snippets_MHA.*FP16.*)"); + } + #ifdef SNIPPETS_LIBXSMM_TPP // GN in TPP requires exposing tmp Buffer results outside the loop (ticket: 151234) retVector.emplace_back(R"(.*smoke_Snippets_GroupNormalization.*)"); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp index 63f5176684ccc1..df0b69f99ef06d 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp @@ -12,32 +12,41 @@ namespace snippets { namespace { -std::vector> transposedShape_4D(bool with_dynamic = true) { - auto shapes = SNIPPETS_TESTS_STATIC_SHAPES( - {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, - {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}}, - {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}}, - {{2, 68, 6, 92}, {2, 68, 6, 92}, {1, 1, 68, 68}, {2, 68, 6, 92}}, - {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}}); +std::vector> transposedShape_4D(bool with_static = true, bool with_dynamic = true) { + std::vector> shapes; + if (with_static) { + auto static_shapes = + SNIPPETS_TESTS_STATIC_SHAPES({{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, + {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}}, + {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}}, + {{2, 68, 6, 92}, {2, 68, 6, 92}, {1, 1, 68, 68}, {2, 68, 6, 92}}, + {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}}); + shapes.insert(shapes.end(), static_shapes.begin(), static_shapes.end()); + } if (with_dynamic) { - std::vector> dynamic_shapes = {{ - {PartialShape{-1, -1, -1, 100}, {{1, 64, 4, 100}, {2, 16, 2, 100}, {1, 72, 4, 100}}}, - {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, - {PartialShape{-1, -1, -1, 128}, {{1, 4, 64, 128}, {2, 2, 16, 128}, {1, 4, 72, 128}}}, - {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, - }, - { - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 16, 2, 100}, {1, 128, 3, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 128, 2, 100}, {1, 128, 1, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {2, 2, 16, 128}, {2, 1, 128, 128}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 128, 2, 100}, {1, 128, 3, 64}}}, - }, - { - {PartialShape{-1, -1, 12, 64}, {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, - {PartialShape{-1, 12, -1, -1}, {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, - }}; + std::vector> dynamic_shapes = { + { + {PartialShape{-1, -1, -1, 100}, {{1, 64, 4, 100}, {2, 16, 2, 100}, {1, 72, 4, 100}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + {PartialShape{-1, -1, -1, 128}, {{1, 4, 64, 128}, {2, 2, 16, 128}, {1, 4, 72, 128}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 16, 2, 100}, {1, 128, 3, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 128, 2, 100}, {1, 128, 1, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {2, 2, 16, 128}, {2, 1, 128, 128}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 128, 2, 100}, {1, 128, 3, 64}}}, + }, + { + {PartialShape{-1, -1, 12, 64}, + {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, + {PartialShape{-1, -1, 12, 64}, + {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, + {PartialShape{-1, 12, -1, -1}, + {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, + {PartialShape{-1, -1, 12, 64}, + {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, + }}; shapes.insert(shapes.end(), dynamic_shapes.begin(), dynamic_shapes.end()); } return shapes; @@ -74,7 +83,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D_WithScalarMul, MHA, - ::testing::Combine(::testing::ValuesIn(transposedShape_4D(false)), + ::testing::Combine(::testing::ValuesIn(transposedShape_4D(true, false)), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::f32), ::testing::Values(true), @@ -137,6 +146,80 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), MHA::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_Without_Multiply, + MHA, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), + ::testing::ValuesIn(precision_fp16_if_supported(4)), + ::testing::Values(ov::element::f16), + ::testing::ValuesIn({false}), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(2), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_With_Multiply_Static, + MHA, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D(true, false)), + ::testing::ValuesIn(precision_fp16_if_supported(4)), + ::testing::Values(ov::element::f16), + ::testing::ValuesIn({true}), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(2), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); +// 3 nodes and 2 subgraph for dynamic with multiply case. +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_With_Multiply_Dynamic, + MHA, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D(false, true)), + ::testing::ValuesIn(precision_fp16_if_supported(4)), + ::testing::Values(ov::element::f16), + ::testing::ValuesIn({true}), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), + ::testing::Values(2), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_Without_Multiply, + MHA, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f16), + ::testing::ValuesIn({false}), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(2), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)), + MHA::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_With_Multiply_Static, + MHA, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D(true, false)), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f16), + ::testing::ValuesIn({true}), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(2), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)), + MHA::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_With_Multiply_Dynamic, + MHA, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D(false, true)), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f16), + ::testing::ValuesIn({true}), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), + ::testing::Values(2), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)), + MHA::getTestCaseName); } // namespace } // namespace snippets } // namespace test diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp index 6c0d54da973086..6815cdab671cea 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp @@ -16,6 +16,10 @@ static inline bool is_bf16_supported_by_brgemm() { return ov::with_cpu_x86_bfloat16() || ov::with_cpu_x86_avx512_core_amx_bf16(); } +static inline bool is_fp16_supported_by_brgemm() { + return ov::with_cpu_x86_avx512_core_amx_fp16(); +} + static inline bool is_i8_supported_by_brgemm() { return ov::with_cpu_x86_avx512_core_vnni() || ov::with_cpu_x86_avx512_core_amx_int8(); } @@ -33,6 +37,13 @@ static inline std::vector> precision_bf16_if_supporte return prc; } +static inline std::vector> precision_fp16_if_supported(size_t count) { + std::vector> prc; + if (is_fp16_supported_by_brgemm()) + prc.emplace_back(std::vector(count, element::f16)); + return prc; +} + static inline std::vector> quantized_precisions_if_supported() { std::vector> prc = {}; // In Snippets MatMul INT8 is supported only on VNNI/AMX platforms diff --git a/src/tests/functional/plugin/shared/src/snippets/mha.cpp b/src/tests/functional/plugin/shared/src/snippets/mha.cpp index 8d0cb8613bc47e..0a8fcc77717c42 100644 --- a/src/tests/functional/plugin/shared/src/snippets/mha.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/mha.cpp @@ -65,6 +65,8 @@ void MHABase::SetUp() { #endif if (inType == ov::element::bf16) rel_threshold = 0.05f; + if (inType == ov::element::f16) + abs_threshold = 2e-2; } std::string MHA::getTestCaseName(testing::TestParamInfo obj) { From adf097b31f033535c61101767ea66c06f717fedd Mon Sep 17 00:00:00 2001 From: Wilson Seok Date: Tue, 17 Dec 2024 16:27:30 +0900 Subject: [PATCH 17/17] [GPU] Add ConvolutionBackpropData in is_decompression_multiply() of MarkDequantizationSubgraph callback (#28075) ### Details: - Add ConvolutionBackpropData in target_consumers,convolutions of is_decompression_multiply() ### Tickets: - 159207 --- src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index a53b6ddac7332a..010318703dde09 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -181,10 +181,14 @@ static bool is_decompression_multiply(const std::shared_ptr node ov::op::v8::Gather::get_type_info_static(), ov::op::v1::Convolution::get_type_info_static(), ov::opset1::Convolution::get_type_info_static(), + ov::op::v1::ConvolutionBackpropData::get_type_info_static(), + ov::opset1::ConvolutionBackpropData::get_type_info_static(), ov::opset1::GroupConvolution::get_type_info_static() }; std::vector convolutions = { ov::op::v1::Convolution::get_type_info_static(), ov::opset1::Convolution::get_type_info_static(), + ov::op::v1::ConvolutionBackpropData::get_type_info_static(), + ov::opset1::ConvolutionBackpropData::get_type_info_static(), ov::opset1::GroupConvolution::get_type_info_static() }; auto all_has_types = [](const std::set>& consumers, const std::vector& types) {