diff --git a/src/core/include/openvino/core/parallel.hpp b/src/core/include/openvino/core/parallel.hpp index 6d3a243c95e7dc..a231c6833f9d84 100644 --- a/src/core/include/openvino/core/parallel.hpp +++ b/src/core/include/openvino/core/parallel.hpp @@ -461,8 +461,10 @@ void parallel_for(const T0& D0, const F& func) { for_1d(ithr, nthr, D0, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func); + { for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func); } #elif OV_THREAD == OV_THREAD_SEQ for_1d(0, 1, D0, func); #endif @@ -509,8 +511,10 @@ void parallel_for2d(const T0& D0, const T1& D1, const F& func) { for_2d(ithr, nthr, D0, D1, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func); + { for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func); } #elif OV_THREAD == OV_THREAD_SEQ for_2d(0, 1, D0, D1, func); #endif @@ -575,8 +579,10 @@ void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) { for_3d(ithr, nthr, D0, D1, D2, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func); + { for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func); } #elif OV_THREAD == OV_THREAD_SEQ for_3d(0, 1, D0, D1, D2, func); #endif @@ -645,8 +651,10 @@ void parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons for_4d(ithr, nthr, D0, D1, D2, D3, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func); + { for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func); } #elif OV_THREAD == OV_THREAD_SEQ for_4d(0, 1, D0, D1, D2, D3, func); #endif @@ -703,8 +711,10 @@ void parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons for_5d(ithr, nthr, D0, D1, D2, D3, D4, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func); + { for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func); } #elif OV_THREAD == OV_THREAD_SEQ for_5d(0, 1, D0, D1, D2, D3, D4, func); #endif @@ -763,8 +773,10 @@ void parallel_for6d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, cons for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func); }); #elif OV_THREAD == OV_THREAD_OMP +// Please note that this function does not guarantee execution on the same number of threads from call to call. +// Use the parallel_nt* functions if the procedure depends on a certain number of threads. # pragma omp parallel - for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func); + { for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func); } #elif OV_THREAD == OV_THREAD_SEQ for_6d(0, 1, D0, D1, D2, D3, D4, D5, func); #endif diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp index ee6afa33827861..7eed5c1df9789b 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp @@ -2828,24 +2828,27 @@ void Interpolate::InterpolateJitExecutor::pillowCGathered(const uint8_t *in_ptr_ bool xPass = IW != OW; bool yPass = IH != OH; - parallel_for(B, [&](size_t b) { + auto b_loop = [&](size_t b) { auto arg = jit_interpolate_call_args(); arg.src_ptr[0] = in_ptr_ + (IW * IH * C * b) * srcDataSize; if (xPass && yPass) { - size_t threadsNum = parallel_get_num_threads(); - size_t parallelNum = B; + size_t parallel_num = B; // IH * OW * C buf needed size_t buffer_size = static_cast(OW * IH * C); - if (parallelNum < threadsNum) { + if (parallel_num < m_threads_num) { arg.src_ptr[1] = static_cast(&pillow_working_buf[b * buffer_size * srcDataSize]); } else { - size_t threadsIdx = parallel_get_thread_num(); - arg.src_ptr[1] = static_cast(&pillow_working_buf[threadsIdx * buffer_size * srcDataSize]); + size_t threads_idx = parallel_get_thread_num(); + arg.src_ptr[1] = static_cast(&pillow_working_buf[threads_idx * buffer_size * srcDataSize]); } } arg.dst = out_ptr_ + (OW * OH * C * b) * dstDataSize; arg.weight_ptr[0] = reinterpret_cast(&auxTable[2]); (*interpolateKernel)(&arg); + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_1d(ithr, nthr, B, b_loop); }); } @@ -3706,16 +3709,15 @@ void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint // | | // | | // ---- - parallel_for2d(B, C, [&](size_t b, size_t c) { + auto bc_loop = [&](size_t b, size_t c) { const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * C * b + IW * IH * c) * srcDataSize; uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * C * b + OW * OH * c) * dstDataSize; uint8_t *xpass_out_ptr_nc = nullptr; const uint8_t *ypass_in_ptr_nc = nullptr; if (xPass && yPass) { - size_t threadsNum = parallel_get_num_threads(); - size_t parallelNum = B * C; + size_t parallel_num = B * C; // IH * OW buf needed - if (parallelNum < threadsNum) { + if (parallel_num < m_threads_num) { xpass_out_ptr_nc = static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); ypass_in_ptr_nc = static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); } else { @@ -3770,6 +3772,10 @@ void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint } } } + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_2d(ithr, nthr, B, C, bc_loop); }); } @@ -3777,16 +3783,16 @@ void Interpolate::InterpolateExecutorBase::create_pillow_working_buf(Interpolate if (srcDimPad5d[3] == dstDim5d[3] || srcDimPad5d[4] == dstDim5d[4]) return; size_t bufSize = srcDimPad5d[3] * dstDim5d[4] * srcDataSize; // IH * OW - size_t threadsNum = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); if (layout == InterpolateLayoutType::planar) { // B and C execute in parallel, need separate buf - size_t parallelNum = srcDimPad5d[0] * srcDimPad5d[1]; - bufSize *= std::min(threadsNum, parallelNum); + size_t parallel_num = srcDimPad5d[0] * srcDimPad5d[1]; + bufSize *= std::min(m_threads_num, parallel_num); } else { bufSize *= srcDimPad5d[1]; // *C // B execute in parallel, need separate buf - size_t parallelNum = srcDimPad5d[0]; - bufSize *= std::min(threadsNum, parallelNum); + size_t parallel_num = srcDimPad5d[0]; + bufSize *= std::min(m_threads_num, parallel_num); } pillow_working_buf.resize(bufSize); } diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.h b/src/plugins/intel_cpu/src/nodes/interpolate.h index 11f0e3104e5085..a43b354aa0306a 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.h +++ b/src/plugins/intel_cpu/src/nodes/interpolate.h @@ -148,6 +148,7 @@ class Interpolate : public Node { size_t dataRank; std::vector auxTable; std::vector pillow_working_buf; + size_t m_threads_num = 0lu; }; std::shared_ptr execPtr = nullptr;