diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index cb5911e90836fb..f22004a0d3e154 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -10,6 +10,11 @@ namespace intel_cpu { using namespace arm_compute; +static std::mutex & get_mtx_ifunc() { + static std::mutex mtx_ifunc; + return mtx_ifunc; +} + inline VectorDims reshape_sizes(VectorDims dims) { const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS; VectorDims result_dims(MAX_NUM_SHAPE - 1); @@ -494,6 +499,11 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto default: IE_THROW() << "Unsupported operation type for ACL Eltwise executor: " << static_cast(aclEltwiseAttrs.algorithm); } + + // We get a problem (seg. faults, data race etc) for eltwise operations when we use several configure(...) functions in parallel. + // We created issue about this problem here: https://github.com/ARM-software/ComputeLibrary/issues/1073 + // TODO: change it when we will get an answer to our question in issue + std::lock_guard _lock {get_mtx_ifunc()}; ifunc = exec_func(); return true; } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp new file mode 100644 index 00000000000000..c617363aefebf6 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "acl_ie_scheduler.hpp" + +#include "arm_compute/core/CPP/ICPPKernel.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include + +namespace ov { +namespace intel_cpu { + +using namespace arm_compute; + +ACLScheduler::ACLScheduler() = default; + +unsigned int ACLScheduler::num_threads() const { + return parallel_get_num_threads(); +} + +void ACLScheduler::set_num_threads(unsigned int num_threads) {} + +void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) { + const Window & max_window = window; + const unsigned int num_iterations = max_window.num_iterations_total(); + const auto _num_threads = std::min(num_iterations, static_cast(parallel_get_num_threads())); + + if (num_iterations == 0) { + return; + } + + std::function main_run; + if (tensors.empty()) { + main_run = [&](const Window &window, const ThreadInfo &info) { + kernel->run(window, info); + }; + } else { + main_run = [&](const Window &window, const ThreadInfo &info) { + kernel->run_op(tensors, window, info); + }; + } + + if (!kernel->is_parallelisable() || _num_threads == 1) { + ThreadInfo info; + info.cpu_info = &cpu_info(); + main_run(max_window, info); + } else { + const auto num_windows = _num_threads; + const auto hints_split_dimension = hints.split_dimension(); + + InferenceEngine::parallel_for(num_windows, [&](int wid) { + Window win = max_window.split_window(hints_split_dimension, wid, num_windows); + win.validate(); + main_run(win, {wid, static_cast(_num_threads), &cpu_info()}); + }); + } +} + +void ACLScheduler::schedule(ICPPKernel *kernel, const Hints &hints) { + ITensorPack tensors; + schedule_custom(kernel, hints, kernel->window(), tensors); +} + +void ACLScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) { + schedule_custom(kernel, hints, window, tensors); +} + +void ACLScheduler::run_workloads(std::vector &workloads) { + InferenceEngine::parallel_for(workloads.size(), [&](int wid) { + workloads[wid]({wid, static_cast(parallel_get_num_threads()), &cpu_info()}); + }); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp new file mode 100644 index 00000000000000..1148f4ad5edd69 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include "support/Mutex.h" + +namespace ov { +namespace intel_cpu { + +using namespace arm_compute; + +class ACLScheduler final : public IScheduler { +public: + ACLScheduler(); + ~ACLScheduler() override = default; + std::uint32_t num_threads() const override; + void set_num_threads(unsigned int num_threads) override; + void schedule(ICPPKernel *kernel, const Hints &hints) override; + void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override; +protected: + void run_workloads(std::vector &workloads) override; +private: + void schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors); +}; +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index ddf14ef59a7eab..96be8734ec0dce 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -38,6 +38,11 @@ #include #include +#if defined(OV_CPU_WITH_ACL) +#include "nodes/executors/acl/acl_ie_scheduler.hpp" +#include "arm_compute/runtime/CPP/CPPScheduler.h" +#endif + using namespace InferenceEngine; #define IE_CPU_PLUGIN_THROW(...) IE_THROW(__VA_ARGS__) << "CPU plugin: " @@ -137,11 +142,44 @@ class CPUSpecialSetup { }; #endif // __linux__ +#if defined(OV_CPU_WITH_ACL) +std::mutex Engine::SchedulerGuard::mutex; +std::weak_ptr Engine::SchedulerGuard::ptr; + +Engine::SchedulerGuard::SchedulerGuard() { +#if IE_THREAD == IE_THREAD_SEQ + // To save state for ACL cores in single-thread mode + arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST); +#else + arm_compute::Scheduler::set(std::make_shared()); +#endif +} + +std::shared_ptr Engine::SchedulerGuard::instance() { + std::lock_guard lock{SchedulerGuard::mutex}; + auto scheduler_guard_ptr = SchedulerGuard::ptr.lock(); + if (scheduler_guard_ptr == nullptr) { + SchedulerGuard::ptr = scheduler_guard_ptr = std::make_shared(); + } + return scheduler_guard_ptr; +} + +Engine::SchedulerGuard::~SchedulerGuard() { + // To save the state of scheduler after ACLScheduler has been executed + // TODO: find out the cause of the state + std::lock_guard lock{this->dest_mutex}; + arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST); +} +#endif + Engine::Engine() : deviceFullName(getDeviceFullName()), specialSetup(new CPUSpecialSetup) { _pluginName = "CPU"; extensionManager->AddExtension(std::make_shared()); +#if defined(OV_CPU_WITH_ACL) + scheduler_guard = SchedulerGuard::instance(); +#endif } Engine::~Engine() { diff --git a/src/plugins/intel_cpu/src/plugin.h b/src/plugins/intel_cpu/src/plugin.h index 20c6d315a2c623..3e9d616dcec02c 100644 --- a/src/plugins/intel_cpu/src/plugin.h +++ b/src/plugins/intel_cpu/src/plugin.h @@ -63,6 +63,20 @@ class Engine : public InferenceEngine::IInferencePlugin { const std::string deviceFullName; std::shared_ptr specialSetup; + +#if defined(OV_CPU_WITH_ACL) + struct SchedulerGuard { + SchedulerGuard(); + ~SchedulerGuard(); + static std::shared_ptr instance(); + static std::mutex mutex; + // separate mutex for saving ACLScheduler state in destructor + mutable std::mutex dest_mutex; + static std::weak_ptr ptr; + }; + + std::shared_ptr scheduler_guard; +#endif }; } // namespace intel_cpu