diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index 1a196ea49e8e95..abcbe51c81fd22 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -131,6 +131,7 @@ struct program { topology const& topology, const ExecutionConfig& config, std::shared_ptr task_executor, + std::shared_ptr compilation_context, bool is_internal = false, bool no_optimizations = false, bool is_body_program = false); @@ -251,6 +252,14 @@ struct program { bool is_internal = false, bool no_optimizations = false, bool is_body_program = false); + static ptr build_program(engine& engine, + const topology& topology, + const ExecutionConfig& config, + std::shared_ptr task_executor, + std::shared_ptr compilation_context, + bool is_internal = false, + bool no_optimizations = false, + bool is_body_program = false); static ptr build_program(engine& engine, const std::set>& nodes, const ExecutionConfig& config, @@ -269,6 +278,7 @@ struct program { void cancel_compilation_context(); static std::shared_ptr make_task_executor(const ExecutionConfig& config); + static std::shared_ptr make_compilation_context(const ExecutionConfig& config); private: uint32_t prog_id = 0; @@ -286,8 +296,7 @@ struct program { bool is_body_program; std::unique_ptr _impls_cache; const size_t _impls_cache_capacity = 10000; - const int _num_async_build_threads = 1; - std::unique_ptr _compilation_context; + std::shared_ptr _compilation_context; std::map> nodes_map; std::list optimized_out; diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp index 22864106fb39f5..422451d096729b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp @@ -10,6 +10,7 @@ #include "intel_gpu/plugin/custom_layer.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/execution_config.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "intel_gpu/graph/topology.hpp" #include "intel_gpu/graph/program.hpp" @@ -75,7 +76,9 @@ class ProgramBuilder final { public: ProgramBuilder(std::shared_ptr model, cldnn::engine& engine, const ExecutionConfig& config, bool createTopologyOnly = false, bool partialBuild = false, - std::shared_ptr task_executor = nullptr, bool innerProgram = false); + std::shared_ptr task_executor = nullptr, + std::shared_ptr compilation_context = nullptr, + bool innerProgram = false); ProgramBuilder(cldnn::engine& engine, const ExecutionConfig& config); static const cldnn::primitive_id m_preProcessTag; @@ -136,6 +139,7 @@ class ProgramBuilder final { bool requires_new_shape_infer(const ov::Node& op) const; std::shared_ptr get_task_executor() const { return m_task_executor; } + std::shared_ptr get_compilation_context() const { return m_compilation_context; } private: static factories_map_t factories_map; @@ -153,6 +157,7 @@ class ProgramBuilder final { bool queryMode; std::shared_ptr m_task_executor; + std::shared_ptr m_compilation_context; void EnableQueryMode() { queryMode = true; } void DisableQueryMode() { queryMode = false; } diff --git a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/compilation_context.hpp similarity index 72% rename from src/plugins/intel_gpu/src/graph/include/compilation_context.hpp rename to src/plugins/intel_gpu/include/intel_gpu/runtime/compilation_context.hpp index be8d65c6aa5ecc..1163048b0cf08f 100644 --- a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/compilation_context.hpp @@ -4,10 +4,10 @@ #pragma once -#include "openvino/runtime/threading/cpu_streams_executor.hpp" #include #include #include "intel_gpu/graph/kernel_impl_params.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" namespace cldnn { @@ -21,7 +21,8 @@ class ICompilationContext { virtual void cancel() = 0; virtual void wait_all() = 0; - static std::unique_ptr create(ov::threading::IStreamsExecutor::Config task_executor_config); + static std::shared_ptr create(ov::threading::IStreamsExecutor::Config task_executor_config); + static std::shared_ptr create(std::shared_ptr task_executor); }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/compilation_context.cpp b/src/plugins/intel_gpu/src/graph/compilation_context.cpp index c1f483200c9a38..bcdbf845815a1d 100644 --- a/src/plugins/intel_gpu/src/graph/compilation_context.cpp +++ b/src/plugins/intel_gpu/src/graph/compilation_context.cpp @@ -2,12 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "compilation_context.hpp" #include #include #include #include #include "intel_gpu/runtime/utils.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" + +#include "openvino/runtime/threading/cpu_streams_executor.hpp" namespace cldnn { class CompilationContext : public ICompilationContext { @@ -15,6 +17,7 @@ class CompilationContext : public ICompilationContext { CompilationContext(ov::threading::IStreamsExecutor::Config task_executor_config) : _task_executor_config(task_executor_config) { _task_executor = std::make_shared(_task_executor_config); } + CompilationContext(std::shared_ptr task_executor) : _task_executor(task_executor) { } void push_task(kernel_impl_params key, Task&& task) override { if (_stop_compilation) @@ -83,8 +86,11 @@ class CompilationContext : public ICompilationContext { std::vector> futures; }; -std::unique_ptr ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) { +std::shared_ptr ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) { return cldnn::make_unique(task_executor_config); } +std::shared_ptr ICompilationContext::create(std::shared_ptr task_executor) { + return cldnn::make_unique(task_executor); +} } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 240db96d5b4988..c8a081dadbc45f 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -13,6 +13,7 @@ #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/event.hpp" #include "intel_gpu/runtime/stream.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" @@ -34,7 +35,6 @@ #include "program_helpers.h" #include "to_string_utils.h" #include "kernels_cache.hpp" -#include "compilation_context.hpp" // TODO: Remove once we have an abstraction for kernels_cache #include "kernel_base.h" diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index a81d0bd10ad58d..f92512da81ea48 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -25,7 +25,6 @@ #include "read_value_inst.h" #include "condition_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" -#include "compilation_context.hpp" #include "implementation_map.hpp" #include "graph_optimizer/prepare_buffer_fusing.h" @@ -36,6 +35,7 @@ #include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/error_handler.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "json_object.h" #include diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 59af7125f9e4dc..dde29dc1e32504 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -8,6 +8,7 @@ #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "intel_gpu/graph/program.hpp" #include "auto_tuner.h" @@ -17,7 +18,6 @@ #include "program_dump_graph.h" #include "sliding_window_utils.hpp" #include "program_helpers.h" -#include "compilation_context.hpp" #include "matrix_nms_inst.h" #include "roi_pooling_inst.h" @@ -145,10 +145,17 @@ std::shared_ptr program::make_task_executor(con return std::make_shared(task_executor_config); } +std::shared_ptr program::make_compilation_context(const ExecutionConfig& config) { + const int _num_async_build_threads = 1; + return ICompilationContext::create(make_task_executor_config(config, + "Task executor config for CompilationContext in GPU plugin", _num_async_build_threads)); +} + program::program(engine& engine_ref, topology const& topology, const ExecutionConfig& config, std::shared_ptr task_executor, + std::shared_ptr compilation_context, bool is_internal, bool no_optimizations, bool is_body_program) @@ -158,7 +165,8 @@ program::program(engine& engine_ref, _task_executor(std::move(task_executor)), processing_order(), is_internal(is_internal), - is_body_program(is_body_program) { + is_body_program(is_body_program), + _compilation_context(compilation_context) { _config.apply_user_properties(_engine.get_device_info()); init_primitives(); GPU_DEBUG_INFO << "Program config\n" << config.to_string(); @@ -214,8 +222,8 @@ void program::init_program() { _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, kernel_selector::KernelBase::get_db().get_batch_header_str())); - _compilation_context = ICompilationContext::create(make_task_executor_config(_config, - "Task executor config for CompilationContext in GPU plugin", _num_async_build_threads)); + if (!_compilation_context) + _compilation_context = program::make_compilation_context(_config); _impls_cache = cldnn::make_unique(_impls_cache_capacity); // Remove items of compilation context's internal queue when some impl is popped in kernels_cache @@ -253,7 +261,18 @@ program::ptr program::build_program(engine& engine, bool is_internal, bool no_optimizations, bool is_body_program) { - return std::make_shared(engine, topology, config, task_executor, is_internal, no_optimizations, is_body_program); + return std::make_shared(engine, topology, config, task_executor, nullptr, is_internal, no_optimizations, is_body_program); +} + +program::ptr program::build_program(engine& engine, + const topology& topology, + const ExecutionConfig& config, + std::shared_ptr task_executor, + std::shared_ptr compilation_context, + bool is_internal, + bool no_optimizations, + bool is_body_program) { + return std::make_shared(engine, topology, config, task_executor, compilation_context, is_internal, no_optimizations, is_body_program); } program::ptr program::build_program(engine& engine, @@ -262,7 +281,7 @@ program::ptr program::build_program(engine& engine, bool is_internal, bool no_optimizations, bool is_body_program) { - return std::make_shared(engine, topology, config, nullptr, is_internal, no_optimizations, is_body_program); + return std::make_shared(engine, topology, config, nullptr, nullptr, is_internal, no_optimizations, is_body_program); } program::ptr program::build_program(engine& engine, diff --git a/src/plugins/intel_gpu/src/plugin/ops/condition.cpp b/src/plugins/intel_gpu/src/plugin/ops/condition.cpp index c25726f673a2f8..7d47d1127fe57d 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/condition.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/condition.cpp @@ -31,7 +31,7 @@ static cldnn::condition::branch gen_branch(ProgramBuilder& p, const std::shared_ config.set_property(ov::intel_gpu::max_dynamic_batch(1)); config.set_property(ov::intel_gpu::allow_new_shape_infer(op->is_dynamic())); - ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), true); + ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true); branch.inner_program = prog.get_compiled_program(); auto& input_map = branch.input_map; diff --git a/src/plugins/intel_gpu/src/plugin/ops/loop.cpp b/src/plugins/intel_gpu/src/plugin/ops/loop.cpp index 628b0d7c37d9aa..af93885a5d949c 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/loop.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/loop.cpp @@ -280,7 +280,7 @@ static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr& op) { ProgramBuilder::ProgramBuilder(std::shared_ptr model, cldnn::engine& engine, const ExecutionConfig& config, bool create_topology_only, bool partial_build, - std::shared_ptr task_executor, bool is_inner_program) + std::shared_ptr task_executor, + std::shared_ptr compilation_context, + bool is_inner_program) : m_config(config) , m_engine(engine) , queryMode(false) - , m_task_executor(task_executor) { + , m_task_executor(task_executor) + , m_compilation_context(compilation_context) { if (m_task_executor == nullptr) m_task_executor = cldnn::program::make_task_executor(m_config); + if (m_compilation_context == nullptr) { + m_compilation_context = cldnn::program::make_compilation_context(m_config); + } // locate global custom kernel config // and auto-load kernels from it #ifdef _WIN32 @@ -158,7 +164,14 @@ std::shared_ptr ProgramBuilder::build(const std::vector #include #include +#include "intel_gpu/runtime/compilation_context.hpp" #include "fully_connected_inst.h" -#include "compilation_context.hpp" #include "program_wrapper.h" diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index dc23440c48af67..2f684a40f7f5ec 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -14,7 +14,7 @@ #include #include -#include "compilation_context.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "fully_connected_inst.h" #include diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index a90edc00a2db98..247453944e3a4a 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -10,7 +10,7 @@ #include #include "openvino/reference/matmul.hpp" -#include "compilation_context.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" #include "gemm_inst.h" #include diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp index a13c1d1550882f..ed52f276fa5960 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp @@ -7,7 +7,7 @@ #include #include #include "openvino/reference/group_normalization.hpp" -#include "compilation_context.hpp" +#include "intel_gpu/runtime/compilation_context.hpp" using namespace cldnn;