Skip to content

Commit

Permalink
[GPU] Share compilation context with sub-networks
Browse files Browse the repository at this point in the history
  • Loading branch information
vladimir-paramuzov committed Oct 26, 2023
1 parent d532d14 commit 6db5ed1
Show file tree
Hide file tree
Showing 14 changed files with 77 additions and 24 deletions.
13 changes: 11 additions & 2 deletions src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ struct program {
topology const& topology,
const ExecutionConfig& config,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<ICompilationContext> compilation_context,
bool is_internal = false,
bool no_optimizations = false,
bool is_body_program = false);
Expand Down Expand Up @@ -251,6 +252,14 @@ struct program {
bool is_internal = false,
bool no_optimizations = false,
bool is_body_program = false);
static ptr build_program(engine& engine,
const topology& topology,
const ExecutionConfig& config,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<ICompilationContext> compilation_context,
bool is_internal = false,
bool no_optimizations = false,
bool is_body_program = false);
static ptr build_program(engine& engine,
const std::set<std::shared_ptr<program_node>>& nodes,
const ExecutionConfig& config,
Expand All @@ -269,6 +278,7 @@ struct program {
void cancel_compilation_context();

static std::shared_ptr<ov::threading::IStreamsExecutor> make_task_executor(const ExecutionConfig& config);
static std::shared_ptr<ICompilationContext> make_compilation_context(const ExecutionConfig& config);

private:
uint32_t prog_id = 0;
Expand All @@ -286,8 +296,7 @@ struct program {
bool is_body_program;
std::unique_ptr<ImplementationsCache> _impls_cache;
const size_t _impls_cache_capacity = 10000;
const int _num_async_build_threads = 1;
std::unique_ptr<ICompilationContext> _compilation_context;
std::shared_ptr<ICompilationContext> _compilation_context;

std::map<primitive_id, std::shared_ptr<program_node>> nodes_map;
std::list<primitive_id> optimized_out;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "intel_gpu/plugin/custom_layer.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/execution_config.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "intel_gpu/graph/topology.hpp"
#include "intel_gpu/graph/program.hpp"

Expand Down Expand Up @@ -75,7 +76,9 @@ class ProgramBuilder final {
public:
ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine& engine, const ExecutionConfig& config,
bool createTopologyOnly = false, bool partialBuild = false,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor = nullptr, bool innerProgram = false);
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor = nullptr,
std::shared_ptr<cldnn::ICompilationContext> compilation_context = nullptr,
bool innerProgram = false);
ProgramBuilder(cldnn::engine& engine, const ExecutionConfig& config);

static const cldnn::primitive_id m_preProcessTag;
Expand Down Expand Up @@ -136,6 +139,7 @@ class ProgramBuilder final {
bool requires_new_shape_infer(const ov::Node& op) const;

std::shared_ptr<ov::threading::IStreamsExecutor> get_task_executor() const { return m_task_executor; }
std::shared_ptr<cldnn::ICompilationContext> get_compilation_context() const { return m_compilation_context; }

private:
static factories_map_t factories_map;
Expand All @@ -153,6 +157,7 @@ class ProgramBuilder final {
bool queryMode;

std::shared_ptr<ov::threading::IStreamsExecutor> m_task_executor;
std::shared_ptr<cldnn::ICompilationContext> m_compilation_context;

void EnableQueryMode() { queryMode = true; }
void DisableQueryMode() { queryMode = false; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

#pragma once

#include "openvino/runtime/threading/cpu_streams_executor.hpp"
#include <functional>
#include <memory>
#include "intel_gpu/graph/kernel_impl_params.hpp"
#include "openvino/runtime/threading/istreams_executor.hpp"

namespace cldnn {

Expand All @@ -21,7 +21,8 @@ class ICompilationContext {
virtual void cancel() = 0;
virtual void wait_all() = 0;

static std::unique_ptr<ICompilationContext> create(ov::threading::IStreamsExecutor::Config task_executor_config);
static std::shared_ptr<ICompilationContext> create(ov::threading::IStreamsExecutor::Config task_executor_config);
static std::shared_ptr<ICompilationContext> create(std::shared_ptr<ov::threading::IStreamsExecutor> task_executor);
};

} // namespace cldnn
10 changes: 8 additions & 2 deletions src/plugins/intel_gpu/src/graph/compilation_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,22 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "compilation_context.hpp"
#include <mutex>
#include <atomic>
#include <unordered_set>
#include <future>
#include "intel_gpu/runtime/utils.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"

#include "openvino/runtime/threading/cpu_streams_executor.hpp"

namespace cldnn {
class CompilationContext : public ICompilationContext {
public:
CompilationContext(ov::threading::IStreamsExecutor::Config task_executor_config) : _task_executor_config(task_executor_config) {
_task_executor = std::make_shared<ov::threading::CPUStreamsExecutor>(_task_executor_config);
}
CompilationContext(std::shared_ptr<ov::threading::IStreamsExecutor> task_executor) : _task_executor(task_executor) { }

void push_task(kernel_impl_params key, Task&& task) override {
if (_stop_compilation)
Expand Down Expand Up @@ -83,8 +86,11 @@ class CompilationContext : public ICompilationContext {
std::vector<std::future<void>> futures;
};

std::unique_ptr<ICompilationContext> ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) {
std::shared_ptr<ICompilationContext> ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) {
return cldnn::make_unique<CompilationContext>(task_executor_config);
}
std::shared_ptr<ICompilationContext> ICompilationContext::create(std::shared_ptr<ov::threading::IStreamsExecutor> task_executor) {
return cldnn::make_unique<CompilationContext>(task_executor);
}

} // namespace cldnn
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/event.hpp"
#include "intel_gpu/runtime/stream.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/runtime/itt.hpp"

Expand All @@ -34,7 +35,6 @@
#include "program_helpers.h"
#include "to_string_utils.h"
#include "kernels_cache.hpp"
#include "compilation_context.hpp"

// TODO: Remove once we have an abstraction for kernels_cache
#include "kernel_base.h"
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#include "read_value_inst.h"
#include "condition_inst.h"
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
#include "compilation_context.hpp"
#include "implementation_map.hpp"
#include "graph_optimizer/prepare_buffer_fusing.h"

Expand All @@ -36,6 +35,7 @@
#include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/error_handler.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"

#include "json_object.h"
#include <string>
Expand Down
31 changes: 25 additions & 6 deletions src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "intel_gpu/graph/program.hpp"

#include "auto_tuner.h"
Expand All @@ -17,7 +18,6 @@
#include "program_dump_graph.h"
#include "sliding_window_utils.hpp"
#include "program_helpers.h"
#include "compilation_context.hpp"

#include "matrix_nms_inst.h"
#include "roi_pooling_inst.h"
Expand Down Expand Up @@ -145,10 +145,17 @@ std::shared_ptr<ov::threading::IStreamsExecutor> program::make_task_executor(con
return std::make_shared<ov::threading::CPUStreamsExecutor>(task_executor_config);
}

std::shared_ptr<ICompilationContext> program::make_compilation_context(const ExecutionConfig& config) {
const int _num_async_build_threads = 1;
return ICompilationContext::create(make_task_executor_config(config,
"Task executor config for CompilationContext in GPU plugin", _num_async_build_threads));
}

program::program(engine& engine_ref,
topology const& topology,
const ExecutionConfig& config,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<ICompilationContext> compilation_context,
bool is_internal,
bool no_optimizations,
bool is_body_program)
Expand All @@ -158,7 +165,8 @@ program::program(engine& engine_ref,
_task_executor(std::move(task_executor)),
processing_order(),
is_internal(is_internal),
is_body_program(is_body_program) {
is_body_program(is_body_program),
_compilation_context(compilation_context) {
_config.apply_user_properties(_engine.get_device_info());
init_primitives();
GPU_DEBUG_INFO << "Program config\n" << config.to_string();
Expand Down Expand Up @@ -214,8 +222,8 @@ void program::init_program() {
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
kernel_selector::KernelBase::get_db().get_batch_header_str()));

_compilation_context = ICompilationContext::create(make_task_executor_config(_config,
"Task executor config for CompilationContext in GPU plugin", _num_async_build_threads));
if (!_compilation_context)
_compilation_context = program::make_compilation_context(_config);

_impls_cache = cldnn::make_unique<ImplementationsCache>(_impls_cache_capacity);
// Remove items of compilation context's internal queue when some impl is popped in kernels_cache
Expand Down Expand Up @@ -253,7 +261,18 @@ program::ptr program::build_program(engine& engine,
bool is_internal,
bool no_optimizations,
bool is_body_program) {
return std::make_shared<program>(engine, topology, config, task_executor, is_internal, no_optimizations, is_body_program);
return std::make_shared<program>(engine, topology, config, task_executor, nullptr, is_internal, no_optimizations, is_body_program);
}

program::ptr program::build_program(engine& engine,
const topology& topology,
const ExecutionConfig& config,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<ICompilationContext> compilation_context,
bool is_internal,
bool no_optimizations,
bool is_body_program) {
return std::make_shared<program>(engine, topology, config, task_executor, compilation_context, is_internal, no_optimizations, is_body_program);
}

program::ptr program::build_program(engine& engine,
Expand All @@ -262,7 +281,7 @@ program::ptr program::build_program(engine& engine,
bool is_internal,
bool no_optimizations,
bool is_body_program) {
return std::make_shared<program>(engine, topology, config, nullptr, is_internal, no_optimizations, is_body_program);
return std::make_shared<program>(engine, topology, config, nullptr, nullptr, is_internal, no_optimizations, is_body_program);
}

program::ptr program::build_program(engine& engine,
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/plugin/ops/condition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ static cldnn::condition::branch gen_branch(ProgramBuilder& p, const std::shared_
config.set_property(ov::intel_gpu::max_dynamic_batch(1));
config.set_property(ov::intel_gpu::allow_new_shape_infer(op->is_dynamic()));

ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), true);
ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
branch.inner_program = prog.get_compiled_program();

auto& input_map = branch.input_map;
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/plugin/ops/loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr<ov::op::
config.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));

// get body program from ov::Model
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), true);
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
auto body_program = prog.get_compiled_program();

GPU_DEBUG_LOG << "* trip_count_id : " << trip_count_id << std::endl;
Expand Down
19 changes: 16 additions & 3 deletions src/plugins/intel_gpu/src/plugin/program_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,20 @@ std::string layer_type_name_ID(const std::shared_ptr<ov::Node>& op) {

ProgramBuilder::ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine& engine, const ExecutionConfig& config,
bool create_topology_only, bool partial_build,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor, bool is_inner_program)
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<cldnn::ICompilationContext> compilation_context,
bool is_inner_program)
: m_config(config)
, m_engine(engine)
, queryMode(false)
, m_task_executor(task_executor) {
, m_task_executor(task_executor)
, m_compilation_context(compilation_context) {
if (m_task_executor == nullptr)
m_task_executor = cldnn::program::make_task_executor(m_config);

if (m_compilation_context == nullptr) {
m_compilation_context = cldnn::program::make_compilation_context(m_config);
}
// locate global custom kernel config
// and auto-load kernels from it
#ifdef _WIN32
Expand Down Expand Up @@ -158,7 +164,14 @@ std::shared_ptr<cldnn::program> ProgramBuilder::build(const std::vector<std::sha
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "ProgramBuilder::CreateProgram");
cldnn::program::ptr program;
try {
program = cldnn::program::build_program(m_engine, *m_topology, m_config, get_task_executor(), false, false, is_inner_program);
program = cldnn::program::build_program(m_engine,
*m_topology,
m_config,
get_task_executor(),
get_compilation_context(),
false,
false,
is_inner_program);
} catch (std::exception& e) {
OPENVINO_ASSERT(false, "[GPU] ProgramBuilder build failed!\n", e.what());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/fully_connected.hpp>
#include <intel_gpu/primitives/data.hpp>
#include "intel_gpu/runtime/compilation_context.hpp"

#include "fully_connected_inst.h"
#include "compilation_context.hpp"

#include "program_wrapper.h"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include <intel_gpu/primitives/quantize.hpp>
#include <intel_gpu/primitives/data.hpp>

#include "compilation_context.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "fully_connected_inst.h"

#include <cmath>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <intel_gpu/primitives/crop.hpp>
#include "openvino/reference/matmul.hpp"

#include "compilation_context.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "gemm_inst.h"

#include <cstddef>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/group_normalization.hpp>
#include "openvino/reference/group_normalization.hpp"
#include "compilation_context.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"


using namespace cldnn;
Expand Down

0 comments on commit 6db5ed1

Please sign in to comment.