Skip to content

Commit

Permalink
initial command list
Browse files Browse the repository at this point in the history
  • Loading branch information
vladimir-paramuzov committed Oct 7, 2024
1 parent e2ea9dd commit 4ac41f0
Show file tree
Hide file tree
Showing 14 changed files with 148 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ struct ExecutionGroup {
event::ptr run(const std::vector<event::ptr>& dep_events);

private:
std::unique_ptr<command_list> m_list = nullptr;
std::shared_ptr<command_list> m_list = nullptr;

void build_list();
bool requires_update();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


namespace cldnn {
class kernel;

class command_list {
public:
Expand All @@ -25,6 +26,8 @@ class command_list {
virtual void start() = 0;
virtual void close() = 0;

virtual void add(kernel& k, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) = 0;

bool is_mutable() { return true; }
};

Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class stream {
#endif

virtual command_list::ptr create_command_list() const { OPENVINO_NOT_IMPLEMENTED; }
virtual event::ptr enqueue_command_list(command_list& list) { OPENVINO_NOT_IMPLEMENTED; }

protected:
QueueTypes m_queue_type;
Expand Down
18 changes: 14 additions & 4 deletions src/plugins/intel_gpu/src/graph/execution_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,16 @@ event::ptr ExecutionGroup::run(const std::vector<event::ptr>& dep_events) {
}

void ExecutionGroup::build_list() {

m_list = m_stream->create_command_list();
m_list->start();
for (size_t i = m_interval.start; i < m_interval.end; i++) {
m_exec_order[i]->prepare_primitive({});
m_exec_order[i]->add_to_command_list(m_list.get());
}
m_list->close();
}


bool ExecutionGroup::requires_update() {
return false;
}
Expand All @@ -38,9 +46,11 @@ void ExecutionGroup::mutate() {
}
event::ptr ExecutionGroup::execute(const std::vector<event::ptr>& dep_events) {
std::vector<event::ptr> ret_events;
for (size_t i = m_interval.start; i < m_interval.end; i++) {
ret_events.push_back(m_exec_order[i]->execute(dep_events));
}
// for (size_t i = m_interval.start; i < m_interval.end; i++) {
// ret_events.push_back(m_exec_order[i]->execute(dep_events));
// }

m_stream->enqueue_command_list(*m_list);

return m_stream->enqueue_marker(ret_events);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class wait_for_events_impl : public primitive_impl {
}

void update(primitive_inst& inst, const kernel_impl_params& impl_param) override { }
event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& events, primitive_inst& instance) override { return nullptr; }
};

namespace detail {
Expand Down
44 changes: 44 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,50 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
}
}

event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& events, typed_primitive_inst<PType>& instance) override {
stream& stream = instance.get_network().get_stream();
if (instance.can_be_optimized()) {
return stream.aggregate_events(events, false, instance.is_output());
}
std::vector<event::ptr> tmp_events(events);
std::vector<event::ptr> all_events;
OPENVINO_ASSERT(_kernels.size() == _kernel_data.kernels.size(), "[GPU] Mismatch between compiled kernels count and expected kernels data\n",
"[GPU] Compiled kernels count: ", _kernels.size(), "\n",
"[GPU] KernelData count: ", _kernel_data.kernels.size(), "\n",
"[GPU] Likely some issue with empty tensor handling happened");
for (size_t kd_idx = 0; kd_idx < _kernel_data.kernels.size(); ++kd_idx) {
if (_kernel_data.kernels[kd_idx].skip_execution)
continue;
// If any user of the prim's users is CPU implementation or network's output, set prim as a output event (event won't be nullptr)
bool needs_completion_event = instance.needs_completion_event();

auto& params = _kernel_data.kernels[kd_idx].params;
auto args = get_arguments(instance);
args.scalars = &params.scalars;

for (const auto& m : instance.get_intermediates_memories()) {
args.intermediates.push_back(m);
}

const auto& gws = params.workGroups.global;
const auto& lws = params.workGroups.local;

GPU_DEBUG_TRACE_DETAIL << "Add kernel " << kd_idx << ": gws=[" << gws[0] << ", " << gws[1] << ", " << gws[2] << "] "
<< "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]"
<< (needs_completion_event ? " has_completion_event=true" : "") << std::endl;


stream.set_arguments(*_kernels[kd_idx], _kernel_data.kernels[kd_idx].params, args);
list->add(*_kernels[kd_idx], params, args);
}

if ((all_events.size() == 0) && (tmp_events.size() > 0))
return stream.aggregate_events(tmp_events);

bool group_events = (all_events.size() > 1);
return stream.aggregate_events(all_events, group_events);
}

event::ptr execute_impl(const std::vector<event::ptr>& events,
typed_primitive_inst<PType>& instance) override {
stream& stream = instance.get_network().get_stream();
Expand Down
18 changes: 18 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "intel_gpu/runtime/tensor_accessor.hpp"
#include "intel_gpu/graph/network.hpp"
#include "intel_gpu/runtime/utils.hpp"
#include "openvino/core/except.hpp"
#include "program_node.h"
#include "primitive_type.h"
#include "intel_gpu/graph/serialization/binary_buffer.hpp"
Expand Down Expand Up @@ -61,6 +62,7 @@ struct primitive_impl {
virtual void set_arguments(primitive_inst& instance) = 0;
virtual void set_arguments(primitive_inst& instance, kernel_arguments_data& args) = 0;
virtual event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) = 0;
virtual event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& events, primitive_inst& instance) { OPENVINO_NOT_IMPLEMENTED; }
const std::string& get_kernel_name() const { return _kernel_name; }

// class typed_primitive_gpu_impl override this with return false;
Expand Down Expand Up @@ -326,6 +328,9 @@ class primitive_inst {
virtual void update_shape_info_tensor(const kernel_impl_params& params);
kernel_impl_params get_fake_aligned_params_if_possible(kernel_impl_params const& orig_impl_param);

std::vector<event::ptr> prepare_primitive(const std::vector<event::ptr>& events);
void add_to_command_list(command_list* list);

protected:
primitive_inst(network& network, program_node const& node, bool allocate_memory);

Expand Down Expand Up @@ -505,6 +510,16 @@ struct typed_primitive_impl : public primitive_impl {
return execute_impl(event, reinterpret_cast<typed_primitive_inst<PType>&>(instance));
}

event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& event, primitive_inst& instance) override {
if (instance.type() != PType::type_id())
throw std::invalid_argument("Implementation type does not match primitive type");
if (instance.get_impl() != this)
throw std::invalid_argument(
"Trying to execute primitive implementation with mismatching primitive instance");

return add_to_cmd_list(list, event, reinterpret_cast<typed_primitive_inst<PType>&>(instance));
}

std::vector<layout> get_internal_buffer_layouts() const override {
return get_internal_buffer_layouts_impl();
}
Expand Down Expand Up @@ -536,6 +551,9 @@ struct typed_primitive_impl : public primitive_impl {
virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/) {}
virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/, kernel_arguments_data& /*args*/) {}
virtual event::ptr execute_impl(const std::vector<event::ptr>& event, typed_primitive_inst<PType>& instance) = 0;
virtual event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& event, typed_primitive_inst<PType>& instance) {
OPENVINO_NOT_IMPLEMENTED;
}
};

template <class PType>
Expand Down
16 changes: 13 additions & 3 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1483,7 +1483,11 @@ bool primitive_inst::has_inner_networks() const {
return (_impl_params->inner_nets.size() > 0);
}

event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
void primitive_inst::add_to_command_list(command_list* list) {
_impl->add_to_cmd_list(list, {}, *this);
}

std::vector<event::ptr> primitive_inst::prepare_primitive(const std::vector<event::ptr>& events) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("primitive_inst::execute: " + id()));
const auto& primitive_id = id();
OPENVINO_ASSERT(_has_valid_input, primitive_id, " has invalid/unset input");
Expand Down Expand Up @@ -1528,7 +1532,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
if (can_skip_execution) {
auto ev = get_network().get_stream().create_user_event(true);
update_shape_done_by_other = false; // reset
return ev;
return { ev };
}

// Check successor reorder if layouts are same
Expand Down Expand Up @@ -1569,7 +1573,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
_outputs[0] = outputs.at(last_prim_id).get_memory();

_impl_params->output_layouts[0] = subgraph->get_output_layout(last_prim_id);
return outputs.at(last_prim_id).get_event();
return { outputs.at(last_prim_id).get_event() };
}

// Try update impl if current impl is dynamic because opt kernel may be added to impl cache through async compilation.
Expand Down Expand Up @@ -1675,10 +1679,16 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
dependencies = {grouped_ev};
}

return dependencies;
}

event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
auto dependencies = prepare_primitive(events);
{
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);
auto ev = _impl->execute(dependencies, *this);

GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
get_network().get_stream().wait_for_events({ev});

Expand Down
32 changes: 32 additions & 0 deletions src/plugins/intel_gpu/src/runtime/ze/ze_command_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
//

#include "ze_command_list.hpp"
#include "intel_gpu/runtime/utils.hpp"
#include "ze/ze_kernel.hpp"
#include "ze/ze_memory.hpp"
#include "ze_api.h"

namespace cldnn {
Expand All @@ -21,6 +24,17 @@ void ze_command_list::start() {
ZE_CHECK(zeCommandListCreate(m_engine.get_context(), m_engine.get_device(), &command_list_desc, &m_command_list));
}

void ze_command_list::add(kernel& k, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) {
auto& casted = downcast<ze_kernel>(k);
casted._cmd_id = get_command_id();
auto& ze_handle = casted.get_handle();

auto& lws = args_desc.workGroups.local;
ze_group_count_t ze_lws {static_cast<uint32_t>(lws[0]), static_cast<uint32_t>(lws[1]), static_cast<uint32_t>(lws[2])};

ZE_CHECK(zeCommandListAppendLaunchKernel(m_command_list, ze_handle, &ze_lws, nullptr, 0, nullptr));
}

void ze_command_list::close() {
ZE_CHECK(zeCommandListClose(m_command_list));
}
Expand All @@ -33,6 +47,24 @@ ze_command_list::~ze_command_list() {
reset();
}

uint64_t ze_command_list::get_command_id() {
if (is_mutable() && 0) {
ze_mutable_command_exp_flags_t flags =
ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS |
ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT |
ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE;

ze_mutable_command_id_exp_desc_t cmd_id_desc = { ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_ID_EXP_DESC, nullptr, flags };
uint64_t cmd_id = 0;
ZE_CHECK(zeCommandListGetNextCommandIdExp(m_command_list, &cmd_id_desc, &cmd_id));
return cmd_id;
} else {
thread_local uint64_t cmd_id = 0;
cmd_id++;

return cmd_id;
}
}

} // namespace ze
} // namespace cldnn
9 changes: 9 additions & 0 deletions src/plugins/intel_gpu/src/runtime/ze/ze_command_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include "ze_event.hpp"

namespace cldnn {



namespace ze {

class ze_command_list : public command_list {
Expand All @@ -21,10 +24,16 @@ class ze_command_list : public command_list {
void start() override;
void close() override;

void add(kernel& k, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) override;


ze_command_list_handle_t get_handle() const { return m_command_list; }


private:

void reset();
uint64_t get_command_id();

const ze_engine& m_engine;
ze_command_list_handle_t m_command_list = nullptr;
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class ze_kernel : public kernel {
std::string _kernel_id;

public:
uint64_t _cmd_id;
ze_kernel(ze_kernel_handle_t compiled_kernel, ze_module_handle_t module, const std::string& kernel_id)
: _compiled_kernel(compiled_kernel)
, _module(module)
Expand Down
6 changes: 6 additions & 0 deletions src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,5 +330,11 @@ command_list::ptr ze_stream::create_command_list() const {
return std::make_shared<ze_command_list>(_engine);
}

event::ptr ze_stream::enqueue_command_list(command_list& list) {
auto ze_list = downcast<ze_command_list>(list).get_handle();
ZE_CHECK(zeCommandListImmediateAppendCommandListsExp(m_command_list, 1, &ze_list, nullptr, 0, nullptr));
return create_user_event(true);
}

} // namespace ze
} // namespace cldnn
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class ze_stream : public stream {
#endif

command_list::ptr create_command_list() const override;
event::ptr enqueue_command_list(command_list& list) override;

private:
void sync_events(std::vector<event::ptr> const& deps, bool is_output = false);
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,11 @@ cldnn::ExecutionConfig get_test_default_config(const cldnn::engine& engine,
}

std::shared_ptr<cldnn::engine> create_test_engine() {
#ifdef OV_GPU_WITH_ZE_RT
auto ret = cldnn::engine::create(engine_types::ze, runtime_types::ze);
#else
auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl);
#endif
#ifdef ENABLE_ONEDNN_FOR_GPU
if (ret->get_device_info().supports_immad)
ret->create_onednn_engine({});
Expand Down

0 comments on commit 4ac41f0

Please sign in to comment.