initial command list

vladimir-paramuzov · Oct 7, 2024 · 4ac41f0 · 4ac41f0
1 parent e2ea9dd
commit 4ac41f0
Show file tree

Hide file tree

Showing 14 changed files with 148 additions and 8 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/execution_group.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/execution_group.hpp
@@ -41,7 +41,7 @@ struct ExecutionGroup {
     event::ptr run(const std::vector<event::ptr>& dep_events);
 
 private:
-    std::unique_ptr<command_list> m_list = nullptr;
+    std::shared_ptr<command_list> m_list = nullptr;
 
     void build_list();
     bool requires_update();

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/command_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/command_list.hpp
@@ -14,6 +14,7 @@
 
 
 namespace cldnn {
+class kernel;
 
 class command_list {
 public:
@@ -25,6 +26,8 @@ class command_list {
     virtual void start() = 0;
     virtual void close() = 0;
 
+    virtual void add(kernel& k, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) = 0;
+
     bool is_mutable() { return true; }
 };
 

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp
@@ -80,6 +80,7 @@ class stream {
 #endif
 
     virtual command_list::ptr create_command_list() const { OPENVINO_NOT_IMPLEMENTED; }
+    virtual event::ptr enqueue_command_list(command_list& list) { OPENVINO_NOT_IMPLEMENTED; }
 
 protected:
     QueueTypes m_queue_type;

diff --git a/src/plugins/intel_gpu/src/graph/execution_group.cpp b/src/plugins/intel_gpu/src/graph/execution_group.cpp
@@ -27,8 +27,16 @@ event::ptr ExecutionGroup::run(const std::vector<event::ptr>& dep_events) {
 }
 
 void ExecutionGroup::build_list() {
-
+    m_list = m_stream->create_command_list();
+    m_list->start();
+    for (size_t i = m_interval.start; i < m_interval.end; i++) {
+        m_exec_order[i]->prepare_primitive({});
+        m_exec_order[i]->add_to_command_list(m_list.get());
+    }
+    m_list->close();
 }
+
+
 bool ExecutionGroup::requires_update() {
     return false;
 }
@@ -38,9 +46,11 @@ void ExecutionGroup::mutate() {
 }
 event::ptr ExecutionGroup::execute(const std::vector<event::ptr>& dep_events) {
     std::vector<event::ptr> ret_events;
-    for (size_t i = m_interval.start; i < m_interval.end; i++) {
-        ret_events.push_back(m_exec_order[i]->execute(dep_events));
-    }
+    // for (size_t i = m_interval.start; i < m_interval.end; i++) {
+        // ret_events.push_back(m_exec_order[i]->execute(dep_events));
+    // }
+
+    m_stream->enqueue_command_list(*m_list);
 
     return m_stream->enqueue_marker(ret_events);
 }

diff --git a/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp b/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp
@@ -50,6 +50,7 @@ class wait_for_events_impl : public primitive_impl {
     }
 
     void update(primitive_inst& inst, const kernel_impl_params& impl_param) override { }
+    event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& events, primitive_inst& instance) override { return nullptr; }
 };
 
 namespace detail {

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@@ -224,6 +224,50 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
         }
     }
 
+    event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& events, typed_primitive_inst<PType>& instance) override {
+        stream& stream = instance.get_network().get_stream();
+        if (instance.can_be_optimized()) {
+            return stream.aggregate_events(events, false, instance.is_output());
+        }
+        std::vector<event::ptr> tmp_events(events);
+        std::vector<event::ptr> all_events;
+        OPENVINO_ASSERT(_kernels.size() == _kernel_data.kernels.size(), "[GPU] Mismatch between compiled kernels count and expected kernels data\n",
+                                                                        "[GPU] Compiled kernels count: ", _kernels.size(), "\n",
+                                                                        "[GPU] KernelData count: ", _kernel_data.kernels.size(), "\n",
+                                                                        "[GPU] Likely some issue with empty tensor handling happened");
+        for (size_t kd_idx = 0; kd_idx < _kernel_data.kernels.size(); ++kd_idx) {
+            if (_kernel_data.kernels[kd_idx].skip_execution)
+                continue;
+            // If any user of the prim's users is CPU implementation or network's output, set prim as a output event (event won't be nullptr)
+            bool needs_completion_event = instance.needs_completion_event();
+
+            auto& params = _kernel_data.kernels[kd_idx].params;
+            auto args = get_arguments(instance);
+            args.scalars = &params.scalars;
+
+            for (const auto& m : instance.get_intermediates_memories()) {
+                args.intermediates.push_back(m);
+            }
+
+            const auto& gws = params.workGroups.global;
+            const auto& lws = params.workGroups.local;
+
+            GPU_DEBUG_TRACE_DETAIL << "Add kernel " << kd_idx << ": gws=[" << gws[0] << ", " << gws[1] << ", " << gws[2] << "] "
+                                   << "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]"
+                                   << (needs_completion_event ? " has_completion_event=true" : "") << std::endl;
+
+
+            stream.set_arguments(*_kernels[kd_idx], _kernel_data.kernels[kd_idx].params, args);
+            list->add(*_kernels[kd_idx], params, args);
+        }
+
+        if ((all_events.size() == 0) && (tmp_events.size() > 0))
+            return stream.aggregate_events(tmp_events);
+
+        bool group_events = (all_events.size() > 1);
+        return stream.aggregate_events(all_events, group_events);
+    }
+
     event::ptr execute_impl(const std::vector<event::ptr>& events,
                             typed_primitive_inst<PType>& instance) override {
         stream& stream = instance.get_network().get_stream();

diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -11,6 +11,7 @@
 #include "intel_gpu/runtime/tensor_accessor.hpp"
 #include "intel_gpu/graph/network.hpp"
 #include "intel_gpu/runtime/utils.hpp"
+#include "openvino/core/except.hpp"
 #include "program_node.h"
 #include "primitive_type.h"
 #include "intel_gpu/graph/serialization/binary_buffer.hpp"
@@ -61,6 +62,7 @@ struct primitive_impl {
     virtual void set_arguments(primitive_inst& instance) = 0;
     virtual void set_arguments(primitive_inst& instance, kernel_arguments_data& args) = 0;
     virtual event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) = 0;
+    virtual event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& events, primitive_inst& instance) { OPENVINO_NOT_IMPLEMENTED; }
     const std::string& get_kernel_name() const { return _kernel_name; }
 
     // class typed_primitive_gpu_impl override this with return false;
@@ -326,6 +328,9 @@ class primitive_inst {
     virtual void update_shape_info_tensor(const kernel_impl_params& params);
     kernel_impl_params get_fake_aligned_params_if_possible(kernel_impl_params const& orig_impl_param);
 
+    std::vector<event::ptr> prepare_primitive(const std::vector<event::ptr>& events);
+    void add_to_command_list(command_list* list);
+
 protected:
     primitive_inst(network& network, program_node const& node, bool allocate_memory);
 
@@ -505,6 +510,16 @@ struct typed_primitive_impl : public primitive_impl {
         return execute_impl(event, reinterpret_cast<typed_primitive_inst<PType>&>(instance));
     }
 
+    event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& event, primitive_inst& instance) override {
+        if (instance.type() != PType::type_id())
+            throw std::invalid_argument("Implementation type does not match primitive type");
+        if (instance.get_impl() != this)
+            throw std::invalid_argument(
+                "Trying to execute primitive implementation with mismatching primitive instance");
+
+        return add_to_cmd_list(list, event, reinterpret_cast<typed_primitive_inst<PType>&>(instance));
+    }
+
     std::vector<layout> get_internal_buffer_layouts() const override {
         return get_internal_buffer_layouts_impl();
     }
@@ -536,6 +551,9 @@ struct typed_primitive_impl : public primitive_impl {
     virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/) {}
     virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/, kernel_arguments_data& /*args*/) {}
     virtual event::ptr execute_impl(const std::vector<event::ptr>& event, typed_primitive_inst<PType>& instance) = 0;
+    virtual event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& event, typed_primitive_inst<PType>& instance) {
+        OPENVINO_NOT_IMPLEMENTED;
+    }
 };
 
 template <class PType>

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -1483,7 +1483,11 @@ bool primitive_inst::has_inner_networks() const {
     return (_impl_params->inner_nets.size() > 0);
 }
 
-event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
+void primitive_inst::add_to_command_list(command_list* list) {
+    _impl->add_to_cmd_list(list, {}, *this);
+}
+
+std::vector<event::ptr> primitive_inst::prepare_primitive(const std::vector<event::ptr>& events) {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("primitive_inst::execute: " + id()));
     const auto& primitive_id = id();
     OPENVINO_ASSERT(_has_valid_input, primitive_id, " has invalid/unset input");
@@ -1528,7 +1532,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
         if (can_skip_execution) {
             auto ev = get_network().get_stream().create_user_event(true);
             update_shape_done_by_other = false; // reset
-            return ev;
+            return { ev };
         }
 
         // Check successor reorder if layouts are same
@@ -1569,7 +1573,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
             _outputs[0] = outputs.at(last_prim_id).get_memory();
 
             _impl_params->output_layouts[0] = subgraph->get_output_layout(last_prim_id);
-            return outputs.at(last_prim_id).get_event();
+            return { outputs.at(last_prim_id).get_event() };
         }
 
         // Try update impl if current impl is dynamic because opt kernel may be added to impl cache through async compilation.
@@ -1675,10 +1679,16 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
         dependencies = {grouped_ev};
     }
 
+    return dependencies;
+}
+
+event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
+    auto dependencies = prepare_primitive(events);
     {
         GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);
         auto ev = _impl->execute(dependencies, *this);
 
+        GPU_DEBUG_GET_INSTANCE(debug_config);
         GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
             get_network().get_stream().wait_for_events({ev});
 

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_command_list.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_command_list.cpp
@@ -3,6 +3,9 @@
 //
 
 #include "ze_command_list.hpp"
+#include "intel_gpu/runtime/utils.hpp"
+#include "ze/ze_kernel.hpp"
+#include "ze/ze_memory.hpp"
 #include "ze_api.h"
 
 namespace cldnn {
@@ -21,6 +24,17 @@ void ze_command_list::start() {
     ZE_CHECK(zeCommandListCreate(m_engine.get_context(), m_engine.get_device(), &command_list_desc, &m_command_list));
 }
 
+void ze_command_list::add(kernel& k, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) {
+    auto& casted = downcast<ze_kernel>(k);
+    casted._cmd_id = get_command_id();
+    auto& ze_handle = casted.get_handle();
+
+    auto& lws = args_desc.workGroups.local;
+    ze_group_count_t ze_lws {static_cast<uint32_t>(lws[0]), static_cast<uint32_t>(lws[1]), static_cast<uint32_t>(lws[2])};
+
+    ZE_CHECK(zeCommandListAppendLaunchKernel(m_command_list, ze_handle, &ze_lws, nullptr, 0, nullptr));
+}
+
 void ze_command_list::close() {
     ZE_CHECK(zeCommandListClose(m_command_list));
 }
@@ -33,6 +47,24 @@ ze_command_list::~ze_command_list() {
     reset();
 }
 
+uint64_t ze_command_list::get_command_id() {
+    if (is_mutable() && 0) {
+        ze_mutable_command_exp_flags_t flags =
+            ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS |
+            ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT |
+            ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE;
+
+        ze_mutable_command_id_exp_desc_t cmd_id_desc = { ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_ID_EXP_DESC, nullptr, flags };
+        uint64_t cmd_id = 0;
+        ZE_CHECK(zeCommandListGetNextCommandIdExp(m_command_list, &cmd_id_desc, &cmd_id));
+        return cmd_id;
+    } else {
+        thread_local uint64_t cmd_id = 0;
+        cmd_id++;
+
+        return cmd_id;
+    }
+}
 
 }  // namespace ze
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_command_list.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_command_list.hpp
@@ -10,6 +10,9 @@
 #include "ze_event.hpp"
 
 namespace cldnn {
+
+
+
 namespace ze {
 
 class ze_command_list : public command_list {
@@ -21,10 +24,16 @@ class ze_command_list : public command_list {
     void start() override;
     void close() override;
 
+    void add(kernel& k, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) override;
+
+
+    ze_command_list_handle_t get_handle() const { return m_command_list; }
+
 
 private:
 
     void reset();
+    uint64_t get_command_id();
 
     const ze_engine& m_engine;
     ze_command_list_handle_t m_command_list = nullptr;

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
@@ -19,6 +19,7 @@ class ze_kernel : public kernel {
     std::string _kernel_id;
 
 public:
+    uint64_t _cmd_id;
     ze_kernel(ze_kernel_handle_t compiled_kernel, ze_module_handle_t module, const std::string& kernel_id)
         : _compiled_kernel(compiled_kernel)
         , _module(module)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -330,5 +330,11 @@ command_list::ptr ze_stream::create_command_list() const {
     return std::make_shared<ze_command_list>(_engine);
 }
 
+event::ptr ze_stream::enqueue_command_list(command_list& list) {
+    auto ze_list = downcast<ze_command_list>(list).get_handle();
+    ZE_CHECK(zeCommandListImmediateAppendCommandListsExp(m_command_list, 1, &ze_list, nullptr, 0, nullptr));
+    return create_user_event(true);
+}
+
 }  // namespace ze
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
@@ -51,6 +51,7 @@ class ze_stream : public stream {
 #endif
 
     command_list::ptr create_command_list() const override;
+    event::ptr enqueue_command_list(command_list& list) override;
 
 private:
     void sync_events(std::vector<event::ptr> const& deps, bool is_output = false);

diff --git a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
@@ -302,7 +302,11 @@ cldnn::ExecutionConfig get_test_default_config(const cldnn::engine& engine,
 }
 
 std::shared_ptr<cldnn::engine> create_test_engine() {
+#ifdef OV_GPU_WITH_ZE_RT
+    auto ret = cldnn::engine::create(engine_types::ze, runtime_types::ze);
+#else
     auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl);
+#endif
 #ifdef ENABLE_ONEDNN_FOR_GPU
     if (ret->get_device_info().supports_immad)
         ret->create_onednn_engine({});
-Original file line number
+Diff line change
@@ Expand Up / @@ -50,6 +50,7 @@ class wait_for_events_impl : public primitive_impl { @@
         }
         void update(primitive_inst& inst, const kernel_impl_params& impl_param) override { }
+        event::ptr add_to_cmd_list(command_list* list, const std::vector<event::ptr>& events, primitive_inst& instance) override { return nullptr; }
     };
     namespace detail {
@@ Expand Down @@