[NPU] Adding Workload Type (openvinotoolkit#25382)

### Details: - Adding workload type - *...* ### Tickets: - CVS-143714 --------- Co-authored-by: Anastasia Kuporosova <[email protected]>
cavusmustafa · Jul 10, 2024 · 3f98a75 · 3f98a75
1 parent 5fd3ba9
commit 3f98a75
Show file tree

Hide file tree

Showing 30 changed files with 419 additions and 93 deletions.
diff --git a/src/bindings/python/src/openvino/properties/__init__.py b/src/bindings/python/src/openvino/properties/__init__.py
@@ -5,6 +5,7 @@
 # Enums
 from openvino._pyopenvino.properties import Affinity
 from openvino._pyopenvino.properties import CacheMode
+from openvino._pyopenvino.properties import WorkloadType
 
 # Properties
 import openvino._pyopenvino.properties as __properties

diff --git a/src/bindings/python/src/openvino/runtime/properties/__init__.py b/src/bindings/python/src/openvino/runtime/properties/__init__.py
@@ -5,6 +5,7 @@
 # Enums
 from openvino._pyopenvino.properties import Affinity
 from openvino._pyopenvino.properties import CacheMode
+from openvino._pyopenvino.properties import WorkloadType
 
 # Properties
 from openvino._pyopenvino.properties import enable_profiling

diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
@@ -21,13 +21,18 @@ void regmodule_properties(py::module m) {
         .value("NUMA", ov::Affinity::NUMA)
         .value("HYBRID_AWARE", ov::Affinity::HYBRID_AWARE);
 
+    py::enum_<ov::WorkloadType>(m_properties, "WorkloadType", py::arithmetic())
+        .value("DEFAULT", ov::WorkloadType::DEFAULT)
+        .value("EFFICIENT", ov::WorkloadType::EFFICIENT);
+
     py::enum_<ov::CacheMode>(m_properties, "CacheMode", py::arithmetic())
         .value("OPTIMIZE_SIZE", ov::CacheMode::OPTIMIZE_SIZE)
         .value("OPTIMIZE_SPEED", ov::CacheMode::OPTIMIZE_SPEED);
 
     // Submodule properties - properties
     wrap_property_RW(m_properties, ov::enable_profiling, "enable_profiling");
     wrap_property_RW(m_properties, ov::cache_dir, "cache_dir");
+    wrap_property_RW(m_properties, ov::workload_type, "workload_type");
     wrap_property_RW(m_properties, ov::cache_mode, "cache_mode");
     wrap_property_RW(m_properties, ov::auto_batch_timeout, "auto_batch_timeout");
     wrap_property_RW(m_properties, ov::num_streams, "num_streams");

diff --git a/src/bindings/python/src/pyopenvino/utils/utils.cpp b/src/bindings/python/src/pyopenvino/utils/utils.cpp
@@ -17,6 +17,7 @@
 #include "openvino/core/meta_data.hpp"
 #include "openvino/frontend/decoder.hpp"
 #include "openvino/frontend/graph_iterator.hpp"
+#include "openvino/runtime/properties.hpp"
 
 using Version = ov::pass::Serialize::Version;
 
@@ -218,6 +219,8 @@ py::object from_ov_any(const ov::Any& any) {
         return py::cast(any.as<ov::streams::Num>());
     } else if (any.is<ov::Affinity>()) {
         return py::cast(any.as<ov::Affinity>());
+    } else if (any.is<ov::WorkloadType>()) {
+        return py::cast(any.as<ov::WorkloadType>());
     } else if (any.is<ov::CacheMode>()) {
         return py::cast(any.as<ov::CacheMode>());
     } else if (any.is<ov::device::UUID>()) {
@@ -401,6 +404,8 @@ ov::Any py_object_to_any(const py::object& py_obj) {
         return py::cast<ov::streams::Num>(py_obj);
     } else if (py::isinstance<ov::Affinity>(py_obj)) {
         return py::cast<ov::Affinity>(py_obj);
+    } else if (py::isinstance<ov::WorkloadType>(py_obj)) {
+        return py::cast<ov::WorkloadType>(py_obj);
     } else if (py::isinstance<ov::Tensor>(py_obj)) {
         return py::cast<ov::Tensor>(py_obj);
     } else if (py::isinstance<ov::Output<ov::Node>>(py_obj)) {

diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py
@@ -61,6 +61,13 @@ def test_properties_rw_base():
                 (props.CacheMode.OPTIMIZE_SPEED, "CacheMode.OPTIMIZE_SPEED", 1),
             ),
         ),
+        (
+            props.WorkloadType,
+            (
+                (props.WorkloadType.DEFAULT, "WorkloadType.DEFAULT", 0),
+                (props.WorkloadType.EFFICIENT, "WorkloadType.EFFICIENT", 1),
+            ),
+        ),
         (
             hints.Priority,
             (

diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp
@@ -10,7 +10,9 @@
  */
 #pragma once
 
+#include <algorithm>
 #include <array>
+#include <cctype>
 #include <iomanip>
 #include <istream>
 #include <map>
@@ -690,6 +692,52 @@ static constexpr Property<std::string> cache_dir{"CACHE_DIR"};
  */
 static constexpr Property<bool, PropertyMutability::RO> loaded_from_cache{"LOADED_FROM_CACHE"};
 
+/**
+ * @brief Enum to define possible workload types
+ *
+ * Workload type represents the execution priority for an inference.
+ *
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+enum class WorkloadType {
+    DEFAULT = 0,    // Default execution priority
+    EFFICIENT = 1,  // Lower execution priority
+};
+
+/** @cond INTERNAL */
+inline std::ostream& operator<<(std::ostream& os, const WorkloadType& mode) {
+    switch (mode) {
+    case WorkloadType::DEFAULT:
+        return os << "Default";
+    case WorkloadType::EFFICIENT:
+        return os << "Efficient";
+    default:
+        OPENVINO_THROW("Unsupported workload type");
+    }
+}
+
+inline std::istream& operator>>(std::istream& is, WorkloadType& mode) {
+    std::string str;
+    is >> str;
+    std::transform(str.begin(), str.end(), str.begin(), tolower);
+    if (str == "default") {
+        mode = WorkloadType::DEFAULT;
+    } else if (str == "efficient") {
+        mode = WorkloadType::EFFICIENT;
+    } else {
+        OPENVINO_THROW("Unsupported workload type: ", str);
+    }
+    return is;
+}
+/** @endcond */
+
+/**
+ * @brief Read-write property to select in which mode the workload will be executed
+ * This is only supported by NPU.
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<WorkloadType, PropertyMutability::RW> workload_type{"WORKLOAD_TYPE"};
+
 /**
  * @brief Enum to define possible cache mode
  * @ingroup ov_runtime_cpp_prop_api
@@ -810,7 +858,6 @@ static constexpr Property<bool, PropertyMutability::RW> enable_mmap{"ENABLE_MMAP
  * @brief Namespace with device properties
  */
 namespace device {
-
 /**
  * @brief the property for setting of required device to execute on
  * values: device id starts from "0" - first device, "1" - second device, etc
@@ -1042,8 +1089,8 @@ inline std::istream& operator>>(std::istream& is, Type& device_type) {
 static constexpr Property<Type, PropertyMutability::RO> type{"DEVICE_TYPE"};
 
 /**
- * @brief Read-only property which defines Giga OPS per second count (GFLOPS or GIOPS) for a set of precisions supported
- * by specified device
+ * @brief Read-only property which defines Giga OPS per second count (GFLOPS or GIOPS) for a set of precisions
+ * supported by specified device
  * @ingroup ov_runtime_cpp_prop_api
  */
 static constexpr Property<std::map<element::Type, float>, PropertyMutability::RO> gops{"DEVICE_GOPS"};

diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/runtime.hpp
@@ -182,4 +182,26 @@ struct ENABLE_CPU_PINNING final : OptionBase<ENABLE_CPU_PINNING, bool> {
         return OptionMode::RunTime;
     }
 };
+
+//
+// WORKLOAD_TYPE
+//
+
+struct WORKLOAD_TYPE final : OptionBase<WORKLOAD_TYPE, ov::WorkloadType> {
+    static std::string_view key() {
+        return ov::workload_type.name();
+    }
+
+    static ov::WorkloadType defaultValue() {
+        return ov::WorkloadType::DEFAULT;
+    }
+
+    static constexpr std::string_view getTypeName() {
+        return "ov::WorkloadType";
+    }
+
+    static ov::WorkloadType parse(std::string_view val);
+
+    static std::string toString(const ov::WorkloadType& val);
+};
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/al/include/npu.hpp b/src/plugins/intel_npu/src/al/include/npu.hpp
@@ -35,6 +35,8 @@ class IEngineBackend : public std::enable_shared_from_this<IEngineBackend> {
     virtual const std::string getName() const = 0;
     /** @brief Backend has support for concurrency batching */
     virtual bool isBatchingSupported() const = 0;
+    /** @brief Backend has support for workload type */
+    virtual bool isWorkloadTypeSupported() const = 0;
     /** @brief Register backend-specific options */
     virtual void registerOptions(OptionsDesc& options) const;
 
@@ -47,6 +49,8 @@ class IEngineBackend : public std::enable_shared_from_this<IEngineBackend> {
 class IExecutor {
 public:
     virtual ~IExecutor() = default;
+
+    virtual void setWorkloadType(const ov::WorkloadType workloadType) const = 0;
 };
 
 //------------------------------------------------------------------------------

diff --git a/src/plugins/intel_npu/src/al/src/config/runtime.cpp b/src/plugins/intel_npu/src/al/src/config/runtime.cpp
@@ -4,7 +4,10 @@
 
 #include "intel_npu/al/config/runtime.hpp"
 
+#include <sstream>
+
 #include "intel_npu/al/config/common.hpp"
+#include "openvino/runtime/properties.hpp"
 
 using namespace intel_npu;
 using namespace ov::intel_npu;
@@ -20,6 +23,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) {
     desc.add<CREATE_EXECUTOR>();
     desc.add<NUM_STREAMS>();
     desc.add<ENABLE_CPU_PINNING>();
+    desc.add<WORKLOAD_TYPE>();
 }
 
 // Heuristically obtained number. Varies depending on the values of PLATFORM and PERFORMANCE_HINT
@@ -128,3 +132,22 @@ std::string intel_npu::NUM_STREAMS::toString(const ov::streams::Num& val) {
 
     return stringStream.str();
 }
+
+//
+// WORKLOAD_TYPE
+//
+
+ov::WorkloadType intel_npu::WORKLOAD_TYPE::parse(std::string_view val) {
+    std::istringstream ss = std::istringstream(std::string(val));
+    ov::WorkloadType workloadType;
+
+    ss >> workloadType;
+
+    return workloadType;
+}
+
+std::string intel_npu::WORKLOAD_TYPE::toString(const ov::WorkloadType& val) {
+    std::ostringstream ss;
+    ss << val;
+    return ss.str();
+}
diff --git a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp
@@ -26,6 +26,7 @@ class ZeroEngineBackend final : public IEngineBackend {
     uint32_t getDriverExtVersion() const override;
 
     bool isBatchingSupported() const override;
+    bool isWorkloadTypeSupported() const override;
 
 private:
     std::shared_ptr<ZeroInitStructsHolder> _instance;

diff --git a/src/plugins/intel_npu/src/backend/include/zero_executor.hpp b/src/plugins/intel_npu/src/backend/include/zero_executor.hpp
@@ -9,6 +9,7 @@
 
 #include "intel_npu/utils/logger/logger.hpp"
 #include "npu.hpp"
+#include "openvino/runtime/properties.hpp"
 #include "zero_init.hpp"
 #include "zero_wrappers.hpp"
 
@@ -32,6 +33,7 @@ class ZeroExecutor final : public IExecutor {
     };
 
     void setArgumentValue(uint32_t argi_, const void* argv_) const;
+    void setWorkloadType(const ov::WorkloadType workloadType) const override;
     inline ze_graph_handle_t graph() const {
         return _graph;
     }

diff --git a/src/plugins/intel_npu/src/backend/include/zero_init.hpp b/src/plugins/intel_npu/src/backend/include/zero_init.hpp
@@ -10,7 +10,8 @@
 #include <memory>
 
 #include "intel_npu/utils/logger/logger.hpp"
-#include "ze_intel_vpu_uuid.h"
+#include "ze_command_queue_npu_ext.h"
+#include "ze_intel_npu_uuid.h"
 #include "zero_types.hpp"
 
 namespace intel_npu {
@@ -39,6 +40,9 @@ class ZeroInitStructsHolder final {
     inline ze_graph_dditable_ext_curr_t* getGraphDdiTable() const {
         return graph_dditable_ext_decorator.get();
     }
+    inline ze_command_queue_npu_dditable_ext_curr_t* getCommandQueueDdiTable() const {
+        return _command_queue_npu_dditable_ext;
+    }
     inline ze_graph_profiling_dditable_ext_t* getProfilingDdiTable() const {
         return _graph_profiling_ddi_table_ext;
     }
@@ -57,6 +61,7 @@ class ZeroInitStructsHolder final {
     ze_device_handle_t device_handle = nullptr;
     ze_context_handle_t context = nullptr;
     std::unique_ptr<ze_graph_dditable_ext_decorator> graph_dditable_ext_decorator;
+    ze_command_queue_npu_dditable_ext_curr_t* _command_queue_npu_dditable_ext = nullptr;
     ze_graph_profiling_dditable_ext_t* _graph_profiling_ddi_table_ext = nullptr;
 
     ze_driver_properties_t driver_properties = {};

diff --git a/src/plugins/intel_npu/src/backend/include/zero_types.hpp b/src/plugins/intel_npu/src/backend/include/zero_types.hpp
@@ -8,6 +8,7 @@
 #include <ze_graph_ext.h>
 
 #include "intel_npu/al/config/runtime.hpp"
+#include "ze_command_queue_npu_ext.h"
 
 /**
  * @brief Last version of Table of Graph Extension functions used within plugin
@@ -127,3 +128,4 @@ struct ze_graph_dditable_ext_decorator final {
 };
 
 using ze_graph_dditable_ext_curr_t = ze_graph_dditable_ext_decorator;
+using ze_command_queue_npu_dditable_ext_curr_t = ze_command_queue_npu_dditable_ext_1_0_t;
diff --git a/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp b/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp
@@ -131,6 +131,7 @@ class CommandQueue {
     CommandQueue(const ze_device_handle_t& device_handle,
                  const ze_context_handle_t& context,
                  const ze_command_queue_priority_t& priority,
+                 ze_command_queue_npu_dditable_ext_curr_t* command_queue_npu_dditable_ext,
                  const Config& config,
                  const uint32_t& group_ordinal);
     CommandQueue(const CommandQueue&) = delete;
@@ -140,6 +141,7 @@ class CommandQueue {
 
     void executeCommandList(CommandList& command_list) const;
     void executeCommandList(CommandList& command_list, Fence& fence) const;
+    void setWorkloadType(ze_command_queue_workload_type_t workloadType) const;
     ~CommandQueue();
     inline ze_command_queue_handle_t handle() const {
         return _handle;
@@ -148,6 +150,7 @@ class CommandQueue {
 private:
     ze_command_queue_handle_t _handle = nullptr;
     ze_context_handle_t _context = nullptr;
+    ze_command_queue_npu_dditable_ext_curr_t* _command_queue_npu_dditable_ext = nullptr;
 
     Logger _log;
 };

diff --git a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp
@@ -34,6 +34,10 @@ bool ZeroEngineBackend::isBatchingSupported() const {
     return _instance->getDriverExtVersion() >= ZE_GRAPH_EXT_VERSION_1_6;
 }
 
+bool ZeroEngineBackend::isWorkloadTypeSupported() const {
+    return _instance->getCommandQueueDdiTable() != nullptr;
+}
+
 ZeroEngineBackend::~ZeroEngineBackend() = default;
 
 const std::shared_ptr<IDevice> ZeroEngineBackend::getDevice() const {