openvinotoolkit · zhangYiIntel · Oct 31, 2024 · Nov 6, 2024 · Nov 8, 2024 · Nov 13, 2024
@@ -23,4 +23,8 @@
 from openvino._pyopenvino.properties.hint import allow_auto_batching
 from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size
 from openvino._pyopenvino.properties.hint import kv_cache_precision
+from openvino._pyopenvino.properties.hint import key_cache_precision
+from openvino._pyopenvino.properties.hint import value_cache_precision
+from openvino._pyopenvino.properties.hint import key_cache_group_size
+from openvino._pyopenvino.properties.hint import value_cache_group_size
 from openvino._pyopenvino.properties.hint import activations_scale_factor
@@ -101,6 +101,10 @@ void regmodule_properties(py::module m) {
     wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching");
     wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size");
     wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision");
+    wrap_property_RW(m_hint, ov::hint::key_cache_precision, "key_cache_precision");
+    wrap_property_RW(m_hint, ov::hint::value_cache_precision, "value_cache_precision");
+    wrap_property_RW(m_hint, ov::hint::key_cache_group_size, "key_cache_group_size");
+    wrap_property_RW(m_hint, ov::hint::value_cache_group_size, "value_cache_group_size");
     wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor");
 
     // Submodule intel_cpu

@@ -334,7 +334,19 @@ def test_properties_ro(ov_property_ro, expected_value):
             "DYNAMIC_QUANTIZATION_GROUP_SIZE",
             ((64, 64),),
         ),
+        (
+            hints.key_cache_group_size,
+            "KEY_CACHE_GROUP_SIZE",
+            ((64, 64),),
+        ),
+        (
+            hints.value_cache_group_size,
+            "VALUE_CACHE_GROUP_SIZE",
+            ((64, 64),),
+        ),
         (hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)),
+        (hints.key_cache_precision, "KEY_CACHE_PRECISION", ((Type.f32, Type.f32),)),
+        (hints.value_cache_precision, "VALUE_CACHE_PRECISION", ((Type.f32, Type.f32),)),
         (
             hints.activations_scale_factor,
             "ACTIVATIONS_SCALE_FACTOR",

@@ -580,6 +580,30 @@ static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization
  */
 static constexpr Property<element::Type, PropertyMutability::RW> kv_cache_precision{"KV_CACHE_PRECISION"};
 
+/**
+ * @brief Hint for device to use specified precision for key cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<element::Type, PropertyMutability::RW> key_cache_precision{"KEY_CACHE_PRECISION"};
+
+/**
+ * @brief Hint for device to use specified precision for value cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<element::Type, PropertyMutability::RW> value_cache_precision{"VALUE_CACHE_PRECISION"};
+
+/**
+ * @brief Hint for device to use group_size for key cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<uint64_t, PropertyMutability::RW> key_cache_group_size{"KEY_CACHE_GROUP_SIZE"};
+
+/**
+ * @brief Hint for device to use group_size for value cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<uint64_t, PropertyMutability::RW> value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"};
+
 /**
  * @brief This property scales down activations to prevent overflows when inference precision is f16.
  * @ingroup ov_runtime_cpp_prop_api

@@ -257,6 +257,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
             RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
             RO_property(ov::hint::dynamic_quantization_group_size.name()),
             RO_property(ov::hint::kv_cache_precision.name()),
+            RO_property(ov::hint::key_cache_precision.name()),
+            RO_property(ov::hint::value_cache_precision.name()),
+            RO_property(ov::hint::key_cache_group_size.name()),
+            RO_property(ov::hint::value_cache_group_size.name()),
         };
 
         OPENVINO_SUPPRESS_DEPRECATED_START
@@ -333,6 +337,14 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
         return decltype(ov::hint::dynamic_quantization_group_size)::value_type(config.fcDynamicQuantizationGroupSize);
     } else if (name == ov::hint::kv_cache_precision) {
         return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision);
+    } else if (name == ov::hint::key_cache_precision) {
+        return decltype(ov::hint::key_cache_precision)::value_type(config.keyCachePrecision);
+    } else if (name == ov::hint::value_cache_precision) {
+        return decltype(ov::hint::value_cache_precision)::value_type(config.valueCachePrecision);
+    } else if (name == ov::hint::key_cache_group_size) {
+        return decltype(ov::hint::key_cache_group_size)::value_type(config.keyCacheGroupSize);
+    } else if (name == ov::hint::value_cache_group_size) {
+        return decltype(ov::hint::value_cache_group_size)::value_type(config.valueCacheGroupSize);
     }
     OPENVINO_THROW("Unsupported property: ", name);
 }

@@ -373,6 +373,59 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                                ov::hint::kv_cache_precision.name(),
                                ". Supported values: u8, bf16, f16, f32");
             }
+        } else if (key == ov::hint::key_cache_precision.name() || key == ov::hint::value_cache_precision.name()) {
+            try {
+                kvCachePrecisionSetExplicitly = true;
+                auto const prec = val.as<ov::element::Type>();
+                if (key == ov::hint::key_cache_precision.name()) {
+                    if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) {
+                        keyCachePrecision = prec;
+                    } else {
+                        OPENVINO_THROW("keyCachePrecision doesn't support value ", prec);
+                    }
+                } else {
+                    if (one_of(prec,
+                               ov::element::f32,
+                               ov::element::f16,
+                               ov::element::bf16,
+                               ov::element::u8,
+                               ov::element::u4,
+                               ov::element::i4)) {
+                        valueCachePrecision = prec;
+                    } else {
+                        OPENVINO_THROW("valueCachePrecision doesn't support value ", prec);
+                    }
+                }
+            } catch (ov::Exception&) {
+                if (key == ov::hint::key_cache_precision.name()) {
+                    OPENVINO_THROW("Wrong value ",
+                                   val.as<std::string>(),
+                                   " for property key ",
+                                   ov::hint::key_cache_precision.name(),
+                                   ". Supported values: u8, bf16, f16, f32");
+                } else {
+                    OPENVINO_THROW("Wrong value ",
+                                   val.as<std::string>(),
+                                   " for property key ",
+                                   ov::hint::value_cache_precision.name(),
+                                   ". Supported values: u4, s4, u8, bf16, f16, f32");
+                }
+            }
+        } else if (key == ov::hint::key_cache_group_size.name() || key == ov::hint::value_cache_group_size.name()) {
+            try {
+                auto const groupSize = val.as<uint64_t>();
+                if (key == ov::hint::key_cache_group_size.name()) {
+                    keyCacheGroupSize = groupSize;
+                } else {
+                    valueCacheGroupSize = groupSize;
+                }
+            } catch (ov::Exception&) {
+                OPENVINO_THROW("Wrong value ",
+                               val.as<std::string>(),
+                               " for property key ",
+                               key,
+                               ". Expected only unsinged integer numbers");
+            }
         } else if (key == ov::cache_encryption_callbacks.name()) {
             try {
                 auto encryption_callbacks = val.as<EncryptionCallbacks>();
@@ -415,6 +468,8 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
         }
         if (!kvCachePrecisionSetExplicitly) {
             kvCachePrecision = ov::element::f32;
+            valueCachePrecision = ov::element::f32;
+            keyCachePrecision = ov::element::f32;
         }
     }
 

@@ -53,12 +53,18 @@ struct Config {
 #endif
 #if defined(OPENVINO_ARCH_X86_64)
     ov::element::Type kvCachePrecision = ov::element::u8;
+    ov::element::Type keyCachePrecision = ov::element::u8;
+    ov::element::Type valueCachePrecision = ov::element::u8;
     size_t rtCacheCapacity = 5000ul;
 #else
     ov::element::Type kvCachePrecision = ov::element::f16;
+    ov::element::Type keyCachePrecision = ov::element::f16;
+    ov::element::Type valueCachePrecision = ov::element::f16;
     // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives
     size_t rtCacheCapacity = 0ul;
 #endif
+    size_t keyCacheGroupSize = 0ul;
+    size_t valueCacheGroupSize = 0ul;
     ov::threading::IStreamsExecutor::Config streamExecutorConfig;
     int streams = 1;
     bool streamsChanged = false;