openvinotoolkit · dmitry-gorokhov · Nov 13, 2024 · Nov 7, 2024 · Nov 8, 2024 · Nov 11, 2024
@@ -358,6 +358,7 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
             }
         } else if (key == ov::hint::kv_cache_precision.name()) {
             try {
+                kvCachePrecisionSetExplicitly = true;
                 auto const prec = val.as<ov::element::Type>();
                 if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) {
                     kvCachePrecision = prec;
@@ -411,6 +412,9 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
         if (!fcDynamicQuantizationGroupSizeSetExplicitly) {
             fcDynamicQuantizationGroupSize = 0;
         }
+        if (!kvCachePrecisionSetExplicitly) {
+            kvCachePrecision = ov::element::f32;
+        }
     }
 
     if (!prop.empty())

@@ -51,14 +51,16 @@ struct Config {
     std::string device_id = {};
     float fcSparseWeiDecompressionRate = 1.0f;
     uint64_t fcDynamicQuantizationGroupSize = 32;
-    ov::element::Type kvCachePrecision = ov::element::f16;
     bool fcDynamicQuantizationGroupSizeSetExplicitly = false;
+    bool kvCachePrecisionSetExplicitly = false;
 #if defined(OV_CPU_WITH_ACL)
     bool aclFastMath = false;
 #endif
 #if defined(OPENVINO_ARCH_X86_64)
+    ov::element::Type kvCachePrecision = ov::element::u8;
     size_t rtCacheCapacity = 5000ul;
 #else
+    ov::element::Type kvCachePrecision = ov::element::f16;
     // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives
     size_t rtCacheCapacity = 0ul;
 #endif

@@ -297,18 +297,19 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr<ov::ITensor>& state) {
         auto S = internal.size(3);
         auto nthr = parallel_get_max_threads();
         std::vector<PlainTensor> buffers(nthr);
+        m_scale_zp.resize<float>({L0, B, H, 2});
         parallel_for3d(B, H, L0, [&](size_t ithr, size_t b, size_t h, size_t m) {
             buffers[ithr].resize<float>({S});
-            cpu_convert(external.ptr_v(b, h, m),
+            cpu_convert(external.ptr_v(m, b, h),
                         buffers[ithr].ptr<float>(),
                         external.m_dt,
                         element::f32,
                         S);
             attn_quant_u8(buffers[ithr].ptr<float>(),
-                          internal.ptr<uint8_t>(b, h, m),
+                          internal.ptr<uint8_t>(m, b, h),
                           S,
-                          m_scale_zp.at<float>({b, h, m, size_t{0}}),
-                          m_scale_zp.at<float>({b, h, m, size_t{1}}));
+                          m_scale_zp.at<float>({m, b, h, size_t{0}}),
+                          m_scale_zp.at<float>({m, b, h, size_t{1}}));
         });
     } else {
         m_internal_mem->load(external_mem);

@@ -194,6 +194,17 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckAccuracyModeDynamicQuantiz
     ASSERT_EQ(groupSize, 0);
 }
 
+TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckAccuracyModeKVCachePrecision) {
+    ov::Core core;
+
+    ASSERT_NO_THROW(core.set_property(deviceName, ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY)));
+    ov::CompiledModel compiledModel = core.compile_model(model, deviceName);
+
+    auto kv_cache_precision_value = ov::element::undefined;
+    ASSERT_NO_THROW(kv_cache_precision_value = compiledModel.get_property(ov::hint::kv_cache_precision));
+    ASSERT_EQ(kv_cache_precision_value, ov::element::f32);
+}
+
 const auto bf16_if_can_be_emulated = ov::with_cpu_x86_avx512_core() ? ov::element::bf16 : ov::element::f32;
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckExecutionModeIsAvailableInCoreAndModel) {