From 0f67bd88f0497836d64b4c0484b527307b78768a Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Thu, 7 Nov 2024 19:40:55 +0800 Subject: [PATCH 1/4] default enable u8 kv cache --- src/plugins/intel_cpu/src/config.cpp | 4 ++++ src/plugins/intel_cpu/src/config.h | 3 ++- .../behavior/ov_executable_network/properties.cpp | 11 +++++++++++ .../src/common/concat_transpose_sdp_transpose.cpp | 1 + 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index adcaeaaaa31a6f..92470ca063a4c0 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -358,6 +358,7 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { } } else if (key == ov::hint::kv_cache_precision.name()) { try { + kvCachePrecisionSetExplicitly = true; auto const prec = val.as(); if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) { kvCachePrecision = prec; @@ -411,6 +412,9 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { if (!fcDynamicQuantizationGroupSizeSetExplicitly) { fcDynamicQuantizationGroupSize = 0; } + if (!kvCachePrecisionSetExplicitly) { + kvCachePrecision = ov::element::f32; + } } if (!prop.empty()) diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 79cdf3a5e827ec..4a88a670149e12 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -51,8 +51,9 @@ struct Config { std::string device_id = {}; float fcSparseWeiDecompressionRate = 1.0f; uint64_t fcDynamicQuantizationGroupSize = 32; - ov::element::Type kvCachePrecision = ov::element::f16; + ov::element::Type kvCachePrecision = ov::element::u8; bool fcDynamicQuantizationGroupSizeSetExplicitly = false; + bool kvCachePrecisionSetExplicitly = false; #if defined(OV_CPU_WITH_ACL) bool aclFastMath = false; #endif diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index 365e7c56dcef82..8ec0900bc7d176 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -194,6 +194,17 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckAccuracyModeDynamicQuantiz ASSERT_EQ(groupSize, 0); } +TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckAccuracyModeKVCachePrecision) { + ov::Core core; + + ASSERT_NO_THROW(core.set_property(deviceName, ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY))); + ov::CompiledModel compiledModel = core.compile_model(model, deviceName); + + auto kv_cache_precision_value = ov::element::undefined; + ASSERT_NO_THROW(kv_cache_precision_value = compiledModel.get_property(ov::hint::kv_cache_precision)); + ASSERT_EQ(kv_cache_precision_value, ov::element::f32); +} + const auto bf16_if_can_be_emulated = ov::with_cpu_x86_avx512_core() ? ov::element::bf16 : ov::element::f32; TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckExecutionModeIsAvailableInCoreAndModel) { diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp index 65bc379c78b540..0f968e0668a093 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp @@ -365,6 +365,7 @@ class ConcatSDPTransposeTestSetState : public ConcatSDPTransposeTestBase { } std::vector run_test(std::shared_ptr model) { function = model; + configuration[ov::hint::kv_cache_precision.name()] = "f16"; prepare(); std::vector outputs; // case 1: initialization + pastkv reaches limitation, remove some state From f37b571e5108dda75aafebfa72efad132d29f13a Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Fri, 8 Nov 2024 09:44:50 +0800 Subject: [PATCH 2/4] disable u8 kvcache default config on non-x86 --- src/plugins/intel_cpu/src/config.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 4a88a670149e12..5f4bb25ede350e 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -51,15 +51,16 @@ struct Config { std::string device_id = {}; float fcSparseWeiDecompressionRate = 1.0f; uint64_t fcDynamicQuantizationGroupSize = 32; - ov::element::Type kvCachePrecision = ov::element::u8; bool fcDynamicQuantizationGroupSizeSetExplicitly = false; bool kvCachePrecisionSetExplicitly = false; #if defined(OV_CPU_WITH_ACL) bool aclFastMath = false; #endif #if defined(OPENVINO_ARCH_X86_64) + ov::element::Type kvCachePrecision = ov::element::u8; size_t rtCacheCapacity = 5000ul; #else + ov::element::Type kvCachePrecision = ov::element::f16; // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives size_t rtCacheCapacity = 0ul; #endif From 81d34114b218633a9fd4f670077870dc0da7225f Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Mon, 11 Nov 2024 09:50:29 +0800 Subject: [PATCH 3/4] apply review comment: fix u8 set_state failure --- src/plugins/intel_cpu/src/memory_state.cpp | 9 +++++---- .../src/common/concat_transpose_sdp_transpose.cpp | 3 +-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/src/memory_state.cpp b/src/plugins/intel_cpu/src/memory_state.cpp index bf77917497de77..aa06f4ebd82957 100644 --- a/src/plugins/intel_cpu/src/memory_state.cpp +++ b/src/plugins/intel_cpu/src/memory_state.cpp @@ -297,18 +297,19 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { auto S = internal.size(3); auto nthr = parallel_get_max_threads(); std::vector buffers(nthr); + m_scale_zp.resize({L0, B, H, 2}); parallel_for3d(B, H, L0, [&](size_t ithr, size_t b, size_t h, size_t m) { buffers[ithr].resize({S}); - cpu_convert(external.ptr_v(b, h, m), + cpu_convert(external.ptr_v(m, b, h), buffers[ithr].ptr(), external.m_dt, element::f32, S); attn_quant_u8(buffers[ithr].ptr(), - internal.ptr(b, h, m), + internal.ptr(m, b, h), S, - m_scale_zp.at({b, h, m, size_t{0}}), - m_scale_zp.at({b, h, m, size_t{1}})); + m_scale_zp.at({m, b, h, size_t{0}}), + m_scale_zp.at({m, b, h, size_t{1}})); }); } else { m_internal_mem->load(external_mem); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp index 0f968e0668a093..bbfe619f3f0c35 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp @@ -361,11 +361,10 @@ class ConcatSDPTransposeTestSetState : public ConcatSDPTransposeTestBase { val += 0.13f; state.set_state(new_state); - } + } } std::vector run_test(std::shared_ptr model) { function = model; - configuration[ov::hint::kv_cache_precision.name()] = "f16"; prepare(); std::vector outputs; // case 1: initialization + pastkv reaches limitation, remove some state From 45eb699928801076befd57be77159c44193258d8 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Tue, 12 Nov 2024 15:02:13 +0800 Subject: [PATCH 4/4] cover bf16/f32/u8 for get/set_state --- .../common/concat_transpose_sdp_transpose.cpp | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp index 65bc379c78b540..f4166544af2bf2 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/core/type/float16.hpp" #include "openvino/opsets/opset13.hpp" #include "openvino/pass/manager.hpp" #include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp" @@ -207,6 +208,10 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface(t.data()), t.get_size(), val, 0.1f); inputs.insert({param, t}); + } else if (param->get_element_type() == ov::element::f16) { + ov::Tensor t{ov::element::f16, shape}; + strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); + inputs.insert({param, t}); } else { ASSERT_TRUE(param->get_element_type() == ov::element::bf16); ov::Tensor t{ov::element::bf16, shape}; @@ -365,6 +370,15 @@ class ConcatSDPTransposeTestSetState : public ConcatSDPTransposeTestBase { } std::vector run_test(std::shared_ptr model) { function = model; + // on spr, all kvccache precision will be covered and all paths for get/set_state will be tested + auto input_type = model->get_parameters()[0]->get_element_type(); + if (input_type == ov::element::f32) { + configuration[ov::hint::kv_cache_precision.name()] = "f32"; + } else if (input_type == ov::element::bf16) { + configuration[ov::hint::kv_cache_precision.name()] = "bf16"; + } else { + configuration[ov::hint::kv_cache_precision.name()] = "u8"; + } prepare(); std::vector outputs; // case 1: initialization + pastkv reaches limitation, remove some state @@ -407,6 +421,15 @@ class ConcatSDPTransposeTestSetState : public ConcatSDPTransposeTestBase { TEST_P(ConcatSDPTransposeTestSetState, CompareWithRefs) { SKIP_IF_CURRENT_TEST_IS_DISABLED(); + ElementType inType; + InputShapeAndTransposeOrder inputShapeAndOrders; + bool hasShapeOf; + std::tie(inType, inputShapeAndOrders, hasShapeOf) = this->GetParam(); + + // skip bf16 test on avx512 platform + if (inType == ElementType::bf16 && !ov::with_cpu_x86_bfloat16()) + GTEST_SKIP(); + auto actualOutputs = run_test(function); CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); CheckNumberOfNodesWithType(compiledModel, "Concatenation", 0); @@ -438,7 +461,7 @@ const std::vector inputShapeAndReordersSetState = { INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTransposeTestSetState, ConcatSDPTransposeTestSetState, - ::testing::Combine(::testing::Values(ElementType::f32), + ::testing::Combine(::testing::Values(ElementType::f32, ElementType::bf16, ElementType::f16), ::testing::ValuesIn(inputShapeAndReordersSetState), ::testing::Values(false)), ConcatSDPTransposeTest::getTestCaseName);