From 2d148ec7ec201e4b01f512bed0e4a470559c3415 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Wed, 13 Nov 2024 09:14:27 +0800 Subject: [PATCH] [CPU] Enable u8 kv cache by default (#27454) ### Details: - *Enable u8 kv cache by default* - *...* ### Tickets: - *[152621](https://jira.devtools.intel.com/browse/CVS-152621)* --- src/plugins/intel_cpu/src/config.cpp | 4 +++ src/plugins/intel_cpu/src/config.h | 4 ++- src/plugins/intel_cpu/src/memory_state.cpp | 9 ++++--- .../ov_executable_network/properties.cpp | 11 ++++++++ .../common/concat_transpose_sdp_transpose.cpp | 25 ++++++++++++++++++- 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index adcaeaaaa31a6f..92470ca063a4c0 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -358,6 +358,7 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { } } else if (key == ov::hint::kv_cache_precision.name()) { try { + kvCachePrecisionSetExplicitly = true; auto const prec = val.as(); if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) { kvCachePrecision = prec; @@ -411,6 +412,9 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { if (!fcDynamicQuantizationGroupSizeSetExplicitly) { fcDynamicQuantizationGroupSize = 0; } + if (!kvCachePrecisionSetExplicitly) { + kvCachePrecision = ov::element::f32; + } } if (!prop.empty()) diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 79cdf3a5e827ec..5f4bb25ede350e 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -51,14 +51,16 @@ struct Config { std::string device_id = {}; float fcSparseWeiDecompressionRate = 1.0f; uint64_t fcDynamicQuantizationGroupSize = 32; - ov::element::Type kvCachePrecision = ov::element::f16; bool fcDynamicQuantizationGroupSizeSetExplicitly = false; + bool kvCachePrecisionSetExplicitly = false; #if defined(OV_CPU_WITH_ACL) bool aclFastMath = false; #endif #if defined(OPENVINO_ARCH_X86_64) + ov::element::Type kvCachePrecision = ov::element::u8; size_t rtCacheCapacity = 5000ul; #else + ov::element::Type kvCachePrecision = ov::element::f16; // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives size_t rtCacheCapacity = 0ul; #endif diff --git a/src/plugins/intel_cpu/src/memory_state.cpp b/src/plugins/intel_cpu/src/memory_state.cpp index bf77917497de77..aa06f4ebd82957 100644 --- a/src/plugins/intel_cpu/src/memory_state.cpp +++ b/src/plugins/intel_cpu/src/memory_state.cpp @@ -297,18 +297,19 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { auto S = internal.size(3); auto nthr = parallel_get_max_threads(); std::vector buffers(nthr); + m_scale_zp.resize({L0, B, H, 2}); parallel_for3d(B, H, L0, [&](size_t ithr, size_t b, size_t h, size_t m) { buffers[ithr].resize({S}); - cpu_convert(external.ptr_v(b, h, m), + cpu_convert(external.ptr_v(m, b, h), buffers[ithr].ptr(), external.m_dt, element::f32, S); attn_quant_u8(buffers[ithr].ptr(), - internal.ptr(b, h, m), + internal.ptr(m, b, h), S, - m_scale_zp.at({b, h, m, size_t{0}}), - m_scale_zp.at({b, h, m, size_t{1}})); + m_scale_zp.at({m, b, h, size_t{0}}), + m_scale_zp.at({m, b, h, size_t{1}})); }); } else { m_internal_mem->load(external_mem); diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index 365e7c56dcef82..8ec0900bc7d176 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -194,6 +194,17 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckAccuracyModeDynamicQuantiz ASSERT_EQ(groupSize, 0); } +TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckAccuracyModeKVCachePrecision) { + ov::Core core; + + ASSERT_NO_THROW(core.set_property(deviceName, ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY))); + ov::CompiledModel compiledModel = core.compile_model(model, deviceName); + + auto kv_cache_precision_value = ov::element::undefined; + ASSERT_NO_THROW(kv_cache_precision_value = compiledModel.get_property(ov::hint::kv_cache_precision)); + ASSERT_EQ(kv_cache_precision_value, ov::element::f32); +} + const auto bf16_if_can_be_emulated = ov::with_cpu_x86_avx512_core() ? ov::element::bf16 : ov::element::f32; TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckExecutionModeIsAvailableInCoreAndModel) { diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp index 65bc379c78b540..f4166544af2bf2 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/core/type/float16.hpp" #include "openvino/opsets/opset13.hpp" #include "openvino/pass/manager.hpp" #include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp" @@ -207,6 +208,10 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface(t.data()), t.get_size(), val, 0.1f); inputs.insert({param, t}); + } else if (param->get_element_type() == ov::element::f16) { + ov::Tensor t{ov::element::f16, shape}; + strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); + inputs.insert({param, t}); } else { ASSERT_TRUE(param->get_element_type() == ov::element::bf16); ov::Tensor t{ov::element::bf16, shape}; @@ -365,6 +370,15 @@ class ConcatSDPTransposeTestSetState : public ConcatSDPTransposeTestBase { } std::vector run_test(std::shared_ptr model) { function = model; + // on spr, all kvccache precision will be covered and all paths for get/set_state will be tested + auto input_type = model->get_parameters()[0]->get_element_type(); + if (input_type == ov::element::f32) { + configuration[ov::hint::kv_cache_precision.name()] = "f32"; + } else if (input_type == ov::element::bf16) { + configuration[ov::hint::kv_cache_precision.name()] = "bf16"; + } else { + configuration[ov::hint::kv_cache_precision.name()] = "u8"; + } prepare(); std::vector outputs; // case 1: initialization + pastkv reaches limitation, remove some state @@ -407,6 +421,15 @@ class ConcatSDPTransposeTestSetState : public ConcatSDPTransposeTestBase { TEST_P(ConcatSDPTransposeTestSetState, CompareWithRefs) { SKIP_IF_CURRENT_TEST_IS_DISABLED(); + ElementType inType; + InputShapeAndTransposeOrder inputShapeAndOrders; + bool hasShapeOf; + std::tie(inType, inputShapeAndOrders, hasShapeOf) = this->GetParam(); + + // skip bf16 test on avx512 platform + if (inType == ElementType::bf16 && !ov::with_cpu_x86_bfloat16()) + GTEST_SKIP(); + auto actualOutputs = run_test(function); CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); CheckNumberOfNodesWithType(compiledModel, "Concatenation", 0); @@ -438,7 +461,7 @@ const std::vector inputShapeAndReordersSetState = { INSTANTIATE_TEST_SUITE_P(smoke_ConcatSDPTransposeTestSetState, ConcatSDPTransposeTestSetState, - ::testing::Combine(::testing::Values(ElementType::f32), + ::testing::Combine(::testing::Values(ElementType::f32, ElementType::bf16, ElementType::f16), ::testing::ValuesIn(inputShapeAndReordersSetState), ::testing::Values(false)), ConcatSDPTransposeTest::getTestCaseName);