Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU]PageAttn with 4bit-quantization #27992

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,8 @@
from openvino._pyopenvino.properties.hint import allow_auto_batching
from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size
from openvino._pyopenvino.properties.hint import kv_cache_precision
from openvino._pyopenvino.properties.hint import key_cache_precision
from openvino._pyopenvino.properties.hint import value_cache_precision
from openvino._pyopenvino.properties.hint import key_cache_group_size
from openvino._pyopenvino.properties.hint import value_cache_group_size
from openvino._pyopenvino.properties.hint import activations_scale_factor
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ void regmodule_properties(py::module m) {
wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching");
wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size");
wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision");
wrap_property_RW(m_hint, ov::hint::key_cache_precision, "key_cache_precision");
wrap_property_RW(m_hint, ov::hint::value_cache_precision, "value_cache_precision");
wrap_property_RW(m_hint, ov::hint::key_cache_group_size, "key_cache_group_size");
wrap_property_RW(m_hint, ov::hint::value_cache_group_size, "value_cache_group_size");
wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor");

// Submodule intel_cpu
Expand Down
12 changes: 12 additions & 0 deletions src/bindings/python/tests/test_runtime/test_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,19 @@ def test_properties_ro(ov_property_ro, expected_value):
"DYNAMIC_QUANTIZATION_GROUP_SIZE",
((64, 64),),
),
(
hints.key_cache_group_size,
"KEY_CACHE_GROUP_SIZE",
((64, 64),),
),
(
hints.value_cache_group_size,
"VALUE_CACHE_GROUP_SIZE",
((64, 64),),
),
(hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)),
(hints.key_cache_precision, "KEY_CACHE_PRECISION", ((Type.f32, Type.f32),)),
(hints.value_cache_precision, "VALUE_CACHE_PRECISION", ((Type.f32, Type.f32),)),
(
hints.activations_scale_factor,
"ACTIVATIONS_SCALE_FACTOR",
Expand Down
24 changes: 24 additions & 0 deletions src/inference/include/openvino/runtime/properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,30 @@ static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization
*/
static constexpr Property<element::Type, PropertyMutability::RW> kv_cache_precision{"KV_CACHE_PRECISION"};

/**
* @brief Hint for device to use specified precision for key cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<element::Type, PropertyMutability::RW> key_cache_precision{"KEY_CACHE_PRECISION"};

/**
* @brief Hint for device to use specified precision for value cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<element::Type, PropertyMutability::RW> value_cache_precision{"VALUE_CACHE_PRECISION"};

/**
* @brief Hint for device to use group_size for key cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<uint64_t, PropertyMutability::RW> key_cache_group_size{"KEY_CACHE_GROUP_SIZE"};

/**
* @brief Hint for device to use group_size for value cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<uint64_t, PropertyMutability::RW> value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"};

/**
* @brief This property scales down activations to prevent overflows when inference precision is f16.
* @ingroup ov_runtime_cpp_prop_api
Expand Down
12 changes: 12 additions & 0 deletions src/plugins/intel_cpu/src/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
RO_property(ov::hint::dynamic_quantization_group_size.name()),
RO_property(ov::hint::kv_cache_precision.name()),
RO_property(ov::hint::key_cache_precision.name()),
RO_property(ov::hint::value_cache_precision.name()),
RO_property(ov::hint::key_cache_group_size.name()),
RO_property(ov::hint::value_cache_group_size.name()),
};

OPENVINO_SUPPRESS_DEPRECATED_START
Expand Down Expand Up @@ -333,6 +337,14 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
return decltype(ov::hint::dynamic_quantization_group_size)::value_type(config.fcDynamicQuantizationGroupSize);
} else if (name == ov::hint::kv_cache_precision) {
return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision);
} else if (name == ov::hint::key_cache_precision) {
return decltype(ov::hint::key_cache_precision)::value_type(config.keyCachePrecision);
} else if (name == ov::hint::value_cache_precision) {
return decltype(ov::hint::value_cache_precision)::value_type(config.valueCachePrecision);
} else if (name == ov::hint::key_cache_group_size) {
return decltype(ov::hint::key_cache_group_size)::value_type(config.keyCacheGroupSize);
} else if (name == ov::hint::value_cache_group_size) {
return decltype(ov::hint::value_cache_group_size)::value_type(config.valueCacheGroupSize);
}
OPENVINO_THROW("Unsupported property: ", name);
}
Expand Down
55 changes: 55 additions & 0 deletions src/plugins/intel_cpu/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,59 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
ov::hint::kv_cache_precision.name(),
". Supported values: u8, bf16, f16, f32");
}
} else if (key == ov::hint::key_cache_precision.name() || key == ov::hint::value_cache_precision.name()) {
try {
kvCachePrecisionSetExplicitly = true;
auto const prec = val.as<ov::element::Type>();
if (key == ov::hint::key_cache_precision.name()) {
if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) {
keyCachePrecision = prec;
} else {
OPENVINO_THROW("keyCachePrecision doesn't support value ", prec);
}
} else {
if (one_of(prec,
ov::element::f32,
ov::element::f16,
ov::element::bf16,
ov::element::u8,
ov::element::u4,
ov::element::i4)) {
valueCachePrecision = prec;
} else {
OPENVINO_THROW("valueCachePrecision doesn't support value ", prec);
}
}
} catch (ov::Exception&) {
if (key == ov::hint::key_cache_precision.name()) {
OPENVINO_THROW("Wrong value ",
val.as<std::string>(),
" for property key ",
ov::hint::key_cache_precision.name(),
". Supported values: u8, bf16, f16, f32");
} else {
OPENVINO_THROW("Wrong value ",
val.as<std::string>(),
" for property key ",
ov::hint::value_cache_precision.name(),
". Supported values: u4, s4, u8, bf16, f16, f32");
}
}
} else if (key == ov::hint::key_cache_group_size.name() || key == ov::hint::value_cache_group_size.name()) {
try {
auto const groupSize = val.as<uint64_t>();
if (key == ov::hint::key_cache_group_size.name()) {
keyCacheGroupSize = groupSize;
} else {
valueCacheGroupSize = groupSize;
}
} catch (ov::Exception&) {
OPENVINO_THROW("Wrong value ",
val.as<std::string>(),
" for property key ",
key,
". Expected only unsinged integer numbers");
}
} else if (key == ov::cache_encryption_callbacks.name()) {
try {
auto encryption_callbacks = val.as<EncryptionCallbacks>();
Expand Down Expand Up @@ -415,6 +468,8 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
}
if (!kvCachePrecisionSetExplicitly) {
kvCachePrecision = ov::element::f32;
valueCachePrecision = ov::element::f32;
keyCachePrecision = ov::element::f32;
}
}

Expand Down
6 changes: 6 additions & 0 deletions src/plugins/intel_cpu/src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,18 @@ struct Config {
#endif
#if defined(OPENVINO_ARCH_X86_64)
ov::element::Type kvCachePrecision = ov::element::u8;
ov::element::Type keyCachePrecision = ov::element::u8;
ov::element::Type valueCachePrecision = ov::element::u8;
size_t rtCacheCapacity = 5000ul;
#else
ov::element::Type kvCachePrecision = ov::element::f16;
ov::element::Type keyCachePrecision = ov::element::f16;
ov::element::Type valueCachePrecision = ov::element::f16;
// TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives
size_t rtCacheCapacity = 0ul;
#endif
size_t keyCacheGroupSize = 0ul;
size_t valueCacheGroupSize = 0ul;
ov::threading::IStreamsExecutor::Config streamExecutorConfig;
int streams = 1;
bool streamsChanged = false;
Expand Down
Loading
Loading