Skip to content

Commit

Permalink
[GPU] Disable KV cache compression and FC scaling for GPU with systol…
Browse files Browse the repository at this point in the history
…ic (openvinotoolkit#27988)

### Details:
- GPU with systolic does not need FC scaling
- GPU with systolic does not support kv cache compresssion yet
  • Loading branch information
isanghao authored and 11happy committed Dec 23, 2024
1 parent f5bfea9 commit caff123
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ class ExecutionConfig {

// Note that RT info property value has lower priority than values set by user via core.set_property or passed to compile_model call
// So this method should be called after setting all user properties, but before apply_user_properties() call.
void apply_rt_info(const ov::RTMap& rt_info);
void apply_rt_info(const cldnn::device_info& info, const ov::RTMap& rt_info);

std::string to_string() const;

Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_gpu/src/plugin/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
ExecutionConfig config = m_configs_map.at(device_id);
config.set_user_property(orig_config);
if (model->has_rt_info("runtime_options"))
config.apply_rt_info(model->get_rt_info<ov::AnyMap>("runtime_options"));
config.apply_rt_info(context->get_engine().get_device_info(), model->get_rt_info<ov::AnyMap>("runtime_options"));
config.apply_user_properties(context->get_engine().get_device_info());

set_cache_info(model, config);
Expand Down Expand Up @@ -281,7 +281,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr<const ov::Model>&
ExecutionConfig config = m_configs_map.at(device_id);
config.set_user_property(orig_config);
if (model->has_rt_info("runtime_options"))
config.apply_rt_info(model->get_rt_info<ov::AnyMap>("runtime_options"));
config.apply_rt_info(ctx->get_engine().get_device_info(), model->get_rt_info<ov::AnyMap>("runtime_options"));
config.apply_user_properties(ctx->get_engine().get_device_info());

ProgramBuilder prog(ctx->get_engine(), config);
Expand Down
8 changes: 5 additions & 3 deletions src/plugins/intel_gpu/src/runtime/execution_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,12 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
user_properties.clear();
}

void ExecutionConfig::apply_rt_info(const ov::RTMap& rt_info) {
apply_rt_info_property(ov::hint::kv_cache_precision, rt_info);
void ExecutionConfig::apply_rt_info(const cldnn::device_info& info, const ov::RTMap& rt_info) {
if (!info.supports_immad) {
apply_rt_info_property(ov::hint::kv_cache_precision, rt_info);
apply_rt_info_property(ov::hint::activations_scale_factor, rt_info);
}
apply_rt_info_property(ov::hint::dynamic_quantization_group_size, rt_info);
apply_rt_info_property(ov::hint::activations_scale_factor, rt_info);
}

std::string ExecutionConfig::to_string() const {
Expand Down
12 changes: 10 additions & 2 deletions src/plugins/intel_gpu/tests/functional/behavior/properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//

#include <algorithm>
#include "openvino/runtime/properties.hpp"
#include "openvino/runtime/intel_gpu/properties.hpp"
#include "base/ov_behavior_test_utils.hpp"
#include "openvino/runtime/core.hpp"
#include "common_test_utils/subgraph_builders/conv_pool_relu.hpp"
Expand Down Expand Up @@ -43,11 +45,17 @@ TEST_F(TestPropertiesGPU, RTInfoPropertiesWithDefault) {
model->set_rt_info("8.0", "runtime_options", ov::hint::activations_scale_factor.name());

OV_ASSERT_NO_THROW(compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU));
OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision));
OV_ASSERT_NO_THROW(size = compiled_model.get_property(ov::hint::dynamic_quantization_group_size));
ASSERT_EQ(size.as<uint64_t>(), 0);

// GPU with systolic does not support some of rt_info
auto capabilities = core.get_property(ov::test::utils::DEVICE_GPU, ov::device::capabilities);
if (find(capabilities.cbegin(), capabilities.cend(), ov::intel_gpu::capability::HW_MATMUL) != capabilities.cend())
return;

OV_ASSERT_NO_THROW(type = compiled_model.get_property(ov::hint::kv_cache_precision));
OV_ASSERT_NO_THROW(scale = compiled_model.get_property(ov::hint::activations_scale_factor));
ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f16);
ASSERT_EQ(size.as<uint64_t>(), 0);
ASSERT_EQ(scale.as<float>(), 8.0f);
}

Expand Down

0 comments on commit caff123

Please sign in to comment.