Skip to content

Commit

Permalink
[GPU] Enable KV-cache compression by default for non-systolic platfor…
Browse files Browse the repository at this point in the history
…ms (#27410)

### Details:
 - Enable KV-cache compression by default for non-systolic platforms
  • Loading branch information
sshlyapn authored Nov 7, 2024
1 parent 9226bbe commit 5fad805
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ KERNEL(dynamic_quantize_gpu_kv_cache)(
#if ASYMMETRIC_QUANTIZATION
min_value = work_group_reduce_min(min_value);
max_value = work_group_reduce_max(max_value);
OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value));
OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_value * scale) - CHAR_MAX;
ACCUMULATOR_TYPE scale = (ACCUMULATOR_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value));
ACCUMULATOR_TYPE zp = (ACCUMULATOR_TYPE)(-min_value * scale) - CHAR_MAX;
#else
max_value = work_group_reduce_max(max_value);
OUTPUT1_TYPE scale = 127.0h / max_value;
ACCUMULATOR_TYPE scale = 127.0h / max_value;
#endif

#ifdef APPEND_MODE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ JitConstants DynamicQuantizeKernelKVCache::GetJitConstants(const dynamic_quantiz
jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization));
jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp));

// Use FP32 accumulator type for scale/zp calculation
jit.Merge(MakeTypeJitConstants(Datatype::F32, "ACCUMULATOR"));

bool rearrange_scales_order = false;
const auto& scales_output_order = params.scales_output_order;
if (!scales_output_order.empty()) {
Expand Down
5 changes: 5 additions & 0 deletions src/plugins/intel_gpu/src/runtime/execution_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
}

// Enable KV-cache compression by default for non-systolic platforms
if (!is_set_by_user(ov::hint::kv_cache_precision) && !info.supports_immad) {
set_property(ov::hint::kv_cache_precision(ov::element::i8));
}

user_properties.clear();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ class SDPAWithKVCacheTest : public ::testing::Test, public ::testing::WithParamI
ov::AnyMap properties = {ov::hint::inference_precision(ov::element::f16),
ov::intel_gpu::hint::enable_sdpa_optimization(true)};

if (p.compressed)
if (p.compressed) {
properties.emplace(ov::hint::kv_cache_precision(ov::element::i8));
} else {
properties.emplace(ov::hint::kv_cache_precision(ov::element::undefined));
}

const size_t n_heads = 16;
const size_t n_features = 64;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,10 @@ class dynamic_quantization_gpu_tests: public ::testing::Test {
auto outputs = network->execute();

auto output_mem = outputs.begin()->second.get_memory();
cldnn::mem_lock<uint8_t> output_ptr (output_mem, get_test_stream());
cldnn::mem_lock<ov::float16> output_ptr (output_mem, get_test_stream());

auto ref_output_mem = get_ref_results();
cldnn::mem_lock<uint8_t> output_ptr_ref (ref_output_mem, get_test_stream());

cldnn::mem_lock<ov::float16> output_ptr_ref (ref_output_mem, get_test_stream());
size_t count = 0;
float max_diff = 0.f;
float avg = 0.f;
Expand All @@ -135,7 +134,7 @@ class dynamic_quantization_gpu_tests: public ::testing::Test {
max_diff = abs_diff;
avg += abs_diff;
count++;
OPENVINO_ASSERT(abs_diff < 1);
ASSERT_LE(abs_diff, 1);
}
GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl;
}
Expand Down

0 comments on commit 5fad805

Please sign in to comment.