diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index 895fd86bb01e5f..8602717c08df30 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -44,10 +44,14 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveGetUpdateDispatchDataFunc(_kernels_data[default_sdpa]); - if (_kernels_data.size() == 2) { + if (_kernels_data.size() >= 2) { auto bt_kernel_impl = kernel_selector.GetImplementation(_kernels_data[indirect_sdpa].kernelName); bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[indirect_sdpa]); } + if (_kernels_data.size() == 3) { + auto bt_kernel_impl = kernel_selector.GetImplementation(_kernels_data[2].kernelName); + bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[2]); + } } } @@ -58,13 +62,15 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive layouts; - if (_kernels_data.size() > 0 && !_kernels_data[0].internalBufferSizes.empty()) { - auto dtype = from_data_type(_kernels_data[0].internalBufferDataType); - const auto bpp = data_type_traits::size_of(dtype); - for (auto size : _kernels_data[0].internalBufferSizes) { - layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel) - {1, 1, 1, (tensor::value_type)(size / bpp)}}; - layouts.push_back(inbuf_layout); + for (size_t i = 0; i < _kernels_data.size(); i++) { + if (!_kernels_data[i].internalBufferSizes.empty()) { + auto dtype = from_data_type(_kernels_data[i].internalBufferDataType); + const auto bpp = data_type_traits::size_of(dtype); + for (auto size : _kernels_data[i].internalBufferSizes) { + layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel) + {1, 1, 1, (tensor::value_type)(size / bpp)}}; + layouts.push_back(inbuf_layout); + } } } @@ -176,11 +182,37 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveget_input_layout(0); + + auto get_reordered_dimension = [](const ov::PartialShape& pshape, const std::vector& order, size_t idx) -> const ov::Dimension& { + if (order.empty()) + return pshape[idx]; + + return pshape[order[idx]]; + }; + + const auto& desc = instance.get_impl_params()->typed_desc(); + const auto dim_L = get_reordered_dimension(query_layout.get_partial_shape(), desc->input_q_transpose_order, 2 /* y */); + + bool is_generate = dim_L.get_length() == 1; // L + return is_generate; + } + event::ptr execute_impl(const std::vector& events, scaled_dot_product_attention_inst& instance) override { - if (need_indirect_load(instance)) + if (need_indirect_load(instance)) { return execute_stage(events, instance, indirect_sdpa); - else + } else if (need_sdpa_opt_load(instance)) { + return execute_stage(events, instance, _kernels_data.size() -1 /* the last */); + } else { return execute_stage(events, instance, default_sdpa); + } } static kernel_selector::sdpa_configuration get_sdpa_configuration(const kernel_impl_params& impl_param) { @@ -317,6 +349,12 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive(kernels_data); } @@ -328,13 +366,16 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive= 2) { if (_kernels_data[indirect_sdpa].params == nullptr) { _kernels_data[indirect_sdpa].params = std::make_shared(get_kernel_params(impl_param, true)); } update_shapes(*_kernels_data[indirect_sdpa].params, impl_param); (_kernels_data[indirect_sdpa].update_dispatch_data_func)(*_kernels_data[indirect_sdpa].params, _kernels_data[indirect_sdpa]); } + if (_kernels_data.size() == 3) { + (_kernels_data[2].update_dispatch_data_func)(*_kernels_data[default_sdpa].params, _kernels_data[2]); + } } }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index 493bd0acedea32..5cd9c384ff2709 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -120,6 +120,7 @@ struct sdpa_params : public base_params { DataTensor value_cache_comp_zp; sdpa_configuration conf; + bool should_use_sdpa_opt = false; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index 838d34bbf85404..467dd71da37944 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -316,6 +316,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const { const sdpa_params& params = static_cast(p); + if (params.should_use_sdpa_opt) + return false; + if (params.conf.is_paged_attention) return false;