diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/cl_kernel_data_serializer.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/cl_kernel_data_serializer.hpp index a35b2f5905c079..543b88b9295299 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/cl_kernel_data_serializer.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/cl_kernel_data_serializer.hpp @@ -6,9 +6,8 @@ #include #include "buffer.hpp" -#include "helpers.hpp" #include "kernel_selector_common.h" -#include "intel_gpu/runtime/kernel_args.hpp" + namespace cldnn { @@ -16,17 +15,7 @@ template class Serializer, BufferType>::value>::type> { public: static void save(BufferType& buffer, const kernel_selector::clKernelData& data) { - const auto& params = data.params; - buffer(params.workGroups.global, params.workGroups.local); - buffer << params.arguments.size(); - for (const auto& arg : params.arguments) { - buffer << make_data(&arg.t, sizeof(argument_desc::Types)) << arg.index; - } - buffer << params.scalars.size(); - for (const auto& scalar : params.scalars) { - buffer << make_data(&scalar.t, sizeof(scalar_desc::Types)) << make_data(&scalar.v, sizeof(scalar_desc::ValueT)); - } - buffer << params.layerID; + data.save(buffer); } }; @@ -34,24 +23,7 @@ template class Serializer, BufferType>::value>::type> { public: static void load(BufferType& buffer, kernel_selector::clKernelData& data) { - auto& params = data.params; - buffer(params.workGroups.global, params.workGroups.local); - - typename arguments_desc::size_type arguments_desc_size = 0UL; - buffer >> arguments_desc_size; - params.arguments.resize(arguments_desc_size); - for (auto& arg : params.arguments) { - buffer >> make_data(&arg.t, sizeof(argument_desc::Types)) >> arg.index; - } - - typename scalars_desc::size_type scalars_desc_size = 0UL; - buffer >> scalars_desc_size; - params.scalars.resize(scalars_desc_size); - for (auto& scalar : params.scalars) { - buffer >> make_data(&scalar.t, sizeof(scalar_desc::Types)) >> make_data(&scalar.v, sizeof(scalar_desc::ValueT)); - } - - buffer >> params.layerID; + data.load(buffer); } }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp index 67687868bbc92f..debc9ca4841356 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp @@ -3,9 +3,14 @@ // #include "kernel_selector_common.h" +#include "intel_gpu/graph/serialization/string_serializer.hpp" #include #include +#ifdef ENABLE_ONEDNN_FOR_GPU +#include "micro_utils.hpp" +#endif + namespace kernel_selector { std::string GetStringEnv(const char* varName) { std::string str; @@ -572,4 +577,54 @@ std::string toString(ReduceMode mode) { } } +void clKernelData::save(cldnn::BinaryOutputBuffer& ob) const { + ob(params.workGroups.global, params.workGroups.local); + ob << params.arguments.size(); + for (const auto& arg : params.arguments) { + ob << make_data(&arg.t, sizeof(cldnn::argument_desc::Types)) << arg.index; + } + ob << params.scalars.size(); + for (const auto& scalar : params.scalars) { + ob << make_data(&scalar.t, sizeof(cldnn::scalar_desc::Types)) << make_data(&scalar.v, sizeof(cldnn::scalar_desc::ValueT)); + } + ob << params.layerID; +#ifdef ENABLE_ONEDNN_FOR_GPU + ob << micro_kernels.size(); + for (const auto& microkernel : micro_kernels) { + microkernel->save(ob); + } +#endif +} + +void clKernelData::load(cldnn::BinaryInputBuffer& ib) { + ib(params.workGroups.global, params.workGroups.local); + + typename cldnn::arguments_desc::size_type arguments_desc_size = 0UL; + ib >> arguments_desc_size; + params.arguments.resize(arguments_desc_size); + for (auto& arg : params.arguments) { + ib >> make_data(&arg.t, sizeof(cldnn::argument_desc::Types)) >> arg.index; + } + + typename cldnn::scalars_desc::size_type scalars_desc_size = 0UL; + ib >> scalars_desc_size; + params.scalars.resize(scalars_desc_size); + for (auto& scalar : params.scalars) { + ib >> make_data(&scalar.t, sizeof(cldnn::scalar_desc::Types)) >> make_data(&scalar.v, sizeof(cldnn::scalar_desc::ValueT)); + } + + ib >> params.layerID; + +#ifdef ENABLE_ONEDNN_FOR_GPU + size_t n_microkernels; + ib >> n_microkernels; + micro_kernels.clear(); + for (size_t i = 0; i < n_microkernels; i++) { + auto microkernel = std::make_shared(); + microkernel->load(ib); + micro_kernels.push_back(microkernel); + } +#endif +} + } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h index 40ac211b1d1026..d9b132ac1dcc43 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h @@ -70,6 +70,9 @@ struct clKernelData { KernelParams params; std::vector> micro_kernels; bool skip_execution = false; + + void save(cldnn::BinaryOutputBuffer& ob) const; + void load(cldnn::BinaryInputBuffer& ib); }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index e3604a481a8f09..46c536ac0bd0af 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -212,6 +212,8 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag default: break; } + OPENVINO_ASSERT(config != nullptr); + /* Get device information */ micro::HWInformation hw_info; hw_info.euCount = params.engineInfo.computeUnitsCount; @@ -334,6 +336,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const { if (Q_num_heads_dim.is_dynamic || K_num_heads_dim.is_dynamic || V_num_heads_dim.is_dynamic || K_num_heads_dim.v != V_num_heads_dim.v) return false; + if (params.conf.head_size > 256) + return false; + return true; } @@ -389,8 +394,9 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m if (d_full) { if (ldq % 4 == 0) jit.AddConstant(MakeJitConstant("BLOCK_Q", 1)); - if (lda % 4 == 0 && v_full) - jit.AddConstant(MakeJitConstant("BLOCK_A", 1)); + // TODO: Causes accuracy drop for static SD model. Enable back once the issue is resolved + // if (lda % 4 == 0 && v_full) + // jit.AddConstant(MakeJitConstant("BLOCK_A", 1)); jit.AddConstant(MakeJitConstant("REMAINDER_Q", !q_full)); } else if (params.engineInfo.arch >= gpu_arch::xe_hpc) { auto vbytes = n_values.v * V.ElementSize(); @@ -436,7 +442,7 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m }; for (size_t i = 0; i < target_definitions.size(); i++) { - definitions.AddConstant(MakeJitConstant(target_definitions[order[i]], source_definitions[i])); + definitions.AddConstant(MakeJitConstant(target_definitions[i], source_definitions[order[i]])); } return definitions; @@ -559,7 +565,7 @@ clKernelData SDPAKernelMicro::get_kernel_data(const sdpa_params& params, bool is } KernelsData SDPAKernelMicro::GetKernelsData(const Params& params) const { - const size_t num_kernels = 2; + const size_t num_kernels = params.is_shape_agnostic ? 2 : 1; KernelData kd = KernelData::Default(params, num_kernels); const auto& prim_params = dynamic_cast(params); diff --git a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp index 828c9016d8669e..055892aca6c547 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp +++ b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp @@ -6,6 +6,9 @@ #ifdef ENABLE_ONEDNN_FOR_GPU +#include "intel_gpu/graph/serialization/binary_buffer.hpp" +#include "intel_gpu/graph/serialization/string_serializer.hpp" + #ifdef UNUSED # undef UNUSED #endif @@ -32,13 +35,37 @@ using SizeParams = dnnl::impl::gpu::intel::jit::SizeParams; using StrategyRequirement = dnnl::impl::gpu::intel::jit::StrategyRequirement; using ShimOptions = dnnl::impl::gpu::intel::micro::ShimOptions; using HostLanguage = dnnl::impl::gpu::intel::micro::HostLanguage; +using Setting = dnnl::impl::gpu::intel::micro::Setting; // Wrapper for Package which is used in clKernelData with forward declaration // to avoid including this header in many places in plugin // which may cause symbols conflicts with oneDNN struct MicroKernelPackage { + MicroKernelPackage() = default; explicit MicroKernelPackage(Package _p) : p(_p) {} Package p; + + // WARNING: We serialize only microkernels settings, so after deserialization + // other struct fields are not initializer properly and can't be used + void save(cldnn::BinaryOutputBuffer& ob) const { + ob << p.settings.size(); + for (auto& s : p.settings) { + ob << s.name; + ob << s.value; + } + } + + void load(cldnn::BinaryInputBuffer& ib) { + size_t n_settings; + ib >> n_settings; + p.settings.clear(); + for (size_t i = 0; i < n_settings; i++) { + Setting s; + ib >> s.name; + ib >> s.value; + p.settings.push_back(s); + } + } }; inline Package select_gemm_microkernel(GEMMProtocol protocol, HWInformation hw_info, SizeParams sizes, const GEMMProblem &problem, diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index bb7385cbe5dbb1..af0f100382c416 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -345,15 +345,14 @@ void TransformationsPipeline::apply(std::shared_ptr func) { return false; } - // For platforms with DPAS support we don't have any other shape-based limitations besides head_size being static and equal for QKV - if (device_info.supports_immad && cldnn::query_microkernels_supported(m_context->get_engine(), config)) + const auto head_size = query_ps[query_ps.size() - 1].get_length(); + if (device_info.supports_immad && cldnn::query_microkernels_supported(m_context->get_engine(), config) && head_size <= 256) return true; // - Head size should be 128 for any model type; or should be in the range of 64 to 256 for stateful LLMs because of performance reasons. // This limitations is recommended to prevent performance drop in models with small head size, such as SD, // until the SDPA operation is optimized for these cases const auto optimal_subgroup_size = 16; - const auto head_size = query_ps[query_ps.size() - 1].get_length(); bool valid_head_size = head_size % optimal_subgroup_size == 0; valid_head_size &= (head_size == 128) || (func->get_variables().size() > 0 && head_size >= 64 && head_size <= 256); if (!valid_head_size) {