Skip to content

Commit

Permalink
[GPU] Serialize micro kernel settings (openvinotoolkit#25319)
Browse files Browse the repository at this point in the history
### Details:
- Add microkernels settings serialization/deserialization logic to be
able to use it after model import in update dispatch data functions
 - Limit head size for micro SDPA by 256.

### Tickets:
 - *CVS-145786*
 - *CVS-145787*
  • Loading branch information
vladimir-paramuzov authored Jul 2, 2024
1 parent c816a2e commit 626966b
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,52 +6,24 @@

#include <type_traits>
#include "buffer.hpp"
#include "helpers.hpp"
#include "kernel_selector_common.h"
#include "intel_gpu/runtime/kernel_args.hpp"


namespace cldnn {

template <typename BufferType>
class Serializer<BufferType, kernel_selector::clKernelData, typename std::enable_if<std::is_base_of<OutputBuffer<BufferType>, BufferType>::value>::type> {
public:
static void save(BufferType& buffer, const kernel_selector::clKernelData& data) {
const auto& params = data.params;
buffer(params.workGroups.global, params.workGroups.local);
buffer << params.arguments.size();
for (const auto& arg : params.arguments) {
buffer << make_data(&arg.t, sizeof(argument_desc::Types)) << arg.index;
}
buffer << params.scalars.size();
for (const auto& scalar : params.scalars) {
buffer << make_data(&scalar.t, sizeof(scalar_desc::Types)) << make_data(&scalar.v, sizeof(scalar_desc::ValueT));
}
buffer << params.layerID;
data.save(buffer);
}
};

template <typename BufferType>
class Serializer<BufferType, kernel_selector::clKernelData, typename std::enable_if<std::is_base_of<InputBuffer<BufferType>, BufferType>::value>::type> {
public:
static void load(BufferType& buffer, kernel_selector::clKernelData& data) {
auto& params = data.params;
buffer(params.workGroups.global, params.workGroups.local);

typename arguments_desc::size_type arguments_desc_size = 0UL;
buffer >> arguments_desc_size;
params.arguments.resize(arguments_desc_size);
for (auto& arg : params.arguments) {
buffer >> make_data(&arg.t, sizeof(argument_desc::Types)) >> arg.index;
}

typename scalars_desc::size_type scalars_desc_size = 0UL;
buffer >> scalars_desc_size;
params.scalars.resize(scalars_desc_size);
for (auto& scalar : params.scalars) {
buffer >> make_data(&scalar.t, sizeof(scalar_desc::Types)) >> make_data(&scalar.v, sizeof(scalar_desc::ValueT));
}

buffer >> params.layerID;
data.load(buffer);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@
//

#include "kernel_selector_common.h"
#include "intel_gpu/graph/serialization/string_serializer.hpp"
#include <sstream>
#include <string>

#ifdef ENABLE_ONEDNN_FOR_GPU
#include "micro_utils.hpp"
#endif

namespace kernel_selector {
std::string GetStringEnv(const char* varName) {
std::string str;
Expand Down Expand Up @@ -572,4 +577,54 @@ std::string toString(ReduceMode mode) {
}
}

void clKernelData::save(cldnn::BinaryOutputBuffer& ob) const {
ob(params.workGroups.global, params.workGroups.local);
ob << params.arguments.size();
for (const auto& arg : params.arguments) {
ob << make_data(&arg.t, sizeof(cldnn::argument_desc::Types)) << arg.index;
}
ob << params.scalars.size();
for (const auto& scalar : params.scalars) {
ob << make_data(&scalar.t, sizeof(cldnn::scalar_desc::Types)) << make_data(&scalar.v, sizeof(cldnn::scalar_desc::ValueT));
}
ob << params.layerID;
#ifdef ENABLE_ONEDNN_FOR_GPU
ob << micro_kernels.size();
for (const auto& microkernel : micro_kernels) {
microkernel->save(ob);
}
#endif
}

void clKernelData::load(cldnn::BinaryInputBuffer& ib) {
ib(params.workGroups.global, params.workGroups.local);

typename cldnn::arguments_desc::size_type arguments_desc_size = 0UL;
ib >> arguments_desc_size;
params.arguments.resize(arguments_desc_size);
for (auto& arg : params.arguments) {
ib >> make_data(&arg.t, sizeof(cldnn::argument_desc::Types)) >> arg.index;
}

typename cldnn::scalars_desc::size_type scalars_desc_size = 0UL;
ib >> scalars_desc_size;
params.scalars.resize(scalars_desc_size);
for (auto& scalar : params.scalars) {
ib >> make_data(&scalar.t, sizeof(cldnn::scalar_desc::Types)) >> make_data(&scalar.v, sizeof(cldnn::scalar_desc::ValueT));
}

ib >> params.layerID;

#ifdef ENABLE_ONEDNN_FOR_GPU
size_t n_microkernels;
ib >> n_microkernels;
micro_kernels.clear();
for (size_t i = 0; i < n_microkernels; i++) {
auto microkernel = std::make_shared<micro::MicroKernelPackage>();
microkernel->load(ib);
micro_kernels.push_back(microkernel);
}
#endif
}

} // namespace kernel_selector
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ struct clKernelData {
KernelParams params;
std::vector<std::shared_ptr<micro::MicroKernelPackage>> micro_kernels;
bool skip_execution = false;

void save(cldnn::BinaryOutputBuffer& ob) const;
void load(cldnn::BinaryInputBuffer& ib);
};

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag
default: break;
}

OPENVINO_ASSERT(config != nullptr);

/* Get device information */
micro::HWInformation hw_info;
hw_info.euCount = params.engineInfo.computeUnitsCount;
Expand Down Expand Up @@ -334,6 +336,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const {
if (Q_num_heads_dim.is_dynamic || K_num_heads_dim.is_dynamic || V_num_heads_dim.is_dynamic || K_num_heads_dim.v != V_num_heads_dim.v)
return false;

if (params.conf.head_size > 256)
return false;

return true;
}

Expand Down Expand Up @@ -389,8 +394,9 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m
if (d_full) {
if (ldq % 4 == 0)
jit.AddConstant(MakeJitConstant("BLOCK_Q", 1));
if (lda % 4 == 0 && v_full)
jit.AddConstant(MakeJitConstant("BLOCK_A", 1));
// TODO: Causes accuracy drop for static SD model. Enable back once the issue is resolved
// if (lda % 4 == 0 && v_full)
// jit.AddConstant(MakeJitConstant("BLOCK_A", 1));
jit.AddConstant(MakeJitConstant("REMAINDER_Q", !q_full));
} else if (params.engineInfo.arch >= gpu_arch::xe_hpc) {
auto vbytes = n_values.v * V.ElementSize();
Expand Down Expand Up @@ -436,7 +442,7 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m
};

for (size_t i = 0; i < target_definitions.size(); i++) {
definitions.AddConstant(MakeJitConstant(target_definitions[order[i]], source_definitions[i]));
definitions.AddConstant(MakeJitConstant(target_definitions[i], source_definitions[order[i]]));
}

return definitions;
Expand Down Expand Up @@ -559,7 +565,7 @@ clKernelData SDPAKernelMicro::get_kernel_data(const sdpa_params& params, bool is
}

KernelsData SDPAKernelMicro::GetKernelsData(const Params& params) const {
const size_t num_kernels = 2;
const size_t num_kernels = params.is_shape_agnostic ? 2 : 1;
KernelData kd = KernelData::Default<sdpa_params>(params, num_kernels);
const auto& prim_params = dynamic_cast<const sdpa_params&>(params);

Expand Down
27 changes: 27 additions & 0 deletions src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

#ifdef ENABLE_ONEDNN_FOR_GPU

#include "intel_gpu/graph/serialization/binary_buffer.hpp"
#include "intel_gpu/graph/serialization/string_serializer.hpp"

#ifdef UNUSED
# undef UNUSED
#endif
Expand All @@ -32,13 +35,37 @@ using SizeParams = dnnl::impl::gpu::intel::jit::SizeParams;
using StrategyRequirement = dnnl::impl::gpu::intel::jit::StrategyRequirement;
using ShimOptions = dnnl::impl::gpu::intel::micro::ShimOptions;
using HostLanguage = dnnl::impl::gpu::intel::micro::HostLanguage;
using Setting = dnnl::impl::gpu::intel::micro::Setting;

// Wrapper for Package which is used in clKernelData with forward declaration
// to avoid including this header in many places in plugin
// which may cause symbols conflicts with oneDNN
struct MicroKernelPackage {
MicroKernelPackage() = default;
explicit MicroKernelPackage(Package _p) : p(_p) {}
Package p;

// WARNING: We serialize only microkernels settings, so after deserialization
// other struct fields are not initializer properly and can't be used
void save(cldnn::BinaryOutputBuffer& ob) const {
ob << p.settings.size();
for (auto& s : p.settings) {
ob << s.name;
ob << s.value;
}
}

void load(cldnn::BinaryInputBuffer& ib) {
size_t n_settings;
ib >> n_settings;
p.settings.clear();
for (size_t i = 0; i < n_settings; i++) {
Setting s;
ib >> s.name;
ib >> s.value;
p.settings.push_back(s);
}
}
};

inline Package select_gemm_microkernel(GEMMProtocol protocol, HWInformation hw_info, SizeParams sizes, const GEMMProblem &problem,
Expand Down
5 changes: 2 additions & 3 deletions src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,15 +345,14 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
return false;
}

// For platforms with DPAS support we don't have any other shape-based limitations besides head_size being static and equal for QKV
if (device_info.supports_immad && cldnn::query_microkernels_supported(m_context->get_engine(), config))
const auto head_size = query_ps[query_ps.size() - 1].get_length();
if (device_info.supports_immad && cldnn::query_microkernels_supported(m_context->get_engine(), config) && head_size <= 256)
return true;

// - Head size should be 128 for any model type; or should be in the range of 64 to 256 for stateful LLMs because of performance reasons.
// This limitations is recommended to prevent performance drop in models with small head size, such as SD,
// until the SDPA operation is optimized for these cases
const auto optimal_subgroup_size = 16;
const auto head_size = query_ps[query_ps.size() - 1].get_length();
bool valid_head_size = head_size % optimal_subgroup_size == 0;
valid_head_size &= (head_size == 128) || (func->get_variables().size() > 0 && head_size >= 64 && head_size <= 256);
if (!valid_head_size) {
Expand Down

0 comments on commit 626966b

Please sign in to comment.