diff --git a/.github/dockerfiles/docker_tag b/.github/dockerfiles/docker_tag index 956897c9270ff9..ae48310adafe6f 100644 --- a/.github/dockerfiles/docker_tag +++ b/.github/dockerfiles/docker_tag @@ -1 +1 @@ -pr-25130 +pr-25303 diff --git a/.github/dockerfiles/ov_build/fedora_33/Dockerfile b/.github/dockerfiles/ov_build/fedora_33/Dockerfile new file mode 100644 index 00000000000000..fc94c37d67a321 --- /dev/null +++ b/.github/dockerfiles/ov_build/fedora_33/Dockerfile @@ -0,0 +1,23 @@ +FROM openvinogithubactions.azurecr.io/dockerio/library/fedora:33 + +USER root + +RUN yum update -y && yum install -y git + +# Install build dependencies +ADD install_build_dependencies.sh /install_build_dependencies.sh +RUN chmod +x /install_build_dependencies.sh && \ + /install_build_dependencies.sh && \ + rm -rf /var/lib/apt/lists/* + +# Install sscache +ARG SCCACHE_VERSION="v0.7.5" +ENV SCCACHE_HOME="/opt/sccache" \ + SCCACHE_PATH="/opt/sccache/sccache" + +RUN mkdir ${SCCACHE_HOME} && cd ${SCCACHE_HOME} && \ + SCCACHE_ARCHIVE="sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" && \ + curl -SLO https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/${SCCACHE_ARCHIVE} && \ + tar -xzf ${SCCACHE_ARCHIVE} --strip-components=1 && rm ${SCCACHE_ARCHIVE} + +ENV PATH="$SCCACHE_HOME:$PATH" diff --git a/.github/dockerfiles/ov_build/webassembly/Dockerfile b/.github/dockerfiles/ov_build/webassembly/Dockerfile new file mode 100644 index 00000000000000..66765ed9341efe --- /dev/null +++ b/.github/dockerfiles/ov_build/webassembly/Dockerfile @@ -0,0 +1,33 @@ +FROM openvinogithubactions.azurecr.io/dockerio/emscripten/emsdk:3.1.61 + +USER root + +# APT configuration +RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \ + echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf + +ENV DEBIAN_FRONTEND="noninteractive" \ + TZ="Europe/London" + +RUN apt-get update && \ + apt-get install software-properties-common && \ + add-apt-repository --yes --no-update ppa:git-core/ppa && \ + apt-get update && \ + apt-get install \ + git \ + ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +# Install sscache +ARG SCCACHE_VERSION="v0.7.5" +ENV SCCACHE_HOME="/opt/sccache" \ + SCCACHE_PATH="/opt/sccache/sccache" + +RUN mkdir ${SCCACHE_HOME} && cd ${SCCACHE_HOME} && \ + SCCACHE_ARCHIVE="sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" && \ + curl -SLO https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/${SCCACHE_ARCHIVE} && \ + tar -xzf ${SCCACHE_ARCHIVE} --strip-components=1 && rm ${SCCACHE_ARCHIVE} + +ENV PATH="$SCCACHE_HOME:$PATH" diff --git a/.github/workflows/fedora.yml b/.github/workflows/fedora.yml index 5835815e0d9e39..02cd0abf018319 100644 --- a/.github/workflows/fedora.yml +++ b/.github/workflows/fedora.yml @@ -20,6 +20,7 @@ jobs: runs-on: ubuntu-latest outputs: affected_components: "${{ steps.smart_ci.outputs.affected_components }}" + changed_components: "${{ steps.smart_ci.outputs.changed_components }}" skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action @@ -40,15 +41,42 @@ jobs: skip_when_only_listed_labels_set: 'docs' skip_when_only_listed_files_changed: '*.md,*.rst,*.png,*.jpg,*.svg,*/layer_tests_summary/*,*/conformance/*' - Build: + - name: Show affected components + run: | + echo "${{ toJSON(steps.smart_ci.outputs.affected_components) }}" + shell: bash + + Docker: needs: Smart_CI + runs-on: aks-linux-4-cores-16gb-docker-build + container: + image: openvinogithubactions.azurecr.io/docker_build:0.2 + volumes: + - /mount:/mount + outputs: + images: "${{ steps.handle_docker.outputs.images }}" + steps: + - name: Checkout + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + + - uses: ./.github/actions/handle_docker + id: handle_docker + with: + images: | + ov_build/fedora_33 + registry: 'openvinogithubactions.azurecr.io' + dockerfiles_root_dir: '.github/dockerfiles' + changed_components: ${{ needs.smart_ci.outputs.changed_components }} + + Build: + needs: [Docker, Smart_CI] timeout-minutes: 150 defaults: run: shell: bash runs-on: aks-linux-16-cores-32gb container: - image: fedora:33 + image: ${{ fromJSON(needs.docker.outputs.images).ov_build.fedora_33 }} volumes: - /mount:/mount options: -e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING @@ -69,9 +97,6 @@ jobs: SCCACHE_AZURE_KEY_PREFIX: fedora33_x86_64_Release if: "!needs.smart_ci.outputs.skip_workflow" steps: - - name: Install git - run: yum update -y && yum install -y git - - name: Clone OpenVINO uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: @@ -89,14 +114,6 @@ jobs: # Dependencies # - - name: Install build dependencies - run: bash ${OPENVINO_REPO}/install_build_dependencies.sh - - - name: Install sccache - uses: mozilla-actions/sccache-action@89e9040de88b577a072e3760aaf59f585da083af # v0.0.5 - with: - version: "v0.7.5" - - name: Install python dependencies run: | python3 -m pip install -U pip @@ -204,14 +221,14 @@ jobs: if-no-files-found: 'error' RPM_Packages: - needs: Build + needs: [Docker, Build] timeout-minutes: 10 defaults: run: shell: bash - runs-on: ubuntu-20.04 + runs-on: aks-linux-4-cores-16gb container: - image: fedora:33 + image: ${{ fromJSON(needs.docker.outputs.images).ov_build.fedora_33 }} env: RPM_PACKAGES_DIR: /__w/openvino/packages/ diff --git a/.github/workflows/webassembly.yml b/.github/workflows/webassembly.yml index 469ccda02f6944..902fb0dfcb00f0 100644 --- a/.github/workflows/webassembly.yml +++ b/.github/workflows/webassembly.yml @@ -20,6 +20,7 @@ jobs: runs-on: ubuntu-latest outputs: affected_components: "${{ steps.smart_ci.outputs.affected_components }}" + changed_components: "${{ steps.smart_ci.outputs.changed_components }}" skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}" steps: - name: checkout action @@ -40,14 +41,41 @@ jobs: skip_when_only_listed_labels_set: 'docs' skip_when_only_listed_files_changed: '*.md,*.rst,*.png,*.jpg,*.svg,*/layer_tests_summary/*,*/conformance/*' - Build: + - name: Show affected components + run: | + echo "${{ toJSON(steps.smart_ci.outputs.affected_components) }}" + shell: bash + + Docker: needs: Smart_CI + runs-on: aks-linux-4-cores-16gb-docker-build + container: + image: openvinogithubactions.azurecr.io/docker_build:0.2 + volumes: + - /mount:/mount + outputs: + images: "${{ steps.handle_docker.outputs.images }}" + steps: + - name: Checkout + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + + - uses: ./.github/actions/handle_docker + id: handle_docker + with: + images: | + ov_build/webassembly + registry: 'openvinogithubactions.azurecr.io' + dockerfiles_root_dir: '.github/dockerfiles' + changed_components: ${{ needs.smart_ci.outputs.changed_components }} + + Build: + needs: [Docker, Smart_CI] defaults: run: shell: bash runs-on: aks-linux-16-cores-32gb container: - image: emscripten/emsdk + image: ${{ fromJSON(needs.docker.outputs.images).ov_build.webassembly }} volumes: - /mount:/mount options: -e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING @@ -62,20 +90,12 @@ jobs: SCCACHE_AZURE_KEY_PREFIX: webassembly_Release if: "!needs.smart_ci.outputs.skip_workflow" steps: - - name: Install git - run: apt-get update && apt-get install --assume-yes --no-install-recommends git ca-certificates - - name: Clone OpenVINO uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: 'openvino' submodules: 'true' - - name: Install sccache - uses: mozilla-actions/sccache-action@89e9040de88b577a072e3760aaf59f585da083af # v0.0.5 - with: - version: "v0.7.5" - - name: emcmake cmake - configure run: | emcmake cmake \ diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp index f0718107ca30a2..53765ed2a47959 100644 --- a/src/common/snippets/include/snippets/lowered/loop_manager.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_manager.hpp @@ -102,7 +102,7 @@ class LoopManager { bool set_default_handlers = true, bool is_work_amount_const = false) { const auto normalized_increment = utils::is_dynamic_value(work_amount) || work_amount == 0 ? increment : std::min(increment, work_amount); - const auto handlers = set_default_handlers + const auto& handlers = set_default_handlers ? SpecificIterationHandlers(work_amount, normalized_increment) : SpecificIterationHandlers(); const auto loop_info = std::make_shared(work_amount, normalized_increment, entries, exits, handlers, is_work_amount_const); diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index 224e1add666948..6fec8c87dba180 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -123,7 +123,7 @@ std::pair LoopManager::get_loop_bo } LoopPort LoopManager::get_loop_port_by_expr_port(const ExpressionPort& expr_port, const size_t loop_id) { - auto get_loop_port = [&](const std::vector& ports) { + auto get_loop_port = [&](const std::vector& ports) -> const LoopPort& { auto it = std::find_if(ports.cbegin(), ports.cend(), [&](const LoopPort& p) { return *p.expr_port == expr_port; }); if (it == ports.cend()) OPENVINO_THROW("Expression has not been found among loop ports. Loop id: " + std::to_string(loop_id)); @@ -272,7 +272,7 @@ void LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, LinearIR:: auto input_ports_upper = loop_info_upper->get_input_ports(); auto output_ports_upper = loop_info_upper->get_output_ports(); auto input_ports_lower = loop_info_lower->get_input_ports(); - auto output_ports_lower = loop_info_lower->get_output_ports(); + const auto& output_ports_lower = loop_info_lower->get_output_ports(); fuse_loop_ports(output_ports_upper, input_ports_lower, loop_id_upper); const auto& from = fuse_into_upper ? loop_id_lower : loop_id_upper; @@ -285,9 +285,9 @@ void LoopManager::fuse_loops(LinearIR::constExprIt loop_begin_target, LinearIR:: const auto handlers = SpecificIterationHandlers::merge_handlers(loop_info_upper->get_handlers(), loop_info_lower->get_handlers()); const auto is_work_amount_const = loop_info_upper->is_work_amount_const() || loop_info_lower->is_work_amount_const(); - auto new_entries = input_ports_upper; + auto new_entries = std::move(input_ports_upper); new_entries.insert(new_entries.end(), input_ports_lower.begin(), input_ports_lower.end()); - auto new_exits = output_ports_upper; + auto new_exits = std::move(output_ports_upper); new_exits.insert(new_exits.end(), output_ports_lower.begin(), output_ports_lower.end()); m_map[to] = std::make_shared(work_amount, increment, new_entries, new_exits, handlers, is_work_amount_const); diff --git a/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp b/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp index 2205b60ea9bacf..cebe581dbda443 100644 --- a/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp +++ b/src/common/snippets/src/lowered/pass/validate_expanded_loops.cpp @@ -52,7 +52,10 @@ void ValidateExpandedLoops::validate_loop_information(const LinearIR& linear_ir) const auto& expanded_loop_info = ov::as_type_ptr(p.second); INFORMATIVE_ASSERT(expanded_loop_info, "expects only ExpandedLoopInfo in LoopManager"); - if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info) { + const auto& unified_loop_info = expanded_loop_info->get_unified_loop_info(); + INFORMATIVE_ASSERT(unified_loop_info, "expects non nullptr UnifiedLoopInfo in ExpandedLoopInfo"); + + if (unified_loop_info != current_unified_loop_info) { // If there is `current_unified_loop_info` - the previos loop is finished and need to validate total information if (current_unified_loop_info) { INFORMATIVE_ASSERT(current_work_amount == current_unified_loop_info->get_work_amount(), @@ -61,7 +64,7 @@ void ValidateExpandedLoops::validate_loop_information(const LinearIR& linear_ir) "total finalization offsets are not equal to finalization offsets of undefined loop"); } - current_unified_loop_info = expanded_loop_info->get_unified_loop_info(); + current_unified_loop_info = unified_loop_info; INFORMATIVE_ASSERT(current_unified_loop_info->get_input_count() == expanded_loop_info->get_input_count() && current_unified_loop_info->get_output_count() == expanded_loop_info->get_output_count(), @@ -74,6 +77,7 @@ void ValidateExpandedLoops::validate_loop_information(const LinearIR& linear_ir) } current_work_amount = utils::dynamic_safe_add(current_work_amount, expanded_loop_info->get_work_amount()); + INFORMATIVE_ASSERT(current_unified_loop_info, "expects non nullptr current UnifiedLoopInfo"); INFORMATIVE_ASSERT(current_unified_loop_info->get_ptr_increments() == expanded_loop_info->get_ptr_increments(), "incompatible pointer increments with UnifiedLoopInfo"); diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/cl_kernel_data_serializer.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/cl_kernel_data_serializer.hpp index a35b2f5905c079..543b88b9295299 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/cl_kernel_data_serializer.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/cl_kernel_data_serializer.hpp @@ -6,9 +6,8 @@ #include #include "buffer.hpp" -#include "helpers.hpp" #include "kernel_selector_common.h" -#include "intel_gpu/runtime/kernel_args.hpp" + namespace cldnn { @@ -16,17 +15,7 @@ template class Serializer, BufferType>::value>::type> { public: static void save(BufferType& buffer, const kernel_selector::clKernelData& data) { - const auto& params = data.params; - buffer(params.workGroups.global, params.workGroups.local); - buffer << params.arguments.size(); - for (const auto& arg : params.arguments) { - buffer << make_data(&arg.t, sizeof(argument_desc::Types)) << arg.index; - } - buffer << params.scalars.size(); - for (const auto& scalar : params.scalars) { - buffer << make_data(&scalar.t, sizeof(scalar_desc::Types)) << make_data(&scalar.v, sizeof(scalar_desc::ValueT)); - } - buffer << params.layerID; + data.save(buffer); } }; @@ -34,24 +23,7 @@ template class Serializer, BufferType>::value>::type> { public: static void load(BufferType& buffer, kernel_selector::clKernelData& data) { - auto& params = data.params; - buffer(params.workGroups.global, params.workGroups.local); - - typename arguments_desc::size_type arguments_desc_size = 0UL; - buffer >> arguments_desc_size; - params.arguments.resize(arguments_desc_size); - for (auto& arg : params.arguments) { - buffer >> make_data(&arg.t, sizeof(argument_desc::Types)) >> arg.index; - } - - typename scalars_desc::size_type scalars_desc_size = 0UL; - buffer >> scalars_desc_size; - params.scalars.resize(scalars_desc_size); - for (auto& scalar : params.scalars) { - buffer >> make_data(&scalar.t, sizeof(scalar_desc::Types)) >> make_data(&scalar.v, sizeof(scalar_desc::ValueT)); - } - - buffer >> params.layerID; + data.load(buffer); } }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp index 67687868bbc92f..debc9ca4841356 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp @@ -3,9 +3,14 @@ // #include "kernel_selector_common.h" +#include "intel_gpu/graph/serialization/string_serializer.hpp" #include #include +#ifdef ENABLE_ONEDNN_FOR_GPU +#include "micro_utils.hpp" +#endif + namespace kernel_selector { std::string GetStringEnv(const char* varName) { std::string str; @@ -572,4 +577,54 @@ std::string toString(ReduceMode mode) { } } +void clKernelData::save(cldnn::BinaryOutputBuffer& ob) const { + ob(params.workGroups.global, params.workGroups.local); + ob << params.arguments.size(); + for (const auto& arg : params.arguments) { + ob << make_data(&arg.t, sizeof(cldnn::argument_desc::Types)) << arg.index; + } + ob << params.scalars.size(); + for (const auto& scalar : params.scalars) { + ob << make_data(&scalar.t, sizeof(cldnn::scalar_desc::Types)) << make_data(&scalar.v, sizeof(cldnn::scalar_desc::ValueT)); + } + ob << params.layerID; +#ifdef ENABLE_ONEDNN_FOR_GPU + ob << micro_kernels.size(); + for (const auto& microkernel : micro_kernels) { + microkernel->save(ob); + } +#endif +} + +void clKernelData::load(cldnn::BinaryInputBuffer& ib) { + ib(params.workGroups.global, params.workGroups.local); + + typename cldnn::arguments_desc::size_type arguments_desc_size = 0UL; + ib >> arguments_desc_size; + params.arguments.resize(arguments_desc_size); + for (auto& arg : params.arguments) { + ib >> make_data(&arg.t, sizeof(cldnn::argument_desc::Types)) >> arg.index; + } + + typename cldnn::scalars_desc::size_type scalars_desc_size = 0UL; + ib >> scalars_desc_size; + params.scalars.resize(scalars_desc_size); + for (auto& scalar : params.scalars) { + ib >> make_data(&scalar.t, sizeof(cldnn::scalar_desc::Types)) >> make_data(&scalar.v, sizeof(cldnn::scalar_desc::ValueT)); + } + + ib >> params.layerID; + +#ifdef ENABLE_ONEDNN_FOR_GPU + size_t n_microkernels; + ib >> n_microkernels; + micro_kernels.clear(); + for (size_t i = 0; i < n_microkernels; i++) { + auto microkernel = std::make_shared(); + microkernel->load(ib); + micro_kernels.push_back(microkernel); + } +#endif +} + } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h index 40ac211b1d1026..d9b132ac1dcc43 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h @@ -70,6 +70,9 @@ struct clKernelData { KernelParams params; std::vector> micro_kernels; bool skip_execution = false; + + void save(cldnn::BinaryOutputBuffer& ob) const; + void load(cldnn::BinaryInputBuffer& ib); }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index e3604a481a8f09..46c536ac0bd0af 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -212,6 +212,8 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag default: break; } + OPENVINO_ASSERT(config != nullptr); + /* Get device information */ micro::HWInformation hw_info; hw_info.euCount = params.engineInfo.computeUnitsCount; @@ -334,6 +336,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const { if (Q_num_heads_dim.is_dynamic || K_num_heads_dim.is_dynamic || V_num_heads_dim.is_dynamic || K_num_heads_dim.v != V_num_heads_dim.v) return false; + if (params.conf.head_size > 256) + return false; + return true; } @@ -389,8 +394,9 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m if (d_full) { if (ldq % 4 == 0) jit.AddConstant(MakeJitConstant("BLOCK_Q", 1)); - if (lda % 4 == 0 && v_full) - jit.AddConstant(MakeJitConstant("BLOCK_A", 1)); + // TODO: Causes accuracy drop for static SD model. Enable back once the issue is resolved + // if (lda % 4 == 0 && v_full) + // jit.AddConstant(MakeJitConstant("BLOCK_A", 1)); jit.AddConstant(MakeJitConstant("REMAINDER_Q", !q_full)); } else if (params.engineInfo.arch >= gpu_arch::xe_hpc) { auto vbytes = n_values.v * V.ElementSize(); @@ -436,7 +442,7 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m }; for (size_t i = 0; i < target_definitions.size(); i++) { - definitions.AddConstant(MakeJitConstant(target_definitions[order[i]], source_definitions[i])); + definitions.AddConstant(MakeJitConstant(target_definitions[i], source_definitions[order[i]])); } return definitions; @@ -559,7 +565,7 @@ clKernelData SDPAKernelMicro::get_kernel_data(const sdpa_params& params, bool is } KernelsData SDPAKernelMicro::GetKernelsData(const Params& params) const { - const size_t num_kernels = 2; + const size_t num_kernels = params.is_shape_agnostic ? 2 : 1; KernelData kd = KernelData::Default(params, num_kernels); const auto& prim_params = dynamic_cast(params); diff --git a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp index 828c9016d8669e..055892aca6c547 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp +++ b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp @@ -6,6 +6,9 @@ #ifdef ENABLE_ONEDNN_FOR_GPU +#include "intel_gpu/graph/serialization/binary_buffer.hpp" +#include "intel_gpu/graph/serialization/string_serializer.hpp" + #ifdef UNUSED # undef UNUSED #endif @@ -32,13 +35,37 @@ using SizeParams = dnnl::impl::gpu::intel::jit::SizeParams; using StrategyRequirement = dnnl::impl::gpu::intel::jit::StrategyRequirement; using ShimOptions = dnnl::impl::gpu::intel::micro::ShimOptions; using HostLanguage = dnnl::impl::gpu::intel::micro::HostLanguage; +using Setting = dnnl::impl::gpu::intel::micro::Setting; // Wrapper for Package which is used in clKernelData with forward declaration // to avoid including this header in many places in plugin // which may cause symbols conflicts with oneDNN struct MicroKernelPackage { + MicroKernelPackage() = default; explicit MicroKernelPackage(Package _p) : p(_p) {} Package p; + + // WARNING: We serialize only microkernels settings, so after deserialization + // other struct fields are not initializer properly and can't be used + void save(cldnn::BinaryOutputBuffer& ob) const { + ob << p.settings.size(); + for (auto& s : p.settings) { + ob << s.name; + ob << s.value; + } + } + + void load(cldnn::BinaryInputBuffer& ib) { + size_t n_settings; + ib >> n_settings; + p.settings.clear(); + for (size_t i = 0; i < n_settings; i++) { + Setting s; + ib >> s.name; + ib >> s.value; + p.settings.push_back(s); + } + } }; inline Package select_gemm_microkernel(GEMMProtocol protocol, HWInformation hw_info, SizeParams sizes, const GEMMProblem &problem, diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index bb7385cbe5dbb1..af0f100382c416 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -345,15 +345,14 @@ void TransformationsPipeline::apply(std::shared_ptr func) { return false; } - // For platforms with DPAS support we don't have any other shape-based limitations besides head_size being static and equal for QKV - if (device_info.supports_immad && cldnn::query_microkernels_supported(m_context->get_engine(), config)) + const auto head_size = query_ps[query_ps.size() - 1].get_length(); + if (device_info.supports_immad && cldnn::query_microkernels_supported(m_context->get_engine(), config) && head_size <= 256) return true; // - Head size should be 128 for any model type; or should be in the range of 64 to 256 for stateful LLMs because of performance reasons. // This limitations is recommended to prevent performance drop in models with small head size, such as SD, // until the SDPA operation is optimized for these cases const auto optimal_subgroup_size = 16; - const auto head_size = query_ps[query_ps.size() - 1].get_length(); bool valid_head_size = head_size % optimal_subgroup_size == 0; valid_head_size &= (head_size == 128) || (func->get_variables().size() > 0 && head_size >= 64 && head_size <= 256); if (!valid_head_size) {