From 45ea4669107eccfc8c010b4654511ecde06b8285 Mon Sep 17 00:00:00 2001 From: Lyamin-Roman Date: Tue, 2 Jul 2024 03:27:06 +0900 Subject: [PATCH] [GPU] Optimize update dispatch data --- .../src/graph/impls/ocl/activation.cpp | 9 +++-- .../src/graph/impls/ocl/arg_max_min.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/border.cpp | 9 +++-- .../src/graph/impls/ocl/concatenation.cpp | 9 +++-- .../src/graph/impls/ocl/convolution.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/crop.cpp | 26 ++++++++------ .../intel_gpu/src/graph/impls/ocl/cum_sum.cpp | 9 +++-- .../src/graph/impls/ocl/fully_connected.cpp | 28 ++++++++++++--- .../intel_gpu/src/graph/impls/ocl/gather.cpp | 9 +++-- .../src/graph/impls/ocl/gather_elements.cpp | 9 +++-- .../src/graph/impls/ocl/gather_nd.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/gemm.cpp | 15 +++++--- .../graph/impls/ocl/group_normalization.cpp | 9 +++-- .../src/graph/impls/ocl/multinomial.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/mvn.cpp | 9 +++-- .../graph/impls/ocl/non_max_suppression.cpp | 11 +++--- .../src/graph/impls/ocl/non_zero.cpp | 18 +++++++--- .../intel_gpu/src/graph/impls/ocl/permute.cpp | 9 +++-- .../src/graph/impls/ocl/quantize.cpp | 30 ++++++++++------ .../intel_gpu/src/graph/impls/ocl/reduce.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/reorder.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/rms.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/rope.cpp | 9 +++-- .../ocl/scaled_dot_product_attention.cpp | 15 +++++--- .../src/graph/impls/ocl/scatter_nd_update.cpp | 9 +++-- .../src/graph/impls/ocl/scatter_update.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/select.cpp | 9 +++-- .../src/graph/impls/ocl/shape_of.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/slice.cpp | 34 +++++++++++++------ .../src/graph/impls/ocl/space_to_batch.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/swiglu.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/tile.cpp | 9 +++-- .../intel_gpu/src/graph/impls/ocl/unique.cpp | 18 +++++++--- 33 files changed, 304 insertions(+), 107 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp index d58d19e2b1018a..0cda4e14905d06 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp @@ -73,8 +73,13 @@ struct activation_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp index d7750e85828e99..a1a869f131b3f4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp @@ -119,8 +119,13 @@ struct arg_max_min_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp index b8f341f7442ac8..5efc6fdab36b62 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp @@ -105,8 +105,13 @@ struct border_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } void save(BinaryOutputBuffer& ob) const override { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/concatenation.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/concatenation.cpp index 50e48f66df5f05..811a6d968bef31 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/concatenation.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/concatenation.cpp @@ -82,8 +82,13 @@ struct concatenation_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp index 2c05c3c8ae26d5..86b1071ad7d077 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp @@ -261,8 +261,13 @@ struct convolution_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp index c4b11b4ababd3c..6fc4d5dd0dfa9e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp @@ -47,16 +47,22 @@ struct crop_impl : typed_primitive_impl_ocl { } return params; } - void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - auto runtime_offset = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]).GetFirstElementOffset(); - kernel_selector::ScalarDescriptor s; - s.t = kernel_selector::ScalarDescriptor::Types::UINT32; - s.v.u32 = static_cast(runtime_offset); - OPENVINO_ASSERT(_kernel_data.kernels[0].params.scalars.size() == 1, - "[GPU] Scalar field for runtime offset is not added for crop shape agnostic impl"); - _kernel_data.kernels[0].params.scalars[0] = s; - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + + void update_dispatch_data(const kernel_impl_params& impl_param) override { + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + auto runtime_offset = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]).GetFirstElementOffset(); + kernel_selector::ScalarDescriptor s; + s.t = kernel_selector::ScalarDescriptor::Types::UINT32; + s.v.u32 = static_cast(runtime_offset); + OPENVINO_ASSERT(_kernel_data.kernels[0].params.scalars.size() == 1, + "[GPU] Scalar field for runtime offset is not added for crop shape agnostic impl"); + _kernel_data.kernels[0].params.scalars[0] = s; + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/cum_sum.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/cum_sum.cpp index 414d70e3b03c2f..c806a785c530f2 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/cum_sum.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/cum_sum.cpp @@ -78,8 +78,13 @@ struct cum_sum_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp index 96bafc7e431f3b..a3e6140d8636ca 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp @@ -77,7 +77,7 @@ struct fully_connected_impl : typed_primitive_impl_ocl { } public: - static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + static kernel_impl_params update_impl_params(const kernel_impl_params& impl_param) { const auto& primitive = impl_param.typed_desc(); auto get_fc_input_layouts = [primitive](const std::vector& input_layouts, bool allow_new_shape_infer) { @@ -157,6 +157,12 @@ struct fully_connected_impl : typed_primitive_impl_ocl { updated_impl_param.output_layouts[0] = get_fc_output_layout(input_layouts, impl_param.get_output_layout()); + return updated_impl_param; + } + + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + const auto& primitive = impl_param.typed_desc(); + auto updated_impl_param = update_impl_params(impl_param); auto params = get_weights_bias_default_params(updated_impl_param, false, is_shape_agnostic); params.allowInputReordering = true; @@ -164,10 +170,10 @@ struct fully_connected_impl : typed_primitive_impl_ocl { bool with_zp = !primitive->decompression_zero_point.empty(); if (commpressed) { params.compressed = true; - params.decompression_scale = convert_data_tensor(input_layouts[2]); + params.decompression_scale = convert_data_tensor(updated_impl_param.input_layouts[2]); if (with_zp) { params.has_decompression_zp = true; - params.decompression_zero_point = convert_data_tensor(input_layouts[3]); + params.decompression_zero_point = convert_data_tensor(updated_impl_param.input_layouts[3]); } else if (primitive->decompression_zero_point_scalar.has_value()) { params.has_decompression_zp = true; params.scalar_zp = true; @@ -194,8 +200,20 @@ struct fully_connected_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + auto& params = static_cast(*_kernel_data.params); + auto updated_impl_param = update_impl_params(impl_param); + update_shapes(params, updated_impl_param); + + if (impl_param.typed_desc()->input_size != 3) { + params.outputs = { params.outputs[0].FlattenFeatureAndSpatials() }; + } + + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp index af69439d7aa51a..0d20294c6bc33b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp @@ -167,8 +167,13 @@ struct gather_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp index 317db10b1463b3..6a683090a7f4ac 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp @@ -75,8 +75,13 @@ struct gather_elements_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp index ac7c8da1ac15eb..8ea57b56614cc9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp @@ -45,8 +45,13 @@ struct gather_nd_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp index f7220dad387348..979d1efb8cd76c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp @@ -303,12 +303,19 @@ struct gemm_impl : multi_stage_primitive { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true, false); - (_kernels_data[default_gemm].update_dispatch_data_func)(kernel_params, _kernels_data[default_gemm]); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernels_data[default_gemm].params == nullptr) { + _kernels_data[default_gemm].params = std::make_shared(get_kernel_params(impl_param, true, false)); + } + update_shapes(*_kernels_data[default_gemm].params, impl_param); + (_kernels_data[default_gemm].update_dispatch_data_func)(*_kernels_data[default_gemm].params, _kernels_data[default_gemm]); if (_kernels_data.size() == 2) { - auto kernel_params = get_kernel_params(impl_param, true, true); - (_kernels_data[indirect_gemm].update_dispatch_data_func)(kernel_params, _kernels_data[indirect_gemm]); + if (_kernels_data[indirect_gemm].params == nullptr) { + _kernels_data[indirect_gemm].params = std::make_shared(get_kernel_params(impl_param, true, true)); + } + update_shapes(*_kernels_data[indirect_gemm].params, impl_param); + (_kernels_data[indirect_gemm].update_dispatch_data_func)(*_kernels_data[indirect_gemm].params, _kernels_data[indirect_gemm]); } } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp index 827f05c2d2e20f..d79e47e8a114e0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp @@ -32,8 +32,13 @@ struct group_normalization_impl : typed_primitive_impl_ocl } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multinomial.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multinomial.cpp index bcc2fa15be20b3..45607326ff2925 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/multinomial.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multinomial.cpp @@ -32,8 +32,13 @@ struct multinomial_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp index c370bd8dff4c40..a3de617405fbad 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp @@ -88,8 +88,13 @@ struct mvn_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/non_max_suppression.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/non_max_suppression.cpp index 81194cb8ba9c5a..65bfa94173bf11 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/non_max_suppression.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/non_max_suppression.cpp @@ -60,8 +60,9 @@ struct non_max_suppression_impl : typed_primitive_impl_ocl } public: - static std::unique_ptr create(const non_max_suppression_node& arg, const kernel_impl_params& impl_param) { +static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { const auto& primitive = impl_param.typed_desc(); + const auto& arg = impl_param.prog->get_node(impl_param.desc->id).as(); auto params = get_default_params(impl_param); const auto input_scores_idx = 1; @@ -154,11 +155,7 @@ struct non_max_suppression_impl : typed_primitive_impl_ocl params.reuse_internal_buffer = true; } - params.set_dynamic_shape_offsets(); - auto& kernel_selector = kernel_selector::non_max_suppression_kernel_selector::Instance(); - auto best_kernel = kernel_selector.get_best_kernel(params); - - return make_unique(best_kernel); + return params; } private: @@ -200,7 +197,7 @@ namespace detail { attach_non_max_suppression_impl::attach_non_max_suppression_impl() { implementation_map::add(impl_types::ocl, - non_max_suppression_impl::create, + typed_primitive_impl_ocl::create, { std::make_tuple(data_types::i32, format::bfyx), diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp index c8f0a5cbae10c0..d8f0e45c25146f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp @@ -48,8 +48,13 @@ struct count_nonzero_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; @@ -82,8 +87,13 @@ struct gather_nonzero_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp index 06ea0a215d12cf..e253fef751b091 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp @@ -71,8 +71,13 @@ struct permute_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp index f1517b13824c42..73a668966ceeb5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp @@ -54,8 +54,9 @@ struct quantize_impl : typed_primitive_impl_ocl { } public: - static std::unique_ptr create(const quantize_node& arg, const kernel_impl_params& impl_param) { + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { auto quantize_params = get_default_params(impl_param); + const auto& arg = impl_param.prog->get_node(impl_param.desc->id).as(); quantize_params.levels = arg.get_levels(); quantize_params.scale_shift_opt = arg.get_scale_shift_opt(); @@ -86,17 +87,17 @@ struct quantize_impl : typed_primitive_impl_ocl { quantize_params.inputs.push_back(convert_data_tensor(impl_param.input_layouts[i])); } - quantize_params.is_shape_agnostic = impl_param.is_dynamic(); - quantize_params.set_dynamic_shape_offsets(); - auto& kernel_selector = kernel_selector::quantize_kernel_selector::Instance(); - auto best_kernel = kernel_selector.get_best_kernel(quantize_params); - - return make_unique(best_kernel); + return quantize_params; } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto quantize_params = get_default_params(impl_param); - (_kernel_data.update_dispatch_data_func)(quantize_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; @@ -147,9 +148,16 @@ attach_quantize_impl::attach_quantize_impl() { keys.emplace(data_types::f16, format::yxfb); keys.emplace(data_types::f32, format::yxfb); - implementation_map::add(impl_types::ocl, shape_types::static_shape, quantize_impl::create, keys); + implementation_map::add(impl_types::ocl, + shape_types::static_shape, + typed_primitive_impl_ocl::create, + keys); - implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, quantize_impl::create, types, dyn_formats); + implementation_map::add(impl_types::ocl, + shape_types::dynamic_shape, + typed_primitive_impl_ocl::create, + types, + dyn_formats); } } // namespace detail diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reduce.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reduce.cpp index c004b63c4f23fd..d937d7fc8a190c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reduce.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reduce.cpp @@ -90,8 +90,13 @@ struct reduce_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp index 02d2e9e19827ec..aa030af71dfc3d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp @@ -119,8 +119,13 @@ struct reorder_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } static std::unique_ptr create(const reorder_node& arg, const kernel_impl_params& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp index 946334f7c59d90..34e379564ee10e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp @@ -43,8 +43,13 @@ struct rms_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } static kernel_impl_params static_canonicalize_shapes(const kernel_impl_params& impl_params) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp index 4706227e35cff2..f65768b8e6eb20 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rope.cpp @@ -91,8 +91,13 @@ struct rope_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index 4bc6e16ce55fa9..fd2270fa366ec6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -275,12 +275,19 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive(get_kernel_params(impl_param, true)); + } + update_shapes(*_kernels_data[default_sdpa].params, impl_param); + (_kernels_data[default_sdpa].update_dispatch_data_func)(*_kernels_data[default_sdpa].params, _kernels_data[default_sdpa]); if (_kernels_data.size() == 2) { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernels_data[indirect_sdpa].update_dispatch_data_func)(kernel_params, _kernels_data[indirect_sdpa]); + if (_kernels_data[indirect_sdpa].params == nullptr) { + _kernels_data[indirect_sdpa].params = std::make_shared(get_kernel_params(impl_param, true)); + } + update_shapes(*_kernels_data[indirect_sdpa].params, impl_param); + (_kernels_data[indirect_sdpa].update_dispatch_data_func)(*_kernels_data[indirect_sdpa].params, _kernels_data[indirect_sdpa]); } } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_nd_update.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_nd_update.cpp index 8709b8be75c0ea..e5f1ebd04b1d4a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_nd_update.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_nd_update.cpp @@ -45,8 +45,13 @@ struct scatter_nd_update_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_update.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_update.cpp index 3e59ecbf63691e..af1029aacb2036 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_update.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scatter_update.cpp @@ -70,8 +70,13 @@ struct scatter_update_impl : typed_primitive_impl_ocl { } void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data); + // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future + if (_kernel_data.params == nullptr) { + _kernel_data.params = std::make_shared(get_kernel_params(impl_param, true)); + } + + update_shapes(*_kernel_data.params, impl_param); + (_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data); } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp index 90a369006715d7..0f69ab377ed3fe 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp @@ -61,8 +61,13 @@ struct select_impl : typed_primitive_impl_ocl