Skip to content

Commit

Permalink
[GPU] Optimize update dispatch data
Browse files Browse the repository at this point in the history
  • Loading branch information
Lyamin-Roman committed Jul 2, 2024
1 parent 626966b commit cd827aa
Show file tree
Hide file tree
Showing 33 changed files with 307 additions and 109 deletions.
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,13 @@ struct activation_impl : typed_primitive_impl_ocl<activation> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,13 @@ struct arg_max_min_impl : typed_primitive_impl_ocl<arg_max_min> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,13 @@ struct border_impl : typed_primitive_impl_ocl<border> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}

void save(BinaryOutputBuffer& ob) const override {
Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/concatenation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,13 @@ struct concatenation_impl : typed_primitive_impl_ocl<concatenation> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,13 @@ struct convolution_impl : typed_primitive_impl_ocl<convolution> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
26 changes: 16 additions & 10 deletions src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,22 @@ struct crop_impl : typed_primitive_impl_ocl<crop> {
}
return params;
}
void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
auto runtime_offset = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]).GetFirstElementOffset();
kernel_selector::ScalarDescriptor s;
s.t = kernel_selector::ScalarDescriptor::Types::UINT32;
s.v.u32 = static_cast<uint32_t>(runtime_offset);
OPENVINO_ASSERT(_kernel_data.kernels[0].params.scalars.size() == 1,
"[GPU] Scalar field for runtime offset is not added for crop shape agnostic impl");
_kernel_data.kernels[0].params.scalars[0] = s;
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);

void update_dispatch_data(const kernel_impl_params& impl_param) override {
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
auto runtime_offset = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]).GetFirstElementOffset();
kernel_selector::ScalarDescriptor s;
s.t = kernel_selector::ScalarDescriptor::Types::UINT32;
s.v.u32 = static_cast<uint32_t>(runtime_offset);
OPENVINO_ASSERT(_kernel_data.kernels[0].params.scalars.size() == 1,
"[GPU] Scalar field for runtime offset is not added for crop shape agnostic impl");
_kernel_data.kernels[0].params.scalars[0] = s;
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/cum_sum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,13 @@ struct cum_sum_impl : typed_primitive_impl_ocl<cum_sum> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
33 changes: 26 additions & 7 deletions src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ struct fully_connected_impl : typed_primitive_impl_ocl<fully_connected> {
}

public:
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
static kernel_impl_params update_impl_params(const kernel_impl_params& impl_param) {
const auto& primitive = impl_param.typed_desc<fully_connected>();

auto get_fc_input_layouts = [primitive](const std::vector<layout>& input_layouts, bool allow_new_shape_infer) {
Expand Down Expand Up @@ -151,23 +151,30 @@ struct fully_connected_impl : typed_primitive_impl_ocl<fully_connected> {
auto updated_impl_param = impl_param;

const auto input_layouts = get_fc_input_layouts(impl_param.input_layouts, allow_new_shape_infer);
updated_impl_param.input_layouts[0] = input_layouts[0];
updated_impl_param.input_layouts[1] = input_layouts[1];
for (size_t i = 0; i < input_layouts.size(); ++i) {
updated_impl_param.input_layouts[i] = input_layouts[i];
}
updated_impl_param.weights_layout = input_layouts[1];

updated_impl_param.output_layouts[0] = get_fc_output_layout(input_layouts, impl_param.get_output_layout());

return updated_impl_param;
}

static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<fully_connected>();
auto updated_impl_param = update_impl_params(impl_param);
auto params = get_weights_bias_default_params<kernel_selector::fully_connected_params>(updated_impl_param, false, is_shape_agnostic);
params.allowInputReordering = true;

bool commpressed = !primitive->decompression_scale.empty();
bool with_zp = !primitive->decompression_zero_point.empty();
if (commpressed) {
params.compressed = true;
params.decompression_scale = convert_data_tensor(input_layouts[2]);
params.decompression_scale = convert_data_tensor(updated_impl_param.input_layouts[2]);
if (with_zp) {
params.has_decompression_zp = true;
params.decompression_zero_point = convert_data_tensor(input_layouts[3]);
params.decompression_zero_point = convert_data_tensor(updated_impl_param.input_layouts[3]);
} else if (primitive->decompression_zero_point_scalar.has_value()) {
params.has_decompression_zp = true;
params.scalar_zp = true;
Expand All @@ -194,8 +201,20 @@ struct fully_connected_impl : typed_primitive_impl_ocl<fully_connected> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

auto& params = static_cast<kernel_params_t&>(*_kernel_data.params);
auto updated_impl_param = update_impl_params(impl_param);
update_shapes(params, updated_impl_param);

if (impl_param.typed_desc<fully_connected>()->input_size != 3) {
params.outputs = { params.outputs[0].FlattenFeatureAndSpatials() };
}

(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,13 @@ struct gather_impl : typed_primitive_impl_ocl<gather> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,13 @@ struct gather_elements_impl : typed_primitive_impl_ocl<gather_elements> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,13 @@ struct gather_nd_impl : typed_primitive_impl_ocl<gather_nd> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
15 changes: 11 additions & 4 deletions src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,12 +303,19 @@ struct gemm_impl : multi_stage_primitive<gemm> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true, false);
(_kernels_data[default_gemm].update_dispatch_data_func)(kernel_params, _kernels_data[default_gemm]);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernels_data[default_gemm].params == nullptr) {
_kernels_data[default_gemm].params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true, false));
}
update_shapes(*_kernels_data[default_gemm].params, impl_param);
(_kernels_data[default_gemm].update_dispatch_data_func)(*_kernels_data[default_gemm].params, _kernels_data[default_gemm]);

if (_kernels_data.size() == 2) {
auto kernel_params = get_kernel_params(impl_param, true, true);
(_kernels_data[indirect_gemm].update_dispatch_data_func)(kernel_params, _kernels_data[indirect_gemm]);
if (_kernels_data[indirect_gemm].params == nullptr) {
_kernels_data[indirect_gemm].params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true, true));
}
update_shapes(*_kernels_data[indirect_gemm].params, impl_param);
(_kernels_data[indirect_gemm].update_dispatch_data_func)(*_kernels_data[indirect_gemm].params, _kernels_data[indirect_gemm]);
}
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ struct group_normalization_impl : typed_primitive_impl_ocl<group_normalization>
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/multinomial.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ struct multinomial_impl : typed_primitive_impl_ocl<multinomial> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,13 @@ struct mvn_impl : typed_primitive_impl_ocl<mvn> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ struct non_max_suppression_impl : typed_primitive_impl_ocl<non_max_suppression>
}

public:
static std::unique_ptr<primitive_impl> create(const non_max_suppression_node& arg, const kernel_impl_params& impl_param) {
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<non_max_suppression>();
const auto& arg = impl_param.prog->get_node(impl_param.desc->id).as<non_max_suppression>();
auto params = get_default_params<kernel_selector::non_max_suppression_params>(impl_param);

const auto input_scores_idx = 1;
Expand Down Expand Up @@ -154,11 +155,7 @@ struct non_max_suppression_impl : typed_primitive_impl_ocl<non_max_suppression>
params.reuse_internal_buffer = true;
}

params.set_dynamic_shape_offsets();
auto& kernel_selector = kernel_selector::non_max_suppression_kernel_selector::Instance();
auto best_kernel = kernel_selector.get_best_kernel(params);

return make_unique<non_max_suppression_impl>(best_kernel);
return params;
}

private:
Expand Down Expand Up @@ -200,7 +197,7 @@ namespace detail {

attach_non_max_suppression_impl::attach_non_max_suppression_impl() {
implementation_map<non_max_suppression>::add(impl_types::ocl,
non_max_suppression_impl::create,
typed_primitive_impl_ocl<non_max_suppression>::create<non_max_suppression_impl>,
{
std::make_tuple(data_types::i32, format::bfyx),

Expand Down
18 changes: 14 additions & 4 deletions src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,13 @@ struct count_nonzero_impl : typed_primitive_impl_ocl<count_nonzero> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down Expand Up @@ -82,8 +87,13 @@ struct gather_nonzero_impl : typed_primitive_impl_ocl<gather_nonzero> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
9 changes: 7 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,13 @@ struct permute_impl : typed_primitive_impl_ocl<permute> {
}

void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params, _kernel_data);
// If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
if (_kernel_data.params == nullptr) {
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(impl_param, true));
}

update_shapes(*_kernel_data.params, impl_param);
(_kernel_data.update_dispatch_data_func)(*_kernel_data.params, _kernel_data);
}
};

Expand Down
Loading

0 comments on commit cd827aa

Please sign in to comment.