Skip to content

Commit

Permalink
[GPU] CPU Reduce impl + ScatterNDUpdate fixes to avoid shape flow syn…
Browse files Browse the repository at this point in the history
…chronizations (openvinotoolkit#24966)

### Details:
 - Added reduce cpu impl to handle this op in shapes flow
- override `get_shape_infer_dependencies` method for scatter_nd_update
prim
- Allows to avoid redundant synchronizations during shape inference for
llama3
  • Loading branch information
vladimir-paramuzov authored Jun 12, 2024
1 parent 65bc499 commit e6fe146
Show file tree
Hide file tree
Showing 6 changed files with 239 additions and 3 deletions.
189 changes: 189 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/cpu/reduce.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "register.hpp"
#include "reduce_inst.h"
#include "implementation_map.hpp"

#include "openvino/op/reduce_max.hpp"
#include "openvino/op/reduce_sum.hpp"
#include "openvino/op/reduce_prod.hpp"
#include "openvino/op/reduce_l1.hpp"
#include "openvino/op/reduce_l2.hpp"
#include "openvino/op/reduce_logical_and.hpp"
#include "openvino/op/reduce_logical_or.hpp"
#include "openvino/op/reduce_mean.hpp"
#include "openvino/op/reduce_min.hpp"

namespace cldnn {
namespace cpu {

namespace {

template<typename T>
std::shared_ptr<ov::op::Op> make_reduce(bool keep_dims) {
auto op = std::make_shared<T>();
op->set_keep_dims(keep_dims);
return op;
}
} // namespace

struct reduce_impl : public typed_primitive_impl<reduce> {
using parent = typed_primitive_impl<reduce>;
using parent::parent;

reduce_mode mode = reduce_mode::sum;
std::vector<int64_t> axes = {};
bool keep_dims = false;

std::shared_ptr<ov::op::Op> op;

DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cpu::reduce_impl)

std::unique_ptr<primitive_impl> clone() const override {
return make_unique<reduce_impl>(*this);
}

reduce_impl() : parent("reduce_cpu_impl") {}

explicit reduce_impl(const reduce_node& outer) {
set_node_params(outer);
}

void set_node_params(const program_node& arg) override {
OPENVINO_ASSERT(arg.is_type<reduce>(), "[GPU] Incorrect program_node type");
const auto& node = arg.as<reduce>();
mode = node.get_primitive()->mode;
axes = node.get_primitive()->axes;
keep_dims = node.get_primitive()->keep_dims;
}

void save(BinaryOutputBuffer& ob) const override {
parent::save(ob);
ob << make_data(&mode, sizeof(reduce_mode));
ob << axes;
ob << keep_dims;
}

void load(BinaryInputBuffer& ib) override {
parent::load(ib);
ib >> make_data(&mode, sizeof(reduce_mode));
ib >> axes;
ib >> keep_dims;
}

event::ptr execute_impl(const std::vector<event::ptr>& events, reduce_inst& instance) override {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "reduce::execute_impl");
auto& stream = instance.get_network().get_stream();

const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph();

if (!pass_through_events) {
for (auto e : events) {
e->wait();
}
}

auto params = instance.get_impl_params();

ov::TensorVector input_host_tensors;
ov::TensorVector output_host_tensors;

if (!op) {
op = make_reduce<ov::op::v1::ReduceProd>(keep_dims);
switch (mode) {
case reduce_mode::max:
op = make_reduce<ov::op::v1::ReduceMax>(keep_dims);
break;
case reduce_mode::min:
op = make_reduce<ov::op::v1::ReduceMin>(keep_dims);
break;
case reduce_mode::mean:
op = make_reduce<ov::op::v1::ReduceMean>(keep_dims);
break;
case reduce_mode::prod:
op = make_reduce<ov::op::v1::ReduceProd>(keep_dims);
break;
case reduce_mode::sum:
op = make_reduce<ov::op::v1::ReduceSum>(keep_dims);
break;
case reduce_mode::logical_and:
op = make_reduce<ov::op::v1::ReduceLogicalAnd>(keep_dims);
break;
case reduce_mode::logical_or:
op = make_reduce<ov::op::v1::ReduceLogicalOr>(keep_dims);
break;
case reduce_mode::l1:
op = make_reduce<ov::op::v4::ReduceL1>(keep_dims);
break;
case reduce_mode::l2:
op = make_reduce<ov::op::v4::ReduceL2>(keep_dims);
break;
default:
OPENVINO_THROW("[GPU] Couldn't create reduce operation: unsupported reduce mode (", static_cast<size_t>(mode), ")");
}
}

cldnn::mem_lock<uint8_t, mem_lock_type::write> output_lock(instance.output_memory_ptr(), stream);
cldnn::mem_lock<uint8_t, mem_lock_type::read> input_lock(instance.dep_memory_ptr(0), stream);

input_host_tensors.push_back(make_tensor(params->input_layouts[0], input_lock.data()));
input_host_tensors.push_back(ov::Tensor(ov::element::i64, ov::Shape{axes.size()}, static_cast<void*>(axes.data())));

output_host_tensors.push_back(make_tensor(params->output_layouts[0], output_lock.data()));

OPENVINO_ASSERT(op->evaluate(output_host_tensors, input_host_tensors),
"[GPU] Couldn't execute reduce primitive with id ", instance.id());

if (pass_through_events) {
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
}
}

return stream.create_user_event(true);
}

void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}

void update_dispatch_data(const kernel_impl_params& impl_param) override {}

public:
static std::unique_ptr<primitive_impl> create(const reduce_node& arg, const kernel_impl_params& impl_param) {
return make_unique<reduce_impl>();
}
};


namespace detail {

attach_reduce_impl::attach_reduce_impl() {
auto formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx,
format::bfuwzyx,
format::bfvuwzyx,
};

auto types = {
data_types::f32,
data_types::f16,
data_types::i32,
data_types::i64,
data_types::i8,
data_types::u8,
};

implementation_map<reduce>::add(impl_types::cpu, shape_types::static_shape, reduce_impl::create, types, formats);
implementation_map<reduce>::add(impl_types::cpu, shape_types::dynamic_shape, reduce_impl::create, types, formats);
}

} // namespace detail
} // namespace cpu
} // namespace cldnn

BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cpu::reduce_impl)
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ void register_implementations() {
REGISTER_CPU(broadcast);
REGISTER_CPU(tile);
REGISTER_CPU(select);
REGISTER_CPU(reduce);
}

} // namespace cpu
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "intel_gpu/primitives/broadcast.hpp"
#include "intel_gpu/primitives/tile.hpp"
#include "intel_gpu/primitives/select.hpp"
#include "intel_gpu/primitives/reduce.hpp"

namespace cldnn {
namespace cpu {
Expand Down Expand Up @@ -53,6 +54,7 @@ REGISTER_CPU(reorder);
REGISTER_CPU(broadcast);
REGISTER_CPU(tile);
REGISTER_CPU(select);
REGISTER_CPU(reduce);

#undef REGISTER_CPU

Expand Down
12 changes: 12 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/scatter_nd_update_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@

namespace cldnn {

template <>
struct typed_program_node<scatter_nd_update> : public typed_program_node_base<scatter_nd_update> {
private:
using parent = typed_program_node_base<scatter_nd_update>;

public:
using parent::parent;
program_node& input(std::size_t i = 0) const { return get_dependency(i); }

std::vector<size_t> get_shape_infer_dependencies() const override { return {}; }
};

using scatter_nd_update_node = typed_program_node<scatter_nd_update>;

template <>
Expand Down
6 changes: 3 additions & 3 deletions src/plugins/intel_gpu/src/graph/scatter_nd_update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ layout scatter_nd_update_inst::calc_output_layout(scatter_nd_update_node const&

template<typename ShapeType>
std::vector<layout> scatter_nd_update_inst::calc_output_layouts(scatter_nd_update_node const& /*node*/, const kernel_impl_params& impl_param) {
auto input0_layout = impl_param.get_input_layout(0);
auto input1_layout = impl_param.get_input_layout(1);
auto input2_layout = impl_param.get_input_layout(2);
const auto& input0_layout = impl_param.get_input_layout(0);
const auto& input1_layout = impl_param.get_input_layout(1);
const auto& input2_layout = impl_param.get_input_layout(2);

std::vector<ShapeType> input_shapes = {
input0_layout.get<ShapeType>(), // inputs_shape
Expand Down
32 changes: 32 additions & 0 deletions src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1652,6 +1652,38 @@ TEST(reduce_gpu, common_bfwzyx_log_sum_exp_keepdims) {
}
}

TEST(reduce_gpu, cpu_impl_int32) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({{4}, data_types::i32, format::bfyx});

set_values<int32_t>(input, {1, 2, 3, 4});

topology topology;
topology.add(input_layout("input", input->get_layout()));
topology.add(reduce("reduce", input_info("input"), reduce_mode::prod, {0}, true));

auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"reduce", {format::bfyx, "", impl_types::cpu}}}));
network network(engine, topology, config);

network.set_input_data("input", input);

auto outputs = network.execute();

ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "reduce");

auto output = outputs.at("reduce").get_memory();

std::vector<int32_t> ref_data = {24};

cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());

for (size_t i = 0; i < ref_data.size(); ++i) {
ASSERT_EQ(ref_data[i], output_ptr[i]);
}
}

TEST(reduce_gpu, dynamic) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({data_types::f32, format::bfwzyx, {2, 3, 1, 1, 1, 1}});
Expand Down

0 comments on commit e6fe146

Please sign in to comment.