Skip to content

Commit

Permalink
[GPU] memory reuse for dynamic models (#18228)
Browse files Browse the repository at this point in the history
* enable memory reuse for dynamic models

* updated to return dependant events for the shape_of primitive

* fixed memory_pool.release_memory()

* fixed a lint error

* fixed missing default value

* updated to use reset flag for dynamic models

* changed to use is_dynamic_output_layout instead of is_dynamic

* updated to use get_internal_params instread of buffer_ptr

* added a memory reuse test for dynamic models
  • Loading branch information
e-ddykim authored Jul 5, 2023
1 parent 8c64891 commit 4c072ac
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 22 deletions.
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ struct memory {

return true;
}
void set_reused(bool reused = true) { _reused = reused; }

virtual event::ptr copy_from(stream& /* stream */, const memory& /* other */, bool blocking = true) = 0;
virtual event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */, bool blocking = true) = 0;
Expand Down
12 changes: 7 additions & 5 deletions src/plugins/intel_gpu/src/graph/impls/cpu/shape_of.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ struct shape_of_impl : public typed_primitive_impl<shape_of> {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "shape_of::execute_impl");
auto& stream = instance.get_network().get_stream();

auto ev = stream.create_user_event(false);

auto output_mem_ptr = instance.output_memory_ptr();

auto output_dt = instance.get_impl_params()->get_output_layout().data_type;
Expand All @@ -60,9 +58,13 @@ struct shape_of_impl : public typed_primitive_impl<shape_of> {
OPENVINO_THROW("[GPU] Couldn't execute shape_of operation: unsupported output data type (", output_dt , ")");
}

ev->set();

return ev;
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
} else {
return stream.create_user_event(true);
}
}

void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,8 @@ class primitive_inst {
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
bool has_inner_networks() const;
void allocate_internal_buffers();
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,
const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false);
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params, uint32_t net_id,
bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false, memory* curr_memory = nullptr, bool runtime_alloc = false);

std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }

Expand Down
8 changes: 8 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/reduce_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ class typed_primitive_inst<reduce> : public typed_primitive_inst_base<reduce> {
static layout calc_output_layout(reduce_node const& node, kernel_impl_params const& impl_param);
static std::string to_string(reduce_node const& node);

bool need_reset_input_memory() const override {
auto input_layout = _deps[0].first->_impl_params->get_output_layout(_deps[0].second);
if (!format::format::is_simple_data_format(input_layout.format) && input_layout.feature() % 16 != 0) {
return true;
}
return false;
}

typed_primitive_inst(network& network, reduce_node const& desc);
};

Expand Down
26 changes: 16 additions & 10 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,10 @@ event::ptr primitive_inst::realloc_if_needed() {

if (can_reuse_buffer) {
GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
if (_outputs[0]->get_layout() != actual_layout)
if (_outputs[0]->get_layout() != actual_layout) {
_outputs[0]->set_reused(true);
_outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
}
if (need_reset_output_memory()) {
ev = _outputs[0]->fill(_network.get_stream());
}
Expand Down Expand Up @@ -1056,8 +1058,6 @@ event::ptr primitive_inst::update_weights() {

static bool user_requesting_mem_reuse_false(const program_node& node) {
for (auto& user : node.get_users()) {
if (user->is_dynamic())
return true;
if ((user->get_selected_impl() != nullptr) && (user->get_selected_impl()->can_reuse_memory == false)) {
return true;
} else if (user->get_selected_impl() == nullptr) {
Expand All @@ -1070,14 +1070,17 @@ static bool user_requesting_mem_reuse_false(const program_node& node) {
}

memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params,
uint32_t net_id, bool is_internal, size_t idx, bool reset, bool is_output_buffer) {
uint32_t net_id, bool is_internal, size_t idx, bool reset, bool is_output_buffer, memory* curr_memory, bool runtime_alloc) {
auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
allocation_type type, bool reusable, bool reset = true) {
allocation_type type, bool reusable, bool reset = true, memory* curr_memory = nullptr) {
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
// Use layout with max tensor for dynamic shape with upper bound
auto static_layout = cldnn::layout(layout.get_partial_shape().get_max_shape(), layout.data_type, layout.format, layout.data_padding);
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool)) {
if (curr_memory != nullptr)
pool.release_memory(curr_memory, id, net_id);
return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable, reset);
}
return pool.get_memory(static_layout, type, reset);
};

Expand All @@ -1097,7 +1100,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
if (total_device_input_mem_size > _engine.get_device_info().max_global_mem_size)
usm_device_allocatable = false;

bool memory_reuse_by_user = !user_requesting_mem_reuse_false(_node);
bool memory_reuse_by_user = (runtime_alloc && _node.is_dynamic_output_layout()) ? !reset : !user_requesting_mem_reuse_false(_node);

// For outputs, cpu prim we want to have lockable alloc type
// Also if the successor of a node is an cpu, then memory needs to be lockable.
Expand All @@ -1123,7 +1126,8 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
_node.get_memory_dependencies(),
alloc_type,
false,
reset);
reset,
curr_memory);
} else {
if ((_node.is_output() && _node.is_type<generic_layer>()) || (!_node.is_output() && _node.is_type<input_layout>()))
reset = false;
Expand All @@ -1140,7 +1144,8 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
_node.get_memory_dependencies(),
alloc_type,
memory_reuse_by_user,
reset);
reset,
curr_memory);
}
}

Expand All @@ -1149,7 +1154,8 @@ std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* up
for (size_t i = 0; i < get_node().get_outputs_count() ; ++i) {
outputs.push_back(allocate_output(get_network().get_engine(), _network.get_memory_pool(),
*_node, (updated_params != nullptr) ? *updated_params : *_impl_params,
get_network_id(), _network.is_internal(), i, reset_mem, is_output_buffer(this, runtime_alloc)));
get_network_id(), _network.is_internal(), i, reset_mem, is_output_buffer(this, runtime_alloc),
(_outputs.size() > i) ? output_memory_ptr(i).get() : nullptr, runtime_alloc));
}
return outputs;
}
Expand Down
7 changes: 3 additions & 4 deletions src/plugins/intel_gpu/src/runtime/memory_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,12 @@ void memory_pool::release_memory(memory* mem, const primitive_id& id, uint32_t n
auto type = mem->get_allocation_type();

{
auto range = _non_padded_pool.equal_range(_layout.bytes_count());
auto it = range.first;
auto it = _non_padded_pool.lower_bound(_layout.bytes_count());

while (it != range.second && it != _non_padded_pool.end()) {
while (it != _non_padded_pool.end()) {
if (it->second._network_id == network_id &&
it->second._type == type &&
it->second._memory.get() == mem) {
it->second._memory->get_internal_params().mem == mem->get_internal_params().mem) {
auto user_it = it->second._users.find({ id, network_id });

// normally there should be only one entry
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2573,7 +2573,7 @@ class fc_random_types_test
std::tie(b, in_f, in_x, in_y, out_f, in_fmt) = GetParam();

quantization_t quant_data;
quant_data.output_low = std::numeric_limits<WeightsT>::min();
quant_data.output_low = std::numeric_limits<WeightsT>::lowest();
quant_data.output_high = std::numeric_limits<WeightsT>::max();

VVVVF<InputT> input_data = generate_random_4d<InputT>(b, in_f, in_y, in_x, 0, 127);
Expand Down
60 changes: 60 additions & 0 deletions src/plugins/intel_gpu/tests/unit/test_cases/memory_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
#include <intel_gpu/primitives/pooling.hpp>
#include <intel_gpu/primitives/concatenation.hpp>
#include <intel_gpu/primitives/data.hpp>
#include "intel_gpu/primitives/fully_connected.hpp"
#include <intel_gpu/primitives/reshape.hpp>
#include <intel_gpu/primitives/crop.hpp>
#include <intel_gpu/primitives/eltwise.hpp>
#include <fully_connected_inst.h>

using namespace cldnn;
using namespace ::tests;
Expand Down Expand Up @@ -537,6 +539,60 @@ class memory_pool: public ::testing::Test {
ASSERT_EQ(out2_ptr[2], 7.0f);
ASSERT_EQ(out2_ptr[3], 8.0f);
}

void test_dynamic_mem_reuse() {
auto& engine = get_test_engine();

const int32_t input_f = 3, weight_b = 4;

auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(), input_f }, data_types::f32,format::bfyx };
auto input_actual_layout1 = layout{ ov::PartialShape{ 2, input_f }, data_types::f32,format::bfyx};
auto input_actual_layout2 = layout{ ov::PartialShape{ 1, input_f }, data_types::f32,format::bfyx};
auto input_data1 = engine.allocate_memory(input_actual_layout1);
auto input_data2 = engine.allocate_memory(input_actual_layout2);
auto fc_weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx});

set_values(input_data1, { -0.5f, 2.0f, 0.5f });
set_values(input_data2, { 0.5f, -2.0f, -0.5f,
-0.5f, 2.0f, 0.5f });
set_values(fc_weights_data, { 1.5f, 1.0f, 0.5f,
-1.0f, 0.0f, 0.5f,
0.5f, -0.5f, -2.0f,
-0.5f, 1.0f, 1.5f });

cldnn::topology topology{
input_layout("input", input_dyn_layout),
activation("relu1", input_info("input"), activation_func::relu),
eltwise("elt1", { input_info("input"), input_info("relu1") }, eltwise_mode::prod),
activation("relu2", input_info("elt1"), activation_func::sqrt),
eltwise("elt2", { input_info("elt1"), input_info("relu2") }, eltwise_mode::prod),
data("fc_weights", fc_weights_data),
fully_connected("fc", input_info("elt2"), "fc_weights")
};

ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);

{
network.set_input_data("input", input_data1);

auto outputs = network.execute();

ASSERT_EQ(std::static_pointer_cast<fully_connected_inst>(network.get_primitive("relu1"))->output_memory_ptr()->buffer_ptr(),
std::static_pointer_cast<fully_connected_inst>(network.get_primitive("relu2"))->output_memory_ptr()->buffer_ptr());
}

{
network.set_input_data("input", input_data2);

auto outputs = network.execute();

ASSERT_EQ(std::static_pointer_cast<fully_connected_inst>(network.get_primitive("relu1"))->output_memory_ptr()->buffer_ptr(),
std::static_pointer_cast<fully_connected_inst>(network.get_primitive("relu2"))->output_memory_ptr()->buffer_ptr());
}
}
};

TEST_F(memory_pool, basic_non_padded_relu_pipe) {
Expand Down Expand Up @@ -579,6 +635,10 @@ TEST_F(memory_pool, add_mem_dep_test) {
this->test_add_mem_dep(false);
}

TEST_F(memory_pool, dynamic_mem_reuse) {
this->test_dynamic_mem_reuse();
}

#ifdef RUN_ALL_MODEL_CACHING_TESTS
TEST_F(memory_pool, basic_non_padded_relu_pipe_cached) {
this->test_basic_non_padded_relu_pipe(true);
Expand Down

0 comments on commit 4c072ac

Please sign in to comment.