[GPU] memory reuse for dynamic models (#18228)

* enable memory reuse for dynamic models * updated to return dependant events for the shape_of primitive * fixed memory_pool.release_memory() * fixed a lint error * fixed missing default value * updated to use reset flag for dynamic models * changed to use is_dynamic_output_layout instead of is_dynamic * updated to use get_internal_params instread of buffer_ptr * added a memory reuse test for dynamic models
openvinotoolkit · Jul 5, 2023 · 4c072ac · 4c072ac
1 parent 8c64891
commit 4c072ac
Show file tree

Hide file tree

Showing 8 changed files with 98 additions and 22 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
@@ -69,6 +69,7 @@ struct memory {
 
         return true;
     }
+    void set_reused(bool reused = true) { _reused = reused; }
 
     virtual event::ptr copy_from(stream& /* stream */, const memory& /* other */, bool blocking = true) = 0;
     virtual event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */, bool blocking = true) = 0;

diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/shape_of.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/shape_of.cpp
@@ -40,8 +40,6 @@ struct shape_of_impl : public typed_primitive_impl<shape_of> {
         OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "shape_of::execute_impl");
         auto& stream = instance.get_network().get_stream();
 
-        auto ev = stream.create_user_event(false);
-
         auto output_mem_ptr = instance.output_memory_ptr();
 
         auto output_dt = instance.get_impl_params()->get_output_layout().data_type;
@@ -60,9 +58,13 @@ struct shape_of_impl : public typed_primitive_impl<shape_of> {
             OPENVINO_THROW("[GPU] Couldn't execute shape_of operation: unsupported output data type (", output_dt , ")");
         }
 
-        ev->set();
-
-        return ev;
+        if (events.size() > 1) {
+            return stream.group_events(events);
+        } else if (events.size() == 1) {
+            return events[0];
+        } else {
+            return stream.create_user_event(true);
+        }
     }
 
     void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}

diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -234,8 +234,8 @@ class primitive_inst {
     bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
     bool has_inner_networks() const;
     void allocate_internal_buffers();
-    static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,
-            const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false);
+    static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params, uint32_t net_id,
+            bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false, memory* curr_memory = nullptr, bool runtime_alloc = false);
 
     std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }
 

diff --git a/src/plugins/intel_gpu/src/graph/include/reduce_inst.h b/src/plugins/intel_gpu/src/graph/include/reduce_inst.h
@@ -33,6 +33,14 @@ class typed_primitive_inst<reduce> : public typed_primitive_inst_base<reduce> {
     static layout calc_output_layout(reduce_node const& node, kernel_impl_params const& impl_param);
     static std::string to_string(reduce_node const& node);
 
+    bool need_reset_input_memory() const override {
+        auto input_layout = _deps[0].first->_impl_params->get_output_layout(_deps[0].second);
+        if (!format::format::is_simple_data_format(input_layout.format) && input_layout.feature() % 16 != 0) {
+            return true;
+        }
+        return false;
+    }
+
     typed_primitive_inst(network& network, reduce_node const& desc);
 };
 

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -364,8 +364,10 @@ event::ptr primitive_inst::realloc_if_needed() {
 
     if (can_reuse_buffer) {
         GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
-        if (_outputs[0]->get_layout() != actual_layout)
+        if (_outputs[0]->get_layout() != actual_layout) {
+            _outputs[0]->set_reused(true);
             _outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
+        }
         if (need_reset_output_memory()) {
             ev = _outputs[0]->fill(_network.get_stream());
         }
@@ -1056,8 +1058,6 @@ event::ptr primitive_inst::update_weights() {
 
 static bool user_requesting_mem_reuse_false(const program_node& node) {
     for (auto& user : node.get_users()) {
-        if (user->is_dynamic())
-            return true;
         if ((user->get_selected_impl() != nullptr) && (user->get_selected_impl()->can_reuse_memory == false)) {
             return true;
         } else if (user->get_selected_impl() == nullptr) {
@@ -1070,14 +1070,17 @@ static bool user_requesting_mem_reuse_false(const program_node& node) {
 }
 
 memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params,
-                                            uint32_t net_id, bool is_internal, size_t idx, bool reset, bool is_output_buffer) {
+                                uint32_t net_id, bool is_internal, size_t idx, bool reset, bool is_output_buffer, memory* curr_memory, bool runtime_alloc) {
     auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
-            allocation_type type, bool reusable, bool reset = true) {
+            allocation_type type, bool reusable, bool reset = true, memory* curr_memory = nullptr) {
         OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
         // Use layout with max tensor for dynamic shape with upper bound
         auto static_layout = cldnn::layout(layout.get_partial_shape().get_max_shape(), layout.data_type, layout.format, layout.data_padding);
-        if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
+        if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool)) {
+            if (curr_memory != nullptr)
+                pool.release_memory(curr_memory, id, net_id);
             return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable, reset);
+        }
         return pool.get_memory(static_layout, type, reset);
     };
 
@@ -1097,7 +1100,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
     if (total_device_input_mem_size > _engine.get_device_info().max_global_mem_size)
         usm_device_allocatable = false;
 
-    bool memory_reuse_by_user = !user_requesting_mem_reuse_false(_node);
+    bool memory_reuse_by_user = (runtime_alloc && _node.is_dynamic_output_layout()) ? !reset : !user_requesting_mem_reuse_false(_node);
 
     // For outputs, cpu prim we want to have lockable alloc type
     // Also if the successor of a node is an cpu, then memory needs to be lockable.
@@ -1123,7 +1126,8 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
                                         _node.get_memory_dependencies(),
                                         alloc_type,
                                         false,
-                                        reset);
+                                        reset,
+                                        curr_memory);
         } else {
             if ((_node.is_output() && _node.is_type<generic_layer>()) || (!_node.is_output() && _node.is_type<input_layout>()))
                 reset = false;
@@ -1140,7 +1144,8 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
                                     _node.get_memory_dependencies(),
                                     alloc_type,
                                     memory_reuse_by_user,
-                                    reset);
+                                    reset,
+                                    curr_memory);
     }
 }
 
@@ -1149,7 +1154,8 @@ std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* up
     for (size_t i = 0; i < get_node().get_outputs_count() ; ++i) {
         outputs.push_back(allocate_output(get_network().get_engine(), _network.get_memory_pool(),
                          *_node, (updated_params != nullptr) ? *updated_params : *_impl_params,
-                         get_network_id(), _network.is_internal(), i, reset_mem, is_output_buffer(this, runtime_alloc)));
+                         get_network_id(), _network.is_internal(), i, reset_mem, is_output_buffer(this, runtime_alloc),
+                         (_outputs.size() > i) ? output_memory_ptr(i).get() : nullptr, runtime_alloc));
     }
     return outputs;
 }

diff --git a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
@@ -55,13 +55,12 @@ void memory_pool::release_memory(memory* mem, const primitive_id& id, uint32_t n
     auto type = mem->get_allocation_type();
 
     {
-        auto range = _non_padded_pool.equal_range(_layout.bytes_count());
-        auto it = range.first;
+        auto it = _non_padded_pool.lower_bound(_layout.bytes_count());
 
-        while (it != range.second && it != _non_padded_pool.end()) {
+        while (it != _non_padded_pool.end()) {
             if (it->second._network_id == network_id &&
                 it->second._type == type &&
-                it->second._memory.get() == mem) {
+                it->second._memory->get_internal_params().mem == mem->get_internal_params().mem) {
                 auto user_it = it->second._users.find({ id, network_id });
 
                 // normally there should be only one entry

diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -2573,7 +2573,7 @@ class fc_random_types_test
         std::tie(b, in_f, in_x, in_y, out_f, in_fmt) = GetParam();
 
         quantization_t quant_data;
-        quant_data.output_low  = std::numeric_limits<WeightsT>::min();
+        quant_data.output_low  = std::numeric_limits<WeightsT>::lowest();
         quant_data.output_high = std::numeric_limits<WeightsT>::max();
 
         VVVVF<InputT> input_data = generate_random_4d<InputT>(b, in_f, in_y, in_x, 0, 127);

diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/memory_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/memory_test.cpp
@@ -9,9 +9,11 @@
 #include <intel_gpu/primitives/pooling.hpp>
 #include <intel_gpu/primitives/concatenation.hpp>
 #include <intel_gpu/primitives/data.hpp>
+#include "intel_gpu/primitives/fully_connected.hpp"
 #include <intel_gpu/primitives/reshape.hpp>
 #include <intel_gpu/primitives/crop.hpp>
 #include <intel_gpu/primitives/eltwise.hpp>
+#include <fully_connected_inst.h>
 
 using namespace cldnn;
 using namespace ::tests;
@@ -537,6 +539,60 @@ class memory_pool: public ::testing::Test {
         ASSERT_EQ(out2_ptr[2], 7.0f);
         ASSERT_EQ(out2_ptr[3], 8.0f);
     }
+
+    void test_dynamic_mem_reuse() {
+        auto& engine = get_test_engine();
+
+        const int32_t input_f = 3, weight_b = 4;
+
+        auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(), input_f }, data_types::f32,format::bfyx };
+        auto input_actual_layout1 = layout{ ov::PartialShape{ 2, input_f }, data_types::f32,format::bfyx};
+        auto input_actual_layout2 = layout{ ov::PartialShape{ 1, input_f }, data_types::f32,format::bfyx};
+        auto input_data1 = engine.allocate_memory(input_actual_layout1);
+        auto input_data2 = engine.allocate_memory(input_actual_layout2);
+        auto fc_weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx});
+
+        set_values(input_data1, { -0.5f, 2.0f, 0.5f });
+        set_values(input_data2, { 0.5f, -2.0f, -0.5f,
+                                -0.5f, 2.0f, 0.5f });
+        set_values(fc_weights_data, { 1.5f, 1.0f, 0.5f,
+                                -1.0f, 0.0f, 0.5f,
+                                0.5f, -0.5f, -2.0f,
+                                -0.5f, 1.0f, 1.5f });
+
+        cldnn::topology topology{
+            input_layout("input", input_dyn_layout),
+            activation("relu1", input_info("input"), activation_func::relu),
+            eltwise("elt1", { input_info("input"), input_info("relu1") }, eltwise_mode::prod),
+            activation("relu2", input_info("elt1"), activation_func::sqrt),
+            eltwise("elt2", { input_info("elt1"), input_info("relu2") }, eltwise_mode::prod),
+            data("fc_weights", fc_weights_data),
+            fully_connected("fc", input_info("elt2"), "fc_weights")
+        };
+
+        ExecutionConfig config = get_test_default_config(engine);
+        config.set_property(ov::intel_gpu::optimize_data(true));
+        config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+        network network(engine, topology, config);
+
+        {
+            network.set_input_data("input", input_data1);
+
+            auto outputs = network.execute();
+
+            ASSERT_EQ(std::static_pointer_cast<fully_connected_inst>(network.get_primitive("relu1"))->output_memory_ptr()->buffer_ptr(), 
+                      std::static_pointer_cast<fully_connected_inst>(network.get_primitive("relu2"))->output_memory_ptr()->buffer_ptr());
+        }
+
+        {
+            network.set_input_data("input", input_data2);
+
+            auto outputs = network.execute();
+
+            ASSERT_EQ(std::static_pointer_cast<fully_connected_inst>(network.get_primitive("relu1"))->output_memory_ptr()->buffer_ptr(), 
+                      std::static_pointer_cast<fully_connected_inst>(network.get_primitive("relu2"))->output_memory_ptr()->buffer_ptr());
+        }
+    }
 };
 
 TEST_F(memory_pool, basic_non_padded_relu_pipe) {
@@ -579,6 +635,10 @@ TEST_F(memory_pool, add_mem_dep_test) {
     this->test_add_mem_dep(false);
 }
 
+TEST_F(memory_pool, dynamic_mem_reuse) {
+    this->test_dynamic_mem_reuse();
+}
+
 #ifdef RUN_ALL_MODEL_CACHING_TESTS
 TEST_F(memory_pool, basic_non_padded_relu_pipe_cached) {
     this->test_basic_non_padded_relu_pipe(true);