[GPU] Test fixes

vladimir-paramuzov · Nov 21, 2023 · 582f72c · 582f72c
1 parent 6c1ffbd
commit 582f72c
Show file tree

Hide file tree

Showing 10 changed files with 99 additions and 19 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@@ -220,6 +220,7 @@ struct network {
     }
 
     void set_variable(const std::string& name, const std::shared_ptr<ov::intel_gpu::VariableState>& variable);
+    bool has_variable(const std::string &variable_id) const;
     ov::intel_gpu::VariableState& get_variable(const std::string &variable_id) const;
     const ov::intel_gpu::VariableStateInfo& get_variable_info(const std::string &variable_id) const;
     const ov::intel_gpu::VariablesMap& get_variables() const;

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -2,12 +2,14 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "prepare_buffer_fusing.h"
+#include "intel_gpu/primitives/read_value.hpp"
 #include "pooling_inst.h"
 #include "primitive_inst.h"
 #include "activation_inst.h"
 #include "concatenation_inst.h"
 #include "crop_inst.h"
 #include "eltwise_inst.h"
+#include "read_value_inst.h"
 #include "reshape_inst.h"
 #include "depth_to_space_inst.h"
 #include "resample_inst.h"
@@ -602,5 +604,36 @@ void prepare_buffer_fusing::run(program& p) {
 
             node.can_be_optimized(can_reshape_be_optimized(node));
         });
+        program_helpers::do_for_types<read_value>(*node, [](read_value_node& node) {
+            // Current implementation allows to avoid copy on read_value primitive
+            // only in cases when it has single user
+            // Otherwise we may face an issue with exeuction of read_value users and assign to the same variable
+            // Graph below is an example of unsupported case
+            //     ┌────────┐     ┌───────┐
+            //     │ Param1 │     │ Const │
+            //     └───┬────┘     └───┬───┘
+            //         │              │
+            //         │         ┌────┴──────┐
+            //  .......│.........│ ReadValue │
+            //  .      │         └────┬─────┬┘
+            //  .      │              │     │
+            //  .      │   ┌─────┐    │     │
+            //  .      └───┤ Add ├────┘     │
+            //  .          └──┬──┘          │
+            //  .             │             │
+            //  .             │             │
+            //  . ┌────────┐  │    ┌─────┐  │
+            //  ..│ Assign ├──┴────┤ Add ├──┘
+            //    └────────┘       └──┬──┘
+            //                        │
+            //                        │
+            //                   ┌────┴──────┐
+            //                   │  Result   │
+            //                   └───────────┘
+            // If read_value here returns virable memory w/o copy, then based on Add-s and Assign execution order we may have different results
+            // TODO: Allow optimizations for the case above too. Looks like it can be achieved by more careful
+            // topological sort (i.e. if we ensure that all read_value users are completed before assign is run)
+            node.can_be_optimized(node.get_users().size() == 1);
+        });
     }
 }
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp
@@ -48,17 +48,22 @@ struct read_value_impl : public typed_primitive_impl<read_value> {
         }
 
         auto& variable = instance.get_network().get_variable(variable_id);
+        auto &stream = instance.get_network().get_stream();
 
         OPENVINO_ASSERT(variable.get_layout() == instance.get_output_layout(),
                 "[GPU] Layout mismatch: variable layout: ", variable.get_layout().to_short_string(),
                 " read_value output layout: ", instance.get_output_layout().to_short_string());
 
-        instance.set_output_memory(variable.get_memory(), false, 0);
-
         if (!variable.is_set()) {
-            auto &stream = instance.get_network().get_stream();
-            const auto ev_set_output = instance.output_memory().fill(stream, 0);
-            return ev_set_output;
+            if (instance.get_impl_params()->input_layouts.size() > 0) {
+                variable.get_memory()->copy_from(stream, instance.dep_memory(0), true);
+            } else {
+                variable.get_memory()->fill(stream, 0);
+            }
+        }
+
+        if (!instance.can_be_optimized()) {
+            return instance.output_memory(0).copy_from(stream, *variable.get_memory(), false);
         }
 
         return instance.get_network().get_stream().create_user_event(true);

diff --git a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h
@@ -37,6 +37,11 @@ class typed_primitive_inst<read_value> : public typed_primitive_inst_base<read_v
 
     void save(cldnn::BinaryOutputBuffer& ob) const override;
     void load(cldnn::BinaryInputBuffer& ib) override;
+
+    void update_output_memory() override;
+
+protected:
+    void on_execute() override;
 };
 
 using read_value_inst = typed_primitive_inst<read_value>;

diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -684,7 +684,7 @@ void network::set_arguments() {
                 // In that case some_op is static and we may want to set arguments once,
                 // but dynamic optimized out reshape means that output buffer of reshape is unavailable
                 // and attempt to set args will fail.
-                if (dep.first->can_be_optimized() && dep.first->is_dynamic())
+                if (dep.first->can_be_optimized() && (dep.first->is_dynamic() || dep.first->get_node().is_type<read_value>()))
                     can_set_args = false;
             }
 
@@ -1649,10 +1649,14 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
 void network::set_variable(const std::string& name, const std::shared_ptr<ov::intel_gpu::VariableState>& variable) {
     GPU_DEBUG_TRACE_DETAIL << "Set variable " << name << " " << variable->get_layout().to_short_string() << std::endl;
     _variables_states[name] = variable;
-    for (auto& inst : _variable_state_primitives.at(name)) {
-        if (variable->get_layout().is_static())
-            inst->set_output_memory(variable->get_memory(), false, 0);
-    }
+    // for (auto& inst : _variable_state_primitives.at(name)) {
+    //     if (variable->get_layout().is_static())
+    //         inst->set_output_memory(variable->get_memory(), false, 0);
+    // }
+}
+
+bool network::has_variable(const std::string &variable_id) const {
+    return _variables_states.find(variable_id) != _variables_states.end();
 }
 
 ov::intel_gpu::VariableState& network::get_variable(const std::string &variable_id) const {

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -451,7 +451,6 @@ event::ptr primitive_inst::realloc_if_needed() {
         std::string variable_id = stateful_prim->variable_id();
         auto variable = get_network().get_variable(variable_id);
         variable.set_layout(actual_layout);
-        return ev;
     }
 
     bool can_reuse_buffer = _outputs[0] && actual_layout.count() <= max_output_layout_size;

diff --git a/src/plugins/intel_gpu/src/graph/read_value.cpp b/src/plugins/intel_gpu/src/graph/read_value.cpp
@@ -11,7 +11,7 @@ namespace cldnn {
 GPU_DEFINE_PRIMITIVE_TYPE_ID(read_value)
 
 read_value_inst::typed_primitive_inst(network& network, const read_value_node& node) :
-    parent(network, node, false),
+    parent(network, node, !node.can_be_optimized() && (node.get_output_layout().is_static() || node.get_output_layout().has_upper_bound())),
     memory_state::variable{node.get_primitive()->variable_id} {
 }
 
@@ -31,6 +31,18 @@ std::string read_value_inst::to_string(const read_value_node& node) {
     return primitive_description.str();
 }
 
+void read_value_inst::on_execute() {
+    update_output_memory();
+}
+
+void read_value_inst::update_output_memory() {
+    if (!can_be_optimized() || !get_network().has_variable(variable_id()))
+        return;
+
+    const auto& variable = get_network().get_variable(variable_id());
+    set_output_memory(variable.get_memory(), false, 0);
+}
+
 void read_value_inst::save(cldnn::BinaryOutputBuffer& ob) const {
     parent::save(ob);
 

diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -670,7 +670,7 @@ void SyncInferRequest::allocate_states() {
     const auto& network = m_graph->get_network();
     const auto& variables_info = network->get_variables_info();
     for (auto& vi : variables_info) {
-        auto variable = std::make_shared<VariableState>(vi.second, network->get_engine(), network->get_shape_predictor());
+        auto variable = std::make_shared<VariableState>(vi.second, network->get_engine(), *network->get_shape_predictor());
         m_variables.emplace(vi.first, variable);
     }
 }

diff --git a/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp b/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp
@@ -222,3 +222,24 @@ TEST(TensorTest, smoke_canReallocateDeviceInputForHostTensor) {
     ASSERT_NO_THROW(inf_req.infer());
 }
 
+TEST(VariablesTest, smoke_canSetStateTensor) {
+    auto ov = ov::Core();
+    const ov::Shape virable_shape = {1, 3, 2, 4};
+    const ov::Shape input_shape = {1, 3, 2, 4};
+    const ov::element::Type et = ov::element::f16;
+    auto model = ngraph::builder::subgraph::makeReadConcatSplitAssign(input_shape, et);
+    auto compiled_model = ov.compile_model(model, ov::test::utils::DEVICE_GPU);
+    auto request = compiled_model.create_infer_request();
+
+    ov::Tensor variable_tensor(et, virable_shape);
+    ov::Tensor input_tensor(et, input_shape);
+
+    auto variables = request.query_state();
+    ASSERT_EQ(variables.size(), 1);
+    auto variable = variables.front();
+    ASSERT_EQ(variable.get_name(), "v0");
+    auto default_state_tensor = variable.get_state();
+    ASSERT_EQ(default_state_tensor.get_shape(), virable_shape);
+
+    ASSERT_NO_THROW(request.infer());
+}
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp
@@ -40,7 +40,7 @@ struct variable_test : public ::testing::TestWithParam<VariableParams<T>> {
 
         cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
 
-        auto variable = std::make_shared<VariableState>(VariableStateInfo{"v0", variable_layout}, engine, network->get_shape_predictor());
+        auto variable = std::make_shared<VariableState>(VariableStateInfo{"v0", variable_layout}, engine, *network->get_shape_predictor());
         network->set_variable("v0", variable);
         network->set_input_data("input", input_data);
 
@@ -129,7 +129,7 @@ void test_exception_on_wrong_layout(bool is_caching_test) {
 
     cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
 
-    auto variable = std::make_shared<VariableState>(VariableStateInfo{"v0", variable_layout}, engine, network->get_shape_predictor());
+    auto variable = std::make_shared<VariableState>(VariableStateInfo{"v0", variable_layout}, engine, *network->get_shape_predictor());
     network->set_variable("v0", variable);
     network->set_input_data("input", input_data);
     network->set_input_data("wrong_input", wrong_input_data);
@@ -167,7 +167,7 @@ void test_different_output_data_type(bool is_caching_test) {
     config.set_property(ov::intel_gpu::optimize_data(true));
     cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
 
-    auto variable = std::make_shared<VariableState>(VariableStateInfo{"v0", variable_layout}, engine, network->get_shape_predictor());
+    auto variable = std::make_shared<VariableState>(VariableStateInfo{"v0", variable_layout}, engine, *network->get_shape_predictor());
     network->set_variable("v0", variable);
     network->set_input_data("input", input_data);
 
@@ -223,9 +223,9 @@ void test_variables_are_preserved_across_inferences(bool is_caching_test) {
     cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
 
 
-    auto variable1 = std::make_shared<VariableState>(VariableStateInfo{"v1", variable_layout}, engine, network->get_shape_predictor());
-    auto variable2 = std::make_shared<VariableState>(VariableStateInfo{"v2", variable_layout}, engine, network->get_shape_predictor());
-    auto variable3 = std::make_shared<VariableState>(VariableStateInfo{"v_result", variable_layout}, engine, network->get_shape_predictor());
+    auto variable1 = std::make_shared<VariableState>(VariableStateInfo{"v1", variable_layout}, engine, *network->get_shape_predictor());
+    auto variable2 = std::make_shared<VariableState>(VariableStateInfo{"v2", variable_layout}, engine, *network->get_shape_predictor());
+    auto variable3 = std::make_shared<VariableState>(VariableStateInfo{"v_result", variable_layout}, engine, *network->get_shape_predictor());
     network->set_variable("v1", variable1);
     network->set_variable("v2", variable2);
     network->set_variable("v_result", variable3);