Fixes

openvinotoolkit · Aug 23, 2024 · 36f795b · 36f795b
1 parent 11e99dd
commit 36f795b
Show file tree

Hide file tree

Showing 60 changed files with 714 additions and 473 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_onednn_optimization_attributes.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_onednn_optimization_attributes.cpp
@@ -6,7 +6,6 @@
 #include "program_node.h"
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
-#include "fully_connected_inst.h"
 #include <impls/onednn/utils.hpp>
 #endif
 

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
@@ -52,6 +52,7 @@ void add_required_reorders::add_reorder(program& p, program_node* node, program_
         throw std::runtime_error("Internal Error: container index out of range exception.");
     }
     p.add_intermediate(new_reorder_node, *usr, idx);
+    new_reorder_node.recalc_output_layouts(false);
 }
 
 bool add_required_reorders::test_format(cldnn::program_node& node, format requested_format) {
@@ -65,12 +66,17 @@ bool add_required_reorders::test_format(cldnn::program_node& node, format reques
         const auto& dep_with_port = node.get_dependency_with_port(i);
         auto& dep = dep_with_port.first;
 
+        auto current_format = dep->get_output_layout(false, dep_with_port.second).format;
+
+        if (format::is_weights_format(current_format))
+            continue;
+
         if (dep->is_type<reorder>()) {
             auto& port = dep_with_port.second;
             auto new_layout = dep->get_output_layout(false, port);
             new_layout.format = requested_format;
             dep->set_output_layout(new_layout, false, port);
-        } else {
+        } else if (current_format != requested_format) {
             add_reorder(node.get_program(), dep_with_port.first, &node, true);
         }
     }
@@ -88,6 +94,10 @@ void add_required_reorders::run(program& p) {
         if (usr->is_type<data>())
             continue;
 
+        if (!usr->is_all_valid_output_layouts()) {
+            usr->get_output_layouts(false);
+        }
+
         // If usr is assign and input and output data types are different
         // add reorder with usr's output data type between dep and usr
         if (usr->is_type<assign>()) {

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
@@ -28,14 +28,16 @@ void compile_graph::run(program& p) {
     std::vector<ov::threading::Task> tasks;
     std::exception_ptr exception;
 
+   auto forcing_map = p.get_config().get_property(ov::intel_gpu::force_implementations);
+
     for (size_t idx = 0; idx < proc_order.size(); idx++) {
         auto& node = *(std::next(proc_order.begin(), idx));
 
         bool can_select_impl = !node->is_type<data>() &&
                                !(node->is_type<mutable_data>() && node->get_dependencies().empty());
 
         if (can_select_impl) {
-            tasks.push_back([node, &exception] {
+            tasks.push_back([node, &exception, &forcing_map] {
                 try {
                     const auto& params = node->get_kernel_impl_params();
                     auto shape_type = ImplementationManager::get_shape_type(*params);
@@ -46,6 +48,11 @@ void compile_graph::run(program& p) {
                     if (impl_type != impl_types::cpu) {
                         impl_type = impl_types::any;
                     }
+                    if (forcing_map.count(node->id())) {
+                        auto forced_impl = forcing_map.at(node->id()).impl_type;
+                        if (forced_impl != impl_types::any)
+                            impl_type = forced_impl;
+                    }
                     auto selected_impl_manager = node->type()->choose_impl(*node, *node->get_kernel_impl_params(), impl_type, shape_type);
                     if (selected_impl_manager) {
                         node->selected_impl = selected_impl_manager->create(*node, *params);

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
@@ -165,6 +165,7 @@ void handle_reshape::run(program& p) {
                         auto& new_reshape_node = p.get_or_create(new_reshape);
                         user->replace_dependency(0, input_node);
                         p.add_intermediate(new_reshape_node, *user, 0);
+                        new_reshape_node.recalc_output_layouts();
                         if (new_reshape->input_size() == 2) {
                             p.add_connection(prim_node.get_dependency(1), new_reshape_node);
                         }
@@ -198,6 +199,7 @@ void handle_reshape::run(program& p) {
                                        reshape_input_node.get_dependencies().empty());
                     reshape_reorder_id++;
                     reshape_input_node.recalc_output_layout();
+                    node->recalc_output_layouts();
                 }
             }
 
@@ -223,6 +225,7 @@ void handle_reshape::run(program& p) {
                     auto& reshape_input_node = p.get_or_create(reshape_input);
                     p.add_intermediate(reshape_input_node, *node, 0, reshape_input_node.get_dependencies().empty());
                     reshape_input_node.recalc_output_layout();
+                    node->recalc_output_layouts();
                 }
 
                 // Check whether output reorder is required for format change

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -770,6 +770,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
 
                 if (new_input.first) {
                     p.add_intermediate(new_input.first, detection_output_node, i, !new_input.second);
+                    detection_output_node.recalc_output_layouts();
                 }
             }
         }
@@ -784,6 +785,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
                 layout{ input_layout.get_partial_shape(), input_layout.data_type, new_format });
             if (reorder.first) {
                 p.add_intermediate(reorder.first, deconv_node, 0, !reorder.second);
+                deconv_node.recalc_output_layouts();
             }
         }
 
@@ -907,6 +909,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
             auto new_input = rf.get_reorder(input.id(), input_layout, new_layout);
             if (new_input.first) {
                p.add_intermediate(new_input.first, fc_node, 0, !new_input.second);
+               fc_node.recalc_output_layouts();
             }
         }
 
@@ -933,6 +936,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
             auto new_input = rf.get_reorder(input->id(), dep.second, input_layout, new_layout);
             if (new_input.first) {
                p.add_intermediate(new_input.first, pooling_node, 0);
+               pooling_node.recalc_output_layouts();
             }
         }
     };

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "impls/registry/implementation_manager.hpp"
 #include "pass_manager.h"
 #include "program_node.h"
 #include "openvino/core/except.hpp"
@@ -97,7 +98,13 @@ void select_preferred_formats::run(program& p) {
 
         const auto& params = n->get_kernel_impl_params();
         auto shape_type = ImplementationManager::get_shape_type(*params);
-        if (auto factory = n->type()->choose_impl(*n, *n->get_kernel_impl_params(), impl_type, shape_type)) {
+        // temporary set format to any as we need to query that from impl and don't want impl to be rejected
+        auto factory = test_format<std::shared_ptr<ImplementationManager>>(*n, format::any,
+            [&impl_type, &shape_type](const program_node& n) {
+                return n.type()->choose_impl(n, *n.get_kernel_impl_params(), impl_type, shape_type);
+        });
+
+        if (factory) {
             try {
                 auto fmts = factory->query_formats(*n);
                 for (size_t i = 0; i < fmts.first.size(); i++) {

diff --git a/src/plugins/intel_gpu/src/graph/impls/common/register.cpp b/src/plugins/intel_gpu/src/graph/impls/common/register.cpp
@@ -15,7 +15,6 @@ void register_implementations() {
     REGISTER_COMMON(data);
     REGISTER_COMMON(input_layout);
     REGISTER_COMMON(loop);
-    REGISTER_COMMON(prior_box);
 }
 
 }  // namespace common

diff --git a/src/plugins/intel_gpu/src/graph/impls/common/register.hpp b/src/plugins/intel_gpu/src/graph/impls/common/register.hpp
@@ -26,7 +26,6 @@ REGISTER_COMMON(condition);
 REGISTER_COMMON(data);
 REGISTER_COMMON(input_layout);
 REGISTER_COMMON(loop);
-REGISTER_COMMON(prior_box);
 
 #undef REGISTER_COMMON
 

diff --git a/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp b/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp
@@ -49,11 +49,6 @@ class wait_for_events_impl : public primitive_impl {
         return make_unique<wait_for_events_impl>(input);
     }
 
-    static std::unique_ptr<primitive_impl> create_prior_box(const prior_box_node& prior_box, const kernel_impl_params&) {
-        // This primitive is being executed on CPU during network compilation.
-        return make_unique<wait_for_events_impl>(prior_box);
-    }
-
     void update(primitive_inst& inst, const kernel_impl_params& impl_param) override { }
 };
 
@@ -67,10 +62,6 @@ attach_input_layout_common::attach_input_layout_common() {
     implementation_map<input_layout>::add(impl_types::common, shape_types::any, wait_for_events_impl::create_input_layout, {});
 }
 
-attach_prior_box_common::attach_prior_box_common() {
-    implementation_map<prior_box>::add(impl_types::common, wait_for_events_impl::create_prior_box, {});
-}
-
 }  // namespace detail
 }  // namespace common
 }  // namespace cldnn

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.hpp
@@ -16,8 +16,8 @@ struct ConvolutionImplementationManager : public ImplementationManager {
 
     std::unique_ptr<primitive_impl> create_impl(const program_node& node, const kernel_impl_params& params) const override;
 
-    bool validate(const program_node& node) const override {
-        OPENVINO_ASSERT(node.is_type<convolution>());
+    bool validate_impl(const program_node& node) const override {
+        assert(node.is_type<convolution>());
 
         const auto& input_layout = node.get_input_layout(0);
         const auto& weights_layout = node.as<convolution>().weights().get_output_layout();
@@ -85,21 +85,18 @@ struct ConvolutionImplementationManager : public ImplementationManager {
                 format::bs_fs_yx_bsv4_fsv2,
             };
 
-            bool fp_case = data_type_traits::is_floating_point(in_dt) &&
+            bool fp_common_case = data_type_traits::is_floating_point(in_dt) &&
                            (one_of(input_fmt.value, supported_fp_only_formats) || one_of(input_fmt.value, supported_common_formats));
-            bool fp16_case = in_dt == ov::element::f16 && input_fmt == format::fs_b_yx_fsv32;
+            bool fp16_case = everyone_is(ov::element::f16, in_dt, wei_dt) && (input_fmt == format::fs_b_yx_fsv32 || output_fmt == format::fs_b_yx_fsv32);
             bool i8u8_case = data_type_traits::is_i8_u8(in_dt) &&
                              (one_of(input_fmt.value, supported_int_only_formats) || one_of(input_fmt.value, supported_common_formats));
 
-            if (!fp_case && !fp16_case && !i8u8_case)
+            if (!fp_common_case && !fp16_case && !i8u8_case)
                 return false;
         }
 
-        return ImplementationManager::validate(node);
+        return true;
     }
-
-    in_out_fmts_t query_formats(const program_node&) const override { OPENVINO_NOT_IMPLEMENTED; }
-    bool support_shapes(const kernel_impl_params&) const override { return true; }
 };
 
 }  // namespace ocl

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/detection_output.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/detection_output.cpp
@@ -4,6 +4,7 @@
 
 #include "primitive_base.hpp"
 
+#include "detection_output.hpp"
 #include "detection_output_inst.h"
 #include "detection_output/detection_output_kernel_selector.h"
 #include "detection_output/detection_output_kernel_ref.h"
@@ -62,22 +63,11 @@ struct detection_output_impl : typed_primitive_impl_ocl<detection_output> {
     }
 };
 
-namespace detail {
-
-attach_detection_output_impl::attach_detection_output_impl() {
-    std::vector<data_types> dt = {
-        data_types::f32,
-        data_types::f16,
-    };
-    std::vector<format::type> fmt = {
-        format::bfyx,
-        format::bs_fs_yx_bsv16_fsv32,
-        format::bs_fs_zyx_bsv16_fsv32,
-    };
-    implementation_map<detection_output>::add(impl_types::ocl, typed_primitive_impl_ocl<detection_output>::create<detection_output_impl>, dt, fmt);
+std::unique_ptr<primitive_impl> DetectionOutputImplementationManager::create_impl(const program_node& node, const kernel_impl_params& params) const {
+    assert(node.is_type<detection_output>());
+    return typed_primitive_impl_ocl<detection_output>::create<detection_output_impl>(static_cast<const detection_output_node&>(node), params);
 }
 
-}  // namespace detail
 }  // namespace ocl
 }  // namespace cldnn
 

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dft.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dft.cpp
@@ -116,6 +116,7 @@ attach_dft_impl::attach_dft_impl() {
         format::bfyx,
         format::b_fs_yx_fsv16,
         format::b_fs_yx_fsv32,
+        format::bs_fs_yx_bsv16_fsv32,
         format::bs_fs_yx_bsv16_fsv16,
         format::bs_fs_yx_bsv32_fsv32,
         format::bs_fs_yx_bsv32_fsv16,

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.cpp
@@ -4,6 +4,7 @@
 
 #include "primitive_base.hpp"
 
+#include "gather_nd.hpp"
 #include "gather_nd_inst.h"
 #include "gather/gather_nd_kernel_selector.h"
 #include "gather/gather_nd_kernel_ref.h"
@@ -55,41 +56,11 @@ struct gather_nd_impl : typed_primitive_impl_ocl<gather_nd> {
     }
 };
 
-namespace detail {
-
-attach_gather_nd_impl::attach_gather_nd_impl() {
-    auto types = {
-        data_types::f32,
-        data_types::f16,
-        data_types::i32
-    };
-
-    auto static_formats = {
-        format::bfyx,
-        format::bfzyx,
-        format::bfwzyx
-    };
-
-    implementation_map<gather_nd>::add(impl_types::ocl,
-                                       shape_types::static_shape,
-                                       typed_primitive_impl_ocl<gather_nd>::create<gather_nd_impl>,
-                                       types,
-                                       static_formats);
-
-    auto dyn_formats = {
-        format::bfyx,
-        format::bfzyx,
-        format::bfwzyx
-    };
-
-    implementation_map<gather_nd>::add(impl_types::ocl,
-                                       shape_types::dynamic_shape,
-                                       typed_primitive_impl_ocl<gather_nd>::create<gather_nd_impl>,
-                                       types,
-                                       dyn_formats);
+std::unique_ptr<primitive_impl> GatherNDImplementationManager::create_impl(const program_node& node, const kernel_impl_params& params) const {
+    assert(node.is_type<gather_nd>());
+    return typed_primitive_impl_ocl<gather_nd>::create<gather_nd_impl>(static_cast<const gather_nd_node&>(node), params);
 }
 
-}  // namespace detail
 }  // namespace ocl
 }  // namespace cldnn
 

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_nd.hpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "impls/registry/implementation_manager.hpp"
+#include "program_node.h"
+
+#include <memory>
+namespace cldnn {
+namespace ocl {
+
+struct GatherNDImplementationManager : public ImplementationManager {
+    OV_GPU_PRIMITIVE_IMPL("GatherNDImplementationOCL")
+    GatherNDImplementationManager(shape_types shape_type, ValidateFunc vf = nullptr) : ImplementationManager(impl_types::ocl, shape_type, vf) {}
+    std::unique_ptr<primitive_impl> create_impl(const program_node& node, const kernel_impl_params& params) const override;
+    bool validate_impl(const program_node& node) const override {
+        static const std::vector<format> supported_fmts = {
+            format::bfyx,
+            format::bfzyx,
+            format::bfwzyx
+        };
+
+        static const std::vector<ov::element::Type_t> supported_in_types = {
+            ov::element::f32,
+            ov::element::f16,
+            ov::element::i32
+        };
+
+        static const std::vector<ov::element::Type_t> supported_out_types = {
+            ov::element::f32,
+            ov::element::f16,
+            ov::element::i32,
+            ov::element::i8,
+            ov::element::u8,
+        };
+
+        const auto& in0_layout = node.get_input_layout(0);
+        const auto& in1_layout = node.get_input_layout(1);
+        const auto& out_layout = node.get_output_layout(0);
+        if (!one_of(in0_layout.format, supported_fmts) || !one_of(out_layout.format, supported_fmts))
+            return false;
+
+        if (!one_of(in0_layout.data_type, supported_in_types) || !one_of(in1_layout.data_type, supported_in_types))
+            return false;
+
+        if (!one_of(out_layout.data_type, supported_out_types))
+            return false;
+
+        return true;
+    }
+};
+
+}  // namespace ocl
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/mvn.cpp
@@ -163,6 +163,8 @@ attach_mvn_impl::attach_mvn_impl() {
 
         std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
         std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
+        std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
+        std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
 
         std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16),
     });