[GPU] Support 7D and 8D tensors (openvinotoolkit#16810)

a-sidorova · Apr 13, 2023 · 67c07cc · 67c07cc
1 parent dcf6fb1
commit 67c07cc
Show file tree

Hide file tree

Showing 58 changed files with 1,507 additions and 829 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/fully_connected.hpp
@@ -19,14 +19,14 @@ namespace cldnn {
 ///        <tr><th>Data type               <th>activation format       <th>weights format
 ///        <tr><td rowspan="7">F32         <td rowspan="4">bfyx        <td>yxfb
 ///        <tr>                                                        <td>fyxb
-///        <tr>                                                        <td>bs_xs_xsv8_bsv8
-///        <tr>                                                        <td>bs_x_bsv16
+///        <tr>                                                        <td>bs_fs_fsv8_bsv8
+///        <tr>                                                        <td>bs_f_bsv16
 ///        <tr>                            <td rowspan="3">yxfb        <td>bfyx
 ///        <tr>                                                        <td>yxfb
-///        <tr>                                                        <td>bs_xs_xsv8_bsv8
+///        <tr>                                                        <td>bs_fs_fsv8_bsv8
 ///        <tr><td rowspan="4">F16         <td rowspan="3">bfyx        <td>yxfb
 ///        <tr>                                                        <td>fyxb
-///        <tr>                                                        <td>bs_x_bsv16
+///        <tr>                                                        <td>bs_f_bsv16
 ///        <tr>                            <td >yxfb                   <td>bfyx
 /// </table>
 

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp
@@ -57,6 +57,9 @@ struct format_traits {
     static bool is_spatial_char(char c) { return std::string(spatial_chars()).find_first_of(c) != std::string::npos; }
     /// @brief Checks if @p c represents group dimensions.
     static bool is_group_char(char c) { return std::string(group_chars()).find_first_of(c) != std::string::npos; }
+
+    /// @brief Checks if order has @p c dimension.
+    bool has_dimension(char c) const { return order.find_first_of(c) != std::string::npos; }
 };
 
 /// @brief Represents memory formats (orders).
@@ -73,6 +76,8 @@ struct format {
         bfyx,                                   ///< the most common format for activations in clDNN.
         bfzyx,                                  ///< format for 5d data tensors
         bfwzyx,                                 ///< batch, feature, 4D spatial
+        bfuwzyx,                                ///< 7d tensor
+        bfvuwzyx,                               ///< 8d tensor
         yxfb,                                   ///< batch first, feature and than spatials
         byxf,                                   ///< used in bitmaps, input from user i.e b images of RGB format
         fyxb,                                   ///< format not used inside clDNN, but supported in reorder as extension
@@ -107,10 +112,10 @@ struct format {
         bs_fs_zyx_bsv32_fsv32,                  ///< format used for big batches (batch and features blocked by 32)
         bs_fs_zyx_bsv32_fsv16,                  ///< format used for big batches (batch blocked by 32, features blocked by 16)
         fs_b_yx_fsv32,                          ///< format for input for fp16 primitives
-        bs_xs_xsv8_bsv8,                        ///< format used only for fully connected
-        bs_xs_xsv8_bsv16,                       ///< format used only for fully connected
-        bs_x_bsv16,                             ///< format used only for fully connected weights fp16 batch=1 : bs - batch slice
-                                                ///< (responses slice), bsv16 - 16 values of single batch slice, x - flattened plane of (fyx)
+        bs_fs_fsv8_bsv8,                        ///< format used only for fully connected
+        bs_fs_fsv8_bsv16,                       ///< format used only for fully connected
+        bs_f_bsv16,                             ///< format used only for fully connected weights fp16 batch=1 : bs - batch slice
+                                                ///< (responses slice), bsv16 - 16 values of single batch slice, f - flattened plane of (fyx)
         b_fs_yx_32fp,                           ///< format for data for binary convolutions
         winograd_2x3_s1_data,                   ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1
         nv12,                                   ///< format for media nv12 input
@@ -326,7 +331,8 @@ struct format {
     static bool is_simple_data_format(type fmt) {
         return (fmt == yxfb || fmt == byxf ||
                 fmt == bfyx || fmt == fyxb ||
-                fmt == bfzyx || fmt == bfwzyx);
+                fmt == bfzyx || fmt == bfwzyx ||
+                fmt == bfuwzyx || fmt == bfvuwzyx);
     }
 
     static format get_default_format(size_t rank, bool is_weights = false, bool is_grouped = false);
@@ -393,6 +399,10 @@ struct format {
     std::string to_string() const;
 };
 
+inline std::ostream& operator<<(std::ostream& os, const format& fmt) {
+    return os << fmt.to_string();
+}
+
 /// @}
 /// @}
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp
@@ -527,7 +527,7 @@ struct layout {
     // for smaller buffer which, currently, should always be performed
     bool identical(const layout& other) const;
 
-    static size_t max_rank() { return 6; }
+    static size_t max_rank() { return 8; }
     static ov::PartialShape transform(const ov::PartialShape& pshape, cldnn::format old_fmt, cldnn::format new_fmt);
 
     size_t hash() const {

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp
@@ -31,7 +31,7 @@ namespace cldnn {
 
 constexpr int32_t tensor_batch_dim_max = 1;
 constexpr int32_t tensor_feature_dim_max = 1;
-constexpr int32_t tensor_spatial_dim_max = 4;
+constexpr int32_t tensor_spatial_dim_max = 6;
 constexpr int32_t tensor_group_dim_max = 1;
 constexpr int32_t tensor_dim_max = tensor_batch_dim_max + tensor_feature_dim_max + tensor_spatial_dim_max + tensor_group_dim_max;
 
@@ -346,7 +346,7 @@ struct tensor {
             delim = ",";
         }
 
-        std::vector<std::string> spatial_dim_names = {", x", ", y", ", z", ", w"};
+        std::vector<std::string> spatial_dim_names = {", x", ", y", ", z", ", w", ", u", ", v"};
         for (size_t i = 0; i < spatial.size(); ++i) {
             out << spatial_dim_names[i] << ":" << spatial[i];
         }
@@ -471,84 +471,46 @@ struct tensor {
        * @endcode
      */
     tensor transform(cldnn::format new_fmt, value_type default_size) const {
-        cldnn::format format = cldnn::format::bfwzyx;
+        cldnn::format format = cldnn::format::bfvuwzyx;
         auto val_order = format.internal_order();
         auto new_order = new_fmt.internal_order();
         std::vector<value_type> old_sizes = sizes();
         std::vector<value_type> new_sizes(old_sizes.size(), default_size);
-        auto tmp = 1;
-        auto tmp_z = 1;
-        auto tmp_w = 1;
-        for (size_t i = 0; i < format.order().size(); i++) {
-            auto c = val_order[i];
-            // skip f and y, z for the formats that do not have it
-            if (((new_fmt == format::bs_xs_xsv8_bsv8) ||
-                 (new_fmt == format::bs_xs_xsv8_bsv16) ||
-                 (new_fmt == format::os_i_osv8__ai8) ||
-                 (new_fmt == format::os_i_osv16__ai8) ||
-                 (new_fmt == format::bs_x_bsv16)) &&
-                ((c == 'f') ||
-                 (c == 'y') ||
-                 (c == 'z') ||
-                 (c == 'w'))) {
-                if (new_order[i] == '?')
-                    new_sizes[i] = default_size;
-
-                tmp *= old_sizes[i];
-                continue;
-            }
-
-            // skip z for the formats that do not have it
-            if (((new_fmt != format::bfzyx && new_fmt != format::b_fs_zyx_fsv16 && new_fmt != format::b_fs_zyx_fsv32 && new_fmt != format::bzyxf &&
-                  new_fmt != format::bfwzyx && new_fmt != format::bs_fs_zyx_bsv16_fsv16 && new_fmt != format::bs_fs_zyx_bsv16_fsv32 &&
-                  new_fmt != format::bs_fs_zyx_bsv32_fsv16 && new_fmt != format::bs_fs_zyx_bsv32_fsv32 &&
-                  new_fmt != format::b_fs_zyx_fsv2 && new_fmt != format::b_fs_zyx_fsv4 &&
-                  new_fmt != format::bs_fs_zyx_bsv8_fsv2 && new_fmt != format::bs_fs_zyx_bsv8_fsv4 &&
-                  new_fmt != format::bs_fs_zyx_bsv16_fsv2 && new_fmt != format::bs_fs_zyx_bsv16_fsv4)) && (c == 'z')) {
-                if (new_order[i] == '?')
-                    new_sizes[i] = default_size;
-
-                tmp_z *= old_sizes[i];
-                continue;
+        const auto& new_traits = format::traits(new_fmt);
+        const cldnn::format default_fmt = cldnn::format::bfvuwzyx;
+        static const std::map<char, char> flatten_mapping = {
+            { 'v', 'u'},
+            { 'u', 'w'},
+            { 'w', 'z'},
+            { 'z', 'y'}
+        };
+
+        for (size_t i = 0; i < default_fmt.order().size(); i++) {
+            auto target_dim = val_order[i]; //bfxywzuv
+            while (!new_traits.has_dimension(target_dim)) {
+                if (flatten_mapping.find(target_dim) != flatten_mapping.end()) {
+                    target_dim = flatten_mapping.at(target_dim);
+                } else {
+                    target_dim = new_fmt.order().back();
+                }
             }
 
-            if (new_fmt != format::bfwzyx && c == 'w') {
-                if (new_order[i] == '?')
-                    new_sizes[i] = default_size;
-
-                if (new_fmt == format::bfzyx || new_fmt == format::b_fs_zyx_fsv16 ||
-                    new_fmt == format::bs_fs_zyx_bsv16_fsv16 || new_fmt == format::b_fs_zyx_fsv32 ||
-                    new_fmt == format::bs_fs_zyx_bsv16_fsv32)
-                    tmp_w *= old_sizes[i];
-                else
-                    tmp_z *= old_sizes[i];
-                continue;
+            auto new_pos = new_order.find(target_dim);
+            if (new_pos != std::string::npos) {
+                if (new_sizes[new_pos] == -1) {
+                    new_sizes[new_pos] = old_sizes[i];
+                } else {
+                    new_sizes[new_pos] *= old_sizes[i];
+                }
             }
-
-            auto new_pos = new_order.find(c);
-            if (new_pos == std::string::npos)
-                throw std::invalid_argument("cannot convert to new format");
-            new_sizes[new_pos] = old_sizes[i];
         }
 
-        // in case of formats with smaller number of dimensions than input, flatten is performed below
-        if (tmp != 1 || tmp_z != 1 || tmp_w != 1) {
-            for (size_t i = 0; i < format.order().size(); i++) {
-                auto c = val_order[i];
-                if (c == 'x') {
-                    auto new_pos = new_order.find(c);
-                    new_sizes[new_pos] *= tmp;
-                }
-                if (c == 'y') {
-                    auto new_pos = new_order.find(c);
-                    if (new_pos != std::string::npos)
-                        new_sizes[new_pos] *= tmp_z;
-                }
-                if (c == 'z') {
-                    auto new_pos = new_order.find(c);
-                    if (new_pos != std::string::npos)
-                        new_sizes[new_pos] *= tmp_w;
-                }
+        for (size_t i = 0; i < new_order.size(); i++) {
+            auto c = new_order[i]; //bfxywz
+            if (c == '?')
+                continue;
+            if (new_sizes[i] == -1) {
+                new_sizes[i] = 1;
             }
         }
 

diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp
@@ -50,8 +50,8 @@ format::type get_preferred_format(fully_connected_node const& node, const kernel
 
     if (data_type_traits::is_floating_point(input_layout.data_type) &&
         (is_batch_after_spatial(input_layout.format.order()) ||
-         input_layout.format == format::bs_x_bsv16 ||
-         input_layout.format == format::bs_xs_xsv8_bsv8))
+         input_layout.format == format::bs_f_bsv16 ||
+         input_layout.format == format::bs_fs_fsv8_bsv8))
         return format::yxfb;
 
     bool no_spatial_padding = true;

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -20,8 +20,8 @@
 
 using namespace cldnn;
 
-#define LOG_NODE_REMOVAL(id)      GPU_DEBUG_LOG_PASS << "Remove node: " << (id) << std::endl;
-#define LOG_NODE_REPLACEMENT(id)  GPU_DEBUG_LOG_PASS << "Replace node: " << (id) << std::endl;
+#define LOG_NODE_REMOVAL(id)      GPU_DEBUG_LOG_PASS << __func__ << ":" << __LINE__  << ": remove node: " << (id) << std::endl;
+#define LOG_NODE_REPLACEMENT(id)  GPU_DEBUG_LOG_PASS << __func__ << ":" << __LINE__  << ": replace node: " << (id) << std::endl;
 
 remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations,
     bool remove_output_reorders)

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -13,7 +13,6 @@
 #include "mvn_inst.h"
 #include "to_string_utils.h"
 #include "pooling_inst.h"
-#include "reshape_inst.h"
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
 #include "gemm_inst.h"
@@ -285,6 +284,8 @@ void propagate_formats_rec(std::map<program_node*, format::type>& fmt_map,
     for (auto next : travel_direction_wrapper<dir>::next_nodes(node)) {
         if (!next->is_in_data_flow())
             continue;
+        if (!can_propagate_formats_rec<dir>(fmt_map, lo, node, next, fmt))
+            continue;
         propagate_formats_rec<dir>(fmt_map, lo, node, next, fmt);
     }
 }
@@ -511,6 +512,8 @@ void minimize_local_reorders(program& p, std::map<program_node*, format::type>&
             continue;
 
         for (auto new_fmt : local_formats) {
+            if (fmt_map.at(node) != format::any && format::dimension(fmt_map.at(node)) != format::dimension(new_fmt))
+                continue;
             fmt_map.at(node) = new_fmt;
 
             auto reorders_cnt = count_reorders(fmt_map, lo, node);

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/eltwise.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/eltwise.cpp
@@ -133,6 +133,7 @@ struct eltwise_impl : typed_primitive_impl_ocl<eltwise> {
                 input_pshape = extend_shape_to_rank_from_begin(input_pshape, out_pshape.size());
             }
             input_layout.set_partial_shape(extend_shape_to_rank_from_end(input_pshape));
+            input_layout.format = format::adjust_to_rank(input_layout.format, input_pshape.size());
         }
 
         return updated_impl_params;
@@ -163,7 +164,9 @@ attach_eltwise_impl::attach_eltwise_impl() {
     auto dyn_formats = {
         format::bfyx,
         format::bfzyx,
-        format::bfwzyx
+        format::bfwzyx,
+        format::bfuwzyx,
+        format::bfvuwzyx,
     };
 
     implementation_map<eltwise>::add(impl_types::ocl,
@@ -213,6 +216,20 @@ attach_eltwise_impl::attach_eltwise_impl() {
         std::make_tuple(data_types::i32, format::bfwzyx),
         std::make_tuple(data_types::i64, format::bfwzyx),
 
+        std::make_tuple(data_types::f32, format::bfuwzyx),
+        std::make_tuple(data_types::f16, format::bfuwzyx),
+        std::make_tuple(data_types::i8, format::bfuwzyx),
+        std::make_tuple(data_types::u8, format::bfuwzyx),
+        std::make_tuple(data_types::i32, format::bfuwzyx),
+        std::make_tuple(data_types::i64, format::bfuwzyx),
+
+        std::make_tuple(data_types::f32, format::bfvuwzyx),
+        std::make_tuple(data_types::f16, format::bfvuwzyx),
+        std::make_tuple(data_types::i8, format::bfvuwzyx),
+        std::make_tuple(data_types::u8, format::bfvuwzyx),
+        std::make_tuple(data_types::i32, format::bfvuwzyx),
+        std::make_tuple(data_types::i64, format::bfvuwzyx),
+
         std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
         std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
         std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp
@@ -151,6 +151,9 @@ attach_fully_connected_impl::attach_fully_connected_impl() {
         std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
         std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
         std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
+        std::make_tuple(data_types::f32, format::bs_fs_fsv8_bsv8),
+        std::make_tuple(data_types::f16, format::bs_fs_fsv8_bsv8),
+        std::make_tuple(data_types::f16, format::bs_fs_fsv8_bsv16),
     });
 }
 

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
@@ -223,9 +223,9 @@ kernel_selector::data_layout to_data_layout(format f) {
             return kernel_selector::data_layout::b_fs_zyx_fsv4;
         case format::b_fs_zyx_fsv32:
             return kernel_selector::data_layout::b_fs_zyx_fsv32;
-        case format::bs_x_bsv16:
+        case format::bs_f_bsv16:
             return kernel_selector::data_layout::bs_f_bsv16__af8;
-        case format::bs_xs_xsv8_bsv8:
+        case format::bs_fs_fsv8_bsv8:
             return kernel_selector::data_layout::bs_f_bsv8__af8;
         case format::winograd_2x3_s1_data:
             return kernel_selector::data_layout::winograd_2x3_s1_data;
@@ -239,6 +239,10 @@ kernel_selector::data_layout to_data_layout(format f) {
             return kernel_selector::data_layout::fs_b_yx_fsv32;
         case format::bfwzyx:
             return kernel_selector::data_layout::bfwzyx;
+        case format::bfuwzyx:
+            return kernel_selector::data_layout::bfuwzyx;
+        case format::bfvuwzyx:
+            return kernel_selector::data_layout::bfvuwzyx;
         case format::b_fs_zyx_fsv16:
             return kernel_selector::data_layout::b_fs_zyx_fsv16;
         case format::bs_fs_yx_bsv16_fsv32:
@@ -282,7 +286,7 @@ kernel_selector::data_layout to_data_layout(format f) {
         case format::image_2d_rgba:
             return kernel_selector::data_layout::image_2d_rgba;
         default:
-            throw std::invalid_argument("Format f (" +  std::to_string((int32_t)f.value) + ") is not a proper data layout");
+            OPENVINO_THROW("[GPU] Can't convert tensor format to kernel selector format as f=", f, " is not handled");
     }
 }
 
@@ -311,9 +315,9 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
         case kernel_selector::data_layout::b_fs_zyx_fsv32:
             return cldnn::format::b_fs_zyx_fsv32;
         case kernel_selector::data_layout::bs_f_bsv8__af8:
-            return cldnn::format::bs_xs_xsv8_bsv8;
+            return cldnn::format::bs_fs_fsv8_bsv8;
         case kernel_selector::data_layout::bs_f_bsv16__af8:
-            return cldnn::format::bs_x_bsv16;
+            return cldnn::format::bs_f_bsv16;
         case kernel_selector::data_layout::winograd_2x3_s1_data:
             return cldnn::format::winograd_2x3_s1_data;
         case kernel_selector::data_layout::b_fs_yx_32fp:
@@ -324,6 +328,10 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
             return cldnn::format::fs_b_yx_fsv32;
         case kernel_selector::data_layout::bfwzyx:
             return cldnn::format::bfwzyx;
+        case kernel_selector::data_layout::bfuwzyx:
+            return cldnn::format::bfuwzyx;
+        case kernel_selector::data_layout::bfvuwzyx:
+            return cldnn::format::bfvuwzyx;
         case kernel_selector::data_layout::bs_fs_yx_bsv16_fsv16:
             return cldnn::format::bs_fs_yx_bsv16_fsv16;
         case kernel_selector::data_layout::bs_fs_zyx_bsv32_fsv16:
@@ -482,12 +490,12 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
             return kernel_selector::weights_layout::oizyx;
         case format::iozyx:
             return kernel_selector::weights_layout::iozyx;
-        case format::bs_xs_xsv8_bsv8:
+        case format::bs_fs_fsv8_bsv8:
         case format::os_i_osv8__ai8:
             return kernel_selector::weights_layout::os_i_osv8__ai8;
         case format::os_i_osv16__ai8:
             return kernel_selector::weights_layout::os_i_osv16__ai8;
-        case format::bs_x_bsv16:
+        case format::bs_f_bsv16:
             return kernel_selector::weights_layout::os_i_osv16;
         case format::os_is_zyx_isv16_osv16:
             return kernel_selector::weights_layout::os_is_zyx_isv16_osv16;
@@ -656,7 +664,7 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
         case kernel_selector::weights_layout::os_iyx_osv64:
             return cldnn::format::os_iyx_osv64;
         case kernel_selector::weights_layout::os_i_osv16:
-            return cldnn::format::bs_x_bsv16;
+            return cldnn::format::bs_f_bsv16;
         case kernel_selector::weights_layout::os_i_osv8__ai8:
             return cldnn::format::os_i_osv8__ai8;
         case kernel_selector::weights_layout::os_i_osv16__ai8: