Skip to content

Commit

Permalink
[GPU] Support 7D and 8D tensors (openvinotoolkit#16810)
Browse files Browse the repository at this point in the history
  • Loading branch information
vladimir-paramuzov authored Apr 13, 2023
1 parent dcf6fb1 commit 67c07cc
Show file tree
Hide file tree
Showing 58 changed files with 1,507 additions and 829 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ namespace cldnn {
/// <tr><th>Data type <th>activation format <th>weights format
/// <tr><td rowspan="7">F32 <td rowspan="4">bfyx <td>yxfb
/// <tr> <td>fyxb
/// <tr> <td>bs_xs_xsv8_bsv8
/// <tr> <td>bs_x_bsv16
/// <tr> <td>bs_fs_fsv8_bsv8
/// <tr> <td>bs_f_bsv16
/// <tr> <td rowspan="3">yxfb <td>bfyx
/// <tr> <td>yxfb
/// <tr> <td>bs_xs_xsv8_bsv8
/// <tr> <td>bs_fs_fsv8_bsv8
/// <tr><td rowspan="4">F16 <td rowspan="3">bfyx <td>yxfb
/// <tr> <td>fyxb
/// <tr> <td>bs_x_bsv16
/// <tr> <td>bs_f_bsv16
/// <tr> <td >yxfb <td>bfyx
/// </table>

Expand Down
20 changes: 15 additions & 5 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ struct format_traits {
static bool is_spatial_char(char c) { return std::string(spatial_chars()).find_first_of(c) != std::string::npos; }
/// @brief Checks if @p c represents group dimensions.
static bool is_group_char(char c) { return std::string(group_chars()).find_first_of(c) != std::string::npos; }

/// @brief Checks if order has @p c dimension.
bool has_dimension(char c) const { return order.find_first_of(c) != std::string::npos; }
};

/// @brief Represents memory formats (orders).
Expand All @@ -73,6 +76,8 @@ struct format {
bfyx, ///< the most common format for activations in clDNN.
bfzyx, ///< format for 5d data tensors
bfwzyx, ///< batch, feature, 4D spatial
bfuwzyx, ///< 7d tensor
bfvuwzyx, ///< 8d tensor
yxfb, ///< batch first, feature and than spatials
byxf, ///< used in bitmaps, input from user i.e b images of RGB format
fyxb, ///< format not used inside clDNN, but supported in reorder as extension
Expand Down Expand Up @@ -107,10 +112,10 @@ struct format {
bs_fs_zyx_bsv32_fsv32, ///< format used for big batches (batch and features blocked by 32)
bs_fs_zyx_bsv32_fsv16, ///< format used for big batches (batch blocked by 32, features blocked by 16)
fs_b_yx_fsv32, ///< format for input for fp16 primitives
bs_xs_xsv8_bsv8, ///< format used only for fully connected
bs_xs_xsv8_bsv16, ///< format used only for fully connected
bs_x_bsv16, ///< format used only for fully connected weights fp16 batch=1 : bs - batch slice
///< (responses slice), bsv16 - 16 values of single batch slice, x - flattened plane of (fyx)
bs_fs_fsv8_bsv8, ///< format used only for fully connected
bs_fs_fsv8_bsv16, ///< format used only for fully connected
bs_f_bsv16, ///< format used only for fully connected weights fp16 batch=1 : bs - batch slice
///< (responses slice), bsv16 - 16 values of single batch slice, f - flattened plane of (fyx)
b_fs_yx_32fp, ///< format for data for binary convolutions
winograd_2x3_s1_data, ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1
nv12, ///< format for media nv12 input
Expand Down Expand Up @@ -326,7 +331,8 @@ struct format {
static bool is_simple_data_format(type fmt) {
return (fmt == yxfb || fmt == byxf ||
fmt == bfyx || fmt == fyxb ||
fmt == bfzyx || fmt == bfwzyx);
fmt == bfzyx || fmt == bfwzyx ||
fmt == bfuwzyx || fmt == bfvuwzyx);
}

static format get_default_format(size_t rank, bool is_weights = false, bool is_grouped = false);
Expand Down Expand Up @@ -393,6 +399,10 @@ struct format {
std::string to_string() const;
};

inline std::ostream& operator<<(std::ostream& os, const format& fmt) {
return os << fmt.to_string();
}

/// @}
/// @}
} // namespace cldnn
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ struct layout {
// for smaller buffer which, currently, should always be performed
bool identical(const layout& other) const;

static size_t max_rank() { return 6; }
static size_t max_rank() { return 8; }
static ov::PartialShape transform(const ov::PartialShape& pshape, cldnn::format old_fmt, cldnn::format new_fmt);

size_t hash() const {
Expand Down
104 changes: 33 additions & 71 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ namespace cldnn {

constexpr int32_t tensor_batch_dim_max = 1;
constexpr int32_t tensor_feature_dim_max = 1;
constexpr int32_t tensor_spatial_dim_max = 4;
constexpr int32_t tensor_spatial_dim_max = 6;
constexpr int32_t tensor_group_dim_max = 1;
constexpr int32_t tensor_dim_max = tensor_batch_dim_max + tensor_feature_dim_max + tensor_spatial_dim_max + tensor_group_dim_max;

Expand Down Expand Up @@ -346,7 +346,7 @@ struct tensor {
delim = ",";
}

std::vector<std::string> spatial_dim_names = {", x", ", y", ", z", ", w"};
std::vector<std::string> spatial_dim_names = {", x", ", y", ", z", ", w", ", u", ", v"};
for (size_t i = 0; i < spatial.size(); ++i) {
out << spatial_dim_names[i] << ":" << spatial[i];
}
Expand Down Expand Up @@ -471,84 +471,46 @@ struct tensor {
* @endcode
*/
tensor transform(cldnn::format new_fmt, value_type default_size) const {
cldnn::format format = cldnn::format::bfwzyx;
cldnn::format format = cldnn::format::bfvuwzyx;
auto val_order = format.internal_order();
auto new_order = new_fmt.internal_order();
std::vector<value_type> old_sizes = sizes();
std::vector<value_type> new_sizes(old_sizes.size(), default_size);
auto tmp = 1;
auto tmp_z = 1;
auto tmp_w = 1;
for (size_t i = 0; i < format.order().size(); i++) {
auto c = val_order[i];
// skip f and y, z for the formats that do not have it
if (((new_fmt == format::bs_xs_xsv8_bsv8) ||
(new_fmt == format::bs_xs_xsv8_bsv16) ||
(new_fmt == format::os_i_osv8__ai8) ||
(new_fmt == format::os_i_osv16__ai8) ||
(new_fmt == format::bs_x_bsv16)) &&
((c == 'f') ||
(c == 'y') ||
(c == 'z') ||
(c == 'w'))) {
if (new_order[i] == '?')
new_sizes[i] = default_size;

tmp *= old_sizes[i];
continue;
}

// skip z for the formats that do not have it
if (((new_fmt != format::bfzyx && new_fmt != format::b_fs_zyx_fsv16 && new_fmt != format::b_fs_zyx_fsv32 && new_fmt != format::bzyxf &&
new_fmt != format::bfwzyx && new_fmt != format::bs_fs_zyx_bsv16_fsv16 && new_fmt != format::bs_fs_zyx_bsv16_fsv32 &&
new_fmt != format::bs_fs_zyx_bsv32_fsv16 && new_fmt != format::bs_fs_zyx_bsv32_fsv32 &&
new_fmt != format::b_fs_zyx_fsv2 && new_fmt != format::b_fs_zyx_fsv4 &&
new_fmt != format::bs_fs_zyx_bsv8_fsv2 && new_fmt != format::bs_fs_zyx_bsv8_fsv4 &&
new_fmt != format::bs_fs_zyx_bsv16_fsv2 && new_fmt != format::bs_fs_zyx_bsv16_fsv4)) && (c == 'z')) {
if (new_order[i] == '?')
new_sizes[i] = default_size;

tmp_z *= old_sizes[i];
continue;
const auto& new_traits = format::traits(new_fmt);
const cldnn::format default_fmt = cldnn::format::bfvuwzyx;
static const std::map<char, char> flatten_mapping = {
{ 'v', 'u'},
{ 'u', 'w'},
{ 'w', 'z'},
{ 'z', 'y'}
};

for (size_t i = 0; i < default_fmt.order().size(); i++) {
auto target_dim = val_order[i]; //bfxywzuv
while (!new_traits.has_dimension(target_dim)) {
if (flatten_mapping.find(target_dim) != flatten_mapping.end()) {
target_dim = flatten_mapping.at(target_dim);
} else {
target_dim = new_fmt.order().back();
}
}

if (new_fmt != format::bfwzyx && c == 'w') {
if (new_order[i] == '?')
new_sizes[i] = default_size;

if (new_fmt == format::bfzyx || new_fmt == format::b_fs_zyx_fsv16 ||
new_fmt == format::bs_fs_zyx_bsv16_fsv16 || new_fmt == format::b_fs_zyx_fsv32 ||
new_fmt == format::bs_fs_zyx_bsv16_fsv32)
tmp_w *= old_sizes[i];
else
tmp_z *= old_sizes[i];
continue;
auto new_pos = new_order.find(target_dim);
if (new_pos != std::string::npos) {
if (new_sizes[new_pos] == -1) {
new_sizes[new_pos] = old_sizes[i];
} else {
new_sizes[new_pos] *= old_sizes[i];
}
}

auto new_pos = new_order.find(c);
if (new_pos == std::string::npos)
throw std::invalid_argument("cannot convert to new format");
new_sizes[new_pos] = old_sizes[i];
}

// in case of formats with smaller number of dimensions than input, flatten is performed below
if (tmp != 1 || tmp_z != 1 || tmp_w != 1) {
for (size_t i = 0; i < format.order().size(); i++) {
auto c = val_order[i];
if (c == 'x') {
auto new_pos = new_order.find(c);
new_sizes[new_pos] *= tmp;
}
if (c == 'y') {
auto new_pos = new_order.find(c);
if (new_pos != std::string::npos)
new_sizes[new_pos] *= tmp_z;
}
if (c == 'z') {
auto new_pos = new_order.find(c);
if (new_pos != std::string::npos)
new_sizes[new_pos] *= tmp_w;
}
for (size_t i = 0; i < new_order.size(); i++) {
auto c = new_order[i]; //bfxywz
if (c == '?')
continue;
if (new_sizes[i] == -1) {
new_sizes[i] = 1;
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_gpu/src/graph/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ format::type get_preferred_format(fully_connected_node const& node, const kernel

if (data_type_traits::is_floating_point(input_layout.data_type) &&
(is_batch_after_spatial(input_layout.format.order()) ||
input_layout.format == format::bs_x_bsv16 ||
input_layout.format == format::bs_xs_xsv8_bsv8))
input_layout.format == format::bs_f_bsv16 ||
input_layout.format == format::bs_fs_fsv8_bsv8))
return format::yxfb;

bool no_spatial_padding = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@

using namespace cldnn;

#define LOG_NODE_REMOVAL(id) GPU_DEBUG_LOG_PASS << "Remove node: " << (id) << std::endl;
#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_LOG_PASS << "Replace node: " << (id) << std::endl;
#define LOG_NODE_REMOVAL(id) GPU_DEBUG_LOG_PASS << __func__ << ":" << __LINE__ << ": remove node: " << (id) << std::endl;
#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_LOG_PASS << __func__ << ":" << __LINE__ << ": replace node: " << (id) << std::endl;

remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations,
bool remove_output_reorders)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "mvn_inst.h"
#include "to_string_utils.h"
#include "pooling_inst.h"
#include "reshape_inst.h"

#ifdef ENABLE_ONEDNN_FOR_GPU
#include "gemm_inst.h"
Expand Down Expand Up @@ -285,6 +284,8 @@ void propagate_formats_rec(std::map<program_node*, format::type>& fmt_map,
for (auto next : travel_direction_wrapper<dir>::next_nodes(node)) {
if (!next->is_in_data_flow())
continue;
if (!can_propagate_formats_rec<dir>(fmt_map, lo, node, next, fmt))
continue;
propagate_formats_rec<dir>(fmt_map, lo, node, next, fmt);
}
}
Expand Down Expand Up @@ -511,6 +512,8 @@ void minimize_local_reorders(program& p, std::map<program_node*, format::type>&
continue;

for (auto new_fmt : local_formats) {
if (fmt_map.at(node) != format::any && format::dimension(fmt_map.at(node)) != format::dimension(new_fmt))
continue;
fmt_map.at(node) = new_fmt;

auto reorders_cnt = count_reorders(fmt_map, lo, node);
Expand Down
19 changes: 18 additions & 1 deletion src/plugins/intel_gpu/src/graph/impls/ocl/eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ struct eltwise_impl : typed_primitive_impl_ocl<eltwise> {
input_pshape = extend_shape_to_rank_from_begin(input_pshape, out_pshape.size());
}
input_layout.set_partial_shape(extend_shape_to_rank_from_end(input_pshape));
input_layout.format = format::adjust_to_rank(input_layout.format, input_pshape.size());
}

return updated_impl_params;
Expand Down Expand Up @@ -163,7 +164,9 @@ attach_eltwise_impl::attach_eltwise_impl() {
auto dyn_formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx
format::bfwzyx,
format::bfuwzyx,
format::bfvuwzyx,
};

implementation_map<eltwise>::add(impl_types::ocl,
Expand Down Expand Up @@ -213,6 +216,20 @@ attach_eltwise_impl::attach_eltwise_impl() {
std::make_tuple(data_types::i32, format::bfwzyx),
std::make_tuple(data_types::i64, format::bfwzyx),

std::make_tuple(data_types::f32, format::bfuwzyx),
std::make_tuple(data_types::f16, format::bfuwzyx),
std::make_tuple(data_types::i8, format::bfuwzyx),
std::make_tuple(data_types::u8, format::bfuwzyx),
std::make_tuple(data_types::i32, format::bfuwzyx),
std::make_tuple(data_types::i64, format::bfuwzyx),

std::make_tuple(data_types::f32, format::bfvuwzyx),
std::make_tuple(data_types::f16, format::bfvuwzyx),
std::make_tuple(data_types::i8, format::bfvuwzyx),
std::make_tuple(data_types::u8, format::bfvuwzyx),
std::make_tuple(data_types::i32, format::bfvuwzyx),
std::make_tuple(data_types::i64, format::bfvuwzyx),

std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
Expand Down
3 changes: 3 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ attach_fully_connected_impl::attach_fully_connected_impl() {
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
std::make_tuple(data_types::f32, format::bs_fs_fsv8_bsv8),
std::make_tuple(data_types::f16, format::bs_fs_fsv8_bsv8),
std::make_tuple(data_types::f16, format::bs_fs_fsv8_bsv16),
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,9 @@ kernel_selector::data_layout to_data_layout(format f) {
return kernel_selector::data_layout::b_fs_zyx_fsv4;
case format::b_fs_zyx_fsv32:
return kernel_selector::data_layout::b_fs_zyx_fsv32;
case format::bs_x_bsv16:
case format::bs_f_bsv16:
return kernel_selector::data_layout::bs_f_bsv16__af8;
case format::bs_xs_xsv8_bsv8:
case format::bs_fs_fsv8_bsv8:
return kernel_selector::data_layout::bs_f_bsv8__af8;
case format::winograd_2x3_s1_data:
return kernel_selector::data_layout::winograd_2x3_s1_data;
Expand All @@ -239,6 +239,10 @@ kernel_selector::data_layout to_data_layout(format f) {
return kernel_selector::data_layout::fs_b_yx_fsv32;
case format::bfwzyx:
return kernel_selector::data_layout::bfwzyx;
case format::bfuwzyx:
return kernel_selector::data_layout::bfuwzyx;
case format::bfvuwzyx:
return kernel_selector::data_layout::bfvuwzyx;
case format::b_fs_zyx_fsv16:
return kernel_selector::data_layout::b_fs_zyx_fsv16;
case format::bs_fs_yx_bsv16_fsv32:
Expand Down Expand Up @@ -282,7 +286,7 @@ kernel_selector::data_layout to_data_layout(format f) {
case format::image_2d_rgba:
return kernel_selector::data_layout::image_2d_rgba;
default:
throw std::invalid_argument("Format f (" + std::to_string((int32_t)f.value) + ") is not a proper data layout");
OPENVINO_THROW("[GPU] Can't convert tensor format to kernel selector format as f=", f, " is not handled");
}
}

Expand Down Expand Up @@ -311,9 +315,9 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
case kernel_selector::data_layout::b_fs_zyx_fsv32:
return cldnn::format::b_fs_zyx_fsv32;
case kernel_selector::data_layout::bs_f_bsv8__af8:
return cldnn::format::bs_xs_xsv8_bsv8;
return cldnn::format::bs_fs_fsv8_bsv8;
case kernel_selector::data_layout::bs_f_bsv16__af8:
return cldnn::format::bs_x_bsv16;
return cldnn::format::bs_f_bsv16;
case kernel_selector::data_layout::winograd_2x3_s1_data:
return cldnn::format::winograd_2x3_s1_data;
case kernel_selector::data_layout::b_fs_yx_32fp:
Expand All @@ -324,6 +328,10 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
return cldnn::format::fs_b_yx_fsv32;
case kernel_selector::data_layout::bfwzyx:
return cldnn::format::bfwzyx;
case kernel_selector::data_layout::bfuwzyx:
return cldnn::format::bfuwzyx;
case kernel_selector::data_layout::bfvuwzyx:
return cldnn::format::bfvuwzyx;
case kernel_selector::data_layout::bs_fs_yx_bsv16_fsv16:
return cldnn::format::bs_fs_yx_bsv16_fsv16;
case kernel_selector::data_layout::bs_fs_zyx_bsv32_fsv16:
Expand Down Expand Up @@ -482,12 +490,12 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
return kernel_selector::weights_layout::oizyx;
case format::iozyx:
return kernel_selector::weights_layout::iozyx;
case format::bs_xs_xsv8_bsv8:
case format::bs_fs_fsv8_bsv8:
case format::os_i_osv8__ai8:
return kernel_selector::weights_layout::os_i_osv8__ai8;
case format::os_i_osv16__ai8:
return kernel_selector::weights_layout::os_i_osv16__ai8;
case format::bs_x_bsv16:
case format::bs_f_bsv16:
return kernel_selector::weights_layout::os_i_osv16;
case format::os_is_zyx_isv16_osv16:
return kernel_selector::weights_layout::os_is_zyx_isv16_osv16;
Expand Down Expand Up @@ -656,7 +664,7 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
case kernel_selector::weights_layout::os_iyx_osv64:
return cldnn::format::os_iyx_osv64;
case kernel_selector::weights_layout::os_i_osv16:
return cldnn::format::bs_x_bsv16;
return cldnn::format::bs_f_bsv16;
case kernel_selector::weights_layout::os_i_osv8__ai8:
return cldnn::format::os_i_osv8__ai8;
case kernel_selector::weights_layout::os_i_osv16__ai8:
Expand Down
Loading

0 comments on commit 67c07cc

Please sign in to comment.