Add support for 3D-pooling layers, remove partial legacy support for …

…channels_first in pooling layers (#375) * Add setup for 3d-pooling support * No channel_first support for global 3d-pooling layers * Revert renaming of non-global pooling layers * Split 2d and 3d pooling layers * Remove unused code * Implement global 3d-pooling layers * Add support for channel_first in global 3d pooling layers * Implement channels_first support for global pooling layers * Unify 2d and 3d pooling layers * Remove FDEEP_FORCE_INLINE * Refactor common 3d-pooling code into base class * Reduce passing of channels_first argument and reduce inner-loop ifs * Unify channels_first loop and channels_last loop * Reformat code * Remove legacy support for channels_first in pooling layers * Combine global pooling layers 1D and 2D and 3D into 3D
Dobiasd · Jan 22, 2023 · b776285 · b776285
1 parent cfad37c
commit b776285
Show file tree

Hide file tree

Showing 23 changed files with 564 additions and 685 deletions.
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ Would you like to build/train a model using Keras/Python? And would you like to
 Layer types typically used in image recognition/generation are supported, making many popular model architectures possible (see [Performance section](#performance)).
 
 * `Add`, `Concatenate`, `Subtract`, `Multiply`, `Average`, `Maximum`, `Minimum`, `Dot`
-* `AveragePooling1D/2D`, `GlobalAveragePooling1D/2D`
+* `AveragePooling1D/2D/3D`, `GlobalAveragePooling1D/2D/3D`
 * `Bidirectional`, `TimeDistributed`, `GRU`, `LSTM`, `CuDNNGRU`, `CuDNNLSTM`
 * `Conv1D/2D`, `SeparableConv2D`, `DepthwiseConv2D`
 * `Cropping1D/2D/3D`, `ZeroPadding1D/2D/3D`
@@ -52,7 +52,7 @@ Layer types typically used in image recognition/generation are supported, making
 * `SpatialDropout1D`, `SpatialDropout2D`, `SpatialDropout3D`
 * `RandomContrast`, `RandomFlip`, `RandomHeight`
 * `RandomRotation`, `RandomTranslation`, `RandomWidth`, `RandomZoom`
-* `MaxPooling1D/2D`, `GlobalMaxPooling1D/2D`
+* `MaxPooling1D/2D/3D`, `GlobalMaxPooling1D/2D/3D`
 * `ELU`, `LeakyReLU`, `ReLU`, `SeLU`, `PReLU`
 * `Sigmoid`, `Softmax`, `Softplus`, `Tanh`
 * `Exponential`, `GELU`, `Softsign`, `Rescaling`
@@ -73,13 +73,13 @@ Layer types typically used in image recognition/generation are supported, making
 
 ### Currently not supported are the following:
 
-`ActivityRegularization`, `AdditiveAttention`, `Attention`, `AveragePooling3D`,
+`ActivityRegularization`, `AdditiveAttention`, `Attention`
 `CategoryEncoding`, `CenterCrop`, `Conv2DTranspose` ([why](FAQ.md#why-are-conv2dtranspose-layers-not-supported)),
 `Conv3D`, `ConvLSTM1D`, `ConvLSTM2D`, `Discretization`,
 `GRUCell`, `Hashing`,
 `IntegerLookup`, `Lambda` ([why](FAQ.md#why-are-lambda-layers-not-supported)),
 `LayerNormalization`, `LocallyConnected1D`, `LocallyConnected2D`,
-`LSTMCell`, `Masking`, `MaxPooling3D`, `MultiHeadAttention`,
+`LSTMCell`, `Masking`, `MultiHeadAttention`,
 `RepeatVector`, `Resizing`, `RNN`, `SimpleRNN`,
 `SimpleRNNCell`, `StackedRNNCells`, `StringLookup`, `TextVectorization`,
 `ThresholdedReLU`, `UnitNormalization`, `Upsampling3D`, `temporal` models

diff --git a/include/fdeep/common.hpp b/include/fdeep/common.hpp
@@ -36,14 +36,6 @@
 #include <string>
 #include <stdexcept>
 
-#if defined(__GNUC__) || defined(__GNUG__)
-#define FDEEP_FORCE_INLINE __attribute__((always_inline)) inline
-#elif defined(_MSC_VER)
-#define FDEEP_FORCE_INLINE __forceinline
-#else
-#define FDEEP_FORCE_INLINE inline
-#endif
-
 namespace fdeep { namespace internal
 {
 

diff --git a/include/fdeep/convolution3d.hpp b/include/fdeep/convolution3d.hpp
@@ -0,0 +1,125 @@
+// Copyright 2016, Tobias Hermann.
+// https://github.com/Dobiasd/frugally-deep
+// Distributed under the MIT License.
+// (See accompanying LICENSE file or at
+//  https://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include "fdeep/common.hpp"
+
+#include "fdeep/filter.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <vector>
+
+namespace fdeep { namespace internal
+{
+
+struct convolution3d_config
+{
+    std::size_t pad_front_;
+    std::size_t pad_back_;
+    std::size_t pad_top_;
+    std::size_t pad_bottom_;
+    std::size_t pad_left_;
+    std::size_t pad_right_;
+    std::size_t out_size_d4_;
+    std::size_t out_height_;
+    std::size_t out_width_;
+};
+
+inline convolution3d_config preprocess_convolution_3d(
+    const shape3& filter_shape,
+    const shape3& strides,
+    padding pad_type,
+    std::size_t input_shape_size_d4,
+    std::size_t input_shape_height,
+    std::size_t input_shape_width)
+{
+    const int filter_size_d4 = static_cast<int>(filter_shape.size_dim_4_);
+    const int filter_height = static_cast<int>(filter_shape.height_);
+    const int filter_width = static_cast<int>(filter_shape.width_);
+    const int in_size_d4 = static_cast<int>(input_shape_size_d4);
+    const int in_height = static_cast<int>(input_shape_height);
+    const int in_width = static_cast<int>(input_shape_width);
+    const int strides_d4 = static_cast<int>(strides.size_dim_4_);
+    const int strides_y = static_cast<int>(strides.height_);
+    const int strides_x = static_cast<int>(strides.width_);
+
+    int out_size_d4 = 0;
+    int out_height = 0;
+    int out_width = 0;
+
+    if (pad_type == padding::same || pad_type == padding::causal)
+    {
+        out_size_d4 = fplus::ceil(static_cast<float>(in_size_d4) / static_cast<float>(strides_d4) - 0.001);
+        out_height = fplus::ceil(static_cast<float>(in_height) / static_cast<float>(strides_y) - 0.001);
+        out_width  = fplus::ceil(static_cast<float>(in_width) / static_cast<float>(strides_x) - 0.001);
+    }
+    else
+    {
+        out_size_d4 = fplus::ceil(static_cast<float>(in_size_d4 - filter_size_d4 + 1) / static_cast<float>(strides_d4) - 0.001);
+        out_height = fplus::ceil(static_cast<float>(in_height - filter_height + 1) / static_cast<float>(strides_y) - 0.001);
+        out_width = fplus::ceil(static_cast<float>(in_width - filter_width + 1) / static_cast<float>(strides_x) - 0.001);
+    }
+
+    int pad_front = 0;
+    int pad_back = 0;
+    int pad_top = 0;
+    int pad_bottom = 0;
+    int pad_left = 0;
+    int pad_right = 0;
+
+    if (pad_type == padding::same)
+    {
+        int pad_along_d4 = 0;
+        int pad_along_height = 0;
+        int pad_along_width = 0;
+
+        if (in_size_d4 % strides_d4 == 0)
+            pad_along_d4 = std::max(filter_size_d4 - strides_d4, 0);
+        else
+            pad_along_d4 = std::max(filter_size_d4 - (in_size_d4 % strides_d4), 0);
+        if (in_height % strides_y == 0)
+            pad_along_height = std::max(filter_height - strides_y, 0);
+        else
+            pad_along_height = std::max(filter_height - (in_height % strides_y), 0);
+        if (in_width % strides_x == 0)
+            pad_along_width = std::max(filter_width - strides_x, 0);
+        else
+            pad_along_width = std::max(filter_width - (in_width % strides_x), 0);
+
+        pad_front = pad_along_d4 / 2;
+        pad_back = pad_along_d4 - pad_front;
+        pad_top = pad_along_height / 2;
+        pad_bottom = pad_along_height - pad_top;
+        pad_left = pad_along_width / 2;
+        pad_right = pad_along_width - pad_left;
+    }
+    else if (pad_type == padding::causal)
+    {
+        pad_front = filter_size_d4 - 1;
+        pad_top = filter_height - 1;
+        pad_left = filter_width - 1;
+    }
+
+    std::size_t out_size_d4_size_t = fplus::integral_cast_throw<std::size_t>(out_size_d4);
+    std::size_t out_height_size_t = fplus::integral_cast_throw<std::size_t>(out_height);
+    std::size_t out_width_size_t = fplus::integral_cast_throw<std::size_t>(out_width);
+    std::size_t pad_front_size_t = fplus::integral_cast_throw<std::size_t>(pad_front);
+    std::size_t pad_back_size_t = fplus::integral_cast_throw<std::size_t>(pad_back);
+    std::size_t pad_top_size_t = fplus::integral_cast_throw<std::size_t>(pad_top);
+    std::size_t pad_bottom_size_t = fplus::integral_cast_throw<std::size_t>(pad_bottom);
+    std::size_t pad_left_size_t = fplus::integral_cast_throw<std::size_t>(pad_left);
+    std::size_t pad_right_size_t = fplus::integral_cast_throw<std::size_t>(pad_right);
+
+    return {pad_front_size_t, pad_back_size_t,
+        pad_top_size_t, pad_bottom_size_t,
+        pad_left_size_t, pad_right_size_t,
+        out_size_d4_size_t, out_height_size_t, out_width_size_t};
+}
+
+} } // namespace fdeep, namespace internal
diff --git a/include/fdeep/fdeep.hpp b/include/fdeep/fdeep.hpp
@@ -14,6 +14,7 @@
 #include "fdeep/tensor_pos.hpp"
 #include "fdeep/node.hpp"
 #include "fdeep/shape2.hpp"
+#include "fdeep/shape3.hpp"
 #include "fdeep/tensor_shape.hpp"
 #include "fdeep/tensor_shape_variable.hpp"
 #include "fdeep/recurrent_ops.hpp"

diff --git a/include/fdeep/import_model.hpp b/include/fdeep/import_model.hpp
@@ -29,7 +29,7 @@
 
 #include "fdeep/layers/add_layer.hpp"
 #include "fdeep/layers/average_layer.hpp"
-#include "fdeep/layers/average_pooling_2d_layer.hpp"
+#include "fdeep/layers/average_pooling_3d_layer.hpp"
 #include "fdeep/layers/batch_normalization_layer.hpp"
 #include "fdeep/layers/bidirectional_layer.hpp"
 #include "fdeep/layers/concatenate_layer.hpp"
@@ -42,10 +42,8 @@
 #include "fdeep/layers/exponential_layer.hpp"
 #include "fdeep/layers/flatten_layer.hpp"
 #include "fdeep/layers/gelu_layer.hpp"
-#include "fdeep/layers/global_average_pooling_1d_layer.hpp"
-#include "fdeep/layers/global_max_pooling_1d_layer.hpp"
-#include "fdeep/layers/global_average_pooling_2d_layer.hpp"
-#include "fdeep/layers/global_max_pooling_2d_layer.hpp"
+#include "fdeep/layers/global_average_pooling_3d_layer.hpp"
+#include "fdeep/layers/global_max_pooling_3d_layer.hpp"
 #include "fdeep/layers/hard_sigmoid_layer.hpp"
 #include "fdeep/layers/input_layer.hpp"
 #include "fdeep/layers/layer.hpp"
@@ -56,13 +54,13 @@
 #include "fdeep/layers/permute_layer.hpp"
 #include "fdeep/layers/prelu_layer.hpp"
 #include "fdeep/layers/linear_layer.hpp"
-#include "fdeep/layers/max_pooling_2d_layer.hpp"
+#include "fdeep/layers/max_pooling_3d_layer.hpp"
 #include "fdeep/layers/maximum_layer.hpp"
 #include "fdeep/layers/minimum_layer.hpp"
 #include "fdeep/layers/model_layer.hpp"
 #include "fdeep/layers/multiply_layer.hpp"
 #include "fdeep/layers/normalization_layer.hpp"
-#include "fdeep/layers/pooling_2d_layer.hpp"
+#include "fdeep/layers/pooling_3d_layer.hpp"
 #include "fdeep/layers/relu_layer.hpp"
 #include "fdeep/layers/repeat_vector_layer.hpp"
 #include "fdeep/layers/rescaling_layer.hpp"
@@ -244,6 +242,26 @@ inline shape2 create_shape2(const nlohmann::json& data)
     }
 }
 
+inline shape3 create_shape3(const nlohmann::json& data)
+{
+    if (data.is_array())
+    {
+        assertion(data.size() == 1 || data.size() == 2 || data.size() == 3,
+            "invalid number of dimensions in shape2");
+        if (data.size() == 1)
+            return shape3(1, 1, data[0]);
+        if (data.size() == 2)
+            return shape3(1, data[0], data[1]);
+        else
+            return shape3(data[0], data[1], data[2]);
+    }
+    else
+    {
+        const std::size_t width = data;
+        return shape3(1, 1, width);
+    }
+}
+
 inline std::size_t create_size_t(const nlohmann::json& int_data)
 {
     const int val = int_data;
@@ -520,71 +538,43 @@ inline layer_ptr create_identity_layer(
     return std::make_shared<linear_layer>(name);
 }
 
-inline layer_ptr create_max_pooling_2d_layer(
+inline layer_ptr create_max_pooling_3d_layer(
     const get_param_f&, const nlohmann::json& data,
     const std::string& name)
 {
-    const auto pool_size = create_shape2(data["config"]["pool_size"]);
-    const auto strides = create_shape2(data["config"]["strides"]);
-    const bool channels_first = json_object_get(data["config"], "data_format", std::string("channels_last")) == "channels_first";
+    const auto pool_size = create_shape3(data["config"]["pool_size"]);
+    const auto strides = create_shape3(data["config"]["strides"]);
     const std::string padding_str = data["config"]["padding"];
     const auto pad_type = create_padding(padding_str);
-    return std::make_shared<max_pooling_2d_layer>(name,
-        pool_size, strides, channels_first, pad_type);
+    return std::make_shared<max_pooling_3d_layer>(name,
+        pool_size, strides, pad_type);
 }
 
-inline layer_ptr create_average_pooling_2d_layer(
+inline layer_ptr create_average_pooling_3d_layer(
     const get_param_f&, const nlohmann::json& data,
     const std::string& name)
 {
-    const auto pool_size = create_shape2(data["config"]["pool_size"]);
-    const auto strides = create_shape2(data["config"]["strides"]);
-    const bool channels_first = json_object_get(data["config"], "data_format", std::string("channels_last")) == "channels_first";
+    const auto pool_size = create_shape3(data["config"]["pool_size"]);
+    const auto strides = create_shape3(data["config"]["strides"]);
     const std::string padding_str = data["config"]["padding"];
 
     const auto pad_type = create_padding(padding_str);
-    return std::make_shared<average_pooling_2d_layer>(name,
-        pool_size, strides, channels_first, pad_type);
+    return std::make_shared<average_pooling_3d_layer>(name,
+        pool_size, strides, pad_type);
 }
 
-inline layer_ptr create_global_max_pooling_1d_layer(
-    const get_param_f&, const nlohmann::json& data,
-    const std::string& name)
-{
-    const bool channels_first = json_obj_has_member(data, "config")
-        && json_object_get(data["config"], "data_format", std::string("channels_last")) == "channels_first";
-
-    return std::make_shared<global_max_pooling_1d_layer>(name, channels_first);
-}
-
-inline layer_ptr create_global_max_pooling_2d_layer(
-    const get_param_f&, const nlohmann::json& data,
-    const std::string& name)
-{
-    const bool channels_first = json_obj_has_member(data, "config")
-        && json_object_get(data["config"], "data_format", std::string("channels_last")) == "channels_first";
-
-    return std::make_shared<global_max_pooling_2d_layer>(name, channels_first);
-}
-
-inline layer_ptr create_global_average_pooling_1d_layer(
-    const get_param_f&, const nlohmann::json& data,
+inline layer_ptr create_global_max_pooling_3d_layer(
+    const get_param_f&, const nlohmann::json&,
     const std::string& name)
 {
-    const bool channels_first = json_obj_has_member(data, "config")
-        && json_object_get(data["config"], "data_format", std::string("channels_last")) == "channels_first";
-
-    return std::make_shared<global_average_pooling_1d_layer>(name, channels_first);
+    return std::make_shared<global_max_pooling_3d_layer>(name);
 }
 
-inline layer_ptr create_global_average_pooling_2d_layer(
-    const get_param_f&, const nlohmann::json& data,
+inline layer_ptr create_global_average_pooling_3d_layer(
+    const get_param_f&, const nlohmann::json&,
     const std::string& name)
 {
-    const bool channels_first = json_obj_has_member(data, "config")
-        && json_object_get(data["config"], "data_format", std::string("channels_last")) == "channels_first";
-
-    return std::make_shared<global_average_pooling_2d_layer>(name, channels_first);
+    return std::make_shared<global_average_pooling_3d_layer>(name);
 }
 
 inline layer_ptr create_upsampling_1d_layer(
@@ -1248,14 +1238,18 @@ inline layer_ptr create_layer(const get_param_f& get_param,
             {"PReLU", create_prelu_layer },
             {"ELU", create_elu_layer_isolated},
             {"ReLU", create_relu_layer_isolated},
-            {"MaxPooling1D", create_max_pooling_2d_layer},
-            {"MaxPooling2D", create_max_pooling_2d_layer},
-            {"AveragePooling1D", create_average_pooling_2d_layer},
-            {"AveragePooling2D", create_average_pooling_2d_layer},
-            {"GlobalMaxPooling1D", create_global_max_pooling_1d_layer},
-            {"GlobalMaxPooling2D", create_global_max_pooling_2d_layer},
-            {"GlobalAveragePooling1D", create_global_average_pooling_1d_layer},
-            {"GlobalAveragePooling2D", create_global_average_pooling_2d_layer},
+            {"MaxPooling1D", create_max_pooling_3d_layer},
+            {"MaxPooling2D", create_max_pooling_3d_layer},
+            {"MaxPooling3D", create_max_pooling_3d_layer},
+            {"AveragePooling1D", create_average_pooling_3d_layer},
+            {"AveragePooling2D", create_average_pooling_3d_layer},
+            {"AveragePooling3D", create_average_pooling_3d_layer},
+            {"GlobalMaxPooling1D", create_global_max_pooling_3d_layer},
+            {"GlobalMaxPooling2D", create_global_max_pooling_3d_layer},
+            {"GlobalMaxPooling3D", create_global_max_pooling_3d_layer},
+            {"GlobalAveragePooling1D", create_global_average_pooling_3d_layer},
+            {"GlobalAveragePooling2D", create_global_average_pooling_3d_layer},
+            {"GlobalAveragePooling3D", create_global_average_pooling_3d_layer},
             {"UpSampling1D", create_upsampling_1d_layer},
             {"UpSampling2D", create_upsampling_2d_layer},
             {"Dense", create_dense_layer},