[GPU/OpenCL] Initial version of Transpose (all axes) with OpenCL ops

Added naive version of OpenCL implementation for Transpose. Incorporated kernel for ops using blas_kernels. Added unit test for Transpose_cl. Signed-off-by: Niket Agarwal <[email protected]>
nnstreamer · Oct 22, 2024 · 5d2614e · 5d2614e
1 parent 72bdf54
commit 5d2614e
Show file tree

Hide file tree

Showing 16 changed files with 1,152 additions and 6 deletions.
diff --git a/Applications/LLaMA/jni/transpose_layer.h b/Applications/LLaMA/jni/transpose_layer.h
@@ -58,7 +58,7 @@ class TransposeLayer final : public nntrainer::Layer {
   /**
    * @copydoc bool supportBackwarding() const
    */
-  bool supportBackwarding() const override { return true; };
+  bool supportBackwarding() const override { return false; };
 
   /**
    * @copydoc Layer::exportTo(Exporter &exporter, ExportMethods method)

diff --git a/api/ccapi/include/layer.h b/api/ccapi/include/layer.h
@@ -102,8 +102,9 @@ enum LayerType {
   LAYER_LOSS_CONSTANT_DERIVATIVE, /**< Synthetic loss layer to feed constant
                                      derivative */
   LAYER_UPSAMPLE2D,               /**< Upsample 2D Layer type */
-  LAYER_RMSNORM = ML_TRAIN_LAYER_TYPE_RMSNORM, /**<RMS NORM Layer */
-  LAYER_UNKNOWN = ML_TRAIN_LAYER_TYPE_UNKNOWN  /**< Unknown */
+  LAYER_RMSNORM = ML_TRAIN_LAYER_TYPE_RMSNORM,     /**<RMS NORM Layer */
+  LAYER_TRANSPOSE = ML_TRAIN_LAYER_TYPE_TRANSPOSE, /**< Transpose Layer type */
+  LAYER_UNKNOWN = ML_TRAIN_LAYER_TYPE_UNKNOWN      /**< Unknown */
 };
 
 /**
@@ -332,6 +333,15 @@ RMSNormCl(const std::vector<std::string> &properties = {},
   return createLayer(LayerType::LAYER_RMSNORM, properties, compute_engine);
 }
 
+/**
+ * @brief Helper function to create Transpose layer
+ */
+inline std::unique_ptr<Layer>
+Transpose(const std::vector<std::string> &properties = {},
+          const LayerComputeEngine &compute_engine = LayerComputeEngine::CPU) {
+  return createLayer(LayerType::LAYER_TRANSPOSE, properties, compute_engine);
+}
+
 /**
  * @brief Helper function to create batch normalization layer
  */

diff --git a/api/nntrainer-api-common.h b/api/nntrainer-api-common.h
@@ -62,9 +62,10 @@ typedef enum {
     27, /**< Layer Normalization Layer type (Since 7.0) */
   ML_TRAIN_LAYER_TYPE_POSITIONAL_ENCODING =
     28, /**< Positional Encoding Layer type (Since 7.0) */
-  ML_TRAIN_LAYER_TYPE_IDENTITY = 29, /**< Identity Layer type (Since 8.0) */
-  ML_TRAIN_LAYER_TYPE_SWIGLU = 30,   /**< Swiglu Layer type */
-  ML_TRAIN_LAYER_TYPE_WEIGHT = 31,   /**< Weight Layer type (Since 9.0)*/
+  ML_TRAIN_LAYER_TYPE_IDENTITY = 29,  /**< Identity Layer type (Since 8.0) */
+  ML_TRAIN_LAYER_TYPE_SWIGLU = 30,    /**< Swiglu Layer type */
+  ML_TRAIN_LAYER_TYPE_WEIGHT = 31,    /**< Weight Layer type (Since 9.0)*/
+  ML_TRAIN_LAYER_TYPE_TRANSPOSE = 32, /**< Transpose Layer type */
   ML_TRAIN_LAYER_TYPE_PREPROCESS_FLIP =
     300, /**< Preprocess flip Layer (Since 6.5) */
   ML_TRAIN_LAYER_TYPE_PREPROCESS_TRANSLATE =

diff --git a/nntrainer/cl_context.cpp b/nntrainer/cl_context.cpp
@@ -23,6 +23,7 @@
 #include <reshape_cl.h>
 #include <rmsnorm_layer_cl.h>
 #include <swiglu_cl.h>
+#include <transpose_cl.h>
 
 namespace nntrainer {
 
@@ -51,6 +52,10 @@ static void add_default_object(ClContext &cc) {
 
   cc.registerFactory(nntrainer::createLayer<ConcatLayerCl>, ConcatLayerCl::type,
                      ml::train::LayerType::LAYER_CONCAT);
+
+  cc.registerFactory(nntrainer::createLayer<TransposeLayerCl>,
+                     TransposeLayerCl::type,
+                     ml::train::LayerType::LAYER_TRANSPOSE);
 }
 
 static void registerer(ClContext &cc) noexcept {

diff --git a/nntrainer/layers/cl_layers/meson.build b/nntrainer/layers/cl_layers/meson.build
@@ -5,6 +5,7 @@ cl_layer_sources = [
    'reshape_cl.cpp',
    'rmsnorm_layer_cl.cpp',
    'concat_cl.cpp',
+   'transpose_cl.cpp',
 ]
 
 foreach s : cl_layer_sources

diff --git a/nntrainer/layers/cl_layers/transpose_cl.cpp b/nntrainer/layers/cl_layers/transpose_cl.cpp
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Niket Agarwal <[email protected]>
+ *
+ * @file   transpose_cl.cpp
+ * @date   31 July 2024
+ * @brief  Implementation of transpose layer
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Niket Agarwal <[email protected]>
+ * @bug    No known bugs except for NYI items
+ *
+ */
+
+#include "transpose_cl.h"
+#include <blas_kernel_interface.h>
+#include <iostream>
+#include <layer_context.h>
+#include <nntrainer_error.h>
+#include <nntrainer_log.h>
+#include <node_exporter.h>
+
+namespace nntrainer {
+
+static constexpr size_t SINGLE_INOUT_IDX = 0;
+
+void TransposeLayerCl::finalize(InitLayerContext &context) {
+  std::vector<TensorDim> dim = context.getInputDimensions();
+
+  for (unsigned int i = 0; i < dim.size(); ++i) {
+    if (dim[i].getDataLen() == 0) {
+      throw std::invalid_argument("Input dimension is not set");
+    } else {
+      dim[i].channel(dim[i].channel());
+      dim[i].height(dim[i].height());
+      dim[i].width(dim[i].width());
+    }
+  }
+
+  context.setOutputDimensions(dim);
+}
+
+void TransposeLayerCl::forwarding(RunLayerContext &context, bool training) {
+  Tensor &in = context.getInput(SINGLE_INOUT_IDX);
+  Tensor &out = context.getOutput(SINGLE_INOUT_IDX);
+  transposeCl("1:0:2", in, out);
+}
+
+void TransposeLayerCl::incremental_forwarding(RunLayerContext &context,
+                                              unsigned int from,
+                                              unsigned int to, bool training) {
+  Tensor &in = context.getInput(SINGLE_INOUT_IDX);
+  Tensor &out = context.getOutput(SINGLE_INOUT_IDX);
+  if (from) {
+    NNTR_THROW_IF(to - from != 1, std::invalid_argument)
+      << "incremental step size is not 1";
+    from = 0;
+    to = 1;
+  }
+  transposeCl("1:0:2", in, out);
+}
+
+void TransposeLayerCl::calcDerivative(RunLayerContext &context) {
+  std::throw_with_nested(std::runtime_error("Training is not supported yet."));
+}
+
+void TransposeLayerCl::setProperty(const std::vector<std::string> &values) {
+  auto remain_props = loadProperties(values, transpose_props);
+  if (!remain_props.empty()) {
+    std::string msg = "[TransposeLayerCl] Unknown Layer Properties count " +
+                      std::to_string(values.size());
+    throw exception::not_supported(msg);
+  }
+}
+
+#ifdef PLUGGABLE
+
+Layer *create_transpose_layer_cl() {
+  auto layer = new TransposeLayerCl();
+  return layer;
+}
+
+void destroy_transpose_layer_cl(Layer *layer) { delete layer; }
+
+extern "C" {
+LayerPluggable ml_train_layer_pluggable{create_transpose_layer_cl,
+                                        destroy_transpose_layer_cl};
+}
+
+#endif
+
+} // namespace nntrainer
diff --git a/nntrainer/layers/cl_layers/transpose_cl.h b/nntrainer/layers/cl_layers/transpose_cl.h
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Niket Agarwal <[email protected]>
+ *
+ * @file   transpose_cl.h
+ * @date   31 July 2024
+ * @brief  Implementation of transpose layer
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Niket Agarwal <[email protected]>
+ * @bug    No known bugs except for NYI items
+ *
+ */
+
+#ifndef __TRANSPOSE_LAYER_CL_H__
+#define __TRANSPOSE_LAYER_CL_H__
+
+#include <common_properties.h>
+#include <layer_devel.h>
+#include <opencl_buffer.h>
+#include <opencl_kernel.h>
+
+#define CREATE_IF_EMPTY_DIMS(tensor, ...) \
+  do {                                    \
+    if (tensor.empty())                   \
+      tensor = Tensor(__VA_ARGS__);       \
+  } while (0);
+
+namespace nntrainer {
+
+/**
+ * @brief A tranpose layer.
+ *
+ */
+class TransposeLayerCl final : public Layer {
+public:
+  /**
+   * @brief Construct a new transpose layer object
+   *
+   */
+  TransposeLayerCl() : Layer(), transpose_props(props::Print()) {}
+
+  /**
+   * @brief Destroy the transpose layer object
+   *
+   */
+  ~TransposeLayerCl() {}
+
+  /**
+   * @copydoc Layer::finalize(InitLayerContext &context)
+   */
+  void finalize(InitLayerContext &context) override;
+
+  /**
+   * @copydoc Layer::forwarding(RunLayerContext &context, bool training)
+   */
+  void forwarding(RunLayerContext &context, bool training) override;
+
+  /**
+   * @copydoc Layer::incremental_forwarding(RunLayerContext &context, unsigned
+   * int from, unsigned int to, bool training)
+   */
+  void incremental_forwarding(RunLayerContext &context, unsigned int from,
+                              unsigned int to, bool training) override;
+
+  /**
+   * @copydoc Layer::calcDerivative(RunLayerContext &context)
+   */
+  void calcDerivative(RunLayerContext &context) override;
+
+  /**
+   * @copydoc bool supportBackwarding() const
+   */
+  bool supportBackwarding() const override { return true; };
+
+  /**
+   * @copydoc Layer::exportTo(Exporter &exporter, ExportMethods method)
+   */
+  void exportTo(Exporter &exporter,
+                const ml::train::ExportMethods &method) const override{};
+
+  /**
+   * @copydoc Layer::getType()
+   */
+  const std::string getType() const override { return TransposeLayerCl::type; };
+
+  /**
+   * @copydoc Layer::setProperty(const std::vector<std::string> &values)
+   */
+  void setProperty(const std::vector<std::string> &values) override;
+
+  inline static const std::string type = "transpose";
+
+  static opencl::Kernel kernel_transpose_axis0;
+  static opencl::Kernel kernel_transpose_fp16_axis0;
+  static opencl::Kernel kernel_transpose_axis1;
+  static opencl::Kernel kernel_transpose_fp16_axis1;
+  static opencl::Kernel kernel_transpose_axis2;
+  static opencl::Kernel kernel_transpose_fp16_axis2;
+
+  std::tuple<props::Print> transpose_props; /**< transpose layer properties :
+                                            unit - number of output neurons */
+};
+} // namespace nntrainer
+
+#endif /* __TRANSPOSE_LAYER_CL_H__ */
diff --git a/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp b/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
@@ -235,4 +235,58 @@ void add_i_cl(Tensor const &input, Tensor &result) {
   }
 }
 
+void transposeCl(const std::string &direction, Tensor const &in,
+                 Tensor &result) {
+
+  unsigned int input_batch_size, input_height, input_width, input_channels;
+
+  input_batch_size = in.batch();
+  input_height = in.height();
+  input_width = in.width();
+  input_channels = in.channel();
+
+  if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
+    const float *data = in.getData();
+    float *rdata = result.getData();
+    // for transpose about channels and height
+    if (direction[0] == '1' && direction[2] == '0') {
+      transpose_cl_axis0(data, rdata, input_batch_size, input_channels,
+                         input_height, input_width);
+    }
+    // for transpose about height and width
+    else if (direction[0] == '0' && direction[2] == '2') {
+      transpose_cl_axis1(data, rdata, input_batch_size, input_channels,
+                         input_height, input_width);
+    }
+    // for transpose about channels and width
+    else if (direction[0] == '2' && direction[2] == '1') {
+      transpose_cl_axis2(data, rdata, input_batch_size, input_channels,
+                         input_height, input_width);
+    }
+
+  } else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
+#ifdef ENABLE_FP16
+    const _FP16 *data = in.getData<_FP16>();
+    _FP16 *rdata = result.getData<_FP16>();
+    // for transpose about channels and height
+    if (direction[0] == '1' && direction[2] == '0') {
+      transpose_cl_axis0(data, rdata, input_batch_size, input_channels,
+                         input_height, input_width);
+    }
+    // for transpose about height and width
+    else if (direction[0] == '0' && direction[2] == '2') {
+      transpose_cl_axis1(data, rdata, input_batch_size, input_channels,
+                         input_height, input_width);
+    }
+    // for transpose about channels and width
+    else if (direction[0] == '2' && direction[2] == '1') {
+      transpose_cl_axis2(data, rdata, input_batch_size, input_channels,
+                         input_height, input_width);
+    }
+#else
+    throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
+  }
+}
+
 } // namespace nntrainer
diff --git a/nntrainer/tensor/cl_operations/blas_kernel_interface.h b/nntrainer/tensor/cl_operations/blas_kernel_interface.h
@@ -70,5 +70,14 @@ void multiplyCl(Tensor &input, float const &value);
  */
 void add_i_cl(Tensor const &input, Tensor &result);
 
+/**
+ * @brief Process data and dimensions for transpose operation
+ * @param[in] direction string
+ * @param[in] input Tensor
+ * @param[in] result Tensor
+ */
+void transposeCl(const std::string &direction, Tensor const &in,
+                    Tensor &result);
+
 } // namespace nntrainer
 #endif /* __BLAS_KERNEL_INTERFACE_H__ */