Skip to content

Commit

Permalink
[GPU/OpenCL] Initial version of Transpose (all axes) with OpenCL ops
Browse files Browse the repository at this point in the history
Added naive version of OpenCL implementation for Transpose.
Incorporated kernel for ops using blas_kernels.
Added unit test for Transpose_cl.

Signed-off-by: Niket Agarwal <[email protected]>
  • Loading branch information
niket-agarwal committed Oct 22, 2024
1 parent 72bdf54 commit 5d2614e
Show file tree
Hide file tree
Showing 16 changed files with 1,152 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Applications/LLaMA/jni/transpose_layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class TransposeLayer final : public nntrainer::Layer {
/**
* @copydoc bool supportBackwarding() const
*/
bool supportBackwarding() const override { return true; };
bool supportBackwarding() const override { return false; };

/**
* @copydoc Layer::exportTo(Exporter &exporter, ExportMethods method)
Expand Down
14 changes: 12 additions & 2 deletions api/ccapi/include/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ enum LayerType {
LAYER_LOSS_CONSTANT_DERIVATIVE, /**< Synthetic loss layer to feed constant
derivative */
LAYER_UPSAMPLE2D, /**< Upsample 2D Layer type */
LAYER_RMSNORM = ML_TRAIN_LAYER_TYPE_RMSNORM, /**<RMS NORM Layer */
LAYER_UNKNOWN = ML_TRAIN_LAYER_TYPE_UNKNOWN /**< Unknown */
LAYER_RMSNORM = ML_TRAIN_LAYER_TYPE_RMSNORM, /**<RMS NORM Layer */
LAYER_TRANSPOSE = ML_TRAIN_LAYER_TYPE_TRANSPOSE, /**< Transpose Layer type */
LAYER_UNKNOWN = ML_TRAIN_LAYER_TYPE_UNKNOWN /**< Unknown */
};

/**
Expand Down Expand Up @@ -332,6 +333,15 @@ RMSNormCl(const std::vector<std::string> &properties = {},
return createLayer(LayerType::LAYER_RMSNORM, properties, compute_engine);
}

/**
* @brief Helper function to create Transpose layer
*/
inline std::unique_ptr<Layer>
Transpose(const std::vector<std::string> &properties = {},
const LayerComputeEngine &compute_engine = LayerComputeEngine::CPU) {
return createLayer(LayerType::LAYER_TRANSPOSE, properties, compute_engine);
}

/**
* @brief Helper function to create batch normalization layer
*/
Expand Down
7 changes: 4 additions & 3 deletions api/nntrainer-api-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,10 @@ typedef enum {
27, /**< Layer Normalization Layer type (Since 7.0) */
ML_TRAIN_LAYER_TYPE_POSITIONAL_ENCODING =
28, /**< Positional Encoding Layer type (Since 7.0) */
ML_TRAIN_LAYER_TYPE_IDENTITY = 29, /**< Identity Layer type (Since 8.0) */
ML_TRAIN_LAYER_TYPE_SWIGLU = 30, /**< Swiglu Layer type */
ML_TRAIN_LAYER_TYPE_WEIGHT = 31, /**< Weight Layer type (Since 9.0)*/
ML_TRAIN_LAYER_TYPE_IDENTITY = 29, /**< Identity Layer type (Since 8.0) */
ML_TRAIN_LAYER_TYPE_SWIGLU = 30, /**< Swiglu Layer type */
ML_TRAIN_LAYER_TYPE_WEIGHT = 31, /**< Weight Layer type (Since 9.0)*/
ML_TRAIN_LAYER_TYPE_TRANSPOSE = 32, /**< Transpose Layer type */
ML_TRAIN_LAYER_TYPE_PREPROCESS_FLIP =
300, /**< Preprocess flip Layer (Since 6.5) */
ML_TRAIN_LAYER_TYPE_PREPROCESS_TRANSLATE =
Expand Down
5 changes: 5 additions & 0 deletions nntrainer/cl_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <reshape_cl.h>
#include <rmsnorm_layer_cl.h>
#include <swiglu_cl.h>
#include <transpose_cl.h>

namespace nntrainer {

Expand Down Expand Up @@ -51,6 +52,10 @@ static void add_default_object(ClContext &cc) {

cc.registerFactory(nntrainer::createLayer<ConcatLayerCl>, ConcatLayerCl::type,
ml::train::LayerType::LAYER_CONCAT);

cc.registerFactory(nntrainer::createLayer<TransposeLayerCl>,
TransposeLayerCl::type,
ml::train::LayerType::LAYER_TRANSPOSE);
}

static void registerer(ClContext &cc) noexcept {
Expand Down
1 change: 1 addition & 0 deletions nntrainer/layers/cl_layers/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ cl_layer_sources = [
'reshape_cl.cpp',
'rmsnorm_layer_cl.cpp',
'concat_cl.cpp',
'transpose_cl.cpp',
]

foreach s : cl_layer_sources
Expand Down
91 changes: 91 additions & 0 deletions nntrainer/layers/cl_layers/transpose_cl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2024 Niket Agarwal <[email protected]>
*
* @file transpose_cl.cpp
* @date 31 July 2024
* @brief Implementation of transpose layer
* @see https://github.com/nnstreamer/nntrainer
* @author Niket Agarwal <[email protected]>
* @bug No known bugs except for NYI items
*
*/

#include "transpose_cl.h"
#include <blas_kernel_interface.h>
#include <iostream>
#include <layer_context.h>
#include <nntrainer_error.h>
#include <nntrainer_log.h>
#include <node_exporter.h>

namespace nntrainer {

static constexpr size_t SINGLE_INOUT_IDX = 0;

void TransposeLayerCl::finalize(InitLayerContext &context) {
std::vector<TensorDim> dim = context.getInputDimensions();

for (unsigned int i = 0; i < dim.size(); ++i) {
if (dim[i].getDataLen() == 0) {
throw std::invalid_argument("Input dimension is not set");
} else {
dim[i].channel(dim[i].channel());
dim[i].height(dim[i].height());
dim[i].width(dim[i].width());
}
}

context.setOutputDimensions(dim);
}

void TransposeLayerCl::forwarding(RunLayerContext &context, bool training) {
Tensor &in = context.getInput(SINGLE_INOUT_IDX);
Tensor &out = context.getOutput(SINGLE_INOUT_IDX);
transposeCl("1:0:2", in, out);
}

void TransposeLayerCl::incremental_forwarding(RunLayerContext &context,
unsigned int from,
unsigned int to, bool training) {
Tensor &in = context.getInput(SINGLE_INOUT_IDX);
Tensor &out = context.getOutput(SINGLE_INOUT_IDX);
if (from) {
NNTR_THROW_IF(to - from != 1, std::invalid_argument)
<< "incremental step size is not 1";
from = 0;
to = 1;
}
transposeCl("1:0:2", in, out);
}

void TransposeLayerCl::calcDerivative(RunLayerContext &context) {
std::throw_with_nested(std::runtime_error("Training is not supported yet."));
}

void TransposeLayerCl::setProperty(const std::vector<std::string> &values) {
auto remain_props = loadProperties(values, transpose_props);
if (!remain_props.empty()) {
std::string msg = "[TransposeLayerCl] Unknown Layer Properties count " +
std::to_string(values.size());
throw exception::not_supported(msg);
}
}

#ifdef PLUGGABLE

Layer *create_transpose_layer_cl() {
auto layer = new TransposeLayerCl();
return layer;
}

void destroy_transpose_layer_cl(Layer *layer) { delete layer; }

extern "C" {
LayerPluggable ml_train_layer_pluggable{create_transpose_layer_cl,
destroy_transpose_layer_cl};
}

#endif

} // namespace nntrainer
105 changes: 105 additions & 0 deletions nntrainer/layers/cl_layers/transpose_cl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2024 Niket Agarwal <[email protected]>
*
* @file transpose_cl.h
* @date 31 July 2024
* @brief Implementation of transpose layer
* @see https://github.com/nnstreamer/nntrainer
* @author Niket Agarwal <[email protected]>
* @bug No known bugs except for NYI items
*
*/

#ifndef __TRANSPOSE_LAYER_CL_H__
#define __TRANSPOSE_LAYER_CL_H__

#include <common_properties.h>
#include <layer_devel.h>
#include <opencl_buffer.h>
#include <opencl_kernel.h>

#define CREATE_IF_EMPTY_DIMS(tensor, ...) \
do { \
if (tensor.empty()) \
tensor = Tensor(__VA_ARGS__); \
} while (0);

namespace nntrainer {

/**
* @brief A tranpose layer.
*
*/
class TransposeLayerCl final : public Layer {
public:
/**
* @brief Construct a new transpose layer object
*
*/
TransposeLayerCl() : Layer(), transpose_props(props::Print()) {}

/**
* @brief Destroy the transpose layer object
*
*/
~TransposeLayerCl() {}

/**
* @copydoc Layer::finalize(InitLayerContext &context)
*/
void finalize(InitLayerContext &context) override;

/**
* @copydoc Layer::forwarding(RunLayerContext &context, bool training)
*/
void forwarding(RunLayerContext &context, bool training) override;

/**
* @copydoc Layer::incremental_forwarding(RunLayerContext &context, unsigned
* int from, unsigned int to, bool training)
*/
void incremental_forwarding(RunLayerContext &context, unsigned int from,
unsigned int to, bool training) override;

/**
* @copydoc Layer::calcDerivative(RunLayerContext &context)
*/
void calcDerivative(RunLayerContext &context) override;

/**
* @copydoc bool supportBackwarding() const
*/
bool supportBackwarding() const override { return true; };

/**
* @copydoc Layer::exportTo(Exporter &exporter, ExportMethods method)
*/
void exportTo(Exporter &exporter,
const ml::train::ExportMethods &method) const override{};

/**
* @copydoc Layer::getType()
*/
const std::string getType() const override { return TransposeLayerCl::type; };

/**
* @copydoc Layer::setProperty(const std::vector<std::string> &values)
*/
void setProperty(const std::vector<std::string> &values) override;

inline static const std::string type = "transpose";

static opencl::Kernel kernel_transpose_axis0;
static opencl::Kernel kernel_transpose_fp16_axis0;
static opencl::Kernel kernel_transpose_axis1;
static opencl::Kernel kernel_transpose_fp16_axis1;
static opencl::Kernel kernel_transpose_axis2;
static opencl::Kernel kernel_transpose_fp16_axis2;

std::tuple<props::Print> transpose_props; /**< transpose layer properties :
unit - number of output neurons */
};
} // namespace nntrainer

#endif /* __TRANSPOSE_LAYER_CL_H__ */
54 changes: 54 additions & 0 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,4 +235,58 @@ void add_i_cl(Tensor const &input, Tensor &result) {
}
}

void transposeCl(const std::string &direction, Tensor const &in,
Tensor &result) {

unsigned int input_batch_size, input_height, input_width, input_channels;

input_batch_size = in.batch();
input_height = in.height();
input_width = in.width();
input_channels = in.channel();

if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
const float *data = in.getData();
float *rdata = result.getData();
// for transpose about channels and height
if (direction[0] == '1' && direction[2] == '0') {
transpose_cl_axis0(data, rdata, input_batch_size, input_channels,
input_height, input_width);
}
// for transpose about height and width
else if (direction[0] == '0' && direction[2] == '2') {
transpose_cl_axis1(data, rdata, input_batch_size, input_channels,
input_height, input_width);
}
// for transpose about channels and width
else if (direction[0] == '2' && direction[2] == '1') {
transpose_cl_axis2(data, rdata, input_batch_size, input_channels,
input_height, input_width);
}

} else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
const _FP16 *data = in.getData<_FP16>();
_FP16 *rdata = result.getData<_FP16>();
// for transpose about channels and height
if (direction[0] == '1' && direction[2] == '0') {
transpose_cl_axis0(data, rdata, input_batch_size, input_channels,
input_height, input_width);
}
// for transpose about height and width
else if (direction[0] == '0' && direction[2] == '2') {
transpose_cl_axis1(data, rdata, input_batch_size, input_channels,
input_height, input_width);
}
// for transpose about channels and width
else if (direction[0] == '2' && direction[2] == '1') {
transpose_cl_axis2(data, rdata, input_batch_size, input_channels,
input_height, input_width);
}
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

} // namespace nntrainer
9 changes: 9 additions & 0 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,14 @@ void multiplyCl(Tensor &input, float const &value);
*/
void add_i_cl(Tensor const &input, Tensor &result);

/**
* @brief Process data and dimensions for transpose operation
* @param[in] direction string
* @param[in] input Tensor
* @param[in] result Tensor
*/
void transposeCl(const std::string &direction, Tensor const &in,
Tensor &result);

} // namespace nntrainer
#endif /* __BLAS_KERNEL_INTERFACE_H__ */
Loading

0 comments on commit 5d2614e

Please sign in to comment.