Skip to content

Commit

Permalink
[Tensor] Update newly added features
Browse files Browse the repository at this point in the history
This commit updates recently added features in tensor, including add_i_partial() and ele_mul().
The newly added functions have been implemented according to the revised tensor structure.

**Changes proposed in this PR:**
- Update Float/HalfTensor class with newly added function, add_i_partial().
- Apply BLAS operations in basic arithmetic operations in Tensor.
- height-width transpose in half-precision can be SIMD accelerated.

**Self-evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test:   [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghyeon Jeong <[email protected]>
  • Loading branch information
djeong20 committed Jul 10, 2024
1 parent dd29ddc commit acb6d4c
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 94 deletions.
49 changes: 14 additions & 35 deletions nntrainer/tensor/float_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,18 +396,8 @@ Tensor &FloatTensor::multiply(Tensor const &m, Tensor &output,
const float beta) const {
auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
float *out_buf) {
if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 &&
std::fpclassify(beta) == FP_ZERO) {
std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
std::multiplies<float>());
} else {
for (unsigned int i = 0; i < e.buffer_size; ++i) {
*out_buf = *buf * *m_buf + beta * *out_buf;
buf += strides[3];
m_buf += e.strides[3];
out_buf += output.getStrides()[3];
}
}
ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
strides[3]);
};

NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
Expand Down Expand Up @@ -436,17 +426,7 @@ Tensor &FloatTensor::divide(float const &value, Tensor &output) const {
Tensor &FloatTensor::divide(Tensor const &m, Tensor &output) const {
auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
float *out_buf) {
if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) {
std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
std::divides<float>());
} else {
for (unsigned int i = 0; i < e.buffer_size; ++i) {
*out_buf = *buf / *m_buf;
buf += strides[3];
m_buf += e.strides[3];
out_buf += output.getStrides()[3];
}
}
ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
};

apply_broadcast(m, f, output);
Expand Down Expand Up @@ -522,6 +502,15 @@ int FloatTensor::add_i(Tensor const &m, Tensor &output, float const alpha) {
return ML_ERROR_NONE;
}

int FloatTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
Tensor &m, unsigned int incX, unsigned int incY,
const Tensor alphas, unsigned int alpha_idx) {
saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
(float *)getAddress(addr_idx), incY);

return ML_ERROR_NONE;
}

Tensor &FloatTensor::add(float const &value, Tensor &output) const {
auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
apply(f, output);
Expand All @@ -532,18 +521,8 @@ Tensor &FloatTensor::add(Tensor const &m, Tensor &output,
float const alpha) const {
auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
float *out_buf) {
if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
std::fpclassify(alpha) == FP_ZERO) {
std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
std::plus<float>());
} else {
for (unsigned int i = 0; i < e.buffer_size; ++i) {
*out_buf = *buf + *m_buf * alpha;
buf += strides[3];
m_buf += e.strides[3];
out_buf += strides[3];
}
}
ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
strides[3]);
};
apply_broadcast(m, f, output);
return output;
Expand Down
7 changes: 7 additions & 0 deletions nntrainer/tensor/float_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,13 @@ class FloatTensor : public TensorBase {
*/
int add_i(Tensor const &m, Tensor &output, float const alpha) override;

/**
* @copydoc Tensor::add_i_partial()
*/
int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
unsigned int incX, unsigned int incY, const Tensor alphas,
unsigned int alpha_idx) override;

/**
* @copydoc Tensor::add(float const &value, Tensor &output)
*/
Expand Down
55 changes: 22 additions & 33 deletions nntrainer/tensor/half_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,17 +395,8 @@ Tensor &HalfTensor::multiply(Tensor const &m, Tensor &output,
const float beta) const {
auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
_FP16 *out_buf) {
if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 &&
std::fpclassify(beta) == FP_ZERO) {
ele_mul(e.buffer_size, buf, m_buf, out_buf);
} else {
for (unsigned int i = 0; i < e.buffer_size; ++i) {
*out_buf = *buf * *m_buf + static_cast<_FP16>(beta) * *out_buf;
buf += strides[3];
m_buf += e.strides[3];
out_buf += output.getStrides()[3];
}
}
ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
strides[3]);
};

NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
Expand Down Expand Up @@ -495,6 +486,15 @@ int HalfTensor::add_i(Tensor const &m, Tensor &output, float const alpha) {
return ML_ERROR_NONE;
}

int HalfTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
Tensor &m, unsigned int incX, unsigned int incY,
const Tensor alphas, unsigned int alpha_idx) {
saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX,
(_FP16 *)getAddress(addr_idx), incY);

return ML_ERROR_NONE;
}

Tensor &HalfTensor::add(float const &value, Tensor &output) const {
auto f = std::bind(std::plus<_FP16>(), std::placeholders::_1,
static_cast<_FP16>(value));
Expand All @@ -506,16 +506,8 @@ Tensor &HalfTensor::add(Tensor const &m, Tensor &output,
float const alpha) const {
auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
_FP16 *out_buf) {
if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 1) {
ele_add(e.buffer_size, buf, m_buf, out_buf);
} else {
for (unsigned int i = 0; i < e.buffer_size; ++i) {
*out_buf = *buf + *m_buf * static_cast<_FP16>(alpha);
buf += strides[3];
m_buf += e.strides[3];
out_buf += strides[3];
}
}
ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
strides[3]);
};
apply_broadcast(m, f, output);
return output;
Expand Down Expand Up @@ -1035,17 +1027,7 @@ Tensor &HalfTensor::divide(float const &value, Tensor &output) const {
Tensor &HalfTensor::divide(Tensor const &m, Tensor &output) const {
auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
_FP16 *out_buf) {
if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) {
std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
std::divides<_FP16>());
} else {
for (unsigned int i = 0; i < e.buffer_size; ++i) {
*out_buf = *buf / *m_buf;
buf += strides[3];
m_buf += e.strides[3];
out_buf += output.getStrides()[3];
}
}
ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
};

apply_broadcast(m, f, output);
Expand Down Expand Up @@ -1136,7 +1118,14 @@ Tensor &HalfTensor::transpose(const std::string &direction,
}
} else {
if (is_format_nchw) {
transposeloop(l, i, k, j, SL, SI, SK, SJ);
for (unsigned int b = 0; b < batch(); ++b) {
for (unsigned int c = 0; c < channel(); ++c) {
transpose_matrix(
height(), width(), (_FP16 *)getData() + getIndex(b, c, 0, 0),
width(), (_FP16 *)output.getData() + output.getIndex(b, c, 0, 0),
output.width());
}
}
} else {
transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
}
Expand Down
7 changes: 7 additions & 0 deletions nntrainer/tensor/half_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,13 @@ class HalfTensor : public TensorBase {
*/
int add_i(Tensor const &m, Tensor &output, float const alpha) override;

/**
* @copydoc Tensor::add_i_partial()
*/
int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
unsigned int incX, unsigned int incY, const Tensor alphas,
unsigned int alpha_idx) override;

/**
* @copydoc Tensor::add(float const &value, Tensor &output)
*/
Expand Down
29 changes: 16 additions & 13 deletions nntrainer/tensor/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,20 @@ Tensor Tensor::multiply(Tensor const &m, const float beta) const {

Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
const float beta) const {
NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
<< "Tensor Format of " << getName() << ":"
<< ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
<< ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";

NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
!output.getContiguous(),
std::invalid_argument)
<< getName() << " is not contiguous, cannot multiply";

NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
!output.getContiguous(),
std::invalid_argument)
<< getName() << " is not contiguous, cannot multiply";
itensor->multiply(m, output, beta);
return output;
}
Expand Down Expand Up @@ -355,19 +369,8 @@ int Tensor::add_i(Tensor const &m, float const alpha) {
int Tensor::add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
unsigned int incX, unsigned int incY,
const Tensor alphas, unsigned int alpha_idx) {
if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
getAddress<float>(addr_idx), incY);
} else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX,
getAddress<_FP16>(addr_idx), incY);
#else
ml_loge("%s", "Error: enable-fp16 is not enabled");
return ML_ERROR_INVALID_PARAMETER;
#endif
}
return ML_ERROR_NONE;
return itensor->add_i_partial(len, addr_idx, m, incX, incY, alphas,
alpha_idx);
}

Tensor Tensor::add(Tensor const &m, float const alpha) const {
Expand Down
27 changes: 14 additions & 13 deletions nntrainer/tensor/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include <cstddef>

#include <blas_interface.h>
#include <nntrainer_log.h>
#include <tensor_base.h>

Expand Down Expand Up @@ -735,19 +736,19 @@ class Tensor {
*/
int add_i(Tensor const &m, float const alpha = 1.F);

/**
* @brief Do add_i for specific section
*
* @param len Length of the specific section
* @param addr_idx Starting index of the psecific section
* @param m Input Tensor to be added
* @param incX Incremental index of X
* @param incY Incremental index of Y
* @param alphas Vector of multiple alpha values
* @param alpha_idx Index of alpha in alpha vector
* @retval #ML_ERROR_NONE Successful
* @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
*/
/**
* @brief Do add_i for specific section
*
* @param len Length of the specific section
* @param addr_idx Starting index of the psecific section
* @param m Input Tensor to be added
* @param incX Incremental index of X
* @param incY Incremental index of Y
* @param alphas Vector of multiple alpha values
* @param alpha_idx Index of alpha in alpha vector
* @retval #ML_ERROR_NONE Successful
* @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
*/
int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
unsigned int incX, unsigned int incY, const Tensor alphas,
unsigned int alpha_idx);
Expand Down
7 changes: 7 additions & 0 deletions nntrainer/tensor/tensor_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,13 @@ class TensorBase {
*/
virtual int add_i(Tensor const &m, Tensor &output, float const alpha) = 0;

/**
* @copydoc Tensor::add_i_partial()
*/
virtual int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
unsigned int incX, unsigned int incY,
const Tensor alphas, unsigned int alpha_idx) = 0;

/**
* @copydoc Tensor::add(float const &value, Tensor &output)
*/
Expand Down

0 comments on commit acb6d4c

Please sign in to comment.