Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds new CPU kernel for SGD op supporting BF16 data type #32162

Merged
merged 10 commits into from
Apr 14, 2021
6 changes: 5 additions & 1 deletion paddle/fluid/operators/optimizers/sgd_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include <string>

#include "paddle/fluid/operators/optimizers/sgd_op.h"

namespace paddle {
namespace operators {

Expand Down Expand Up @@ -127,4 +129,6 @@ REGISTER_OPERATOR(
ops::SGDOpInferVarType);
REGISTER_OP_CPU_KERNEL(
sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SGDOpKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>,
ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
303 changes: 210 additions & 93 deletions paddle/fluid/operators/optimizers/sgd_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,220 @@ See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/bfloat16.h"

namespace paddle {
namespace operators {

namespace detail {

template <typename T, int VariableTypeId>
struct sgd_dense_param_kernel {
void operator()() const {}
};

// LodTensor
template <typename T>
struct sgd_dense_param_kernel<
T, framework::VarTypeTrait<framework::LoDTensor>::kId> {
void operator()(const framework::ExecutionContext &ctx) const {
VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, LoDTensor>";
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
const auto *grad = ctx.Input<framework::Tensor>("Grad");

const auto sz = param_out->numel();
jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->data<T>();
const T *param_data = param->data<T>();
const T *grad_data = grad->data<T>();
int64_t rows_idx = 0;
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());

auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
}
};

// SelectedRows
template <typename T>
struct sgd_dense_param_kernel<
T, framework::VarTypeTrait<framework::SelectedRows>::kId> {
void operator()(const framework::ExecutionContext &ctx) const {
VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, SelectedRows>";
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");

const auto &grad_value = grad->value();
const auto &grad_rows = grad->rows();
const T *param_data = param->data<T>();
const T *grad_data = grad_value.data<T>();
const T *lr = learning_rate->data<T>();
const int64_t *rows_data = grad_rows.data();
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());

jit::sgd_attr_t attr;
attr.param_height = param_out->dims()[0];
attr.param_width = param_out->numel() / attr.param_height;
attr.grad_height = grad_rows.size(); // note: it is not grad->height()
attr.grad_width = grad_value.numel() / attr.grad_height;
attr.selected_rows_size = grad_rows.size();

auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
}
};

// LodTensor
template <>
struct sgd_dense_param_kernel<
platform::bfloat16, framework::VarTypeTrait<framework::LoDTensor>::kId> {
void operator()(const framework::ExecutionContext &ctx) const {
VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, LoDTensor>";
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
const auto *grad = ctx.Input<framework::Tensor>("Grad");
param_out->mutable_data<platform::bfloat16>(ctx.GetPlace());

auto p = framework::EigenVector<platform::bfloat16>::Flatten(*param);
auto g = framework::EigenVector<platform::bfloat16>::Flatten(*grad);
auto o = framework::EigenVector<platform::bfloat16>::Flatten(*param_out);
const auto *lr = learning_rate->data<platform::bfloat16>();

o = p - lr[0] * g;
}
};

// SelectedRows
template <>
struct sgd_dense_param_kernel<
platform::bfloat16, framework::VarTypeTrait<framework::SelectedRows>::kId> {
void operator()(const framework::ExecutionContext &ctx) const {
VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, SelectedRows>";
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");

const auto &grad_value = grad->value();
const auto &grad_rows = grad->rows();
const auto grad_height = grad->height();
const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
const auto grad_width = grad_value.numel() / grad_val_height;

const auto *grad_data = grad_value.data<platform::bfloat16>();
auto *out_data = param_out->data<platform::bfloat16>();
const auto *lr = learning_rate->data<platform::bfloat16>();

for (size_t i = 0; i < grad_rows.size(); ++i) {
PADDLE_ENFORCE_LT(
grad_rows[i], grad_height,
platform::errors::OutOfRange(
"Grad rows index value should be less than grad height."
"Got [%s], but expected less than [%s]",
grad_rows[i], grad_height));
const int64_t row = grad_rows[i];
for (int64_t j = 0; j < grad_width; ++j) {
out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
}
}
}
};

template <typename T>
void sgd_op_invoke_dense_param_kernel(const framework::ExecutionContext &ctx) {
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
const auto *grad_var = ctx.InputVar("Grad");

if (grad_var->IsType<framework::LoDTensor>()) {
const auto *grad = ctx.Input<framework::Tensor>("Grad");
const auto sz = param_out->numel();
PADDLE_ENFORCE_EQ(param->numel(), sz,
platform::errors::InvalidArgument(
"The input tensor Param's numel of SgdOp "
"should be equal with ParamOut's numel. "
"But received Param's "
"numel = [%s], ParamOut's numel = [%s]",
param->numel(), sz));
PADDLE_ENFORCE_EQ(grad->numel(), sz,
platform::errors::InvalidArgument(
"The input tensor Grad's numel of SgdOp "
"should be equal with ParamOut's numel. "
"But received Grad's "
"numel = [%s], ParamOut's numel = [%s]",
grad->numel(), sz));

sgd_dense_param_kernel<
T, framework::VarTypeTrait<framework::LoDTensor>::kId>()(ctx);
} else if (grad_var->IsType<framework::SelectedRows>()) {
// TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out,
platform::errors::InvalidArgument(
"The input tensor Param of SgdOp "
"should be equal with ParamOut if variable's "
"type is SelectedRows. "));
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");

// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad->rows().size() == 0) {
return;
}

auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(
grad->height(), out_dims[0],
platform::errors::InvalidArgument(
"The input tensor Grad's height of SgdOp "
"should be equal with ParamOut's dims. But received Grad's "
"height [%s] and ParamOut's dims [%s]",
grad->height(), out_dims[0]));

auto &grad_value = grad->value();
auto &grad_rows = grad->rows();
const auto param_height = param_out->dims()[0];
const auto param_width = param_out->numel() / param_height;
// note: it is not grad->height()
const auto grad_height = static_cast<int64_t>(grad_rows.size());
const auto grad_width = grad_value.numel() / grad_height;

PADDLE_ENFORCE_EQ(
grad_width, param_width,
platform::errors::InvalidArgument(
"The grad_value's numel of SgdOp "
"should be equal with param_out's numel. But received "
"grad_value's numel [%s] and param_out's numel [%s]",
grad_width, param_width));

sgd_dense_param_kernel<
T, framework::VarTypeTrait<framework::SelectedRows>::kId>()(ctx);
} else {
PADDLE_ENFORCE_EQ(
false, true, platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad in SgdOp. Excepted "
"LodTensor or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
}

} // namespace detail

template <typename DeviceContext, typename T>
class SGDOpKernel : public framework::OpKernel<T> {
public:
Expand All @@ -38,102 +244,12 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
const auto *grad_var = ctx.InputVar("Grad");

if (param_var->IsType<framework::LoDTensor>()) {
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
// Actually, all tensors are LoDTensor except SelectedRows.
if (grad_var->IsType<framework::LoDTensor>()) {
const auto *grad = ctx.Input<framework::Tensor>("Grad");
auto sz = param_out->numel();
PADDLE_ENFORCE_EQ(param->numel(), sz,
platform::errors::InvalidArgument(
"The input tensor Param's numel of SgdOp "
"should be equal with ParamOut's numel. "
"But received Param's "
"numel = [%s], ParamOut's numel = [%s]",
param->numel(), sz));
PADDLE_ENFORCE_EQ(grad->numel(), sz,
platform::errors::InvalidArgument(
"The input tensor Grad's numel of SgdOp "
"should be equal with ParamOut's numel. "
"But received Grad's "
"numel = [%s], ParamOut's numel = [%s]",
grad->numel(), sz));

jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->data<T>();
const T *param_data = param->data<T>();
const T *grad_data = grad->data<T>();
int64_t rows_idx = 0;
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());

auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
} else if (grad_var->IsType<framework::SelectedRows>()) {
// TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out,
platform::errors::InvalidArgument(
"The input tensor Param of SgdOp "
"should be equal with ParamOut if variable's "
"type is SelectedRows. "));
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
auto &grad_rows = grad->rows();

// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad_rows.size() == 0) {
return;
}

auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(
grad->height(), out_dims[0],
platform::errors::InvalidArgument(
"The input tensor Grad's height of SgdOp "
"should be equal with ParamOut's dims. But received Grad's "
"height [%s] and ParamOut's dims [%s]",
grad->height(), out_dims[0]));
auto &grad_value = grad->value();
const T *param_data = param->data<T>();
const T *grad_data = grad_value.data<T>();
const T *lr = learning_rate->data<T>();
const int64_t *rows_data = grad_rows.data();
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());

jit::sgd_attr_t attr;
attr.param_height = out_dims[0];
attr.param_width = param_out->numel() / attr.param_height;
attr.grad_height = grad_rows.size(); // note: it is not grad->height()
attr.grad_width = grad_value.numel() / attr.grad_height;
attr.selected_rows_size = grad_rows.size();
PADDLE_ENFORCE_EQ(
attr.grad_width, attr.param_width,
platform::errors::InvalidArgument(
"The grad_value's numel of SgdOp "
"should be equal with param_out's numel. But received "
"grad_value's numel [%s] and param_out's numel [%s]",
attr.grad_width, attr.param_width));

auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
} else {
PADDLE_ENFORCE_EQ(
false, true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad in SgdOp. Excepted "
"LodTensor or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
detail::sgd_op_invoke_dense_param_kernel<T>(ctx);
} else if (param_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
platform::errors::InvalidArgument(
"when param is SelectedRows, "
"gradient should also be SelectedRows"));
"When param is SelectedRows, gradient should also "
"be SelectedRows"));
const auto &param = param_var->Get<framework::SelectedRows>();
auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
const auto &grad = grad_var->Get<framework::SelectedRows>();
Expand Down Expand Up @@ -179,5 +295,6 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
}
}
};

} // namespace operators
} // namespace paddle
Loading