Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMP OP&Test] arange op support fp16/bf16 #51106

Merged
merged 18 commits into from
Mar 9, 2023
Merged
34 changes: 25 additions & 9 deletions paddle/phi/kernels/gpu/arange_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#include "paddle/phi/kernels/arange_kernel.h"

#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#include "paddle/phi/core/kernel_registry.h"
Expand All @@ -23,9 +26,11 @@

namespace phi {

template <typename T>
__global__ void Range(T start, T step, int64_t size, T* out) {
CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
template <typename T, typename OUT_TYPE>
__global__ void Range(T start, T step, int64_t size, OUT_TYPE* out) {
CUDA_KERNEL_LOOP(index, size) {
out[index] = static_cast<OUT_TYPE>(start + step * index);
}
}

template <typename T, typename Context>
Expand All @@ -34,9 +39,11 @@ void ArangeKernel(const Context& dev_ctx,
const DenseTensor& end,
const DenseTensor& step,
DenseTensor* out) {
T start_value = GetValue<T, Context>(dev_ctx, start);
T end_value = GetValue<T, Context>(dev_ctx, end);
T step_value = GetValue<T, Context>(dev_ctx, step);
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType start_value =
static_cast<MPType>(GetValue<T, Context>(dev_ctx, start));
MPType end_value = static_cast<MPType>(GetValue<T, Context>(dev_ctx, end));
MPType step_value = static_cast<MPType>(GetValue<T, Context>(dev_ctx, step));

int64_t size = 0;
phi::funcs::GetSize(start_value, end_value, step_value, &size);
Expand All @@ -49,7 +56,8 @@ void ArangeKernel(const Context& dev_ctx,
return;
}
int64_t grid = (size + block - 1) / block;
Range<T><<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
Range<MPType, T>
<<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
}

template <typename T, typename Context>
Expand Down Expand Up @@ -78,8 +86,16 @@ template decltype(ArangeNullaryKernel<int, phi::GPUContext>)
ArangeNullaryKernel;
} // namespace phi

PD_REGISTER_KERNEL(
arange, GPU, ALL_LAYOUT, phi::ArangeKernel, float, double, int64_t, int) {
PD_REGISTER_KERNEL(arange,
GPU,
ALL_LAYOUT,
phi::ArangeKernel,
float,
double,
int64_t,
int,
phi::dtype::float16,
phi::dtype::bfloat16) {
kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
Expand Down
46 changes: 45 additions & 1 deletion python/paddle/fluid/tests/unittests/test_arange.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import unittest

import numpy as np
from eager_op_test import OpTest
from eager_op_test import OpTest, convert_float_to_uint16

import paddle
from paddle.fluid import core
Expand Down Expand Up @@ -58,6 +58,50 @@ def init_config(self):
self.case = (0, 5, 1)


class TestFloa16ArangeOp(TestArangeOp):
def init_config(self):
self.dtype = np.float16
self.python_api = paddle.arange
self.case = (0, 5, 1)

def test_check_output(self):
self.check_output()


@unittest.skipIf(
not core.is_compiled_with_cuda()
or not core.is_bfloat16_supported(core.CUDAPlace(0)),
"core is not complied with CUDA and not support the bfloat16",
)
class TestBFloat16ArangeOp(OpTest):
def setUp(self):
self.op_type = "range"
self.init_config()
self.inputs = {
'Start': convert_float_to_uint16(self.start),
'End': convert_float_to_uint16(self.end),
'Step': convert_float_to_uint16(self.step),
}

self.outputs = {
'Out': convert_float_to_uint16(
np.arange(self.start, self.end, self.step)
)
}

def init_config(self):
self.dtype = np.uint16
self.python_api = arange_wrapper
self.case = (0, 5, 1)
self.start = np.array([self.case[0]]).astype(np.float32)
self.end = np.array([self.case[1]]).astype(np.float32)
self.step = np.array([self.case[2]]).astype(np.float32)

def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place)


class TestInt32ArangeOp(TestArangeOp):
def init_config(self):
self.dtype = np.int32
Expand Down
2 changes: 1 addition & 1 deletion python/paddle/tensor/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,7 +1234,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
check_dtype(
dtype,
'dtype',
['float32', 'float64', 'int32', 'int64'],
['float32', 'float64', 'int32', 'int64', 'float16', 'uint16'],
'range/arange',
)
helper = LayerHelper('range', **locals())
Expand Down