Skip to content

Commit

Permalink
[AMP OP&Test] arange op support fp16/bf16 (#51106)
Browse files Browse the repository at this point in the history
* AMP arange & Test

* fix arange bfloat16 dtype

* update for review

* update for review2

* fix tile

* update

* fix ci

* r

* f

* fix windows ci

* update bfloat data

* fix bloat16 input

* add print

* Update test_where_op.py

* update kernel

* del repeat

* update review
  • Loading branch information
yangjianfengo1 authored Mar 9, 2023
1 parent 2727ddd commit f344897
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 11 deletions.
34 changes: 25 additions & 9 deletions paddle/phi/kernels/gpu/arange_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#include "paddle/phi/kernels/arange_kernel.h"

#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#include "paddle/phi/core/kernel_registry.h"
Expand All @@ -23,9 +26,11 @@

namespace phi {

template <typename T>
__global__ void Range(T start, T step, int64_t size, T* out) {
CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
template <typename T, typename OUT_TYPE>
__global__ void Range(T start, T step, int64_t size, OUT_TYPE* out) {
CUDA_KERNEL_LOOP(index, size) {
out[index] = static_cast<OUT_TYPE>(start + step * index);
}
}

template <typename T, typename Context>
Expand All @@ -34,9 +39,11 @@ void ArangeKernel(const Context& dev_ctx,
const DenseTensor& end,
const DenseTensor& step,
DenseTensor* out) {
T start_value = GetValue<T, Context>(dev_ctx, start);
T end_value = GetValue<T, Context>(dev_ctx, end);
T step_value = GetValue<T, Context>(dev_ctx, step);
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType start_value =
static_cast<MPType>(GetValue<T, Context>(dev_ctx, start));
MPType end_value = static_cast<MPType>(GetValue<T, Context>(dev_ctx, end));
MPType step_value = static_cast<MPType>(GetValue<T, Context>(dev_ctx, step));

int64_t size = 0;
phi::funcs::GetSize(start_value, end_value, step_value, &size);
Expand All @@ -49,7 +56,8 @@ void ArangeKernel(const Context& dev_ctx,
return;
}
int64_t grid = (size + block - 1) / block;
Range<T><<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
Range<MPType, T>
<<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
}

template <typename T, typename Context>
Expand Down Expand Up @@ -78,8 +86,16 @@ template decltype(ArangeNullaryKernel<int, phi::GPUContext>)
ArangeNullaryKernel;
} // namespace phi

PD_REGISTER_KERNEL(
arange, GPU, ALL_LAYOUT, phi::ArangeKernel, float, double, int64_t, int) {
PD_REGISTER_KERNEL(arange,
GPU,
ALL_LAYOUT,
phi::ArangeKernel,
float,
double,
int64_t,
int,
phi::dtype::float16,
phi::dtype::bfloat16) {
kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
Expand Down
46 changes: 45 additions & 1 deletion python/paddle/fluid/tests/unittests/test_arange.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import unittest

import numpy as np
from eager_op_test import OpTest
from eager_op_test import OpTest, convert_float_to_uint16

import paddle
from paddle.fluid import core
Expand Down Expand Up @@ -58,6 +58,50 @@ def init_config(self):
self.case = (0, 5, 1)


class TestFloa16ArangeOp(TestArangeOp):
def init_config(self):
self.dtype = np.float16
self.python_api = paddle.arange
self.case = (0, 5, 1)

def test_check_output(self):
self.check_output()


@unittest.skipIf(
not core.is_compiled_with_cuda()
or not core.is_bfloat16_supported(core.CUDAPlace(0)),
"core is not complied with CUDA and not support the bfloat16",
)
class TestBFloat16ArangeOp(OpTest):
def setUp(self):
self.op_type = "range"
self.init_config()
self.inputs = {
'Start': convert_float_to_uint16(self.start),
'End': convert_float_to_uint16(self.end),
'Step': convert_float_to_uint16(self.step),
}

self.outputs = {
'Out': convert_float_to_uint16(
np.arange(self.start, self.end, self.step)
)
}

def init_config(self):
self.dtype = np.uint16
self.python_api = arange_wrapper
self.case = (0, 5, 1)
self.start = np.array([self.case[0]]).astype(np.float32)
self.end = np.array([self.case[1]]).astype(np.float32)
self.step = np.array([self.case[2]]).astype(np.float32)

def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place)


class TestInt32ArangeOp(TestArangeOp):
def init_config(self):
self.dtype = np.int32
Expand Down
2 changes: 1 addition & 1 deletion python/paddle/tensor/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1233,7 +1233,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
check_dtype(
dtype,
'dtype',
['float32', 'float64', 'int32', 'int64'],
['float32', 'float64', 'int32', 'int64', 'float16', 'uint16'],
'range/arange',
)
helper = LayerHelper('range', **locals())
Expand Down

0 comments on commit f344897

Please sign in to comment.