【Hackathon 6th No.10】Update paddle.searchsorted -part (#64331)

* update searchsorted * fix impl * fix impl dtype * fix impl dtype * fix impl dtype * rerun ci * update test * fix test * fix test * update fp16 test * fix impl * fix impl * fix impl * fix impl * fix impl
PaddlePaddle · May 23, 2024 · 5ccfdff · 5ccfdff
1 parent 592ac86
commit 5ccfdff
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 13 deletions.
diff --git a/paddle/phi/kernels/gpu/searchsorted_kernel.cu b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
@@ -25,6 +25,8 @@ PD_REGISTER_KERNEL(searchsorted,
                    float,
                    double,
                    int,
-                   int64_t) {
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
diff --git a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
@@ -17,7 +17,7 @@
 #include <math.h>
 
 #include "paddle/common/ddim.h"
-#include "paddle/phi/kernels/funcs/algorithm.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
@@ -59,6 +59,54 @@ class GpuAndCpuSearchSortedCompute {
   static HOSTDEVICE bool IsInf(int x UNUSED) { return false; }
   static HOSTDEVICE bool IsInf(int64_t x UNUSED) { return false; }
 
+  HOSTDEVICE inline size_t LowerBound(const T1* x, size_t num, const T2& val) {
+    // @{ Group LowerBound
+    // The following code is from
+    // https://en.cppreference.com/w/cpp/algorithm/lower_bound
+    using MT1 = typename phi::dtype::MPTypeTrait<T1>::Type;
+    using MT2 = typename phi::dtype::MPTypeTrait<T2>::Type;
+    MT2 val_mt = static_cast<MT2>(val);
+
+    auto* first = x;
+    int64_t count = static_cast<int64_t>(num);
+    while (count > 0) {
+      int64_t step = (count >> 1);
+      auto* it = first + step;
+      MT1 it_mt = static_cast<MT1>(*it);
+      if (it_mt < val_mt) {
+        first = ++it;
+        count -= (step + 1);
+      } else {
+        count = step;
+      }
+    }
+    return static_cast<size_t>(first - x);
+  }
+
+  HOSTDEVICE inline size_t UpperBound(const T1* x, size_t num, const T2& val) {
+    // @{ Group UpperBound
+    // The following code is from
+    // https://en.cppreference.com/w/cpp/algorithm/upper_bound
+    using MT1 = typename phi::dtype::MPTypeTrait<T1>::Type;
+    using MT2 = typename phi::dtype::MPTypeTrait<T2>::Type;
+    MT2 val_mt = static_cast<MT2>(val);
+
+    auto* first = x;
+    int64_t count = static_cast<int64_t>(num);
+    while (count > 0) {
+      auto step = (count >> 1);
+      auto* it = first + step;
+      MT1 it_mt = static_cast<MT1>(*it);
+      if (val_mt < it_mt) {
+        count = step;
+      } else {
+        first = ++it;
+        count -= (step + 1);
+      }
+    }
+    return static_cast<size_t>(first - x);
+  }
+
   HOSTDEVICE GpuAndCpuSearchSortedCompute(const T1* sequence_data,
                                           const T2* value_data,
                                           bool right,
@@ -74,19 +122,21 @@ class GpuAndCpuSearchSortedCompute {
         seq_size_(seq_size),
         out_data_(out_data) {}
   HOSTDEVICE void operator()(int64_t idx) {
+    using MT2 = typename phi::dtype::MPTypeTrait<T2>::Type;
     const T2* value_ptr = value_data_ + idx;
+    const MT2 value_mt = static_cast<MT2>(*value_ptr);
     const T1* sequence_ptr = is_1d_boundaries_
                                  ? sequence_data_
                                  : sequence_data_ + idx / val_size_ * seq_size_;
-    if (IsInf(*value_ptr) || IsNan(*value_ptr)) {
+    if (IsInf(value_mt) || IsNan(value_mt)) {
       out_data_[idx] = seq_size_;
     } else {
       if (right_) {
-        out_data_[idx] = static_cast<OutType>(phi::funcs::UpperBound<T1, T2>(
-            sequence_ptr, seq_size_, *value_ptr));
+        out_data_[idx] = static_cast<OutType>(
+            UpperBound(sequence_ptr, seq_size_, *value_ptr));
       } else {
-        out_data_[idx] = static_cast<OutType>(phi::funcs::LowerBound<T1, T2>(
-            sequence_ptr, seq_size_, *value_ptr));
+        out_data_[idx] = static_cast<OutType>(
+            LowerBound(sequence_ptr, seq_size_, *value_ptr));
       }
     }
   }
@@ -166,11 +216,16 @@ void VisitDataTypeForSearchSorted(DataType type, Visitor visitor) {
     visitor.template apply<int>();
   } else if (type == DataType::INT64) {
     visitor.template apply<int64_t>();
+  } else if (type == DataType::FLOAT16) {
+    visitor.template apply<phi::dtype::float16>();
+  } else if (type == DataType::BFLOAT16) {
+    visitor.template apply<phi::dtype::bfloat16>();
   } else {
     PADDLE_THROW(errors::InvalidArgument(
         "The received values data type %s can not meet input requirements. "
         "Because the given values data type of searchsorted operators must be "
-        "float32, float64, int32 or int64. Please input appropriate "
+        "bfloat16, float16, float32, float64, int32 or int64. Please input "
+        "appropriate "
         "sorted_sequence again! ",
         type));
   }

diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
@@ -1125,8 +1125,8 @@ def searchsorted(
     Find the index of the corresponding `sorted_sequence` in the innermost dimension based on the given `values`.
 
     Args:
-        sorted_sequence (Tensor): An input N-D or 1-D tensor with type int32, int64, float32, float64. The value of the tensor monotonically increases in the innermost dimension.
-        values (Tensor): An input N-D tensor value with type int32, int64, float32, float64.
+        sorted_sequence (Tensor): An input N-D or 1-D tensor with type int32, int64, float16, float32, float64, bfloat16. The value of the tensor monotonically increases in the innermost dimension.
+        values (Tensor): An input N-D tensor value with type int32, int64, float16, float32, float64, bfloat16.
         out_int32 (bool, optional): Data type of the output tensor which can be int32, int64. The default value is False, and it indicates that the output data type is int64.
         right (bool, optional): Find the upper or lower bounds of the sorted_sequence range in the innermost dimension based on the given `values`. If the value of the sorted_sequence is nan or inf, return the size of the innermost dimension.
                                The default value is False and it shows the lower bounds.
@@ -1168,13 +1168,13 @@ def searchsorted(
         check_variable_and_dtype(
             sorted_sequence,
             'SortedSequence',
-            ['float32', 'float64', 'int32', 'int64'],
+            ['uint16', 'float16', 'float32', 'float64', 'int32', 'int64'],
             'paddle.searchsorted',
         )
         check_variable_and_dtype(
             values,
             'Values',
-            ['float32', 'float64', 'int32', 'int64'],
+            ['uint16', 'float16', 'float32', 'float64', 'int32', 'int64'],
             'paddle.searchsorted',
         )
 

diff --git a/test/legacy_test/test_searchsorted_op.py b/test/legacy_test/test_searchsorted_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
@@ -92,6 +92,91 @@ def init_test_case(self):
         self.side = "right"
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the float16",
+)
+class TestSearchSortedFP16OP(TestSearchSorted):
+    def setUp(self):
+        self.python_api = paddle.searchsorted
+        self.op_type = "searchsorted"
+        self.dtype = np.float16
+        self.init_test_case()
+
+        self.inputs = {
+            'SortedSequence': self.sorted_sequence.astype(self.dtype),
+            'Values': self.values.astype(self.dtype),
+        }
+        self.attrs = {"out_int32": False, "right": False}
+        self.attrs["right"] = True if self.side == 'right' else False
+        self.outputs = {
+            'Out': np.searchsorted(
+                self.sorted_sequence, self.values, side=self.side
+            )
+        }
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_pir=True)
+
+    def init_test_case(self):
+        self.sorted_sequence = np.array([1, 3, 5, 7, 9])
+        self.values = np.array([[3, 6, 9], [3, 6, 9]])
+        self.side = "left"
+
+
+class TestSearchSortedFP16OP_2(TestSearchSortedFP16OP):
+    def init_test_case(self):
+        self.sorted_sequence = np.array([1, 3, 5, 7, 9])
+        self.values = np.array([[3, 6, 9], [3, 6, 9]])
+        self.side = "right"
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestSearchSortedBF16(TestSearchSorted):
+    def setUp(self):
+        self.python_api = paddle.searchsorted
+        self.public_python_api = paddle.searchsorted
+        self.op_type = "searchsorted"
+        self.python_out_sig = ["Out"]
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+        self.init_test_case()
+
+        self.inputs = {
+            'SortedSequence': convert_float_to_uint16(self.sorted_sequence),
+            'Values': convert_float_to_uint16(self.values),
+        }
+        self.attrs = {"out_int32": False, "right": False}
+        self.attrs["right"] = True if self.side == 'right' else False
+        self.outputs = {
+            'Out': np.searchsorted(
+                self.sorted_sequence, self.values, side=self.side
+            )
+        }
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_pir=True)
+
+    def init_test_case(self):
+        self.sorted_sequence = np.array([1, 3, 5, 7, 9]).astype(self.np_dtype)
+        self.values = np.array([[3, 6, 9], [3, 6, 9]]).astype(self.np_dtype)
+        self.side = "left"
+
+
+class TestSearchSortedBF16_2(TestSearchSortedBF16):
+    def init_test_case(self):
+        self.sorted_sequence = np.array([1, 3, 5, 7, 9]).astype(self.np_dtype)
+        self.values = np.array([[3, 6, 9], [3, 6, 9]]).astype(self.np_dtype)
+        self.side = "right"
+
+
 class TestSearchSortedAPI(unittest.TestCase):
     def init_test_case(self):
         self.sorted_sequence = np.array([2, 4, 6, 8, 10]).astype("float64")