rapidsai · rapids-bot · Jul 16, 2021 · Feb 24, 2021 · Mar 3, 2021 · Mar 7, 2021
@@ -56,6 +56,7 @@ endfunction()
 
 jit_preprocess_files(SOURCE_DIRECTORY      ${CUDF_SOURCE_DIR}/src
                      FILES                 binaryop/jit/kernel.cu
+                                           transform/jit/masked_udf_kernel.cu
                                            transform/jit/kernel.cu
                                            rolling/jit/kernel.cu
                      )

@@ -53,6 +53,12 @@ std::unique_ptr<column> transform(
   bool is_ptx,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<column> generalized_masked_op(
+  table_view data_view,
+  std::string const& binary_udf,
+  data_type output_type,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
  * preserving existing null values and also returns new null_count.

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Include Jitify's cstddef header first
+#include <cstddef>
+
+#include <cuda/std/climits>
+#include <cuda/std/cstddef>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <transform/jit/operation-udf.hpp>
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+#include <cuda/std/tuple>
+#include <tuple>
+
+namespace cudf {
+namespace transformation {
+namespace jit {
+
+template <typename T>
+struct Masked {
+  T value;
+  bool valid;
+};
+
+template <typename TypeIn, typename MaskType, typename OffsetType>
+__device__ auto make_args(cudf::size_type id, TypeIn in_ptr, MaskType in_mask, OffsetType in_offset)
+{
+  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
+  return cuda::std::make_tuple(in_ptr[id], valid);
+}
+
+template <typename InType, typename MaskType, typename OffsetType, typename... Arguments>
+__device__ auto make_args(cudf::size_type id,
+                          InType in_ptr,
+                          MaskType in_mask,      // in practice, always cudf::bitmask_type const*
+                          OffsetType in_offset,  // in practice, always cudf::size_type
+                          Arguments... args)
+{
+  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
+  return cuda::std::tuple_cat(cuda::std::make_tuple(in_ptr[id], valid), make_args(id, args...));
+}
+
+template <typename TypeOut, typename... Arguments>
+__global__ void generic_udf_kernel(cudf::size_type size,
+                                   TypeOut* out_data,
+                                   bool* out_mask,
+                                   Arguments... args)
+{
+  int const tid    = threadIdx.x;
+  int const blkid  = blockIdx.x;
+  int const blksz  = blockDim.x;
+  int const gridsz = gridDim.x;
+  int const start  = tid + blkid * blksz;
+  int const step   = blksz * gridsz;
+
+  Masked<TypeOut> output;
+  for (cudf::size_type i = start; i < size; i += step) {
+    auto func_args = cuda::std::tuple_cat(
+      cuda::std::make_tuple(&output.value),
+      make_args(i, args...)  // passed int64*, bool*, int64, int64*, bool*, int64
+    );
+    cuda::std::apply(GENERIC_OP, func_args);
+    out_data[i] = output.value;
+    out_mask[i] = output.valid;
+  }
+}
+
+}  // namespace jit
+}  // namespace transformation
+}  // namespace cudf
@@ -15,6 +15,7 @@
  */
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
+#include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
 
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
@@ -25,6 +26,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -63,6 +65,81 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
+std::vector<std::string> make_template_types(column_view outcol_view, table_view data_view)
+{
+  std::string mskptr_type =
+    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::bitmask_type>())) + "*";
+  std::string offset_type =
+    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
+
+  std::vector<std::string> template_types;
+  template_types.reserve(data_view.num_columns() + 1);
+
+  template_types.push_back(cudf::jit::get_type_name(outcol_view.type()));
+  for (auto const& col : data_view) {
+    template_types.push_back(cudf::jit::get_type_name(col.type()) + "*");
+    template_types.push_back(mskptr_type);
+    template_types.push_back(offset_type);
+  }
+  return template_types;
+}
+
+void generalized_operation(table_view data_view,
+                           std::string const& udf,
+                           data_type output_type,
+                           mutable_column_view outcol_view,
+                           mutable_column_view outmsk_view,
+                           rmm::mr::device_memory_resource* mr)
+{
+  std::vector<std::string> template_types = make_template_types(outcol_view, data_view);
+
+  std::string generic_kernel_name =
+    jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
+      .instantiate(template_types);
+
+  std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
+    udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
+
+  // {size, out_ptr, out_mask_ptr, col0_ptr, col0_mask_ptr, col0_offset, col1_ptr...}
+  std::vector<void*> kernel_args;
+  kernel_args.reserve((data_view.num_columns() * 3) + 3);
+
+  cudf::size_type size   = outcol_view.size();
+  const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view);
+  const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view);
+  kernel_args.insert(kernel_args.begin(), {&size, &outcol_ptr, &outmsk_ptr});
+
+  std::vector<const void*> data_ptrs;
+  std::vector<cudf::bitmask_type const*> mask_ptrs;
+  std::vector<cudf::offset_type> offsets;
+
+  data_ptrs.reserve(data_view.num_columns());
+  mask_ptrs.reserve(data_view.num_columns());
+  offsets.reserve(data_view.num_columns());
+
+  column_view col;
+  for (int col_idx = 0; col_idx < data_view.num_columns(); col_idx++) {
+    col = data_view.column(col_idx);
+
+    data_ptrs.push_back(cudf::jit::get_data_ptr(col));
+    mask_ptrs.push_back(col.null_mask());
+    offsets.push_back(col.offset());
+
+    kernel_args.push_back(&data_ptrs[col_idx]);
+    kernel_args.push_back(&mask_ptrs[col_idx]);
+    kernel_args.push_back(&offsets[col_idx]);
+  }
+
+  rmm::cuda_stream_view generic_stream;
+  cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
+    .get_kernel(generic_kernel_name,
+                {},
+                {{"transform/jit/operation-udf.hpp", generic_cuda_source}},
+                {"-arch=sm_."})                                    //
+    ->configure_1d_max_occupancy(0, 0, 0, generic_stream.value())  //
+    ->launch(kernel_args.data());
+}
+
 }  // namespace jit
 }  // namespace transformation
 
@@ -89,6 +166,24 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
+std::unique_ptr<column> generalized_masked_op(table_view data_view,
+                                              std::string const& udf,
+                                              data_type output_type,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  rmm::cuda_stream_view stream   = rmm::cuda_stream_default;
+  std::unique_ptr<column> output = make_fixed_width_column(output_type, data_view.num_rows());
+  std::unique_ptr<column> output_mask =
+    make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows());
+
+  transformation::jit::generalized_operation(
+    data_view, udf, output_type, *output, *output_mask, mr);
+
+  auto final_output_mask = cudf::bools_to_mask(*output_mask);
+  output.get()->set_null_mask(std::move(*(final_output_mask.first)));
+  return output;
+}
+
 }  // namespace detail
 
 std::unique_ptr<column> transform(column_view const& input,
@@ -101,4 +196,12 @@ std::unique_ptr<column> transform(column_view const& input,
   return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<column> generalized_masked_op(table_view data_view,
+                                              std::string const& udf,
+                                              data_type output_type,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  return detail::generalized_masked_op(data_view, udf, output_type, mr);
+}
+
 }  // namespace cudf
@@ -38,6 +38,12 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         bool is_ptx
     ) except +
 
+    cdef unique_ptr[column] generalized_masked_op(
+        table_view data_view,
+        string udf,
+        data_type output_type,
+    ) except +
+
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input
     ) except +
@@ -123,6 +123,27 @@ def transform(Column input, op):
     return Column.from_unique_ptr(move(c_output))
 
 
+def masked_udf(Table incols, op, output_type):
+    cdef table_view data_view = incols.data_view()
+    cdef string c_str = op.encode("UTF-8")
+    cdef type_id c_tid
+    cdef data_type c_dtype
+
+    c_tid = <type_id> (
+        <underlying_type_t_type_id> np_to_cudf_types[output_type]
+    )
+    c_dtype = data_type(c_tid)
+
+    with nogil:
+        c_output = move(libcudf_transform.generalized_masked_op(
+            data_view,
+            c_str,
+            c_dtype,
+        ))
+
+    return Column.from_unique_ptr(move(c_output))
+
+
 def table_encode(Table input):
     cdef table_view c_input = input.data_view()
     cdef pair[unique_ptr[table], unique_ptr[column]] c_result

@@ -27,4 +27,5 @@
 from cudf.core.multiindex import MultiIndex
 from cudf.core.scalar import NA, Scalar
 from cudf.core.series import Series
+import cudf.core.udf
 from cudf.core.cut import cut
@@ -4697,6 +4697,35 @@ def query(self, expr, local_dict=None):
             boolmask = queryutils.query_execute(self, expr, callenv)
             return self._apply_boolean_mask(boolmask)
 
+    def apply(self, func, axis=1):
+        """
+        Apply a function along an axis of the DataFrame.
+
+        Designed to mimic `pandas.DataFrame.apply`. Applies a user
+        defined function row wise over a dataframe, with true null
+        handling. Works with UDFs using `core.udf.pipeline.nulludf`
+        and returns a single series. Uses numba to jit compile the
+        function to PTX via LLVM.
+
+        Parameters
+        ----------
+        func : function
+            Function to apply to each row.
+
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Axis along which the function is applied:
+            * 0 or 'index': apply function to each column.
+              Note: axis=0 is not yet supported.
+            * 1 or 'columns': apply function to each row.
+
+        """
+        if axis != 1:
+            raise ValueError(
+                "DataFrame.apply currently only supports row wise ops"
+            )
+
+        return func(self)
+
     @applyutils.doc_apply()
     def apply_rows(
         self,

@@ -1456,6 +1456,17 @@ def _quantiles(
         result._copy_type_metadata(self)
         return result
 
+    @annotate("APPLY", color="purple", domain="cudf_python")
+    def _apply(self, func):
+        """
+        Apply `func` across the rows of the frame.
+        """
+        output_dtype, ptx = cudf.core.udf.pipeline.compile_masked_udf(
+            func, self.dtypes
+        )
+        result = cudf._lib.transform.masked_udf(self, ptx, output_dtype)
+        return result
+
     def rank(
         self,
         axis=0,

@@ -0,0 +1 @@
+from . import typing, lowering
@@ -0,0 +1,20 @@
+import operator
+
+arith_ops = [
+    operator.add,
+    operator.sub,
+    operator.mul,
+    operator.truediv,
+    operator.floordiv,
+    operator.mod,
+    operator.pow,
+]
+
+comparison_ops = [
+    operator.eq,
+    operator.ne,
+    operator.lt,
+    operator.le,
+    operator.gt,
+    operator.ge,
+]
@@ -0,0 +1,4 @@
+class Masked:
+    def __init__(self, value, valid):
+        self.value = value
+        self.valid = valid