Skip to content

Commit

Permalink
Implement cudf::label_bins() (#7554)
Browse files Browse the repository at this point in the history
This PR resolves #7517, implementing a binning feature in `libcudf` to support [pandas.cut](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html) in `cudf`.

Authors:
  - Vyas Ramasubramani (@vyasr)

Approvers:
  - AJ Schmidt (@ajschmidt8)
  - David (@davidwendt)
  - Jake Hemstad (@jrhemstad)
  - Nghia Truong (@ttnghia)

URL: #7554
  • Loading branch information
vyasr authored Mar 24, 2021
1 parent b271c06 commit 2aa9f5b
Show file tree
Hide file tree
Showing 8 changed files with 834 additions and 0 deletions.
2 changes: 2 additions & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ test:
- test -f $PREFIX/include/cudf/ast/linearizer.hpp
- test -f $PREFIX/include/cudf/ast/operators.hpp
- test -f $PREFIX/include/cudf/binaryop.hpp
- test -f $PREFIX/include/cudf/labeling/label_bins.hpp
- test -f $PREFIX/include/cudf/column/column_factories.hpp
- test -f $PREFIX/include/cudf/column/column.hpp
- test -f $PREFIX/include/cudf/column/column_view.hpp
Expand All @@ -66,6 +67,7 @@ test:
- test -f $PREFIX/include/cudf/datetime.hpp
- test -f $PREFIX/include/cudf/detail/aggregation/aggregation.hpp
- test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp
- test -f $PREFIX/include/cudf/detail/label_bins.hpp
- test -f $PREFIX/include/cudf/detail/binaryop.hpp
- test -f $PREFIX/include/cudf/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/detail/copy.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ add_library(cudf
src/binaryop/jit/code/kernel.cpp
src/binaryop/jit/code/operation.cpp
src/binaryop/jit/code/traits.cpp
src/labeling/label_bins.cu
src/bitmask/null_mask.cu
src/column/column.cu
src/column/column_device_view.cu
Expand Down
58 changes: 58 additions & 0 deletions cpp/include/cudf/detail/label_bins.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/labeling/label_bins.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>

namespace cudf {

namespace detail {

/**
* @addtogroup label_bins
* @{
* @file
* @brief Internal APIs for labeling values by bin.
*/

/**
* @copydoc cudf::label_bins(column_view const& input, column_view const& left_edges, inclusive
* left_inclusive, column_view const& right_edges, inclusive right_inclusive, null_order
* edge_null_precedence null_order::BEFORE, rmm::mr::device_memory_resource* mr)
*
* @param stream Stream view on which to allocate resources and queue execution.
*/
std::unique_ptr<column> label_bins(
column_view const& input,
column_view const& left_edges,
inclusive left_inclusive,
column_view const& right_edges,
inclusive right_inclusive,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace detail
} // namespace cudf
81 changes: 81 additions & 0 deletions cpp/include/cudf/labeling/label_bins.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/types.hpp>

#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>

namespace cudf {

/**
* @addtogroup label_bins
* @{
* @file
* @brief APIs for labeling values by bin.
*/

/**
* @brief Enum used to define whether or not bins include their boundary points.
*/
enum class inclusive { YES, NO };

/**
* @brief Labels elements based on membership in the specified bins.
*
* A bin `i` is defined by `left_edges[i], right_edges[i]`. Whether the edges are inclusive or
* not is determined by `left_inclusive` and `right_inclusive`, respectively.
*
* A value `input[j]` belongs to bin `i` if `value[j]` is contained in the range `left_edges[i],
* right_edges[i]` (with the specified inclusiveness) and `label[j] == i`. If `input[j]` does not
* belong to any bin, then `label[j]` is NULL.
*
* Notes:
* - If an empty set of edges is provided, all elements in `input` are labeled NULL.
* - NULL elements in `input` belong to no bin and their corresponding label is NULL.
* - NaN elements in `input` belong to no bin and their corresponding label is NULL.
* - Bins must be provided in monotonically increasing order, otherwise behavior is undefined.
* - If two or more bins overlap, behavior is undefined.
*
* @throws cudf::logic_error if `input.type() == left_edges.type() == right_edges.type()` is
* violated.
* @throws cudf::logic_error if `left_edges.size() != right_edges.size()`
* @throws cudf::logic_error if `left_edges.has_nulls()` or `right_edges.has_nulls()`
*
* @param input The input elements to label according to the specified bins.
* @param left_edges Values of the left edge of each bin.
* @param left_inclusive Whether or not the left edge is inclusive.
* @param right_edges Value of the right edge of each bin.
* @param right_inclusive Whether or not the right edge is inclusive.
* @param edge_null_precedence Whether nulls in left and right edges are at the beginning or the
* end.
* @param mr Device memory resource used to allocate the returned column's device.
* @return The integer labels of the elements in `input` according to the specified bins.
*/
std::unique_ptr<column> label_bins(
column_view const& input,
column_view const& left_edges,
inclusive left_inclusive,
column_view const& right_edges,
inclusive right_inclusive,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf
5 changes: 5 additions & 0 deletions cpp/include/doxygen_groups.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
* @defgroup lists_gather Gathering
* @defgroup lists_elements Counting
* @defgroup lists_drop_duplicates Filtering
* @defgroup lists_sort Sorting
* @}
* @defgroup nvtext_apis NVText
* @{
Expand All @@ -164,4 +165,8 @@
* @defgroup utility_bitmask Bitmask
* @defgroup utility_error Exception
* @}
* @defgroup labeling_apis Labeling
* @{
* @defgroup label_bins Bin Labeling
* @}
*/
Loading

0 comments on commit 2aa9f5b

Please sign in to comment.