-
Notifications
You must be signed in to change notification settings - Fork 908
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement cudf::label_bins() (#7554)
This PR resolves #7517, implementing a binning feature in `libcudf` to support [pandas.cut](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html) in `cudf`. Authors: - Vyas Ramasubramani (@vyasr) Approvers: - AJ Schmidt (@ajschmidt8) - David (@davidwendt) - Jake Hemstad (@jrhemstad) - Nghia Truong (@ttnghia) URL: #7554
- Loading branch information
Showing
8 changed files
with
834 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cudf/labeling/label_bins.hpp> | ||
|
||
#include <cudf/column/column.hpp> | ||
#include <cudf/column/column_view.hpp> | ||
#include <cudf/types.hpp> | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
#include <rmm/mr/device/device_memory_resource.hpp> | ||
#include <rmm/mr/device/per_device_resource.hpp> | ||
|
||
namespace cudf { | ||
|
||
namespace detail { | ||
|
||
/** | ||
* @addtogroup label_bins | ||
* @{ | ||
* @file | ||
* @brief Internal APIs for labeling values by bin. | ||
*/ | ||
|
||
/** | ||
* @copydoc cudf::label_bins(column_view const& input, column_view const& left_edges, inclusive | ||
* left_inclusive, column_view const& right_edges, inclusive right_inclusive, null_order | ||
* edge_null_precedence null_order::BEFORE, rmm::mr::device_memory_resource* mr) | ||
* | ||
* @param stream Stream view on which to allocate resources and queue execution. | ||
*/ | ||
std::unique_ptr<column> label_bins( | ||
column_view const& input, | ||
column_view const& left_edges, | ||
inclusive left_inclusive, | ||
column_view const& right_edges, | ||
inclusive right_inclusive, | ||
rmm::cuda_stream_view stream = rmm::cuda_stream_default, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
/** @} */ // end of group | ||
} // namespace detail | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
/* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cudf/column/column.hpp> | ||
#include <cudf/column/column_view.hpp> | ||
#include <cudf/types.hpp> | ||
|
||
#include <rmm/mr/device/device_memory_resource.hpp> | ||
#include <rmm/mr/device/per_device_resource.hpp> | ||
|
||
namespace cudf { | ||
|
||
/** | ||
* @addtogroup label_bins | ||
* @{ | ||
* @file | ||
* @brief APIs for labeling values by bin. | ||
*/ | ||
|
||
/** | ||
* @brief Enum used to define whether or not bins include their boundary points. | ||
*/ | ||
enum class inclusive { YES, NO }; | ||
|
||
/** | ||
* @brief Labels elements based on membership in the specified bins. | ||
* | ||
* A bin `i` is defined by `left_edges[i], right_edges[i]`. Whether the edges are inclusive or | ||
* not is determined by `left_inclusive` and `right_inclusive`, respectively. | ||
* | ||
* A value `input[j]` belongs to bin `i` if `value[j]` is contained in the range `left_edges[i], | ||
* right_edges[i]` (with the specified inclusiveness) and `label[j] == i`. If `input[j]` does not | ||
* belong to any bin, then `label[j]` is NULL. | ||
* | ||
* Notes: | ||
* - If an empty set of edges is provided, all elements in `input` are labeled NULL. | ||
* - NULL elements in `input` belong to no bin and their corresponding label is NULL. | ||
* - NaN elements in `input` belong to no bin and their corresponding label is NULL. | ||
* - Bins must be provided in monotonically increasing order, otherwise behavior is undefined. | ||
* - If two or more bins overlap, behavior is undefined. | ||
* | ||
* @throws cudf::logic_error if `input.type() == left_edges.type() == right_edges.type()` is | ||
* violated. | ||
* @throws cudf::logic_error if `left_edges.size() != right_edges.size()` | ||
* @throws cudf::logic_error if `left_edges.has_nulls()` or `right_edges.has_nulls()` | ||
* | ||
* @param input The input elements to label according to the specified bins. | ||
* @param left_edges Values of the left edge of each bin. | ||
* @param left_inclusive Whether or not the left edge is inclusive. | ||
* @param right_edges Value of the right edge of each bin. | ||
* @param right_inclusive Whether or not the right edge is inclusive. | ||
* @param edge_null_precedence Whether nulls in left and right edges are at the beginning or the | ||
* end. | ||
* @param mr Device memory resource used to allocate the returned column's device. | ||
* @return The integer labels of the elements in `input` according to the specified bins. | ||
*/ | ||
std::unique_ptr<column> label_bins( | ||
column_view const& input, | ||
column_view const& left_edges, | ||
inclusive left_inclusive, | ||
column_view const& right_edges, | ||
inclusive right_inclusive, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
/** @} */ // end of group | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.