Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor string conversion check #7599

Merged
78 changes: 1 addition & 77 deletions cpp/include/cudf/strings/char_types/char_types.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -146,82 +146,6 @@ std::unique_ptr<column> filter_characters_of_type(
string_character_types types_to_keep = string_character_types::ALL_TYPES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to integers.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7']
* b = s.is_integer(s)
* b is [true, true, false, false, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_integer(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns `true` if all strings contain
* characters that are valid for conversion to integers.
*
* This function will return `true` if all string elements
* has at least one character in [-+0-9].
*
* Any null entry or empty string will cause this function to return `false`.
*
* @param strings Strings instance for this operation.
* @return true if all string are valid
*/
bool all_integer(strings_column_view const& strings);

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to floats.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9eE.].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
* b = s.is_float(s)
* b is [true, true, false, false, true, true, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_float(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns `true` if all strings contain
* characters that are valid for conversion to floats.
*
* This function will return `true` if all string elements
* has at least one character in [-+0-9eE.].
*
* Any null entry or empty string will cause this function to return `false`.
*
* @param strings Strings instance for this operation.
* @return true if all string are valid
*/
bool all_float(strings_column_view const& strings);

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
26 changes: 25 additions & 1 deletion cpp/include/cudf/strings/convert/convert_floats.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -68,6 +68,30 @@ std::unique_ptr<column> from_floats(
column_view const& floats,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to floats.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9eE.].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
* b = s.is_float(s)
* b is [true, true, false, false, true, true, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_float(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
26 changes: 25 additions & 1 deletion cpp/include/cudf/strings/convert/convert_integers.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -73,6 +73,30 @@ std::unique_ptr<column> from_integers(
column_view const& integers,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to integers.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7']
* b = s.is_integer(s)
* b is [true, true, false, false, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_integer(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a new integer numeric column parsing hexadecimal values from the
* provided strings column.
Expand Down
113 changes: 1 addition & 112 deletions cpp/src/strings/char_types/char_types.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -186,91 +186,6 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
mr);
}

std::unique_ptr<column> is_integer(
strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;
// create output column
auto results = make_numeric_column(data_type{type_id::BOOL8},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);
auto d_results = results->mutable_view().data<bool>();
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings.size()),
d_results,
[d_column] __device__(size_type idx) {
if (d_column.is_null(idx)) return false;
return string::is_integer(d_column.element<string_view>(idx));
});
results->set_null_count(strings.null_count());
return results;
}

bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream)
{
auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;
auto transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
if (d_column.is_null(idx)) return false;
return string::is_integer(d_column.element<string_view>(idx));
});
return thrust::all_of(rmm::exec_policy(stream),
transformer_itr,
transformer_itr + strings.size(),
thrust::identity<bool>());
}

std::unique_ptr<column> is_float(
strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;
// create output column
auto results = make_numeric_column(data_type{type_id::BOOL8},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);
auto d_results = results->mutable_view().data<bool>();
// check strings for valid float chars
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings.size()),
d_results,
[d_column] __device__(size_type idx) {
if (d_column.is_null(idx)) return false;
return string::is_float(d_column.element<string_view>(idx));
});
results->set_null_count(strings.null_count());
return results;
}

bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream)
{
auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;
auto transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
if (d_column.is_null(idx)) return false;
return string::is_float(d_column.element<string_view>(idx));
});
return thrust::all_of(rmm::exec_policy(stream),
transformer_itr,
transformer_itr + strings.size(),
thrust::identity<bool>());
}

} // namespace detail

// external API
Expand All @@ -295,31 +210,5 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> is_integer(strings_column_view const& strings,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::is_integer(strings, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> is_float(strings_column_view const& strings,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::is_float(strings, rmm::cuda_stream_default, mr);
}

bool all_integer(strings_column_view const& strings)
{
CUDF_FUNC_RANGE();
return detail::all_integer(strings, rmm::cuda_stream_default);
}

bool all_float(strings_column_view const& strings)
{
CUDF_FUNC_RANGE();
return detail::all_float(strings, rmm::cuda_stream_default);
}

} // namespace strings
} // namespace cudf
41 changes: 40 additions & 1 deletion cpp/src/strings/convert/convert_floats.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf/strings/convert/convert_floats.hpp>
#include <cudf/strings/detail/converters.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/traits.hpp>
Expand Down Expand Up @@ -536,12 +537,50 @@ std::unique_ptr<column> from_floats(column_view const& floats,
} // namespace detail

// external API

std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::from_floats(floats, rmm::cuda_stream_default, mr);
}

namespace detail {
std::unique_ptr<column> is_float(
strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;
// create output column
auto results = make_numeric_column(data_type{type_id::BOOL8},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);
auto d_results = results->mutable_view().data<bool>();
// check strings for valid float chars
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings.size()),
d_results,
[d_column] __device__(size_type idx) {
if (d_column.is_null(idx)) return false;
return string::is_float(d_column.element<string_view>(idx));
});
results->set_null_count(strings.null_count());
return results;
}

} // namespace detail

// external API
std::unique_ptr<column> is_float(strings_column_view const& strings,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::is_float(strings, rmm::cuda_stream_default, mr);
}

} // namespace strings
} // namespace cudf
Loading