Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cudf strings is_title API #9380

Merged
merged 14 commits into from
Oct 12, 2021
27 changes: 27 additions & 0 deletions cpp/include/cudf/strings/capitalize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,33 @@ std::unique_ptr<column> title(
string_character_types sequence_type = string_character_types::ALPHA,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Checks if the strings in the input column are title formatted.
*
* The first character of each word should be upper-case while all other
* characters should be lower-case. A word is a sequence of alphabetic characters.
* Numerics, punctuation, and spaces delimit words.
*
* This function returns a column of booleans indicating true if the string in
* the input row is in title format and false if not.
*
* @code{.pseudo}
* Example:
* input = [" Test1", "A Test", " Another test ", "N2Vidia Corp", "!Abc"];
* output = is_title(input)
* output is [true, true, false, true, true]
* @endcode
*
* Any null string entries result in corresponding null output column entries.
*
* @param input String column.
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Column of type BOOL8.
*/
std::unique_ptr<column> is_title(
strings_column_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
79 changes: 69 additions & 10 deletions cpp/src/strings/capitalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ namespace strings {
namespace detail {
namespace {

using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;

/**
* @brief Returns the given character's info flags.
*/
__device__ char_info get_char_info(character_flags_table_type const* d_flags, char_utf8 chr)
{
auto const code_point = detail::utf8_to_codepoint(chr);
auto const flag = code_point <= 0x00FFFF ? d_flags[code_point] : character_flags_table_type{0};
return char_info{code_point, flag};
}

/**
* @brief Base class for capitalize and title functors.
*
Expand All @@ -60,15 +72,6 @@ struct base_fn {
{
}

using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;

__device__ char_info get_char_info(char_utf8 chr) const
{
auto const code_point = detail::utf8_to_codepoint(chr);
auto const flag = code_point <= 0x00FFFF ? d_flags[code_point] : character_flags_table_type{0};
return char_info{code_point, flag};
}

__device__ int32_t convert_char(char_info const& info, char* d_buffer) const
{
auto const code_point = info.first;
Expand Down Expand Up @@ -111,7 +114,7 @@ struct base_fn {
auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
bool capitalize = true;
for (auto const chr : d_str) {
auto const info = get_char_info(chr);
auto const info = get_char_info(d_flags, chr);
auto const flag = info.second;
auto const change_case = capitalize ? IS_LOWER(flag) : IS_UPPER(flag);

Expand Down Expand Up @@ -178,6 +181,35 @@ struct title_fn : base_fn<title_fn> {
};
};

/**
* @brief Functor for determining title format for each string in a column.
*
* Non-alphabetic (IS_ALPHA) characters delimit words. The first letter of
* each word should be upper-case (IS_UPPER). All other characters should be
* lower-case (IS_LOWER).
*/
struct is_title_fn {
character_flags_table_type const* d_flags;
column_device_view const d_column;

__device__ bool operator()(size_type idx)
{
if (d_column.is_null(idx)) { return false; }
auto const d_str = d_column.element<string_view>(idx);
if (d_str.empty()) { return false; }
bool capitalized = true;
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
for (auto const chr : d_str) {
bdice marked this conversation as resolved.
Show resolved Hide resolved
auto const flag = get_char_info(d_flags, chr).second;
if (IS_ALPHA(flag)) {
if (capitalized && IS_LOWER(flag)) return false;
if (!capitalized && IS_UPPER(flag)) return false;
}
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
capitalized = (IS_ALPHA(flag) == 0);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
}
return true;
}
};

/**
* @brief Common utility function for title() and capitalize().
*
Expand Down Expand Up @@ -226,6 +258,26 @@ std::unique_ptr<column> title(strings_column_view const& input,
return capitalizer(title_fn{*d_column, sequence_type}, input, stream, mr);
}

std::unique_ptr<column> is_title(strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.is_empty()) return make_empty_column(data_type{type_id::BOOL8});
auto results = make_numeric_column(data_type{type_id::BOOL8},
input.size(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);
auto d_column = column_device_view::create(input.parent(), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
results->mutable_view().data<bool>(),
is_title_fn{get_character_flags_table(), *d_column});
return results;
}

} // namespace detail

std::unique_ptr<column> capitalize(strings_column_view const& input,
Expand All @@ -244,5 +296,12 @@ std::unique_ptr<column> title(strings_column_view const& input,
return detail::title(input, sequence_type, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> is_title(strings_column_view const& input,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::is_title(input, rmm::cuda_stream_default, mr);
}

} // namespace strings
} // namespace cudf
24 changes: 24 additions & 0 deletions cpp/tests/strings/case_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,30 @@ TEST_F(StringsCaseTest, Title)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
}

TEST_F(StringsCaseTest, IsTitle)
{
cudf::test::strings_column_wrapper input({"Sⱥⱥnich",
"Examples Abc",
"Thesé Strings",
"",
"Are The",
"Tést strings",
"",
"N2Vidia Corp",
"SNAKE",
"!Abc",
" Eagle",
"A Test",
"one More"},
{1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});

auto results = cudf::strings::is_title(cudf::strings_column_view(input));

cudf::test::fixed_width_column_wrapper<bool> expected({1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0},
{1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsCaseTest, MultiCharUpper)
{
cudf::test::strings_column_wrapper strings{"\u1f52 \u1f83", "\u1e98 \ufb05", "\u0149"};
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.column.column cimport column
Expand All @@ -11,3 +11,6 @@ cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:

cdef unique_ptr[column] title(
const column_view & strings) except +

cdef unique_ptr[column] is_title(
const column_view & strings) except +
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
count_bytes,
count_characters,
)
from cudf._lib.strings.capitalize import capitalize, title
from cudf._lib.strings.capitalize import capitalize, title, is_title
from cudf._lib.strings.case import swapcase, to_lower, to_upper
from cudf._lib.strings.char_types import (
filter_alphanum,
Expand Down
13 changes: 12 additions & 1 deletion python/cudf/cudf/_lib/strings/capitalize.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2020, NVIDIA CORPORATION.
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
Expand All @@ -8,6 +8,7 @@ from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.strings.capitalize cimport (
capitalize as cpp_capitalize,
is_title as cpp_is_title,
title as cpp_title,
)

Expand All @@ -30,3 +31,13 @@ def title(Column source_strings):
c_result = move(cpp_title(source_view))

return Column.from_unique_ptr(move(c_result))


def is_title(Column source_strings):
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_is_title(source_view))

return Column.from_unique_ptr(move(c_result))
25 changes: 25 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1865,6 +1865,31 @@ def title(self) -> SeriesOrIndex:
"""
return self._return_or_inplace(libstrings.title(self._column))

def istitle(self) -> SeriesOrIndex:
"""
Check whether each string is title formatted.
The first letter after a space is uppercase and the rest
are lowercase.
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

Equivalent to `str.istitle()
<https://docs.python.org/3/library/stdtypes.html#str.istitle>`_.
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

Returns : Series or Index of object

Examples
--------
>>> import cudf
>>> data = ['leopard', 'Golden Eagle', 'SNAKE', ''])
>>> s = cudf.Series(data)
>>> s.str.istitle()
0 False
1 True
2 False
3 False
dtype: object
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
"""
return self._return_or_inplace(libstrings.is_title(self._column))

def filter_alphanum(
self, repl: str = None, keep: bool = True
) -> SeriesOrIndex:
Expand Down
17 changes: 17 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,23 @@ def test_string_char_case(case_op, data):
assert_eq(gs.str.isempty(), ps == "")


def test_string_is_title():
data = [
"leopard",
"Golden Eagle",
"SNAKE",
"",
"!A",
"hello World",
"A B C",
"Art of War",
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
]
gs = cudf.Series(data)
ps = pd.Series(data)

assert_eq(gs.str.istitle(), ps.str.istitle())


@pytest.mark.parametrize(
"data",
[
Expand Down