Skip to content

Commit

Permalink
Add cudf strings is_title API (#9380)
Browse files Browse the repository at this point in the history
Closes #5265 

This PR adds the libcudf `cudf::strings::is_title()` function and the cudf python `istitle()` function for strings column/series. This includes corresponding gtest and pytest for this feature.

As mentioned in #5265 this function is equivalent for Pandas and pyspark which follow the logic referenced here https://pandas.pydata.org/docs/reference/api/pandas.Series.str.istitle.html

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)

URL: #9380
  • Loading branch information
davidwendt authored Oct 12, 2021
1 parent 56eb91a commit 5e46c7e
Show file tree
Hide file tree
Showing 8 changed files with 184 additions and 13 deletions.
27 changes: 27 additions & 0 deletions cpp/include/cudf/strings/capitalize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,33 @@ std::unique_ptr<column> title(
string_character_types sequence_type = string_character_types::ALPHA,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Checks if the strings in the input column are title formatted.
*
* The first character of each word should be upper-case while all other
* characters should be lower-case. A word is a sequence of upper-case
* and lower-case characters.
*
* This function returns a column of booleans indicating true if the string in
* the input row is in title format and false if not.
*
* @code{.pseudo}
* Example:
* input = [" Test1", "A Test", " Another test ", "N2Vidia Corp", "!Abc"];
* output = is_title(input)
* output is [true, true, false, true, true]
* @endcode
*
* Any null string entries result in corresponding null output column entries.
*
* @param input String column.
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Column of type BOOL8.
*/
std::unique_ptr<column> is_title(
strings_column_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
80 changes: 70 additions & 10 deletions cpp/src/strings/capitalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ namespace strings {
namespace detail {
namespace {

using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;

/**
* @brief Returns the given character's info flags.
*/
__device__ char_info get_char_info(character_flags_table_type const* d_flags, char_utf8 chr)
{
auto const code_point = detail::utf8_to_codepoint(chr);
auto const flag = code_point <= 0x00FFFF ? d_flags[code_point] : character_flags_table_type{0};
return char_info{code_point, flag};
}

/**
* @brief Base class for capitalize and title functors.
*
Expand All @@ -60,15 +72,6 @@ struct base_fn {
{
}

using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;

__device__ char_info get_char_info(char_utf8 chr) const
{
auto const code_point = detail::utf8_to_codepoint(chr);
auto const flag = code_point <= 0x00FFFF ? d_flags[code_point] : character_flags_table_type{0};
return char_info{code_point, flag};
}

__device__ int32_t convert_char(char_info const& info, char* d_buffer) const
{
auto const code_point = info.first;
Expand Down Expand Up @@ -111,7 +114,7 @@ struct base_fn {
auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
bool capitalize = true;
for (auto const chr : d_str) {
auto const info = get_char_info(chr);
auto const info = get_char_info(d_flags, chr);
auto const flag = info.second;
auto const change_case = capitalize ? IS_LOWER(flag) : IS_UPPER(flag);

Expand Down Expand Up @@ -178,6 +181,36 @@ struct title_fn : base_fn<title_fn> {
};
};

/**
* @brief Functor for determining title format for each string in a column.
*
* The first letter of each word should be upper-case (IS_UPPER).
* All other characters should be lower-case (IS_LOWER).
* Non-upper/lower-case (IS_UPPER_OR_LOWER) characters delimit words.
*/
struct is_title_fn {
character_flags_table_type const* d_flags;
column_device_view const d_column;

__device__ bool operator()(size_type idx)
{
if (d_column.is_null(idx)) { return false; }
auto const d_str = d_column.element<string_view>(idx);

bool at_least_one_valid = false; // requires one or more cased characters
bool should_be_capitalized = true; // current character should be upper-case
for (auto const chr : d_str) {
auto const flag = get_char_info(d_flags, chr).second;
if (IS_UPPER_OR_LOWER(flag)) {
if (should_be_capitalized == !IS_UPPER(flag)) return false;
at_least_one_valid = true;
}
should_be_capitalized = !IS_UPPER_OR_LOWER(flag);
}
return at_least_one_valid;
}
};

/**
* @brief Common utility function for title() and capitalize().
*
Expand Down Expand Up @@ -226,6 +259,26 @@ std::unique_ptr<column> title(strings_column_view const& input,
return capitalizer(title_fn{*d_column, sequence_type}, input, stream, mr);
}

std::unique_ptr<column> is_title(strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.is_empty()) return make_empty_column(data_type{type_id::BOOL8});
auto results = make_numeric_column(data_type{type_id::BOOL8},
input.size(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);
auto d_column = column_device_view::create(input.parent(), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
results->mutable_view().data<bool>(),
is_title_fn{get_character_flags_table(), *d_column});
return results;
}

} // namespace detail

std::unique_ptr<column> capitalize(strings_column_view const& input,
Expand All @@ -244,5 +297,12 @@ std::unique_ptr<column> title(strings_column_view const& input,
return detail::title(input, sequence_type, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> is_title(strings_column_view const& input,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::is_title(input, rmm::cuda_stream_default, mr);
}

} // namespace strings
} // namespace cudf
26 changes: 26 additions & 0 deletions cpp/tests/strings/case_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,32 @@ TEST_F(StringsCaseTest, Title)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
}

TEST_F(StringsCaseTest, IsTitle)
{
cudf::test::strings_column_wrapper input({"Sⱥⱥnich",
"Examples Abc",
"Thesé Strings",
"",
"Are The",
"Tést strings",
"",
"N2Vidia Corp",
"SNAKE",
"!Abc",
" Eagle",
"A Test",
"12345",
"Alpha Not Upper Or Lower: ƻC",
"one More"},
{1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});

auto results = cudf::strings::is_title(cudf::strings_column_view(input));

cudf::test::fixed_width_column_wrapper<bool> expected(
{1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0}, {1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsCaseTest, MultiCharUpper)
{
cudf::test::strings_column_wrapper strings{"\u1f52 \u1f83", "\u1e98 \ufb05", "\u0149"};
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.column.column cimport column
Expand All @@ -11,3 +11,6 @@ cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:

cdef unique_ptr[column] title(
const column_view & strings) except +

cdef unique_ptr[column] is_title(
const column_view & strings) except +
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
count_bytes,
count_characters,
)
from cudf._lib.strings.capitalize import capitalize, title
from cudf._lib.strings.capitalize import capitalize, title, is_title
from cudf._lib.strings.case import swapcase, to_lower, to_upper
from cudf._lib.strings.char_types import (
filter_alphanum,
Expand Down
13 changes: 12 additions & 1 deletion python/cudf/cudf/_lib/strings/capitalize.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2020, NVIDIA CORPORATION.
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
Expand All @@ -8,6 +8,7 @@ from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.strings.capitalize cimport (
capitalize as cpp_capitalize,
is_title as cpp_is_title,
title as cpp_title,
)

Expand All @@ -30,3 +31,13 @@ def title(Column source_strings):
c_result = move(cpp_title(source_view))

return Column.from_unique_ptr(move(c_result))


def is_title(Column source_strings):
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_is_title(source_view))

return Column.from_unique_ptr(move(c_result))
24 changes: 24 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1865,6 +1865,30 @@ def title(self) -> SeriesOrIndex:
"""
return self._return_or_inplace(libstrings.title(self._column))

def istitle(self) -> SeriesOrIndex:
"""
Check whether each string is title formatted.
The first letter of each word should be uppercase and the rest
should be lowercase.
Equivalent to :meth:`str.istitle`.
Returns : Series or Index of object
Examples
--------
>>> import cudf
>>> data = ['leopard', 'Golden Eagle', 'SNAKE', ''])
>>> s = cudf.Series(data)
>>> s.str.istitle()
0 False
1 True
2 False
3 False
dtype: bool
"""
return self._return_or_inplace(libstrings.is_title(self._column))

def filter_alphanum(
self, repl: str = None, keep: bool = True
) -> SeriesOrIndex:
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,26 @@ def test_string_char_case(case_op, data):
assert_eq(gs.str.isempty(), ps == "")


def test_string_is_title():
data = [
"leopard",
"Golden Eagle",
"SNAKE",
"",
"!A",
"hello World",
"A B C",
"#",
"AƻB",
"Ⓑⓖ",
"Art of War",
]
gs = cudf.Series(data)
ps = pd.Series(data)

assert_eq(gs.str.istitle(), ps.str.istitle())


@pytest.mark.parametrize(
"data",
[
Expand Down

0 comments on commit 5e46c7e

Please sign in to comment.