diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp index 604756b5d09..dbf8ef54e3e 100644 --- a/cpp/include/cudf/strings/capitalize.hpp +++ b/cpp/include/cudf/strings/capitalize.hpp @@ -91,6 +91,33 @@ std::unique_ptr title( string_character_types sequence_type = string_character_types::ALPHA, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Checks if the strings in the input column are title formatted. + * + * The first character of each word should be upper-case while all other + * characters should be lower-case. A word is a sequence of upper-case + * and lower-case characters. + * + * This function returns a column of booleans indicating true if the string in + * the input row is in title format and false if not. + * + * @code{.pseudo} + * Example: + * input = [" Test1", "A Test", " Another test ", "N2Vidia Corp", "!Abc"]; + * output = is_title(input) + * output is [true, true, false, true, true] + * @endcode + * + * Any null string entries result in corresponding null output column entries. + * + * @param input String column. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Column of type BOOL8. + */ +std::unique_ptr is_title( + strings_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu index 023e82dfe24..d5d0eb3a955 100644 --- a/cpp/src/strings/capitalize.cu +++ b/cpp/src/strings/capitalize.cu @@ -35,6 +35,18 @@ namespace strings { namespace detail { namespace { +using char_info = thrust::pair; + +/** + * @brief Returns the given character's info flags. + */ +__device__ char_info get_char_info(character_flags_table_type const* d_flags, char_utf8 chr) +{ + auto const code_point = detail::utf8_to_codepoint(chr); + auto const flag = code_point <= 0x00FFFF ? d_flags[code_point] : character_flags_table_type{0}; + return char_info{code_point, flag}; +} + /** * @brief Base class for capitalize and title functors. * @@ -60,15 +72,6 @@ struct base_fn { { } - using char_info = thrust::pair; - - __device__ char_info get_char_info(char_utf8 chr) const - { - auto const code_point = detail::utf8_to_codepoint(chr); - auto const flag = code_point <= 0x00FFFF ? d_flags[code_point] : character_flags_table_type{0}; - return char_info{code_point, flag}; - } - __device__ int32_t convert_char(char_info const& info, char* d_buffer) const { auto const code_point = info.first; @@ -111,7 +114,7 @@ struct base_fn { auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; bool capitalize = true; for (auto const chr : d_str) { - auto const info = get_char_info(chr); + auto const info = get_char_info(d_flags, chr); auto const flag = info.second; auto const change_case = capitalize ? IS_LOWER(flag) : IS_UPPER(flag); @@ -178,6 +181,36 @@ struct title_fn : base_fn { }; }; +/** + * @brief Functor for determining title format for each string in a column. + * + * The first letter of each word should be upper-case (IS_UPPER). + * All other characters should be lower-case (IS_LOWER). + * Non-upper/lower-case (IS_UPPER_OR_LOWER) characters delimit words. + */ +struct is_title_fn { + character_flags_table_type const* d_flags; + column_device_view const d_column; + + __device__ bool operator()(size_type idx) + { + if (d_column.is_null(idx)) { return false; } + auto const d_str = d_column.element(idx); + + bool at_least_one_valid = false; // requires one or more cased characters + bool should_be_capitalized = true; // current character should be upper-case + for (auto const chr : d_str) { + auto const flag = get_char_info(d_flags, chr).second; + if (IS_UPPER_OR_LOWER(flag)) { + if (should_be_capitalized == !IS_UPPER(flag)) return false; + at_least_one_valid = true; + } + should_be_capitalized = !IS_UPPER_OR_LOWER(flag); + } + return at_least_one_valid; + } +}; + /** * @brief Common utility function for title() and capitalize(). * @@ -226,6 +259,26 @@ std::unique_ptr title(strings_column_view const& input, return capitalizer(title_fn{*d_column, sequence_type}, input, stream, mr); } +std::unique_ptr is_title(strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.is_empty()) return make_empty_column(data_type{type_id::BOOL8}); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + auto d_column = column_device_view::create(input.parent(), stream); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + results->mutable_view().data(), + is_title_fn{get_character_flags_table(), *d_column}); + return results; +} + } // namespace detail std::unique_ptr capitalize(strings_column_view const& input, @@ -244,5 +297,12 @@ std::unique_ptr title(strings_column_view const& input, return detail::title(input, sequence_type, rmm::cuda_stream_default, mr); } +std::unique_ptr is_title(strings_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_title(input, rmm::cuda_stream_default, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp index da55e967266..a9d4c9c76b5 100644 --- a/cpp/tests/strings/case_tests.cpp +++ b/cpp/tests/strings/case_tests.cpp @@ -149,6 +149,32 @@ TEST_F(StringsCaseTest, Title) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } +TEST_F(StringsCaseTest, IsTitle) +{ + cudf::test::strings_column_wrapper input({"Sⱥⱥnich", + "Examples Abc", + "Thesé Strings", + "", + "Are The", + "Tést strings", + "", + "N2Vidia Corp", + "SNAKE", + "!Abc", + " Eagle", + "A Test", + "12345", + "Alpha Not Upper Or Lower: ƻC", + "one More"}, + {1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto results = cudf::strings::is_title(cudf::strings_column_view(input)); + + cudf::test::fixed_width_column_wrapper expected( + {1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0}, {1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsCaseTest, MultiCharUpper) { cudf::test::strings_column_wrapper strings{"\u1f52 \u1f83", "\u1e98 \ufb05", "\u0149"}; diff --git a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd index 02a4469f495..d193a8265b1 100644 --- a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column @@ -11,3 +11,6 @@ cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] title( const column_view & strings) except + + + cdef unique_ptr[column] is_title( + const column_view & strings) except + diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 598ac804dd6..fbc1538cc74 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -26,7 +26,7 @@ count_bytes, count_characters, ) -from cudf._lib.strings.capitalize import capitalize, title +from cudf._lib.strings.capitalize import capitalize, title, is_title from cudf._lib.strings.case import swapcase, to_lower, to_upper from cudf._lib.strings.char_types import ( filter_alphanum, diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx index bb1bf25ef7b..0bbdfa462e2 100644 --- a/python/cudf/cudf/_lib/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/strings/capitalize.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -8,6 +8,7 @@ from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.strings.capitalize cimport ( capitalize as cpp_capitalize, + is_title as cpp_is_title, title as cpp_title, ) @@ -30,3 +31,13 @@ def title(Column source_strings): c_result = move(cpp_title(source_view)) return Column.from_unique_ptr(move(c_result)) + + +def is_title(Column source_strings): + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + with nogil: + c_result = move(cpp_is_title(source_view)) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 476709a76f8..3f6a18c2ea0 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1865,6 +1865,30 @@ def title(self) -> SeriesOrIndex: """ return self._return_or_inplace(libstrings.title(self._column)) + def istitle(self) -> SeriesOrIndex: + """ + Check whether each string is title formatted. + The first letter of each word should be uppercase and the rest + should be lowercase. + + Equivalent to :meth:`str.istitle`. + + Returns : Series or Index of object + + Examples + -------- + >>> import cudf + >>> data = ['leopard', 'Golden Eagle', 'SNAKE', '']) + >>> s = cudf.Series(data) + >>> s.str.istitle() + 0 False + 1 True + 2 False + 3 False + dtype: bool + """ + return self._return_or_inplace(libstrings.is_title(self._column)) + def filter_alphanum( self, repl: str = None, keep: bool = True ) -> SeriesOrIndex: diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index b254a6ba02c..dad0e7581d7 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1386,6 +1386,26 @@ def test_string_char_case(case_op, data): assert_eq(gs.str.isempty(), ps == "") +def test_string_is_title(): + data = [ + "leopard", + "Golden Eagle", + "SNAKE", + "", + "!A", + "hello World", + "A B C", + "#", + "AƻB", + "Ⓑⓖ", + "Art of War", + ] + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq(gs.str.istitle(), ps.str.istitle()) + + @pytest.mark.parametrize( "data", [