Skip to content

Commit

Permalink
Push DeviceScalar to cython-only (#6800)
Browse files Browse the repository at this point in the history
* deal with strings

* handle timedelta

* handle text

* handle numerical issues

* cleanup

* changelog

* style

* Apply suggestions from code review

Co-authored-by: GALI PREM SAGAR <[email protected]>

* move runtime import

* updates

* cleanup docs

* fix string tests

* Update python/cudf/cudf/_lib/nvtext/tokenize.pyx

* factor out instance checks to python

Co-authored-by: GALI PREM SAGAR <[email protected]>
  • Loading branch information
brandon-b-miller and galipremsagar authored Dec 1, 2020
1 parent 1c81827 commit bd537b6
Show file tree
Hide file tree
Showing 19 changed files with 229 additions and 173 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@
- PR #6780 Move `cudf::cast` tests to separate test file
- PR #6789 Rename `unary_op` to `unary_operator`
- PR #6770 Support building decimal columns with Table.TestBuilder
- PR #6800 Push DeviceScalar to cython-only
- PR #6822 Split out `cudf::distinct_count` from `drop_duplicates.cu`
- PR #6813 Enable `expand=False` in `.str.split` and `.str.rsplit`
- PR #6829 Enable workaround to write categorical columns in csv
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar


def generate_ngrams(Column strings, int ngrams, DeviceScalar separator):
def generate_ngrams(Column strings, int ngrams, object py_separator):

cdef DeviceScalar separator = py_separator.device_value

cdef column_view c_strings = strings.view()
cdef size_type c_ngrams = ngrams
cdef const string_scalar* c_separator = <const string_scalar*>separator\
Expand Down
8 changes: 6 additions & 2 deletions python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@ from cudf._lib.scalar cimport DeviceScalar
def ngrams_tokenize(
Column strings,
int ngrams,
DeviceScalar delimiter,
DeviceScalar separator
object py_delimiter,
object py_separator
):

cdef DeviceScalar delimiter = py_delimiter.device_value
cdef DeviceScalar separator = py_separator.device_value

cdef column_view c_strings = strings.view()
cdef size_type c_ngrams = ngrams
cdef const string_scalar* c_separator = <const string_scalar*>separator\
Expand Down
15 changes: 10 additions & 5 deletions python/cudf/cudf/_lib/nvtext/replace.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@ from cudf._lib.scalar cimport DeviceScalar
def replace_tokens(Column strings,
Column targets,
Column replacements,
DeviceScalar delimiter):
object py_delimiter):
"""
The `targets` tokens are searched for within each `strings`
in the Column and replaced with the corresponding `replacements`
if found. Tokens are identified by the `delimiter` character
if found. Tokens are identified by the `py_delimiter` character
provided.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef column_view c_strings = strings.view()
cdef column_view c_targets = targets.view()
cdef column_view c_replacements = replacements.view()
Expand All @@ -49,15 +51,18 @@ def replace_tokens(Column strings,

def filter_tokens(Column strings,
size_type min_token_length,
DeviceScalar replacement,
DeviceScalar delimiter):
object py_replacement,
object py_delimiter):
"""
Tokens smaller than `min_token_length` are removed from `strings`
in the Column and optionally replaced with the corresponding
`replacement` string. Tokens are identified by the `delimiter`
`py_replacement` string. Tokens are identified by the `py_delimiter`
character provided.
"""

cdef DeviceScalar replacement = py_replacement.device_value
cdef DeviceScalar delimiter = py_delimiter.device_value

cdef column_view c_strings = strings.view()
cdef const string_scalar* c_repl = <const string_scalar*>replacement\
.get_raw_ptr()
Expand Down
36 changes: 8 additions & 28 deletions python/cudf/cudf/_lib/nvtext/tokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,9 @@ from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar


def tokenize(Column strings, object delimiter):
if isinstance(delimiter, DeviceScalar):
return _tokenize_scalar(strings, delimiter)
def _tokenize_scalar(Column strings, object py_delimiter):

if isinstance(delimiter, Column):
return _tokenize_column(strings, delimiter)

raise TypeError(
"Expected a DeviceScalar or Column for delimiters, but got {}".format(
type(delimiter)
)
)


def _tokenize_scalar(Column strings, DeviceScalar delimiter):
cdef DeviceScalar delimiter = py_delimiter.device_value

cdef column_view c_strings = strings.view()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
Expand Down Expand Up @@ -65,21 +53,10 @@ def _tokenize_column(Column strings, Column delimiters):
return Column.from_unique_ptr(move(c_result))


def count_tokens(Column strings, object delimiter):
if isinstance(delimiter, DeviceScalar):
return _count_tokens_scalar(strings, delimiter)
def _count_tokens_scalar(Column strings, object py_delimiter):

if isinstance(delimiter, Column):
return _count_tokens_column(strings, delimiter)
cdef DeviceScalar delimiter = py_delimiter.device_value

raise TypeError(
"Expected a DeviceScalar or Column for delimiters, but got {}".format(
type(delimiter)
)
)


def _count_tokens_scalar(Column strings, DeviceScalar delimiter):
cdef column_view c_strings = strings.view()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
Expand Down Expand Up @@ -123,7 +100,10 @@ def character_tokenize(Column strings):
return Column.from_unique_ptr(move(c_result))


def detokenize(Column strings, Column indices, DeviceScalar separator):
def detokenize(Column strings, Column indices, object py_separator):

cdef DeviceScalar separator = py_separator.device_value

cdef column_view c_strings = strings.view()
cdef column_view c_indices = indices.view()
cdef const string_scalar* c_separator = <const string_scalar*>separator\
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/strings/char_types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ from cudf._lib.cpp.strings.char_types cimport (
)


def filter_alphanum(Column source_strings, DeviceScalar repl, bool keep=True):
def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
"""
Returns a Column of strings keeping only alphanumeric character types.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_repl = <const string_scalar*>(
Expand Down
23 changes: 15 additions & 8 deletions python/cudf/cudf/_lib/strings/combine.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,16 @@ from cudf._lib.cpp.strings.combine cimport (


def concatenate(Table source_strings,
DeviceScalar separator,
DeviceScalar narep):
object py_separator,
object py_narep):
"""
Returns a Column by concatenating strings column-wise in `source_strings`
with the specified `separator` between each column and
`na`/`None` values are replaced by `narep`
with the specified `py_separator` between each column and
`na`/`None` values are replaced by `py_narep`
"""
cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value

cdef unique_ptr[column] c_result
cdef table_view source_view = source_strings.data_view()

Expand All @@ -47,13 +50,17 @@ def concatenate(Table source_strings,


def join(Column source_strings,
DeviceScalar separator,
DeviceScalar narep):
object py_separator,
object py_narep):
"""
Returns a Column by concatenating strings row-wise in `source_strings`
with the specified `separator` between each column and
`na`/`None` values are replaced by `narep`
with the specified `py_separator` between each column and
`na`/`None` values are replaced by `py_narep`
"""

cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

Expand Down
34 changes: 24 additions & 10 deletions python/cudf/cudf/_lib/strings/find.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ from cudf._lib.cpp.strings.find cimport (
)


def contains(Column source_strings, DeviceScalar target):
def contains(Column source_strings, object py_target):
"""
Returns a Column of boolean values with True for `source_strings`
that contain the pattern given in `target`.
that contain the pattern given in `py_target`.
"""
cdef DeviceScalar target = py_target.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

Expand Down Expand Up @@ -57,11 +59,14 @@ def contains_multiple(Column source_strings, Column target_strings):
return Column.from_unique_ptr(move(c_result))


def endswith(Column source_strings, DeviceScalar target):
def endswith(Column source_strings, object py_target):
"""
Returns a Column of boolean values with True for `source_strings`
that contain strings that end with the pattern given in `target`.
that contain strings that end with the pattern given in `py_target`.
"""

cdef DeviceScalar target = py_target.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

Expand Down Expand Up @@ -97,11 +102,14 @@ def endswith_multiple(Column source_strings, Column target_strings):
return Column.from_unique_ptr(move(c_result))


def startswith(Column source_strings, DeviceScalar target):
def startswith(Column source_strings, object py_target):
"""
Returns a Column of boolean values with True for `source_strings`
that contain strings that start with the pattern given in `target`.
that contain strings that start with the pattern given in `py_target`.
"""

cdef DeviceScalar target = py_target.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

Expand Down Expand Up @@ -138,15 +146,18 @@ def startswith_multiple(Column source_strings, Column target_strings):


def find(Column source_strings,
DeviceScalar target,
object py_target,
size_type start,
size_type end):
"""
Returns a Column containing lowest indexes in each string of
`source_strings` that fully contain `target` string.
`source_strings` that fully contain `py_target` string.
Scan portion of strings in `source_strings` can be
controlled by setting `start` and `end` values.
"""

cdef DeviceScalar target = py_target.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

Expand All @@ -166,15 +177,18 @@ def find(Column source_strings,


def rfind(Column source_strings,
DeviceScalar target,
object py_target,
size_type start,
size_type end):
"""
Returns a Column containing highest indexes in each string of
`source_strings` that fully contain `target` string.
`source_strings` that fully contain `py_target` string.
Scan portion of strings in `source_strings` can be
controlled by setting `start` and `end` values.
"""

cdef DeviceScalar target = py_target.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

Expand Down
21 changes: 14 additions & 7 deletions python/cudf/cudf/_lib/strings/replace.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@ from cudf._lib.cpp.strings.substring cimport (
def slice_replace(Column source_strings,
size_type start,
size_type stop,
DeviceScalar repl):
object py_repl):
"""
Returns a Column by replacing specified section
of each string with `repl`. Positions can be
of each string with `py_repl`. Positions can be
specified with `start` and `stop` params.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

Expand All @@ -51,11 +53,14 @@ def slice_replace(Column source_strings,

def insert(Column source_strings,
size_type start,
DeviceScalar repl):
object py_repl):
"""
Returns a Column by inserting a specified
string `repl` at a specific position in all strings.
string `py_repl` at a specific position in all strings.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

Expand All @@ -75,14 +80,16 @@ def insert(Column source_strings,


def replace(Column source_strings,
DeviceScalar target,
DeviceScalar repl,
object py_target,
object py_repl,
int32_t maxrepl):
"""
Returns a Column after replacing occurrences of
patterns `target` with `repl` in `source_strings`.
patterns `py_target` with `py_repl` in `source_strings`.
`maxrepl` indicates number of replacements to make from start.
"""
cdef DeviceScalar target = py_target.device_value
cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/_lib/strings/replace_re.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,17 @@ from libcpp.string cimport string

def replace_re(Column source_strings,
object pattern,
DeviceScalar repl,
object py_repl,
size_type n):
"""
Returns a Column after replacing occurrences regular
expressions `pattern` with `repl` in `source_strings`.
expressions `pattern` with `py_repl` in `source_strings`.
`n` indicates the number of resplacements to be made from
start. (-1 indicates all)
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

Expand Down
14 changes: 10 additions & 4 deletions python/cudf/cudf/_lib/strings/split/partition.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@ from cudf._lib.cpp.strings.split.partition cimport (


def partition(Column source_strings,
DeviceScalar delimiter):
object py_delimiter):
"""
Returns a Table by splitting the `source_strings`
column at the first occurrence of the specified `delimiter`.
column at the first occurrence of the specified `py_delimiter`.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
Expand All @@ -45,11 +48,14 @@ def partition(Column source_strings,


def rpartition(Column source_strings,
DeviceScalar delimiter):
object py_delimiter):
"""
Returns a Column by splitting the `source_strings`
column at the last occurrence of the specified `delimiter`.
column at the last occurrence of the specified `py_delimiter`.
"""

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_str = <const string_scalar*>(
Expand Down
Loading

0 comments on commit bd537b6

Please sign in to comment.