Push DeviceScalar to cython-only (#6800)

* deal with strings * handle timedelta * handle text * handle numerical issues * cleanup * changelog * style * Apply suggestions from code review Co-authored-by: GALI PREM SAGAR <[email protected]> * move runtime import * updates * cleanup docs * fix string tests * Update python/cudf/cudf/_lib/nvtext/tokenize.pyx * factor out instance checks to python Co-authored-by: GALI PREM SAGAR <[email protected]>
rapidsai · Dec 1, 2020 · bd537b6 · bd537b6
1 parent 1c81827
commit bd537b6
Show file tree

Hide file tree

Showing 19 changed files with 229 additions and 173 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -108,6 +108,7 @@
 - PR #6780 Move `cudf::cast` tests to separate test file
 - PR #6789 Rename `unary_op` to `unary_operator`
 - PR #6770 Support building decimal columns with Table.TestBuilder
+- PR #6800 Push DeviceScalar to cython-only
 - PR #6822 Split out `cudf::distinct_count` from `drop_duplicates.cu`
 - PR #6813 Enable `expand=False` in `.str.split` and `.str.rsplit`
 - PR #6829 Enable workaround to write categorical columns in csv

diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -15,7 +15,10 @@ from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
-def generate_ngrams(Column strings, int ngrams, DeviceScalar separator):
+def generate_ngrams(Column strings, int ngrams, object py_separator):
+
+    cdef DeviceScalar separator = py_separator.device_value
+
     cdef column_view c_strings = strings.view()
     cdef size_type c_ngrams = ngrams
     cdef const string_scalar* c_separator = <const string_scalar*>separator\

diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -17,9 +17,13 @@ from cudf._lib.scalar cimport DeviceScalar
 def ngrams_tokenize(
     Column strings,
     int ngrams,
-    DeviceScalar delimiter,
-    DeviceScalar separator
+    object py_delimiter,
+    object py_separator
 ):
+
+    cdef DeviceScalar delimiter = py_delimiter.device_value
+    cdef DeviceScalar separator = py_separator.device_value
+
     cdef column_view c_strings = strings.view()
     cdef size_type c_ngrams = ngrams
     cdef const string_scalar* c_separator = <const string_scalar*>separator\

diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -18,14 +18,16 @@ from cudf._lib.scalar cimport DeviceScalar
 def replace_tokens(Column strings,
                    Column targets,
                    Column replacements,
-                   DeviceScalar delimiter):
+                   object py_delimiter):
     """
     The `targets` tokens are searched for within each `strings`
     in the Column and replaced with the corresponding `replacements`
-    if found. Tokens are identified by the `delimiter` character
+    if found. Tokens are identified by the `py_delimiter` character
     provided.
     """
 
+    cdef DeviceScalar delimiter = py_delimiter.device_value
+
     cdef column_view c_strings = strings.view()
     cdef column_view c_targets = targets.view()
     cdef column_view c_replacements = replacements.view()
@@ -49,15 +51,18 @@ def replace_tokens(Column strings,
 
 def filter_tokens(Column strings,
                   size_type min_token_length,
-                  DeviceScalar replacement,
-                  DeviceScalar delimiter):
+                  object py_replacement,
+                  object py_delimiter):
     """
     Tokens smaller than `min_token_length` are removed from `strings`
     in the Column and optionally replaced with the corresponding
-    `replacement` string. Tokens are identified by the `delimiter`
+    `py_replacement` string. Tokens are identified by the `py_delimiter`
     character provided.
     """
 
+    cdef DeviceScalar replacement = py_replacement.device_value
+    cdef DeviceScalar delimiter = py_delimiter.device_value
+
     cdef column_view c_strings = strings.view()
     cdef const string_scalar* c_repl = <const string_scalar*>replacement\
         .get_raw_ptr()

diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -17,21 +17,9 @@ from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
-def tokenize(Column strings, object delimiter):
-    if isinstance(delimiter, DeviceScalar):
-        return _tokenize_scalar(strings, delimiter)
+def _tokenize_scalar(Column strings, object py_delimiter):
 
-    if isinstance(delimiter, Column):
-        return _tokenize_column(strings, delimiter)
-
-    raise TypeError(
-        "Expected a DeviceScalar or Column for delimiters, but got {}".format(
-            type(delimiter)
-        )
-    )
-
-
-def _tokenize_scalar(Column strings, DeviceScalar delimiter):
+    cdef DeviceScalar delimiter = py_delimiter.device_value
 
     cdef column_view c_strings = strings.view()
     cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
@@ -65,21 +53,10 @@ def _tokenize_column(Column strings, Column delimiters):
     return Column.from_unique_ptr(move(c_result))
 
 
-def count_tokens(Column strings, object delimiter):
-    if isinstance(delimiter, DeviceScalar):
-        return _count_tokens_scalar(strings, delimiter)
+def _count_tokens_scalar(Column strings, object py_delimiter):
 
-    if isinstance(delimiter, Column):
-        return _count_tokens_column(strings, delimiter)
+    cdef DeviceScalar delimiter = py_delimiter.device_value
 
-    raise TypeError(
-        "Expected a DeviceScalar or Column for delimiters, but got {}".format(
-            type(delimiter)
-        )
-    )
-
-
-def _count_tokens_scalar(Column strings, DeviceScalar delimiter):
     cdef column_view c_strings = strings.view()
     cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
         .get_raw_ptr()
@@ -123,7 +100,10 @@ def character_tokenize(Column strings):
     return Column.from_unique_ptr(move(c_result))
 
 
-def detokenize(Column strings, Column indices, DeviceScalar separator):
+def detokenize(Column strings, Column indices, object py_separator):
+
+    cdef DeviceScalar separator = py_separator.device_value
+
     cdef column_view c_strings = strings.view()
     cdef column_view c_indices = indices.view()
     cdef const string_scalar* c_separator = <const string_scalar*>separator\

diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -19,10 +19,13 @@ from cudf._lib.cpp.strings.char_types cimport (
 )
 
 
-def filter_alphanum(Column source_strings, DeviceScalar repl, bool keep=True):
+def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
     """
     Returns a Column of strings keeping only alphanumeric character types.
     """
+
+    cdef DeviceScalar repl = py_repl.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
     cdef const string_scalar* scalar_repl = <const string_scalar*>(

diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -20,13 +20,16 @@ from cudf._lib.cpp.strings.combine cimport (
 
 
 def concatenate(Table source_strings,
-                DeviceScalar separator,
-                DeviceScalar narep):
+                object py_separator,
+                object py_narep):
     """
     Returns a Column by concatenating strings column-wise in `source_strings`
-    with the specified `separator` between each column and
-    `na`/`None` values are replaced by `narep`
+    with the specified `py_separator` between each column and
+    `na`/`None` values are replaced by `py_narep`
     """
+    cdef DeviceScalar separator = py_separator.device_value
+    cdef DeviceScalar narep = py_narep.device_value
+
     cdef unique_ptr[column] c_result
     cdef table_view source_view = source_strings.data_view()
 
@@ -47,13 +50,17 @@ def concatenate(Table source_strings,
 
 
 def join(Column source_strings,
-         DeviceScalar separator,
-         DeviceScalar narep):
+         object py_separator,
+         object py_narep):
     """
     Returns a Column by concatenating strings row-wise in `source_strings`
-    with the specified `separator` between each column and
-    `na`/`None` values are replaced by `narep`
+    with the specified `py_separator` between each column and
+    `na`/`None` values are replaced by `py_narep`
     """
+
+    cdef DeviceScalar separator = py_separator.device_value
+    cdef DeviceScalar narep = py_narep.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
 

diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
@@ -18,11 +18,13 @@ from cudf._lib.cpp.strings.find cimport (
 )
 
 
-def contains(Column source_strings, DeviceScalar target):
+def contains(Column source_strings, object py_target):
     """
     Returns a Column of boolean values with True for `source_strings`
-    that contain the pattern given in `target`.
+    that contain the pattern given in `py_target`.
     """
+    cdef DeviceScalar target = py_target.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
 
@@ -57,11 +59,14 @@ def contains_multiple(Column source_strings, Column target_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
-def endswith(Column source_strings, DeviceScalar target):
+def endswith(Column source_strings, object py_target):
     """
     Returns a Column of boolean values with True for `source_strings`
-    that contain strings that end with the pattern given in `target`.
+    that contain strings that end with the pattern given in `py_target`.
     """
+
+    cdef DeviceScalar target = py_target.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
 
@@ -97,11 +102,14 @@ def endswith_multiple(Column source_strings, Column target_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
-def startswith(Column source_strings, DeviceScalar target):
+def startswith(Column source_strings, object py_target):
     """
     Returns a Column of boolean values with True for `source_strings`
-    that contain strings that start with the pattern given in `target`.
+    that contain strings that start with the pattern given in `py_target`.
     """
+
+    cdef DeviceScalar target = py_target.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
 
@@ -138,15 +146,18 @@ def startswith_multiple(Column source_strings, Column target_strings):
 
 
 def find(Column source_strings,
-         DeviceScalar target,
+         object py_target,
          size_type start,
          size_type end):
     """
     Returns a Column containing lowest indexes in each string of
-    `source_strings` that fully contain `target` string.
+    `source_strings` that fully contain `py_target` string.
     Scan portion of strings in `source_strings` can be
     controlled by setting `start` and `end` values.
     """
+
+    cdef DeviceScalar target = py_target.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
 
@@ -166,15 +177,18 @@ def find(Column source_strings,
 
 
 def rfind(Column source_strings,
-          DeviceScalar target,
+          object py_target,
           size_type start,
           size_type end):
     """
     Returns a Column containing highest indexes in each string of
-    `source_strings` that fully contain `target` string.
+    `source_strings` that fully contain `py_target` string.
     Scan portion of strings in `source_strings` can be
     controlled by setting `start` and `end` values.
     """
+
+    cdef DeviceScalar target = py_target.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
 

diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx
@@ -24,13 +24,15 @@ from cudf._lib.cpp.strings.substring cimport (
 def slice_replace(Column source_strings,
                   size_type start,
                   size_type stop,
-                  DeviceScalar repl):
+                  object py_repl):
     """
     Returns a Column by replacing specified section
-    of each string with `repl`. Positions can be
+    of each string with `py_repl`. Positions can be
     specified with `start` and `stop` params.
     """
 
+    cdef DeviceScalar repl = py_repl.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
 
@@ -51,11 +53,14 @@ def slice_replace(Column source_strings,
 
 def insert(Column source_strings,
            size_type start,
-           DeviceScalar repl):
+           object py_repl):
     """
     Returns a Column by inserting a specified
-    string `repl` at a specific position in all strings.
+    string `py_repl` at a specific position in all strings.
     """
+
+    cdef DeviceScalar repl = py_repl.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
 
@@ -75,14 +80,16 @@ def insert(Column source_strings,
 
 
 def replace(Column source_strings,
-            DeviceScalar target,
-            DeviceScalar repl,
+            object py_target,
+            object py_repl,
             int32_t maxrepl):
     """
     Returns a Column after replacing occurrences of
-    patterns `target` with `repl` in `source_strings`.
+    patterns `py_target` with `py_repl` in `source_strings`.
     `maxrepl` indicates number of replacements to make from start.
     """
+    cdef DeviceScalar target = py_target.device_value
+    cdef DeviceScalar repl = py_repl.device_value
 
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()

diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -20,15 +20,17 @@ from libcpp.string cimport string
 
 def replace_re(Column source_strings,
                object pattern,
-               DeviceScalar repl,
+               object py_repl,
                size_type n):
     """
     Returns a Column after replacing occurrences regular
-    expressions `pattern` with `repl` in `source_strings`.
+    expressions `pattern` with `py_repl` in `source_strings`.
     `n` indicates the number of resplacements to be made from
     start. (-1 indicates all)
     """
 
+    cdef DeviceScalar repl = py_repl.device_value
+
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
 

diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -21,11 +21,14 @@ from cudf._lib.cpp.strings.split.partition cimport (
 
 
 def partition(Column source_strings,
-              DeviceScalar delimiter):
+              object py_delimiter):
     """
     Returns a Table by splitting the `source_strings`
-    column at the first occurrence of the specified `delimiter`.
+    column at the first occurrence of the specified `py_delimiter`.
     """
+
+    cdef DeviceScalar delimiter = py_delimiter.device_value
+
     cdef unique_ptr[table] c_result
     cdef column_view source_view = source_strings.view()
     cdef const string_scalar* scalar_str = <const string_scalar*>(
@@ -45,11 +48,14 @@ def partition(Column source_strings,
 
 
 def rpartition(Column source_strings,
-               DeviceScalar delimiter):
+               object py_delimiter):
     """
     Returns a Column by splitting the `source_strings`
-    column at the last occurrence of the specified `delimiter`.
+    column at the last occurrence of the specified `py_delimiter`.
     """
+
+    cdef DeviceScalar delimiter = py_delimiter.device_value
+
     cdef unique_ptr[table] c_result
     cdef column_view source_view = source_strings.view()
     cdef const string_scalar* scalar_str = <const string_scalar*>(