From e73fff0196d5b38b539068ea5e5bd8d6a2336afa Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Wed, 24 Mar 2021 14:01:25 -0400
Subject: [PATCH] Misc Python/Cython optimizations (#7686)

This PR introduces various small optimizations that should generally improve various common Python overhead. See https://github.com/rapidsai/cudf/pull/7454#issuecomment-804483021 for the motivation behind these optimizations and some benchmarks.

Merge after: #7660

Summary:

* Adds a way to initialize a ColumnAccessor (_init_unsafe) without validating its input. This is useful when converting a `cudf::table` to a `Frame`, where we're guaranteed the columns are well formed
* Improved (faster) `is_numerical_dtype`
* Prioritize check for numeric dtypes in `astype()` and `build_column()`. Numeric types are presumably more common, and we can avoid expensive checks for other dtypes this way.

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7686
---
 python/cudf/cudf/_lib/table.pyx          | 32 +++++++++++++++---------
 python/cudf/cudf/core/buffer.py          |  4 +++
 python/cudf/cudf/core/column/column.py   | 24 ++++++++++--------
 python/cudf/cudf/core/column_accessor.py | 23 ++++++++++++-----
 python/cudf/cudf/core/frame.py           |  4 ++-
 python/cudf/cudf/utils/dtypes.py         | 15 +++++------
 6 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index f97b45d8abf..93d79ba6843 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -99,22 +99,30 @@ cdef class Table:
         cdef vector[unique_ptr[column]].iterator it = columns.begin()
 
         # First construct the index, if any
+        cdef int i
+
         index = None
         if index_names is not None:
-            index_columns = []
-            for _ in index_names:
-                index_columns.append(Column.from_unique_ptr(
-                    move(dereference(it))
-                ))
-                it += 1
-            index = Table(dict(zip(index_names, index_columns)))
+            index_data = ColumnAccessor._create_unsafe(
+                {
+                    name: Column.from_unique_ptr(
+                        move(dereference(it + i))
+                    )
+                    for i, name in enumerate(index_names)
+                }
+            )
+            index = Table(data=index_data)
 
         # Construct the data dict
-        data_columns = []
-        for _ in column_names:
-            data_columns.append(Column.from_unique_ptr(move(dereference(it))))
-            it += 1
-        data = dict(zip(column_names, data_columns))
+        cdef int n_index_columns = len(index_names) if index_names else 0
+        data = ColumnAccessor._create_unsafe(
+            {
+                name: Column.from_unique_ptr(
+                    move(dereference(it + i + n_index_columns))
+                )
+                for i, name in enumerate(column_names)
+            }
+        )
 
         return Table(data=data, index=index)
 
diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
index 350346a87f9..9fc5570e35a 100644
--- a/python/cudf/cudf/core/buffer.py
+++ b/python/cudf/cudf/core/buffer.py
@@ -42,6 +42,10 @@ def __init__(
             self.ptr = data.ptr
             self.size = data.size
             self._owner = owner or data._owner
+        elif isinstance(data, rmm.DeviceBuffer):
+            self.ptr = data.ptr
+            self.size = data.size
+            self._owner = data
         elif hasattr(data, "__array_interface__") or hasattr(
             data, "__cuda_array_interface__"
         ):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b2b2874eeb4..dd06d97d105 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1017,7 +1017,9 @@ def distinct_count(
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
     def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
-        if is_categorical_dtype(dtype):
+        if is_numerical_dtype(dtype):
+            return self.as_numerical_column(dtype)
+        elif is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
         elif pd.api.types.pandas_dtype(dtype).type in {
             np.str_,
@@ -1548,6 +1550,16 @@ def build_column(
     """
     dtype = pd.api.types.pandas_dtype(dtype)
 
+    if is_numerical_dtype(dtype):
+        assert data is not None
+        return cudf.core.column.NumericalColumn(
+            data=data,
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+        )
     if is_categorical_dtype(dtype):
         if not len(children) == 1:
             raise ValueError(
@@ -1634,15 +1646,7 @@ def build_column(
             children=children,
         )
     else:
-        assert data is not None
-        return cudf.core.column.NumericalColumn(
-            data=data,
-            dtype=dtype,
-            mask=mask,
-            size=size,
-            offset=offset,
-            null_count=null_count,
-        )
+        raise TypeError(f"Unrecognized dtype: {dtype}")
 
 
 def build_categorical_column(
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 0c580132290..33bae5c1328 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -19,11 +19,7 @@
 
 import cudf
 from cudf.core import column
-from cudf.utils.utils import (
-    cached_property,
-    to_flat_dict,
-    to_nested_dict,
-)
+from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict
 
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
@@ -84,6 +80,21 @@ def __init__(
             self.multiindex = multiindex
             self._level_names = level_names
 
+    @classmethod
+    def _create_unsafe(
+        cls,
+        data: Dict[Any, ColumnBase],
+        multiindex: bool = False,
+        level_names=None,
+    ) -> ColumnAccessor:
+        # create a ColumnAccessor without verifying column
+        # type or size
+        obj = cls()
+        obj._data = data
+        obj.multiindex = multiindex
+        obj._level_names = level_names
+        return obj
+
     def __iter__(self):
         return self._data.__iter__()
 
@@ -167,7 +178,7 @@ def _column_length(self):
             return 0
 
     def _clear_cache(self):
-        cached_properties = "columns", "names", "_grouped_data"
+        cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
             try:
                 self.__delattr__(attr)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index e6898b8c606..ecff3dee573 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2408,7 +2408,9 @@ def _copy_type_metadata(
         for name, col, other_col in zip(
             self._data.keys(), self._data.values(), other._data.values()
         ):
-            self._data[name] = other_col._copy_type_metadata(col)
+            self._data.set_by_label(
+                name, other_col._copy_type_metadata(col), validate=False
+            )
 
         if include_index:
             if self._index is not None and other._index is not None:
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 1438421bb12..8875a36dba8 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -144,16 +144,13 @@ def numeric_normalize_types(*args):
 
 
 def is_numerical_dtype(obj):
-    if is_categorical_dtype(obj):
+    # TODO: we should handle objects with a `.dtype` attribute,
+    # e.g., arrays, here.
+    try:
+        dtype = np.dtype(obj)
+    except TypeError:
         return False
-    if is_list_dtype(obj):
-        return False
-    return (
-        np.issubdtype(obj, np.bool_)
-        or np.issubdtype(obj, np.floating)
-        or np.issubdtype(obj, np.signedinteger)
-        or np.issubdtype(obj, np.unsignedinteger)
-    )
+    return dtype.kind in "biuf"
 
 
 def is_string_dtype(obj):