From e73fff0196d5b38b539068ea5e5bd8d6a2336afa Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Wed, 24 Mar 2021 14:01:25 -0400 Subject: [PATCH] Misc Python/Cython optimizations (#7686) This PR introduces various small optimizations that should generally improve various common Python overhead. See https://github.com/rapidsai/cudf/pull/7454#issuecomment-804483021 for the motivation behind these optimizations and some benchmarks. Merge after: #7660 Summary: * Adds a way to initialize a ColumnAccessor (_init_unsafe) without validating its input. This is useful when converting a `cudf::table` to a `Frame`, where we're guaranteed the columns are well formed * Improved (faster) `is_numerical_dtype` * Prioritize check for numeric dtypes in `astype()` and `build_column()`. Numeric types are presumably more common, and we can avoid expensive checks for other dtypes this way. Authors: - Ashwin Srinath (@shwina) Approvers: - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7686 --- python/cudf/cudf/_lib/table.pyx | 32 +++++++++++++++--------- python/cudf/cudf/core/buffer.py | 4 +++ python/cudf/cudf/core/column/column.py | 24 ++++++++++-------- python/cudf/cudf/core/column_accessor.py | 23 ++++++++++++----- python/cudf/cudf/core/frame.py | 4 ++- python/cudf/cudf/utils/dtypes.py | 15 +++++------ 6 files changed, 64 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index f97b45d8abf..93d79ba6843 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -99,22 +99,30 @@ cdef class Table: cdef vector[unique_ptr[column]].iterator it = columns.begin() # First construct the index, if any + cdef int i + index = None if index_names is not None: - index_columns = [] - for _ in index_names: - index_columns.append(Column.from_unique_ptr( - move(dereference(it)) - )) - it += 1 - index = Table(dict(zip(index_names, index_columns))) + index_data = ColumnAccessor._create_unsafe( + { + name: Column.from_unique_ptr( + move(dereference(it + i)) + ) + for i, name in enumerate(index_names) + } + ) + index = Table(data=index_data) # Construct the data dict - data_columns = [] - for _ in column_names: - data_columns.append(Column.from_unique_ptr(move(dereference(it)))) - it += 1 - data = dict(zip(column_names, data_columns)) + cdef int n_index_columns = len(index_names) if index_names else 0 + data = ColumnAccessor._create_unsafe( + { + name: Column.from_unique_ptr( + move(dereference(it + i + n_index_columns)) + ) + for i, name in enumerate(column_names) + } + ) return Table(data=data, index=index) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 350346a87f9..9fc5570e35a 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -42,6 +42,10 @@ def __init__( self.ptr = data.ptr self.size = data.size self._owner = owner or data._owner + elif isinstance(data, rmm.DeviceBuffer): + self.ptr = data.ptr + self.size = data.size + self._owner = data elif hasattr(data, "__array_interface__") or hasattr( data, "__cuda_array_interface__" ): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b2b2874eeb4..dd06d97d105 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1017,7 +1017,9 @@ def distinct_count( return cpp_distinct_count(self, ignore_nulls=dropna) def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: - if is_categorical_dtype(dtype): + if is_numerical_dtype(dtype): + return self.as_numerical_column(dtype) + elif is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) elif pd.api.types.pandas_dtype(dtype).type in { np.str_, @@ -1548,6 +1550,16 @@ def build_column( """ dtype = pd.api.types.pandas_dtype(dtype) + if is_numerical_dtype(dtype): + assert data is not None + return cudf.core.column.NumericalColumn( + data=data, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + ) if is_categorical_dtype(dtype): if not len(children) == 1: raise ValueError( @@ -1634,15 +1646,7 @@ def build_column( children=children, ) else: - assert data is not None - return cudf.core.column.NumericalColumn( - data=data, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - ) + raise TypeError(f"Unrecognized dtype: {dtype}") def build_categorical_column( diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 0c580132290..33bae5c1328 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -19,11 +19,7 @@ import cudf from cudf.core import column -from cudf.utils.utils import ( - cached_property, - to_flat_dict, - to_nested_dict, -) +from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -84,6 +80,21 @@ def __init__( self.multiindex = multiindex self._level_names = level_names + @classmethod + def _create_unsafe( + cls, + data: Dict[Any, ColumnBase], + multiindex: bool = False, + level_names=None, + ) -> ColumnAccessor: + # create a ColumnAccessor without verifying column + # type or size + obj = cls() + obj._data = data + obj.multiindex = multiindex + obj._level_names = level_names + return obj + def __iter__(self): return self._data.__iter__() @@ -167,7 +178,7 @@ def _column_length(self): return 0 def _clear_cache(self): - cached_properties = "columns", "names", "_grouped_data" + cached_properties = ("columns", "names", "_grouped_data") for attr in cached_properties: try: self.__delattr__(attr) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e6898b8c606..ecff3dee573 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2408,7 +2408,9 @@ def _copy_type_metadata( for name, col, other_col in zip( self._data.keys(), self._data.values(), other._data.values() ): - self._data[name] = other_col._copy_type_metadata(col) + self._data.set_by_label( + name, other_col._copy_type_metadata(col), validate=False + ) if include_index: if self._index is not None and other._index is not None: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 1438421bb12..8875a36dba8 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,16 +144,13 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): - if is_categorical_dtype(obj): + # TODO: we should handle objects with a `.dtype` attribute, + # e.g., arrays, here. + try: + dtype = np.dtype(obj) + except TypeError: return False - if is_list_dtype(obj): - return False - return ( - np.issubdtype(obj, np.bool_) - or np.issubdtype(obj, np.floating) - or np.issubdtype(obj, np.signedinteger) - or np.issubdtype(obj, np.unsignedinteger) - ) + return dtype.kind in "biuf" def is_string_dtype(obj):