Skip to content

Commit

Permalink
Misc Python/Cython optimizations (#7686)
Browse files Browse the repository at this point in the history
This PR introduces various small optimizations that should generally improve various common Python overhead. See #7454 (comment) for the motivation behind these optimizations and some benchmarks.

Merge after: #7660 

Summary:

* Adds a way to initialize a ColumnAccessor (_init_unsafe) without validating its input. This is useful when converting a `cudf::table` to a `Frame`, where we're guaranteed the columns are well formed
* Improved (faster) `is_numerical_dtype`
* Prioritize check for numeric dtypes in `astype()` and `build_column()`. Numeric types are presumably more common, and we can avoid expensive checks for other dtypes this way.

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)

URL: #7686
  • Loading branch information
shwina authored Mar 24, 2021
1 parent 8a7af11 commit e73fff0
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 38 deletions.
32 changes: 20 additions & 12 deletions python/cudf/cudf/_lib/table.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -99,22 +99,30 @@ cdef class Table:
cdef vector[unique_ptr[column]].iterator it = columns.begin()

# First construct the index, if any
cdef int i

index = None
if index_names is not None:
index_columns = []
for _ in index_names:
index_columns.append(Column.from_unique_ptr(
move(dereference(it))
))
it += 1
index = Table(dict(zip(index_names, index_columns)))
index_data = ColumnAccessor._create_unsafe(
{
name: Column.from_unique_ptr(
move(dereference(it + i))
)
for i, name in enumerate(index_names)
}
)
index = Table(data=index_data)

# Construct the data dict
data_columns = []
for _ in column_names:
data_columns.append(Column.from_unique_ptr(move(dereference(it))))
it += 1
data = dict(zip(column_names, data_columns))
cdef int n_index_columns = len(index_names) if index_names else 0
data = ColumnAccessor._create_unsafe(
{
name: Column.from_unique_ptr(
move(dereference(it + i + n_index_columns))
)
for i, name in enumerate(column_names)
}
)

return Table(data=data, index=index)

Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ def __init__(
self.ptr = data.ptr
self.size = data.size
self._owner = owner or data._owner
elif isinstance(data, rmm.DeviceBuffer):
self.ptr = data.ptr
self.size = data.size
self._owner = data
elif hasattr(data, "__array_interface__") or hasattr(
data, "__cuda_array_interface__"
):
Expand Down
24 changes: 14 additions & 10 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,9 @@ def distinct_count(
return cpp_distinct_count(self, ignore_nulls=dropna)

def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
if is_categorical_dtype(dtype):
if is_numerical_dtype(dtype):
return self.as_numerical_column(dtype)
elif is_categorical_dtype(dtype):
return self.as_categorical_column(dtype, **kwargs)
elif pd.api.types.pandas_dtype(dtype).type in {
np.str_,
Expand Down Expand Up @@ -1548,6 +1550,16 @@ def build_column(
"""
dtype = pd.api.types.pandas_dtype(dtype)

if is_numerical_dtype(dtype):
assert data is not None
return cudf.core.column.NumericalColumn(
data=data,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
)
if is_categorical_dtype(dtype):
if not len(children) == 1:
raise ValueError(
Expand Down Expand Up @@ -1634,15 +1646,7 @@ def build_column(
children=children,
)
else:
assert data is not None
return cudf.core.column.NumericalColumn(
data=data,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
)
raise TypeError(f"Unrecognized dtype: {dtype}")


def build_categorical_column(
Expand Down
23 changes: 17 additions & 6 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@

import cudf
from cudf.core import column
from cudf.utils.utils import (
cached_property,
to_flat_dict,
to_nested_dict,
)
from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict

if TYPE_CHECKING:
from cudf.core.column import ColumnBase
Expand Down Expand Up @@ -84,6 +80,21 @@ def __init__(
self.multiindex = multiindex
self._level_names = level_names

@classmethod
def _create_unsafe(
cls,
data: Dict[Any, ColumnBase],
multiindex: bool = False,
level_names=None,
) -> ColumnAccessor:
# create a ColumnAccessor without verifying column
# type or size
obj = cls()
obj._data = data
obj.multiindex = multiindex
obj._level_names = level_names
return obj

def __iter__(self):
return self._data.__iter__()

Expand Down Expand Up @@ -167,7 +178,7 @@ def _column_length(self):
return 0

def _clear_cache(self):
cached_properties = "columns", "names", "_grouped_data"
cached_properties = ("columns", "names", "_grouped_data")
for attr in cached_properties:
try:
self.__delattr__(attr)
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2408,7 +2408,9 @@ def _copy_type_metadata(
for name, col, other_col in zip(
self._data.keys(), self._data.values(), other._data.values()
):
self._data[name] = other_col._copy_type_metadata(col)
self._data.set_by_label(
name, other_col._copy_type_metadata(col), validate=False
)

if include_index:
if self._index is not None and other._index is not None:
Expand Down
15 changes: 6 additions & 9 deletions python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,16 +144,13 @@ def numeric_normalize_types(*args):


def is_numerical_dtype(obj):
if is_categorical_dtype(obj):
# TODO: we should handle objects with a `.dtype` attribute,
# e.g., arrays, here.
try:
dtype = np.dtype(obj)
except TypeError:
return False
if is_list_dtype(obj):
return False
return (
np.issubdtype(obj, np.bool_)
or np.issubdtype(obj, np.floating)
or np.issubdtype(obj, np.signedinteger)
or np.issubdtype(obj, np.unsignedinteger)
)
return dtype.kind in "biuf"


def is_string_dtype(obj):
Expand Down

0 comments on commit e73fff0

Please sign in to comment.