From c007b1c17295af4fe009c6616f9fe3bf0d6d4c44 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 11 Aug 2021 13:56:57 -0700 Subject: [PATCH] Remove _copy_construct factory (#8999) This PR removes the `_copy_construct` factory method for the `Frame` types that were still exposing it, replacing its usage with `_from_data`. The current implementation of `_from_data` is slightly faster, and more importantly this change leaves us with a single fast path for building `Frame` objects (bypassing the slow constructors) so that we only have to maintain and optimize one going forward. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Marlene (https://github.com/marlenezw) URL: https://github.com/rapidsai/cudf/pull/8999 --- python/cudf/cudf/core/_internals/where.py | 2 +- python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/frame.py | 23 ++++------ python/cudf/cudf/core/index.py | 56 +++++++---------------- python/cudf/cudf/core/indexing.py | 5 +- python/cudf/cudf/core/multiindex.py | 3 +- python/cudf/cudf/core/series.py | 51 +++++++++++---------- python/cudf/cudf/core/window/rolling.py | 2 +- 8 files changed, 60 insertions(+), 86 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 87dc1d8e01f..176d91ad478 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -378,6 +378,6 @@ def where( if isinstance(frame, Index): result = Index(result, name=frame.name) else: - result = frame._copy_construct(data=result) + result = frame._from_data({frame.name: result}, frame._index) return frame._mimic_inplace(result, inplace=inplace) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bb6b54a490a..0aafae0a85b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -10,7 +10,7 @@ import warnings from collections import defaultdict from collections.abc import Iterable, Sequence -from typing import Any, Mapping, Optional, TypeVar +from typing import Any, MutableMapping, Optional, TypeVar import cupy import numpy as np @@ -459,7 +459,7 @@ def _init_from_dict_like(self, data, index=None, columns=None): @classmethod def _from_data( cls, - data: Mapping, + data: MutableMapping, index: Optional[BaseIndex] = None, columns: Any = None, ) -> DataFrame: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f06bd9f9024..3c6bc057af1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import functools import warnings from collections import abc -from typing import Any, Dict, Mapping, Optional, Tuple, TypeVar, Union +from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union import cupy import numpy as np @@ -65,7 +65,9 @@ def __init_subclass__(cls): @classmethod def _from_data( - cls, data: Mapping, index: Optional[cudf.core.index.BaseIndex] = None, + cls, + data: MutableMapping, + index: Optional[cudf.core.index.BaseIndex] = None, ): obj = cls.__new__(cls) libcudf.table.Table.__init__(obj, data, index) @@ -4229,7 +4231,7 @@ def _reduce( @classmethod def _from_data( cls, - data: Mapping, + data: MutableMapping, index: Optional[cudf.core.index.BaseIndex] = None, name: Any = None, ): @@ -4519,16 +4521,6 @@ def factorize(self, na_sentinel=-1): """ return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) - @property - def _copy_construct_defaults(self): - """A default dictionary of kwargs to be used for copy construction.""" - raise NotImplementedError - - def _copy_construct(self, **kwargs): - """Shallow copy this object by replacing certain ctor args. - """ - return self.__class__(**{**self._copy_construct_defaults, **kwargs}) - def _binaryop( self, other: T, @@ -4587,8 +4579,9 @@ def _binaryop( result_name: (self._column, other, reflect, fill_value) } - return self._copy_construct( - data=type(self)._colwise_binop(operands, fn)[result_name], + return self._from_data( + data=type(self)._colwise_binop(operands, fn), + index=self._index, name=result_name, ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 9ed756547bb..d53fc4dd3c3 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4,7 +4,16 @@ import pickle from numbers import Number -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import ( + Any, + Dict, + List, + MutableMapping, + Optional, + Tuple, + Type, + Union, +) import cupy import numpy as np @@ -661,39 +670,6 @@ def difference(self, other, sort=None): return difference - def _copy_construct(self, **kwargs): - # Need to override the parent behavior because pandas allows operations - # on unsigned types to return signed values, forcing us to choose the - # right index type here. - data = kwargs.get("data") - cls = self.__class__ - - if data is not None: - if self.dtype != data.dtype: - # TODO: This logic is largely copied from `as_index`. The two - # should be unified via a centralized type dispatching scheme. - if isinstance(data, NumericalColumn): - try: - cls = _dtype_to_index[data.dtype.type] - except KeyError: - cls = GenericIndex - elif isinstance(data, StringColumn): - cls = StringIndex - elif isinstance(data, DatetimeColumn): - cls = DatetimeIndex - elif isinstance(data, TimeDeltaColumn): - cls = TimedeltaIndex - elif isinstance(data, CategoricalColumn): - cls = CategoricalIndex - elif cls is RangeIndex: - # RangeIndex must convert to other numerical types for ops - try: - cls = _dtype_to_index[data.dtype.type] - except KeyError: - cls = GenericIndex - - return cls(**{**self._copy_construct_defaults, **kwargs}) - def sort_values(self, return_indexer=False, ascending=True, key=None): """ Return a sorted copy of the index, and optionally return the indices @@ -1299,12 +1275,14 @@ def from_pandas(cls, index, nan_as_null=None): ind.name = index.name return ind - @property - def _copy_construct_defaults(self): - return {"data": self._column, "name": self.name} - @classmethod - def _from_data(cls, data, index=None): + def _from_data( + cls, + data: MutableMapping, + index: Optional[BaseIndex] = None, + name: Any = None, + ) -> BaseIndex: + assert index is None if not isinstance(data, cudf.core.column_accessor.ColumnAccessor): data = cudf.core.column_accessor.ColumnAccessor(data) if len(data) == 0: diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index a4a69a4e084..09cfc6e144a 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -98,8 +98,9 @@ def __getitem__(self, arg): or _is_null_host_scalar(data) ): return data - index = self._sr.index.take(arg) - return self._sr._copy_construct(data=data, index=index) + return self._sr._from_data( + {self._sr.name: data}, index=cudf.Index(self._sr.index.take(arg)) + ) def __setitem__(self, key, value): from cudf.core.column import column diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index cdc80b6ef32..af6ac5f3dae 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -830,8 +830,7 @@ def _compute_levels_and_codes(self): for name in self._source_data.columns: code, cats = self._source_data[name].factorize() codes[name] = code.astype(np.int64) - cats.name = None - cats = cudf.Series(cats)._copy_construct(name=None) + cats = cudf.Series(cats, name=None) levels.append(cats) self._levels = levels diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d33d624b266..177208fa921 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, Mapping, Optional +from typing import Any, MutableMapping, Optional from uuid import uuid4 import cupy @@ -270,7 +270,7 @@ def __init__( @classmethod def _from_data( cls, - data: Mapping, + data: MutableMapping, index: Optional[BaseIndex] = None, name: Any = None, ) -> Series: @@ -383,10 +383,6 @@ def deserialize(cls, header, frames): return Series(column, index=index, name=name) - @property - def _copy_construct_defaults(self): - return {"data": self._column, "index": self._index, "name": self.name} - def _get_columns_by_label(self, labels, downcast=False): """Return the column specified by `labels` @@ -699,7 +695,7 @@ def reset_index(self, drop=False, inplace=False): if inplace is True: self._index = RangeIndex(len(self)) else: - return self._copy_construct(index=RangeIndex(len(self))) + return self._from_data(self._data, index=RangeIndex(len(self))) def set_index(self, index): """Returns a new Series with a different index. @@ -734,7 +730,7 @@ def set_index(self, index): dtype: int64 """ index = index if isinstance(index, BaseIndex) else as_index(index) - return self._copy_construct(index=index) + return self._from_data(self._data, index, self.name) def as_index(self): """Returns a new Series with a RangeIndex. @@ -851,8 +847,9 @@ def set_mask(self, mask, null_count=None): "in the future.", DeprecationWarning, ) - col = self._column.set_mask(mask) - return self._copy_construct(data=col) + return self._from_data( + {self.name: self._column.set_mask(mask)}, self._index + ) def __sizeof__(self): return self._column.__sizeof__() + self._index.__sizeof__() @@ -1093,8 +1090,9 @@ def take(self, indices, keep_index=True): return self.iloc[indices] else: col_inds = as_column(indices) - data = self._column.take(col_inds, keep_index=False) - return self._copy_construct(data=data, index=None) + return self._from_data( + {self.name: self._column.take(col_inds, keep_index=False)} + ) def head(self, n=5): """ @@ -2723,8 +2721,9 @@ def nans_to_nulls(self): 4 10.0 dtype: float64 """ - result_col = self._column.nans_to_nulls() - return self._copy_construct(data=result_col) + return self._from_data( + {self.name: self._column.nans_to_nulls()}, self._index + ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if bool_only not in (None, True): @@ -3011,8 +3010,9 @@ def astype(self, dtype, copy=False, errors="raise"): try: data = self._column.astype(dtype) - return self._copy_construct( - data=data.copy(deep=True) if copy else data, index=self.index + return self._from_data( + {self.name: (data.copy(deep=True) if copy else data)}, + index=self._index, ) except Exception as e: @@ -3326,8 +3326,8 @@ def _sort(self, ascending=True, na_position="last"): col_keys, col_inds = self._column.sort_by_values( ascending=ascending, na_position=na_position ) - sr_keys = self._copy_construct(data=col_keys) - sr_inds = self._copy_construct(data=col_inds) + sr_keys = self._from_data({self.name: col_keys}, self._index) + sr_inds = self._from_data({self.name: col_inds}, self._index) return sr_keys, sr_inds def replace( @@ -3630,9 +3630,9 @@ def reverse(self): dtype: int64 """ rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32) - col = self._column[rinds] - index = self.index._values[rinds] - return self._copy_construct(data=col, index=index) + return self._from_data( + {self.name: self._column[rinds]}, self.index._values[rinds] + ) def one_hot_encoding(self, cats, dtype="float64"): """Perform one-hot-encoding @@ -3786,7 +3786,9 @@ def _return_sentinel_series(): codes = codes.merge(value, on="value", how="left") codes = codes.sort_values("order")["code"].fillna(na_sentinel) - return codes._copy_construct(name=None, index=self.index) + codes.name = None + codes.index = self._index + return codes # UDF related @@ -3900,7 +3902,7 @@ def applymap(self, udf, out_dtype=None): """ if not callable(udf): raise ValueError("Input UDF must be a callable object.") - return self._copy_construct(data=self._unaryop(udf)) + return self._from_data({self.name: self._unaryop(udf)}, self._index) # # Stats @@ -4721,7 +4723,8 @@ def scale(self): vmin = self.min() vmax = self.max() scaled = (self - vmin) / (vmax - vmin) - return self._copy_construct(data=scaled) + scaled._index = self._index.copy(deep=False) + return scaled # Absolute def abs(self): diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index d2f120a7bb9..e3ed15ba2a6 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -215,7 +215,7 @@ def _apply_agg_series(self, sr, agg_name): self.center, agg_name, ) - return sr._copy_construct(data=result_col) + return sr._from_data({sr.name: result_col}, sr._index) def _apply_agg_dataframe(self, df, agg_name): result_df = cudf.DataFrame({})