Skip to content

Commit

Permalink
Remove _copy_construct factory (#8999)
Browse files Browse the repository at this point in the history
This PR removes the `_copy_construct` factory method for the `Frame` types that were still exposing it, replacing its usage with `_from_data`. The current implementation of `_from_data` is slightly faster, and more importantly this change leaves us with a single fast path for building `Frame` objects (bypassing the slow constructors) so that we only have to maintain and optimize one going forward.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Marlene  (https://github.com/marlenezw)

URL: #8999
  • Loading branch information
vyasr authored Aug 11, 2021
1 parent f6ce668 commit c007b1c
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 86 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_internals/where.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,6 @@ def where(
if isinstance(frame, Index):
result = Index(result, name=frame.name)
else:
result = frame._copy_construct(data=result)
result = frame._from_data({frame.name: result}, frame._index)

return frame._mimic_inplace(result, inplace=inplace)
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import warnings
from collections import defaultdict
from collections.abc import Iterable, Sequence
from typing import Any, Mapping, Optional, TypeVar
from typing import Any, MutableMapping, Optional, TypeVar

import cupy
import numpy as np
Expand Down Expand Up @@ -459,7 +459,7 @@ def _init_from_dict_like(self, data, index=None, columns=None):
@classmethod
def _from_data(
cls,
data: Mapping,
data: MutableMapping,
index: Optional[BaseIndex] = None,
columns: Any = None,
) -> DataFrame:
Expand Down
23 changes: 8 additions & 15 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import functools
import warnings
from collections import abc
from typing import Any, Dict, Mapping, Optional, Tuple, TypeVar, Union
from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union

import cupy
import numpy as np
Expand Down Expand Up @@ -65,7 +65,9 @@ def __init_subclass__(cls):

@classmethod
def _from_data(
cls, data: Mapping, index: Optional[cudf.core.index.BaseIndex] = None,
cls,
data: MutableMapping,
index: Optional[cudf.core.index.BaseIndex] = None,
):
obj = cls.__new__(cls)
libcudf.table.Table.__init__(obj, data, index)
Expand Down Expand Up @@ -4229,7 +4231,7 @@ def _reduce(
@classmethod
def _from_data(
cls,
data: Mapping,
data: MutableMapping,
index: Optional[cudf.core.index.BaseIndex] = None,
name: Any = None,
):
Expand Down Expand Up @@ -4519,16 +4521,6 @@ def factorize(self, na_sentinel=-1):
"""
return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)

@property
def _copy_construct_defaults(self):
"""A default dictionary of kwargs to be used for copy construction."""
raise NotImplementedError

def _copy_construct(self, **kwargs):
"""Shallow copy this object by replacing certain ctor args.
"""
return self.__class__(**{**self._copy_construct_defaults, **kwargs})

def _binaryop(
self,
other: T,
Expand Down Expand Up @@ -4587,8 +4579,9 @@ def _binaryop(
result_name: (self._column, other, reflect, fill_value)
}

return self._copy_construct(
data=type(self)._colwise_binop(operands, fn)[result_name],
return self._from_data(
data=type(self)._colwise_binop(operands, fn),
index=self._index,
name=result_name,
)

Expand Down
56 changes: 17 additions & 39 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,16 @@

import pickle
from numbers import Number
from typing import Any, Dict, List, Optional, Tuple, Type, Union
from typing import (
Any,
Dict,
List,
MutableMapping,
Optional,
Tuple,
Type,
Union,
)

import cupy
import numpy as np
Expand Down Expand Up @@ -661,39 +670,6 @@ def difference(self, other, sort=None):

return difference

def _copy_construct(self, **kwargs):
# Need to override the parent behavior because pandas allows operations
# on unsigned types to return signed values, forcing us to choose the
# right index type here.
data = kwargs.get("data")
cls = self.__class__

if data is not None:
if self.dtype != data.dtype:
# TODO: This logic is largely copied from `as_index`. The two
# should be unified via a centralized type dispatching scheme.
if isinstance(data, NumericalColumn):
try:
cls = _dtype_to_index[data.dtype.type]
except KeyError:
cls = GenericIndex
elif isinstance(data, StringColumn):
cls = StringIndex
elif isinstance(data, DatetimeColumn):
cls = DatetimeIndex
elif isinstance(data, TimeDeltaColumn):
cls = TimedeltaIndex
elif isinstance(data, CategoricalColumn):
cls = CategoricalIndex
elif cls is RangeIndex:
# RangeIndex must convert to other numerical types for ops
try:
cls = _dtype_to_index[data.dtype.type]
except KeyError:
cls = GenericIndex

return cls(**{**self._copy_construct_defaults, **kwargs})

def sort_values(self, return_indexer=False, ascending=True, key=None):
"""
Return a sorted copy of the index, and optionally return the indices
Expand Down Expand Up @@ -1299,12 +1275,14 @@ def from_pandas(cls, index, nan_as_null=None):
ind.name = index.name
return ind

@property
def _copy_construct_defaults(self):
return {"data": self._column, "name": self.name}

@classmethod
def _from_data(cls, data, index=None):
def _from_data(
cls,
data: MutableMapping,
index: Optional[BaseIndex] = None,
name: Any = None,
) -> BaseIndex:
assert index is None
if not isinstance(data, cudf.core.column_accessor.ColumnAccessor):
data = cudf.core.column_accessor.ColumnAccessor(data)
if len(data) == 0:
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,9 @@ def __getitem__(self, arg):
or _is_null_host_scalar(data)
):
return data
index = self._sr.index.take(arg)
return self._sr._copy_construct(data=data, index=index)
return self._sr._from_data(
{self._sr.name: data}, index=cudf.Index(self._sr.index.take(arg))
)

def __setitem__(self, key, value):
from cudf.core.column import column
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,8 +830,7 @@ def _compute_levels_and_codes(self):
for name in self._source_data.columns:
code, cats = self._source_data[name].factorize()
codes[name] = code.astype(np.int64)
cats.name = None
cats = cudf.Series(cats)._copy_construct(name=None)
cats = cudf.Series(cats, name=None)
levels.append(cats)

self._levels = levels
Expand Down
51 changes: 27 additions & 24 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from collections import abc as abc
from numbers import Number
from shutil import get_terminal_size
from typing import Any, Mapping, Optional
from typing import Any, MutableMapping, Optional
from uuid import uuid4

import cupy
Expand Down Expand Up @@ -270,7 +270,7 @@ def __init__(
@classmethod
def _from_data(
cls,
data: Mapping,
data: MutableMapping,
index: Optional[BaseIndex] = None,
name: Any = None,
) -> Series:
Expand Down Expand Up @@ -383,10 +383,6 @@ def deserialize(cls, header, frames):

return Series(column, index=index, name=name)

@property
def _copy_construct_defaults(self):
return {"data": self._column, "index": self._index, "name": self.name}

def _get_columns_by_label(self, labels, downcast=False):
"""Return the column specified by `labels`
Expand Down Expand Up @@ -699,7 +695,7 @@ def reset_index(self, drop=False, inplace=False):
if inplace is True:
self._index = RangeIndex(len(self))
else:
return self._copy_construct(index=RangeIndex(len(self)))
return self._from_data(self._data, index=RangeIndex(len(self)))

def set_index(self, index):
"""Returns a new Series with a different index.
Expand Down Expand Up @@ -734,7 +730,7 @@ def set_index(self, index):
dtype: int64
"""
index = index if isinstance(index, BaseIndex) else as_index(index)
return self._copy_construct(index=index)
return self._from_data(self._data, index, self.name)

def as_index(self):
"""Returns a new Series with a RangeIndex.
Expand Down Expand Up @@ -851,8 +847,9 @@ def set_mask(self, mask, null_count=None):
"in the future.",
DeprecationWarning,
)
col = self._column.set_mask(mask)
return self._copy_construct(data=col)
return self._from_data(
{self.name: self._column.set_mask(mask)}, self._index
)

def __sizeof__(self):
return self._column.__sizeof__() + self._index.__sizeof__()
Expand Down Expand Up @@ -1093,8 +1090,9 @@ def take(self, indices, keep_index=True):
return self.iloc[indices]
else:
col_inds = as_column(indices)
data = self._column.take(col_inds, keep_index=False)
return self._copy_construct(data=data, index=None)
return self._from_data(
{self.name: self._column.take(col_inds, keep_index=False)}
)

def head(self, n=5):
"""
Expand Down Expand Up @@ -2723,8 +2721,9 @@ def nans_to_nulls(self):
4 10.0
dtype: float64
"""
result_col = self._column.nans_to_nulls()
return self._copy_construct(data=result_col)
return self._from_data(
{self.name: self._column.nans_to_nulls()}, self._index
)

def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
if bool_only not in (None, True):
Expand Down Expand Up @@ -3011,8 +3010,9 @@ def astype(self, dtype, copy=False, errors="raise"):
try:
data = self._column.astype(dtype)

return self._copy_construct(
data=data.copy(deep=True) if copy else data, index=self.index
return self._from_data(
{self.name: (data.copy(deep=True) if copy else data)},
index=self._index,
)

except Exception as e:
Expand Down Expand Up @@ -3326,8 +3326,8 @@ def _sort(self, ascending=True, na_position="last"):
col_keys, col_inds = self._column.sort_by_values(
ascending=ascending, na_position=na_position
)
sr_keys = self._copy_construct(data=col_keys)
sr_inds = self._copy_construct(data=col_inds)
sr_keys = self._from_data({self.name: col_keys}, self._index)
sr_inds = self._from_data({self.name: col_inds}, self._index)
return sr_keys, sr_inds

def replace(
Expand Down Expand Up @@ -3630,9 +3630,9 @@ def reverse(self):
dtype: int64
"""
rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32)
col = self._column[rinds]
index = self.index._values[rinds]
return self._copy_construct(data=col, index=index)
return self._from_data(
{self.name: self._column[rinds]}, self.index._values[rinds]
)

def one_hot_encoding(self, cats, dtype="float64"):
"""Perform one-hot-encoding
Expand Down Expand Up @@ -3786,7 +3786,9 @@ def _return_sentinel_series():
codes = codes.merge(value, on="value", how="left")
codes = codes.sort_values("order")["code"].fillna(na_sentinel)

return codes._copy_construct(name=None, index=self.index)
codes.name = None
codes.index = self._index
return codes

# UDF related

Expand Down Expand Up @@ -3900,7 +3902,7 @@ def applymap(self, udf, out_dtype=None):
"""
if not callable(udf):
raise ValueError("Input UDF must be a callable object.")
return self._copy_construct(data=self._unaryop(udf))
return self._from_data({self.name: self._unaryop(udf)}, self._index)

#
# Stats
Expand Down Expand Up @@ -4721,7 +4723,8 @@ def scale(self):
vmin = self.min()
vmax = self.max()
scaled = (self - vmin) / (vmax - vmin)
return self._copy_construct(data=scaled)
scaled._index = self._index.copy(deep=False)
return scaled

# Absolute
def abs(self):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def _apply_agg_series(self, sr, agg_name):
self.center,
agg_name,
)
return sr._copy_construct(data=result_col)
return sr._from_data({sr.name: result_col}, sr._index)

def _apply_agg_dataframe(self, df, agg_name):
result_df = cudf.DataFrame({})
Expand Down

0 comments on commit c007b1c

Please sign in to comment.