From c007b1c17295af4fe009c6616f9fe3bf0d6d4c44 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 11 Aug 2021 13:56:57 -0700
Subject: [PATCH] Remove _copy_construct factory (#8999)

This PR removes the `_copy_construct` factory method for the `Frame` types that were still exposing it, replacing its usage with `_from_data`. The current implementation of `_from_data` is slightly faster, and more importantly this change leaves us with a single fast path for building `Frame` objects (bypassing the slow constructors) so that we only have to maintain and optimize one going forward.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Marlene  (https://github.com/marlenezw)

URL: https://github.com/rapidsai/cudf/pull/8999
---
 python/cudf/cudf/core/_internals/where.py |  2 +-
 python/cudf/cudf/core/dataframe.py        |  4 +-
 python/cudf/cudf/core/frame.py            | 23 ++++------
 python/cudf/cudf/core/index.py            | 56 +++++++----------------
 python/cudf/cudf/core/indexing.py         |  5 +-
 python/cudf/cudf/core/multiindex.py       |  3 +-
 python/cudf/cudf/core/series.py           | 51 +++++++++++----------
 python/cudf/cudf/core/window/rolling.py   |  2 +-
 8 files changed, 60 insertions(+), 86 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 87dc1d8e01f..176d91ad478 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -378,6 +378,6 @@ def where(
         if isinstance(frame, Index):
             result = Index(result, name=frame.name)
         else:
-            result = frame._copy_construct(data=result)
+            result = frame._from_data({frame.name: result}, frame._index)
 
         return frame._mimic_inplace(result, inplace=inplace)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bb6b54a490a..0aafae0a85b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -10,7 +10,7 @@
 import warnings
 from collections import defaultdict
 from collections.abc import Iterable, Sequence
-from typing import Any, Mapping, Optional, TypeVar
+from typing import Any, MutableMapping, Optional, TypeVar
 
 import cupy
 import numpy as np
@@ -459,7 +459,7 @@ def _init_from_dict_like(self, data, index=None, columns=None):
     @classmethod
     def _from_data(
         cls,
-        data: Mapping,
+        data: MutableMapping,
         index: Optional[BaseIndex] = None,
         columns: Any = None,
     ) -> DataFrame:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index f06bd9f9024..3c6bc057af1 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import functools
 import warnings
 from collections import abc
-from typing import Any, Dict, Mapping, Optional, Tuple, TypeVar, Union
+from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union
 
 import cupy
 import numpy as np
@@ -65,7 +65,9 @@ def __init_subclass__(cls):
 
     @classmethod
     def _from_data(
-        cls, data: Mapping, index: Optional[cudf.core.index.BaseIndex] = None,
+        cls,
+        data: MutableMapping,
+        index: Optional[cudf.core.index.BaseIndex] = None,
     ):
         obj = cls.__new__(cls)
         libcudf.table.Table.__init__(obj, data, index)
@@ -4229,7 +4231,7 @@ def _reduce(
     @classmethod
     def _from_data(
         cls,
-        data: Mapping,
+        data: MutableMapping,
         index: Optional[cudf.core.index.BaseIndex] = None,
         name: Any = None,
     ):
@@ -4519,16 +4521,6 @@ def factorize(self, na_sentinel=-1):
         """
         return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
 
-    @property
-    def _copy_construct_defaults(self):
-        """A default dictionary of kwargs to be used for copy construction."""
-        raise NotImplementedError
-
-    def _copy_construct(self, **kwargs):
-        """Shallow copy this object by replacing certain ctor args.
-        """
-        return self.__class__(**{**self._copy_construct_defaults, **kwargs})
-
     def _binaryop(
         self,
         other: T,
@@ -4587,8 +4579,9 @@ def _binaryop(
             result_name: (self._column, other, reflect, fill_value)
         }
 
-        return self._copy_construct(
-            data=type(self)._colwise_binop(operands, fn)[result_name],
+        return self._from_data(
+            data=type(self)._colwise_binop(operands, fn),
+            index=self._index,
             name=result_name,
         )
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 9ed756547bb..d53fc4dd3c3 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -4,7 +4,16 @@
 
 import pickle
 from numbers import Number
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import (
+    Any,
+    Dict,
+    List,
+    MutableMapping,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
 
 import cupy
 import numpy as np
@@ -661,39 +670,6 @@ def difference(self, other, sort=None):
 
         return difference
 
-    def _copy_construct(self, **kwargs):
-        # Need to override the parent behavior because pandas allows operations
-        # on unsigned types to return signed values, forcing us to choose the
-        # right index type here.
-        data = kwargs.get("data")
-        cls = self.__class__
-
-        if data is not None:
-            if self.dtype != data.dtype:
-                # TODO: This logic is largely copied from `as_index`. The two
-                # should be unified via a centralized type dispatching scheme.
-                if isinstance(data, NumericalColumn):
-                    try:
-                        cls = _dtype_to_index[data.dtype.type]
-                    except KeyError:
-                        cls = GenericIndex
-                elif isinstance(data, StringColumn):
-                    cls = StringIndex
-                elif isinstance(data, DatetimeColumn):
-                    cls = DatetimeIndex
-                elif isinstance(data, TimeDeltaColumn):
-                    cls = TimedeltaIndex
-                elif isinstance(data, CategoricalColumn):
-                    cls = CategoricalIndex
-            elif cls is RangeIndex:
-                # RangeIndex must convert to other numerical types for ops
-                try:
-                    cls = _dtype_to_index[data.dtype.type]
-                except KeyError:
-                    cls = GenericIndex
-
-        return cls(**{**self._copy_construct_defaults, **kwargs})
-
     def sort_values(self, return_indexer=False, ascending=True, key=None):
         """
         Return a sorted copy of the index, and optionally return the indices
@@ -1299,12 +1275,14 @@ def from_pandas(cls, index, nan_as_null=None):
         ind.name = index.name
         return ind
 
-    @property
-    def _copy_construct_defaults(self):
-        return {"data": self._column, "name": self.name}
-
     @classmethod
-    def _from_data(cls, data, index=None):
+    def _from_data(
+        cls,
+        data: MutableMapping,
+        index: Optional[BaseIndex] = None,
+        name: Any = None,
+    ) -> BaseIndex:
+        assert index is None
         if not isinstance(data, cudf.core.column_accessor.ColumnAccessor):
             data = cudf.core.column_accessor.ColumnAccessor(data)
         if len(data) == 0:
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index a4a69a4e084..09cfc6e144a 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -98,8 +98,9 @@ def __getitem__(self, arg):
             or _is_null_host_scalar(data)
         ):
             return data
-        index = self._sr.index.take(arg)
-        return self._sr._copy_construct(data=data, index=index)
+        return self._sr._from_data(
+            {self._sr.name: data}, index=cudf.Index(self._sr.index.take(arg))
+        )
 
     def __setitem__(self, key, value):
         from cudf.core.column import column
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index cdc80b6ef32..af6ac5f3dae 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -830,8 +830,7 @@ def _compute_levels_and_codes(self):
         for name in self._source_data.columns:
             code, cats = self._source_data[name].factorize()
             codes[name] = code.astype(np.int64)
-            cats.name = None
-            cats = cudf.Series(cats)._copy_construct(name=None)
+            cats = cudf.Series(cats, name=None)
             levels.append(cats)
 
         self._levels = levels
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index d33d624b266..177208fa921 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -7,7 +7,7 @@
 from collections import abc as abc
 from numbers import Number
 from shutil import get_terminal_size
-from typing import Any, Mapping, Optional
+from typing import Any, MutableMapping, Optional
 from uuid import uuid4
 
 import cupy
@@ -270,7 +270,7 @@ def __init__(
     @classmethod
     def _from_data(
         cls,
-        data: Mapping,
+        data: MutableMapping,
         index: Optional[BaseIndex] = None,
         name: Any = None,
     ) -> Series:
@@ -383,10 +383,6 @@ def deserialize(cls, header, frames):
 
         return Series(column, index=index, name=name)
 
-    @property
-    def _copy_construct_defaults(self):
-        return {"data": self._column, "index": self._index, "name": self.name}
-
     def _get_columns_by_label(self, labels, downcast=False):
         """Return the column specified by `labels`
 
@@ -699,7 +695,7 @@ def reset_index(self, drop=False, inplace=False):
             if inplace is True:
                 self._index = RangeIndex(len(self))
             else:
-                return self._copy_construct(index=RangeIndex(len(self)))
+                return self._from_data(self._data, index=RangeIndex(len(self)))
 
     def set_index(self, index):
         """Returns a new Series with a different index.
@@ -734,7 +730,7 @@ def set_index(self, index):
         dtype: int64
         """
         index = index if isinstance(index, BaseIndex) else as_index(index)
-        return self._copy_construct(index=index)
+        return self._from_data(self._data, index, self.name)
 
     def as_index(self):
         """Returns a new Series with a RangeIndex.
@@ -851,8 +847,9 @@ def set_mask(self, mask, null_count=None):
             "in the future.",
             DeprecationWarning,
         )
-        col = self._column.set_mask(mask)
-        return self._copy_construct(data=col)
+        return self._from_data(
+            {self.name: self._column.set_mask(mask)}, self._index
+        )
 
     def __sizeof__(self):
         return self._column.__sizeof__() + self._index.__sizeof__()
@@ -1093,8 +1090,9 @@ def take(self, indices, keep_index=True):
             return self.iloc[indices]
         else:
             col_inds = as_column(indices)
-            data = self._column.take(col_inds, keep_index=False)
-            return self._copy_construct(data=data, index=None)
+            return self._from_data(
+                {self.name: self._column.take(col_inds, keep_index=False)}
+            )
 
     def head(self, n=5):
         """
@@ -2723,8 +2721,9 @@ def nans_to_nulls(self):
         4    10.0
         dtype: float64
         """
-        result_col = self._column.nans_to_nulls()
-        return self._copy_construct(data=result_col)
+        return self._from_data(
+            {self.name: self._column.nans_to_nulls()}, self._index
+        )
 
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         if bool_only not in (None, True):
@@ -3011,8 +3010,9 @@ def astype(self, dtype, copy=False, errors="raise"):
         try:
             data = self._column.astype(dtype)
 
-            return self._copy_construct(
-                data=data.copy(deep=True) if copy else data, index=self.index
+            return self._from_data(
+                {self.name: (data.copy(deep=True) if copy else data)},
+                index=self._index,
             )
 
         except Exception as e:
@@ -3326,8 +3326,8 @@ def _sort(self, ascending=True, na_position="last"):
         col_keys, col_inds = self._column.sort_by_values(
             ascending=ascending, na_position=na_position
         )
-        sr_keys = self._copy_construct(data=col_keys)
-        sr_inds = self._copy_construct(data=col_inds)
+        sr_keys = self._from_data({self.name: col_keys}, self._index)
+        sr_inds = self._from_data({self.name: col_inds}, self._index)
         return sr_keys, sr_inds
 
     def replace(
@@ -3630,9 +3630,9 @@ def reverse(self):
         dtype: int64
         """
         rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32)
-        col = self._column[rinds]
-        index = self.index._values[rinds]
-        return self._copy_construct(data=col, index=index)
+        return self._from_data(
+            {self.name: self._column[rinds]}, self.index._values[rinds]
+        )
 
     def one_hot_encoding(self, cats, dtype="float64"):
         """Perform one-hot-encoding
@@ -3786,7 +3786,9 @@ def _return_sentinel_series():
         codes = codes.merge(value, on="value", how="left")
         codes = codes.sort_values("order")["code"].fillna(na_sentinel)
 
-        return codes._copy_construct(name=None, index=self.index)
+        codes.name = None
+        codes.index = self._index
+        return codes
 
     # UDF related
 
@@ -3900,7 +3902,7 @@ def applymap(self, udf, out_dtype=None):
         """
         if not callable(udf):
             raise ValueError("Input UDF must be a callable object.")
-        return self._copy_construct(data=self._unaryop(udf))
+        return self._from_data({self.name: self._unaryop(udf)}, self._index)
 
     #
     # Stats
@@ -4721,7 +4723,8 @@ def scale(self):
         vmin = self.min()
         vmax = self.max()
         scaled = (self - vmin) / (vmax - vmin)
-        return self._copy_construct(data=scaled)
+        scaled._index = self._index.copy(deep=False)
+        return scaled
 
     # Absolute
     def abs(self):
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index d2f120a7bb9..e3ed15ba2a6 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -215,7 +215,7 @@ def _apply_agg_series(self, sr, agg_name):
                 self.center,
                 agg_name,
             )
-        return sr._copy_construct(data=result_col)
+        return sr._from_data({sr.name: result_col}, sr._index)
 
     def _apply_agg_dataframe(self, df, agg_name):
         result_df = cudf.DataFrame({})