rapidsai · rapids-bot · Jun 15, 2021 · May 26, 2021 · May 26, 2021 · May 26, 2021
@@ -7,6 +7,8 @@
     as_column,
     build_categorical_column,
     build_column,
+    build_list_column,
+    build_struct_column,
     column_empty,
     column_empty_like,
     column_empty_like_same_mask,

@@ -1506,27 +1506,23 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
             offset=codes_col.offset,
         )
 
-    def _copy_type_metadata(
-        self: CategoricalColumn, other: ColumnBase
-    ) -> ColumnBase:
-        """Copies type metadata from self onto other, returning a new column.
-
-        In addition to the default behavior, if `other` is not a
-        CategoricalColumn, we assume other is a column of codes, and return a
-        CategoricalColumn composed of `other`  and the categories of `self`.
-        """
-        if not isinstance(other, cudf.core.column.CategoricalColumn):
-            other = column.build_categorical_column(
-                categories=self.categories,
-                codes=column.as_column(other.base_data, dtype=other.dtype),
-                mask=other.base_mask,
-                ordered=self.ordered,
-                size=other.size,
-                offset=other.offset,
-                null_count=other.null_count,
+    def _with_type_metadata(
+        self: CategoricalColumn, dtype: Dtype
+    ) -> CategoricalColumn:
+        if isinstance(dtype, CategoricalDtype):
+            return column.build_categorical_column(
+                categories=dtype.categories._values,
+                codes=column.as_column(
+                    self.codes.base_data, dtype=self.codes.dtype
+                ),
+                mask=self.codes.base_mask,
+                ordered=dtype.ordered,
+                size=self.codes.size,
+                offset=self.codes.offset,
+                null_count=self.codes.null_count,
             )
-        # Have to ignore typing here because it misdiagnoses super().
-        return super()._copy_type_metadata(other)  # type: ignore
+
+        return self
 
 
 def _create_empty_categorical_column(

@@ -48,6 +48,7 @@
 from cudf.utils import ioutils, utils
 from cudf.utils.dtypes import (
     check_cast_unsupported_dtype,
+    cudf_dtype_from_pa_type,
     get_time_unit,
     is_categorical_dtype,
     is_decimal_dtype,
@@ -295,7 +296,9 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
             "None"
         ]
 
-        result = _copy_type_metadata_from_arrow(array, result)
+        result = result._with_type_metadata(
+            cudf_dtype_from_pa_type(array.type)
+        )
         return result
 
     def _get_mask_as_column(self) -> ColumnBase:
@@ -408,7 +411,7 @@ def copy(self: T, deep: bool = True) -> T:
         """
         if deep:
             result = libcudf.copying.copy_column(self)
-            return cast(T, self._copy_type_metadata(result))
+            return cast(T, result._with_type_metadata(self.dtype))
         else:
             return cast(
                 T,
@@ -1267,28 +1270,14 @@ def scatter_to_table(
             }
         )
 
-    def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
+    def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         """
         Copies type metadata from self onto other, returning a new column.
 
-        * when `self` and `other` are nested columns of the same type,
-          recursively apply this function on the children of `self` to the
-          and the children of `other`.
-        * if none of the above, return `other` without any changes
+        When ``self`` is a nested column, recursively apply this function on
+        the children of ``self``.
         """
-        # TODO: This logic should probably be moved to a common nested column
-        # class.
-        if isinstance(other, type(self)):
-            if self.base_children and other.base_children:
-                base_children = tuple(
-                    self.base_children[i]._copy_type_metadata(
-                        other.base_children[i]
-                    )
-                    for i in range(len(self.base_children))
-                )
-                other.set_base_children(base_children)
-
-        return other
+        return self
 
 
 def column_empty_like(
@@ -1603,6 +1592,84 @@ def build_interval_column(
     )
 
 
+def build_list_column(
+    indices: ColumnBase,
+    elements: ColumnBase,
+    mask: Buffer = None,
+    size: int = None,
+    offset: int = 0,
+    null_count: int = None,
+) -> "cudf.core.column.ListColumn":
+    """
+    Build a ListColumn
+
+    Parameters
+    ----------
+    indices : ColumnBase
+        Column of list indices
+    elements : ColumnBase
+        Column of list elements
+    mask: Buffer
+        Null mask
+    size: int, optional
+    offset: int, optional
+    """
+    dtype = ListDtype(element_type=elements.dtype)
+
+    result = build_column(
+        data=None,
+        dtype=dtype,
+        mask=mask,
+        size=size,
+        offset=offset,
+        null_count=null_count,
+        children=(indices, elements),
+    )
+
+    return cast("cudf.core.column.ListColumn", result)
+
+
+def build_struct_column(
+    names: Sequence[str],
+    children: Tuple[ColumnBase, ...],
+    dtype: Optional[Dtype] = None,
+    mask: Buffer = None,
+    size: int = None,
+    offset: int = 0,
+    null_count: int = None,
+) -> "cudf.core.column.StructColumn":
+    """
+    Build a StructColumn
+
+    Parameters
+    ----------
+    names : list-like
+        Field names to map to children dtypes
+    children : tuple
+
+    mask: Buffer
+        Null mask
+    size: int, optional
+    offset: int, optional
+    """
+    if dtype is None:
+        dtype = StructDtype(
+            fields={name: col.dtype for name, col in zip(names, children)}
+        )
+
+    result = build_column(
+        data=None,
+        dtype=dtype,
+        mask=mask,
+        size=size,
+        offset=offset,
+        null_count=null_count,
+        children=children,
+    )
+
+    return cast("cudf.core.column.StructColumn", result)
+
+
 def as_column(
     arbitrary: Any,
     nan_as_null: bool = None,
@@ -2200,60 +2267,6 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
     return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)
 
 
-def _copy_type_metadata_from_arrow(
-    arrow_array: pa.array, cudf_column: ColumnBase
-) -> ColumnBase:
-    """
-    Similar to `Column._copy_type_metadata`, except copies type metadata
-    from arrow array into a cudf column. Recursive for every level.
-    * When `arrow_array` is struct type and `cudf_column` is StructDtype, copy
-    field names.
-    * When `arrow_array` is decimal type and `cudf_column` is
-    Decimal64Dtype, copy precisions.
-    """
-    if pa.types.is_decimal(arrow_array.type) and isinstance(
-        cudf_column, cudf.core.column.DecimalColumn
-    ):
-        cudf_column.dtype.precision = arrow_array.type.precision
-    elif pa.types.is_struct(arrow_array.type) and isinstance(
-        cudf_column, cudf.core.column.StructColumn
-    ):
-        base_children = tuple(
-            _copy_type_metadata_from_arrow(arrow_array.field(i), col_child)
-            for i, col_child in enumerate(cudf_column.base_children)
-        )
-        cudf_column.set_base_children(base_children)
-        return cudf.core.column.StructColumn(
-            data=None,
-            size=cudf_column.base_size,
-            dtype=StructDtype.from_arrow(arrow_array.type),
-            mask=cudf_column.base_mask,
-            offset=cudf_column.offset,
-            null_count=cudf_column.null_count,
-            children=base_children,
-        )
-    elif pa.types.is_list(arrow_array.type) and isinstance(
-        cudf_column, cudf.core.column.ListColumn
-    ):
-        if arrow_array.values and cudf_column.base_children:
-            base_children = (
-                cudf_column.base_children[0],
-                _copy_type_metadata_from_arrow(
-                    arrow_array.values, cudf_column.base_children[1]
-                ),
-            )
-            return cudf.core.column.ListColumn(
-                size=cudf_column.base_size,
-                dtype=ListDtype.from_arrow(arrow_array.type),
-                mask=cudf_column.base_mask,
-                offset=cudf_column.offset,
-                null_count=cudf_column.null_count,
-                children=base_children,
-            )
-
-    return cudf_column
-
-
 def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:

@@ -141,7 +141,7 @@ def _decimal_quantile(
             self, quant, interpolation, sorted_indices, exact
         )
 
-        return self._copy_type_metadata(result)
+        return result._with_type_metadata(self.dtype)
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
@@ -189,7 +189,7 @@ def fillna(
         result = libcudf.replace.replace_nulls(
             input_col=self, replacement=value, method=method, dtype=dtype
         )
-        return self._copy_type_metadata(result)
+        return result._with_type_metadata(self.dtype)
 
     def serialize(self) -> Tuple[dict, list]:
         header, frames = super().serialize()
@@ -209,16 +209,13 @@ def __cuda_array_interface__(self):
             "Decimals are not yet supported via `__cuda_array_interface__`"
         )
 
-    def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
-        """Copies type metadata from self onto other, returning a new column.
+    def _with_type_metadata(
+        self: "cudf.core.column.DecimalColumn", dtype: Dtype
+    ) -> "cudf.core.column.DecimalColumn":
+        if isinstance(dtype, Decimal64Dtype):
+            self.dtype.precision = dtype.precision
 
-        In addition to the default behavior, if `other` is also a decimal
-        column the precision is copied over.
-        """
-        if isinstance(other, DecimalColumn):
-            other.dtype.precision = self.dtype.precision  # type: ignore
-        # Have to ignore typing here because it misdiagnoses super().
-        return super()._copy_type_metadata(other)  # type: ignore
+        return self
 
 
 def _binop_scale(l_dtype, r_dtype, op):

@@ -16,7 +16,7 @@
     sort_lists,
 )
 from cudf._lib.table import Table
-from cudf._typing import BinaryOperand
+from cudf._typing import BinaryOperand, Dtype
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
@@ -76,7 +76,10 @@ def __sizeof__(self):
 
     @property
     def base_size(self):
-        return len(self.base_children[0]) - 1
+        # in some cases, libcudf will return an empty ListColumn with no
+        # indices; in these cases, we must manually set the base_size to 0 to
+        # avoid it being negative
+        return max(0, len(self.base_children[0]) - 1)
 
     def binary_operator(
         self, binop: str, other: BinaryOperand, reflect: bool = False
@@ -233,6 +236,23 @@ def __cuda_array_interface__(self):
             "Lists are not yet supported via `__cuda_array_interface__`"
         )
 
+    def _with_type_metadata(
+        self: "cudf.core.column.ListColumn", dtype: Dtype
+    ) -> "cudf.core.column.ListColumn":
+        if isinstance(dtype, ListDtype):
+            return column.build_list_column(
+                indices=self.base_children[0],
+                elements=self.base_children[1]._with_type_metadata(
+                    dtype.element_type
+                ),
+                mask=self.base_mask,
+                size=self.base_size,
+                offset=self.offset,
+                null_count=self.null_count,
+            )
+
+        return self
+
 
 class ListMethods(ColumnMethodsMixin):
     """

@@ -21,7 +21,7 @@
     column,
     string,
 )
-from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
@@ -544,6 +544,20 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
         return False
 
+    def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
+        if isinstance(dtype, CategoricalDtype):
+            return column.build_categorical_column(
+                categories=dtype.categories._values,
+                codes=as_column(self.base_data, dtype=self.dtype),
+                mask=self.base_mask,
+                ordered=dtype.ordered,
+                size=self.size,
+                offset=self.offset,
+                null_count=self.null_count,
+            )
+
+        return self
+
     def to_pandas(
         self, index: pd.Index = None, nullable: bool = False, **kwargs
     ) -> "pd.Series":

@@ -199,4 +199,6 @@ def round(
         return libcudf.round.round(self, decimal_places=decimals, how=how)
 
     def _apply_scan_op(self, op: str) -> ColumnBase:
-        return self._copy_type_metadata(libcudf.reduce.scan(op, self, True))
+        return libcudf.reduce.scan(op, self, True)._with_type_metadata(
+            self.dtype
+        )