Skip to content

Commit

Permalink
Add functionality to apply Dtype metadata to ColumnBase (#8373)
Browse files Browse the repository at this point in the history
Based on discussion on #8333:

- adds `_with_type_metadata()` to `ColumnBase` to return a new column with the metadata of `dtype` applied
- removes `_copy_type_metadata[_from_arrow]()` and uses this function in their place

These changes would be helpful for #8153, as we want to be able to copy metadata from one column to another using only the dtype object.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Michael Wang (https://github.com/isVoid)

URL: #8373
  • Loading branch information
charlesbluca authored Jun 15, 2021
1 parent 2606b71 commit 884f98f
Show file tree
Hide file tree
Showing 12 changed files with 202 additions and 125 deletions.
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
as_column,
build_categorical_column,
build_column,
build_list_column,
build_struct_column,
column_empty,
column_empty_like,
column_empty_like_same_mask,
Expand Down
36 changes: 16 additions & 20 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1513,27 +1513,23 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
offset=codes_col.offset,
)

def _copy_type_metadata(
self: CategoricalColumn, other: ColumnBase
) -> ColumnBase:
"""Copies type metadata from self onto other, returning a new column.
In addition to the default behavior, if `other` is not a
CategoricalColumn, we assume other is a column of codes, and return a
CategoricalColumn composed of `other` and the categories of `self`.
"""
if not isinstance(other, cudf.core.column.CategoricalColumn):
other = column.build_categorical_column(
categories=self.categories,
codes=column.as_column(other.base_data, dtype=other.dtype),
mask=other.base_mask,
ordered=self.ordered,
size=other.size,
offset=other.offset,
null_count=other.null_count,
def _with_type_metadata(
self: CategoricalColumn, dtype: Dtype
) -> CategoricalColumn:
if isinstance(dtype, CategoricalDtype):
return column.build_categorical_column(
categories=dtype.categories._values,
codes=column.as_column(
self.codes.base_data, dtype=self.codes.dtype
),
mask=self.codes.base_mask,
ordered=dtype.ordered,
size=self.codes.size,
offset=self.codes.offset,
null_count=self.codes.null_count,
)
# Have to ignore typing here because it misdiagnoses super().
return super()._copy_type_metadata(other) # type: ignore

return self


def _create_empty_categorical_column(
Expand Down
161 changes: 87 additions & 74 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from cudf.utils import ioutils, utils
from cudf.utils.dtypes import (
check_cast_unsupported_dtype,
cudf_dtype_from_pa_type,
get_time_unit,
is_categorical_dtype,
is_decimal_dtype,
Expand Down Expand Up @@ -295,7 +296,9 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
"None"
]

result = _copy_type_metadata_from_arrow(array, result)
result = result._with_type_metadata(
cudf_dtype_from_pa_type(array.type)
)
return result

def _get_mask_as_column(self) -> ColumnBase:
Expand Down Expand Up @@ -408,7 +411,7 @@ def copy(self: T, deep: bool = True) -> T:
"""
if deep:
result = libcudf.copying.copy_column(self)
return cast(T, self._copy_type_metadata(result))
return cast(T, result._with_type_metadata(self.dtype))
else:
return cast(
T,
Expand Down Expand Up @@ -1267,28 +1270,14 @@ def scatter_to_table(
}
)

def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
"""
Copies type metadata from self onto other, returning a new column.
* when `self` and `other` are nested columns of the same type,
recursively apply this function on the children of `self` to the
and the children of `other`.
* if none of the above, return `other` without any changes
When ``self`` is a nested column, recursively apply this function on
the children of ``self``.
"""
# TODO: This logic should probably be moved to a common nested column
# class.
if isinstance(other, type(self)):
if self.base_children and other.base_children:
base_children = tuple(
self.base_children[i]._copy_type_metadata(
other.base_children[i]
)
for i in range(len(self.base_children))
)
other.set_base_children(base_children)

return other
return self


def column_empty_like(
Expand Down Expand Up @@ -1603,6 +1592,84 @@ def build_interval_column(
)


def build_list_column(
indices: ColumnBase,
elements: ColumnBase,
mask: Buffer = None,
size: int = None,
offset: int = 0,
null_count: int = None,
) -> "cudf.core.column.ListColumn":
"""
Build a ListColumn
Parameters
----------
indices : ColumnBase
Column of list indices
elements : ColumnBase
Column of list elements
mask: Buffer
Null mask
size: int, optional
offset: int, optional
"""
dtype = ListDtype(element_type=elements.dtype)

result = build_column(
data=None,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
children=(indices, elements),
)

return cast("cudf.core.column.ListColumn", result)


def build_struct_column(
names: Sequence[str],
children: Tuple[ColumnBase, ...],
dtype: Optional[Dtype] = None,
mask: Buffer = None,
size: int = None,
offset: int = 0,
null_count: int = None,
) -> "cudf.core.column.StructColumn":
"""
Build a StructColumn
Parameters
----------
names : list-like
Field names to map to children dtypes
children : tuple
mask: Buffer
Null mask
size: int, optional
offset: int, optional
"""
if dtype is None:
dtype = StructDtype(
fields={name: col.dtype for name, col in zip(names, children)}
)

result = build_column(
data=None,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
children=children,
)

return cast("cudf.core.column.StructColumn", result)


def as_column(
arbitrary: Any,
nan_as_null: bool = None,
Expand Down Expand Up @@ -2200,60 +2267,6 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)


def _copy_type_metadata_from_arrow(
arrow_array: pa.array, cudf_column: ColumnBase
) -> ColumnBase:
"""
Similar to `Column._copy_type_metadata`, except copies type metadata
from arrow array into a cudf column. Recursive for every level.
* When `arrow_array` is struct type and `cudf_column` is StructDtype, copy
field names.
* When `arrow_array` is decimal type and `cudf_column` is
Decimal64Dtype, copy precisions.
"""
if pa.types.is_decimal(arrow_array.type) and isinstance(
cudf_column, cudf.core.column.DecimalColumn
):
cudf_column.dtype.precision = arrow_array.type.precision
elif pa.types.is_struct(arrow_array.type) and isinstance(
cudf_column, cudf.core.column.StructColumn
):
base_children = tuple(
_copy_type_metadata_from_arrow(arrow_array.field(i), col_child)
for i, col_child in enumerate(cudf_column.base_children)
)
cudf_column.set_base_children(base_children)
return cudf.core.column.StructColumn(
data=None,
size=cudf_column.base_size,
dtype=StructDtype.from_arrow(arrow_array.type),
mask=cudf_column.base_mask,
offset=cudf_column.offset,
null_count=cudf_column.null_count,
children=base_children,
)
elif pa.types.is_list(arrow_array.type) and isinstance(
cudf_column, cudf.core.column.ListColumn
):
if arrow_array.values and cudf_column.base_children:
base_children = (
cudf_column.base_children[0],
_copy_type_metadata_from_arrow(
arrow_array.values, cudf_column.base_children[1]
),
)
return cudf.core.column.ListColumn(
size=cudf_column.base_size,
dtype=ListDtype.from_arrow(arrow_array.type),
mask=cudf_column.base_mask,
offset=cudf_column.offset,
null_count=cudf_column.null_count,
children=base_children,
)

return cudf_column


def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
"""Concatenate a sequence of columns."""
if len(objs) == 0:
Expand Down
19 changes: 8 additions & 11 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _decimal_quantile(
self, quant, interpolation, sorted_indices, exact
)

return self._copy_type_metadata(result)
return result._with_type_metadata(self.dtype)

def as_decimal_column(
self, dtype: Dtype, **kwargs
Expand Down Expand Up @@ -189,7 +189,7 @@ def fillna(
result = libcudf.replace.replace_nulls(
input_col=self, replacement=value, method=method, dtype=dtype
)
return self._copy_type_metadata(result)
return result._with_type_metadata(self.dtype)

def serialize(self) -> Tuple[dict, list]:
header, frames = super().serialize()
Expand All @@ -209,16 +209,13 @@ def __cuda_array_interface__(self):
"Decimals are not yet supported via `__cuda_array_interface__`"
)

def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
"""Copies type metadata from self onto other, returning a new column.
def _with_type_metadata(
self: "cudf.core.column.DecimalColumn", dtype: Dtype
) -> "cudf.core.column.DecimalColumn":
if isinstance(dtype, Decimal64Dtype):
self.dtype.precision = dtype.precision

In addition to the default behavior, if `other` is also a decimal
column the precision is copied over.
"""
if isinstance(other, DecimalColumn):
other.dtype.precision = self.dtype.precision # type: ignore
# Have to ignore typing here because it misdiagnoses super().
return super()._copy_type_metadata(other) # type: ignore
return self


def _binop_scale(l_dtype, r_dtype, op):
Expand Down
24 changes: 22 additions & 2 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
sort_lists,
)
from cudf._lib.table import Table
from cudf._typing import BinaryOperand
from cudf._typing import BinaryOperand, Dtype
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethodsMixin
Expand Down Expand Up @@ -76,7 +76,10 @@ def __sizeof__(self):

@property
def base_size(self):
return len(self.base_children[0]) - 1
# in some cases, libcudf will return an empty ListColumn with no
# indices; in these cases, we must manually set the base_size to 0 to
# avoid it being negative
return max(0, len(self.base_children[0]) - 1)

def binary_operator(
self, binop: str, other: BinaryOperand, reflect: bool = False
Expand Down Expand Up @@ -233,6 +236,23 @@ def __cuda_array_interface__(self):
"Lists are not yet supported via `__cuda_array_interface__`"
)

def _with_type_metadata(
self: "cudf.core.column.ListColumn", dtype: Dtype
) -> "cudf.core.column.ListColumn":
if isinstance(dtype, ListDtype):
return column.build_list_column(
indices=self.base_children[0],
elements=self.base_children[1]._with_type_metadata(
dtype.element_type
),
mask=self.base_mask,
size=self.base_size,
offset=self.offset,
null_count=self.null_count,
)

return self


class ListMethods(ColumnMethodsMixin):
"""
Expand Down
16 changes: 15 additions & 1 deletion python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
column,
string,
)
from cudf.core.dtypes import Decimal64Dtype
from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
from cudf.utils import cudautils, utils
from cudf.utils.dtypes import (
NUMERIC_TYPES,
Expand Down Expand Up @@ -544,6 +544,20 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:

return False

def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
if isinstance(dtype, CategoricalDtype):
return column.build_categorical_column(
categories=dtype.categories._values,
codes=as_column(self.base_data, dtype=self.dtype),
mask=self.base_mask,
ordered=dtype.ordered,
size=self.size,
offset=self.offset,
null_count=self.null_count,
)

return self

def to_pandas(
self, index: pd.Index = None, nullable: bool = False, **kwargs
) -> "pd.Series":
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,6 @@ def round(
return libcudf.round.round(self, decimal_places=decimals, how=how)

def _apply_scan_op(self, op: str) -> ColumnBase:
return self._copy_type_metadata(libcudf.reduce.scan(op, self, True))
return libcudf.reduce.scan(op, self, True)._with_type_metadata(
self.dtype
)
Loading

0 comments on commit 884f98f

Please sign in to comment.