diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 32cb557548f..76d38e00790 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -7,6 +7,8 @@ as_column, build_categorical_column, build_column, + build_list_column, + build_struct_column, column_empty, column_empty_like, column_empty_like_same_mask, diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 30c07c8e8bb..f51f1af9e3f 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1506,27 +1506,23 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: offset=codes_col.offset, ) - def _copy_type_metadata( - self: CategoricalColumn, other: ColumnBase - ) -> ColumnBase: - """Copies type metadata from self onto other, returning a new column. - - In addition to the default behavior, if `other` is not a - CategoricalColumn, we assume other is a column of codes, and return a - CategoricalColumn composed of `other` and the categories of `self`. - """ - if not isinstance(other, cudf.core.column.CategoricalColumn): - other = column.build_categorical_column( - categories=self.categories, - codes=column.as_column(other.base_data, dtype=other.dtype), - mask=other.base_mask, - ordered=self.ordered, - size=other.size, - offset=other.offset, - null_count=other.null_count, + def _with_type_metadata( + self: CategoricalColumn, dtype: Dtype + ) -> CategoricalColumn: + if isinstance(dtype, CategoricalDtype): + return column.build_categorical_column( + categories=dtype.categories._values, + codes=column.as_column( + self.codes.base_data, dtype=self.codes.dtype + ), + mask=self.codes.base_mask, + ordered=dtype.ordered, + size=self.codes.size, + offset=self.codes.offset, + null_count=self.codes.null_count, ) - # Have to ignore typing here because it misdiagnoses super(). - return super()._copy_type_metadata(other) # type: ignore + + return self def _create_empty_categorical_column( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 87580dd3755..dd863b13282 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -48,6 +48,7 @@ from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( check_cast_unsupported_dtype, + cudf_dtype_from_pa_type, get_time_unit, is_categorical_dtype, is_decimal_dtype, @@ -295,7 +296,9 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: "None" ] - result = _copy_type_metadata_from_arrow(array, result) + result = result._with_type_metadata( + cudf_dtype_from_pa_type(array.type) + ) return result def _get_mask_as_column(self) -> ColumnBase: @@ -408,7 +411,7 @@ def copy(self: T, deep: bool = True) -> T: """ if deep: result = libcudf.copying.copy_column(self) - return cast(T, self._copy_type_metadata(result)) + return cast(T, result._with_type_metadata(self.dtype)) else: return cast( T, @@ -1267,28 +1270,14 @@ def scatter_to_table( } ) - def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase: + def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: """ Copies type metadata from self onto other, returning a new column. - * when `self` and `other` are nested columns of the same type, - recursively apply this function on the children of `self` to the - and the children of `other`. - * if none of the above, return `other` without any changes + When ``self`` is a nested column, recursively apply this function on + the children of ``self``. """ - # TODO: This logic should probably be moved to a common nested column - # class. - if isinstance(other, type(self)): - if self.base_children and other.base_children: - base_children = tuple( - self.base_children[i]._copy_type_metadata( - other.base_children[i] - ) - for i in range(len(self.base_children)) - ) - other.set_base_children(base_children) - - return other + return self def column_empty_like( @@ -1603,6 +1592,84 @@ def build_interval_column( ) +def build_list_column( + indices: ColumnBase, + elements: ColumnBase, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, +) -> "cudf.core.column.ListColumn": + """ + Build a ListColumn + + Parameters + ---------- + indices : ColumnBase + Column of list indices + elements : ColumnBase + Column of list elements + mask: Buffer + Null mask + size: int, optional + offset: int, optional + """ + dtype = ListDtype(element_type=elements.dtype) + + result = build_column( + data=None, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + children=(indices, elements), + ) + + return cast("cudf.core.column.ListColumn", result) + + +def build_struct_column( + names: Sequence[str], + children: Tuple[ColumnBase, ...], + dtype: Optional[Dtype] = None, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, +) -> "cudf.core.column.StructColumn": + """ + Build a StructColumn + + Parameters + ---------- + names : list-like + Field names to map to children dtypes + children : tuple + + mask: Buffer + Null mask + size: int, optional + offset: int, optional + """ + if dtype is None: + dtype = StructDtype( + fields={name: col.dtype for name, col in zip(names, children)} + ) + + result = build_column( + data=None, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + children=children, + ) + + return cast("cudf.core.column.StructColumn", result) + + def as_column( arbitrary: Any, nan_as_null: bool = None, @@ -2200,60 +2267,6 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) -def _copy_type_metadata_from_arrow( - arrow_array: pa.array, cudf_column: ColumnBase -) -> ColumnBase: - """ - Similar to `Column._copy_type_metadata`, except copies type metadata - from arrow array into a cudf column. Recursive for every level. - * When `arrow_array` is struct type and `cudf_column` is StructDtype, copy - field names. - * When `arrow_array` is decimal type and `cudf_column` is - Decimal64Dtype, copy precisions. - """ - if pa.types.is_decimal(arrow_array.type) and isinstance( - cudf_column, cudf.core.column.DecimalColumn - ): - cudf_column.dtype.precision = arrow_array.type.precision - elif pa.types.is_struct(arrow_array.type) and isinstance( - cudf_column, cudf.core.column.StructColumn - ): - base_children = tuple( - _copy_type_metadata_from_arrow(arrow_array.field(i), col_child) - for i, col_child in enumerate(cudf_column.base_children) - ) - cudf_column.set_base_children(base_children) - return cudf.core.column.StructColumn( - data=None, - size=cudf_column.base_size, - dtype=StructDtype.from_arrow(arrow_array.type), - mask=cudf_column.base_mask, - offset=cudf_column.offset, - null_count=cudf_column.null_count, - children=base_children, - ) - elif pa.types.is_list(arrow_array.type) and isinstance( - cudf_column, cudf.core.column.ListColumn - ): - if arrow_array.values and cudf_column.base_children: - base_children = ( - cudf_column.base_children[0], - _copy_type_metadata_from_arrow( - arrow_array.values, cudf_column.base_children[1] - ), - ) - return cudf.core.column.ListColumn( - size=cudf_column.base_size, - dtype=ListDtype.from_arrow(arrow_array.type), - mask=cudf_column.base_mask, - offset=cudf_column.offset, - null_count=cudf_column.null_count, - children=base_children, - ) - - return cudf_column - - def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 459cfae6fdb..86f976f2105 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -141,7 +141,7 @@ def _decimal_quantile( self, quant, interpolation, sorted_indices, exact ) - return self._copy_type_metadata(result) + return result._with_type_metadata(self.dtype) def as_decimal_column( self, dtype: Dtype, **kwargs @@ -189,7 +189,7 @@ def fillna( result = libcudf.replace.replace_nulls( input_col=self, replacement=value, method=method, dtype=dtype ) - return self._copy_type_metadata(result) + return result._with_type_metadata(self.dtype) def serialize(self) -> Tuple[dict, list]: header, frames = super().serialize() @@ -209,16 +209,13 @@ def __cuda_array_interface__(self): "Decimals are not yet supported via `__cuda_array_interface__`" ) - def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase: - """Copies type metadata from self onto other, returning a new column. + def _with_type_metadata( + self: "cudf.core.column.DecimalColumn", dtype: Dtype + ) -> "cudf.core.column.DecimalColumn": + if isinstance(dtype, Decimal64Dtype): + self.dtype.precision = dtype.precision - In addition to the default behavior, if `other` is also a decimal - column the precision is copied over. - """ - if isinstance(other, DecimalColumn): - other.dtype.precision = self.dtype.precision # type: ignore - # Have to ignore typing here because it misdiagnoses super(). - return super()._copy_type_metadata(other) # type: ignore + return self def _binop_scale(l_dtype, r_dtype, op): diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 7ea02c0e878..f13c1a3e114 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -16,7 +16,7 @@ sort_lists, ) from cudf._lib.table import Table -from cudf._typing import BinaryOperand +from cudf._typing import BinaryOperand, Dtype from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin @@ -76,7 +76,10 @@ def __sizeof__(self): @property def base_size(self): - return len(self.base_children[0]) - 1 + # in some cases, libcudf will return an empty ListColumn with no + # indices; in these cases, we must manually set the base_size to 0 to + # avoid it being negative + return max(0, len(self.base_children[0]) - 1) def binary_operator( self, binop: str, other: BinaryOperand, reflect: bool = False @@ -233,6 +236,23 @@ def __cuda_array_interface__(self): "Lists are not yet supported via `__cuda_array_interface__`" ) + def _with_type_metadata( + self: "cudf.core.column.ListColumn", dtype: Dtype + ) -> "cudf.core.column.ListColumn": + if isinstance(dtype, ListDtype): + return column.build_list_column( + indices=self.base_children[0], + elements=self.base_children[1]._with_type_metadata( + dtype.element_type + ), + mask=self.base_mask, + size=self.base_size, + offset=self.offset, + null_count=self.null_count, + ) + + return self + class ListMethods(ColumnMethodsMixin): """ diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index e35cc744434..e60281c1dfc 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -21,7 +21,7 @@ column, string, ) -from cudf.core.dtypes import Decimal64Dtype +from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( NUMERIC_TYPES, @@ -544,6 +544,20 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: return False + def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: + if isinstance(dtype, CategoricalDtype): + return column.build_categorical_column( + categories=dtype.categories._values, + codes=as_column(self.base_data, dtype=self.dtype), + mask=self.base_mask, + ordered=dtype.ordered, + size=self.size, + offset=self.offset, + null_count=self.null_count, + ) + + return self + def to_pandas( self, index: pd.Index = None, nullable: bool = False, **kwargs ) -> "pd.Series": diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 4c935e18ba8..388cdb0ca79 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -199,4 +199,6 @@ def round( return libcudf.round.round(self, decimal_places=decimals, how=how) def _apply_scan_op(self, op: str) -> ColumnBase: - return self._copy_type_metadata(libcudf.reduce.scan(op, self, True)) + return libcudf.reduce.scan(op, self, True)._with_type_metadata( + self.dtype + ) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 3c47f30dd15..8b6dbbf1f3c 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -4,8 +4,10 @@ import pyarrow as pa import cudf -from cudf.core.column import ColumnBase +from cudf._typing import Dtype +from cudf.core.column import ColumnBase, build_struct_column from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.dtypes import StructDtype from cudf.utils.dtypes import is_struct_dtype @@ -111,18 +113,21 @@ def __cuda_array_interface__(self): "Structs are not yet supported via `__cuda_array_interface__`" ) - def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase: - """Copies type metadata from self onto other, returning a new column. - - In addition to the default behavior, if `other` is a StructColumns we - rename the fields of `other` to the field names of `self`. - """ - if isinstance(other, cudf.core.column.StructColumn): - other = other._rename_fields( - self.dtype.fields.keys() # type: ignore + def _with_type_metadata(self: StructColumn, dtype: Dtype) -> StructColumn: + if isinstance(dtype, StructDtype): + return build_struct_column( + names=dtype.fields.keys(), + children=tuple( + self.base_children[i]._with_type_metadata(dtype.fields[f]) + for i, f in enumerate(dtype.fields.keys()) + ), + mask=self.base_mask, + size=self.base_size, + offset=self.offset, + null_count=self.null_count, ) - # Have to ignore typing here because it misdiagnoses super(). - return super()._copy_type_metadata(other) # type: ignore + + return self class StructMethods(ColumnMethodsMixin): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ea40997ca3f..b186225fedf 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -505,7 +505,7 @@ def _concat( # Reassign precision for any decimal cols for name, col in out._data.items(): if isinstance(col, cudf.core.column.DecimalColumn): - col = tables[0]._data[name]._copy_type_metadata(col) + col = col._with_type_metadata(tables[0]._data[name].dtype) # Reassign index and column names if isinstance(objs[0].columns, pd.MultiIndex): @@ -2240,13 +2240,13 @@ def _copy_type_metadata( """ Copy type metadata from each column of `other` to the corresponding column of `self`. - See `ColumnBase._copy_type_metadata` for more information. + See `ColumnBase._with_type_metadata` for more information. """ for name, col, other_col in zip( self._data.keys(), self._data.values(), other._data.values() ): self._data.set_by_label( - name, other_col._copy_type_metadata(col), validate=False + name, col._with_type_metadata(other_col.dtype), validate=False ) if include_index: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cf08b16b7d6..cf12c8b6fdd 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2407,7 +2407,7 @@ def _concat(cls, objs, axis=0, index=True): col = _concat_columns([o._column for o in objs]) if isinstance(col, cudf.core.column.DecimalColumn): - col = objs[0]._column._copy_type_metadata(col) + col = col._with_type_metadata(objs[0]._column.dtype) return cls(data=col, index=index, name=name) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index efdeb36755f..3ac6cc0bb44 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -367,6 +367,32 @@ def test_as_column_buffer(data, expected): assert_eq(cudf.Series(actual_column), cudf.Series(expected)) +@pytest.mark.parametrize( + "data,expected", + [ + ( + pa.array([100, 200, 300], type=pa.decimal128(3)), + cudf.core.column.as_column( + [100, 200, 300], dtype=cudf.core.dtypes.Decimal64Dtype(3, 0) + ), + ), + ( + pa.array([{"a": 1, "b": 3}, {"c": 2, "d": 4}]), + cudf.core.column.as_column([{"a": 1, "b": 3}, {"c": 2, "d": 4}]), + ), + ( + pa.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]), + cudf.core.column.as_column( + [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]] + ), + ), + ], +) +def test_as_column_arrow_array(data, expected): + actual_column = cudf.core.column.as_column(data) + assert_eq(cudf.Series(actual_column), cudf.Series(expected)) + + @pytest.mark.parametrize( "pd_dtype,expect_dtype", [ diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 0b59116f8e6..251c6339fe0 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -340,6 +340,8 @@ def cudf_dtype_from_pa_type(typ): return cudf.core.dtypes.ListDtype.from_arrow(typ) elif pa.types.is_struct(typ): return cudf.core.dtypes.StructDtype.from_arrow(typ) + elif pa.types.is_decimal(typ): + return cudf.core.dtypes.Decimal64Dtype.from_arrow(typ) else: return pd.api.types.pandas_dtype(typ.to_pandas_dtype())