diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 6654892287..c6867c1a33 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -131,7 +131,7 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: if len(index_columns) != 1: raise ValueError("only method 'linear' supports multi-index") xvalues = block.index_columns[0] - if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES: + if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: raise ValueError("Can only interpolate on numeric index.") for column in original_columns: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 3163aa5b09..779d11b371 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1063,7 +1063,7 @@ def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.AggregateOp]: stats: list[agg_ops.AggregateOp] = [agg_ops.count_op] if dtype not in bigframes.dtypes.UNORDERED_DTYPES: stats += [agg_ops.min_op, agg_ops.max_op] - if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES: + if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: # Notable exclusions: # prod op tends to cause overflows # Also, var_op is redundant as can be derived from std diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index d6183228d1..199c8db785 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -220,7 +220,10 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value: raise ValueError( "Column name {} not in set of values: {}".format(key, self.column_ids) ) - return typing.cast(ibis_types.Value, self._column_names[key]) + return typing.cast( + ibis_types.Value, + bigframes.dtypes.ibis_value_to_canonical_type(self._column_names[key]), + ) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: ibis_type = typing.cast( @@ -1177,7 +1180,14 @@ def _to_ibis_expr( # Make sure all dtypes are the "canonical" ones for BigFrames. This is # important for operations like UNION where the schema must match. table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + bigframes.dtypes.ibis_value_to_canonical_type( + column.resolve(self._table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) + else column + ) + for column in columns ) base_table = table if self._reduced_predicate is not None: diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 3ee46ef675..66ba901649 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -359,7 +359,8 @@ def _convert_index(self, dataframe: df.DataFrame): def _raise_on_non_numeric(self, op: str): if not all( - dtype in dtypes.NUMERIC_BIGFRAMES_TYPES for dtype in self._block.dtypes + dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + for dtype in self._block.dtypes ): raise NotImplementedError( f"'{op}' does not support non-numeric columns. " @@ -371,7 +372,9 @@ def _raise_on_non_numeric(self, op: str): def _aggregated_columns(self, numeric_only: bool = False) -> typing.Sequence[str]: valid_agg_cols: list[str] = [] for col_id in self._selected_cols: - is_numeric = self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES + is_numeric = ( + self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + ) if is_numeric or not numeric_only: valid_agg_cols.append(col_id) return valid_agg_cols diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 98aa8f1185..423c2bcaac 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1800,7 +1800,7 @@ def agg( ) -> DataFrame | bigframes.series.Series: if utils.is_list_like(func): if any( - dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES + dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE for dtype in self.dtypes ): raise NotImplementedError( @@ -1867,7 +1867,7 @@ def melt( ) def describe(self) -> DataFrame: - df_numeric = self._drop_non_numeric(keep_bool=False) + df_numeric = self._drop_non_numeric(permissive=False) if len(df_numeric.columns) == 0: raise NotImplementedError( f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}" @@ -2005,10 +2005,12 @@ def unstack(self, level: LevelsType = -1): ) return DataFrame(pivot_block) - def _drop_non_numeric(self, keep_bool=True) -> DataFrame: - types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) - if not keep_bool: - types_to_keep -= set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES) + def _drop_non_numeric(self, permissive=True) -> DataFrame: + types_to_keep = ( + set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) + if permissive + else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) + ) non_numeric_cols = [ col_id for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) @@ -2026,7 +2028,7 @@ def _drop_non_bool(self) -> DataFrame: def _raise_on_non_numeric(self, op: str): if not all( - dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES + dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE for dtype in self._block.dtypes ): raise NotImplementedError( @@ -2301,7 +2303,7 @@ def notna(self) -> DataFrame: def cumsum(self): is_numeric_types = [ - (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) + (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) for _, dtype in self.dtypes.items() ] if not all(is_numeric_types): @@ -2313,7 +2315,7 @@ def cumsum(self): def cumprod(self) -> DataFrame: is_numeric_types = [ - (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) + (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) for _, dtype in self.dtypes.items() ] if not all(is_numeric_types): diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 891c372a10..b754acea2e 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -15,6 +15,7 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" import datetime +import decimal import textwrap import typing from typing import Any, Dict, Iterable, Literal, Tuple, Union @@ -30,6 +31,7 @@ import bigframes.constants as constants import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ @@ -40,9 +42,6 @@ pd.ArrowDtype, ] -# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation) -NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()] - # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable UNORDERED_DTYPES = [gpd.array.GeometryDtype()] @@ -57,6 +56,9 @@ "timestamp[us][pyarrow]", "date32[day][pyarrow]", "time64[us][pyarrow]", + "decimal128(38, 9)[pyarrow]", + "decimal256(38, 9)[pyarrow]", + "binary[pyarrow]", ] # Type hints for Ibis data types supported by BigQuery DataFrame @@ -72,8 +74,17 @@ BOOL_BIGFRAMES_TYPES = [pd.BooleanDtype()] -# Several operations are restricted to these types. -NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()] +# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation) +# Pandas is inconsistent, so two definitions are provided, each used in different contexts +NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [ + pd.Float64Dtype(), + pd.Int64Dtype(), +] +NUMERIC_BIGFRAMES_TYPES_PERMISSIVE = NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + [ + pd.BooleanDtype(), + pd.ArrowDtype(pa.decimal128(38, 9)), + pd.ArrowDtype(pa.decimal256(76, 38)), +] # Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame ReadOnlyIbisDtype = Union[ @@ -97,6 +108,15 @@ ibis_dtypes.Timestamp(timezone="UTC"), pd.ArrowDtype(pa.timestamp("us", tz="UTC")), ), + (ibis_dtypes.binary, pd.ArrowDtype(pa.binary())), + ( + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), + pd.ArrowDtype(pa.decimal128(38, 9)), + ), + ( + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), + pd.ArrowDtype(pa.decimal256(76, 38)), + ), ) BIGFRAMES_TO_IBIS: Dict[Dtype, ibis_dtypes.DataType] = { @@ -112,6 +132,9 @@ ibis_dtypes.time: pa.time64("us"), ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), + ibis_dtypes.binary: pa.binary(), + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pa.decimal128(38, 9), + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pa.decimal256(76, 38), } ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} @@ -125,10 +148,6 @@ ) IBIS_TO_BIGFRAMES.update( { - ibis_dtypes.binary: np.dtype("O"), - ibis_dtypes.json: np.dtype("O"), - ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): np.dtype("O"), - ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): np.dtype("O"), ibis_dtypes.GeoSpatial( geotype="geography", srid=4326, nullable=True ): gpd.array.GeometryDtype(), @@ -178,7 +197,7 @@ def ibis_dtype_to_bigframes_dtype( # our IO returns them as objects. Eventually, we should support them as # ArrowDType (and update the IO accordingly) if isinstance(ibis_dtype, ibis_dtypes.Array): - return np.dtype("O") + return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) if isinstance(ibis_dtype, ibis_dtypes.Struct): return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) @@ -200,7 +219,9 @@ def ibis_dtype_to_bigframes_dtype( def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: if isinstance(ibis_dtype, ibis_dtypes.Array): - return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type)) + return pa.list_( + ibis_dtype_to_arrow_dtype(ibis_dtype.value_type.copy(nullable=True)) + ) if isinstance(ibis_dtype, ibis_dtypes.Struct): return pa.struct( @@ -224,21 +245,13 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: This is useful in cases where multiple types correspond to the same BigFrames dtype. """ ibis_type = value.type() + name = value.get_name() + if ibis_type.is_json(): + value = vendored_ibis_ops.ToJsonString(value).to_expr() + return value.name(name) # Allow REQUIRED fields to be joined with NULLABLE fields. nullable_type = ibis_type.copy(nullable=True) - return value.cast(nullable_type).name(value.get_name()) - - -def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table: - """Converts an Ibis table expression to canonical types. - - This is useful in cases where multiple types correspond to the same BigFrames dtype. - """ - casted_columns = [] - for column_name in table.columns: - column = typing.cast(ibis_types.Value, table[column_name]) - casted_columns.append(ibis_value_to_canonical_type(column)) - return table.select(*casted_columns) + return value.cast(nullable_type).name(name) def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: @@ -386,15 +399,35 @@ def cast_ibis_value( ibis_dtypes.bool, ibis_dtypes.float64, ibis_dtypes.string, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.float64: ( + ibis_dtypes.string, + ibis_dtypes.int64, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.string: ( + ibis_dtypes.int64, + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ibis_dtypes.binary, ), - ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), - ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64), ibis_dtypes.date: (ibis_dtypes.string,), - ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,), - ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,), + ibis_dtypes.Decimal(precision=38, scale=9): ( + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.Decimal(precision=76, scale=38): ( + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=38, scale=9), + ), ibis_dtypes.time: (), ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,), + ibis_dtypes.binary: (ibis_dtypes.string,), } value = ibis_value_to_canonical_type(value) @@ -458,30 +491,62 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: return False +# string is binary def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool: """Determine whether a scalar's type matches a given pyarrow type.""" if pa_type == pa.time64("us"): return isinstance(scalar, datetime.time) - if pa_type == pa.timestamp("us"): + elif pa_type == pa.timestamp("us"): if isinstance(scalar, datetime.datetime): return not scalar.tzinfo if isinstance(scalar, pd.Timestamp): return not scalar.tzinfo - if pa_type == pa.timestamp("us", tz="UTC"): + elif pa_type == pa.timestamp("us", tz="UTC"): if isinstance(scalar, datetime.datetime): return scalar.tzinfo == datetime.timezone.utc if isinstance(scalar, pd.Timestamp): return scalar.tzinfo == datetime.timezone.utc - if pa_type == pa.date32(): + elif pa_type == pa.date32(): return isinstance(scalar, datetime.date) + elif pa_type == pa.binary(): + return isinstance(scalar, bytes) + elif pa_type == pa.decimal128(38, 9): + # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks + return isinstance(scalar, decimal.Decimal) + elif pa_type == pa.decimal256(76, 38): + # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks + return isinstance(scalar, decimal.Decimal) return False -def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool: - """Whether scalar can be compare to items of dtype (though maybe requiring coercion)""" +def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]: + """Whether scalar can be compare to items of dtype (though maybe requiring coercion). Returns the datatype that must be used for the comparison""" if is_dtype(scalar, dtype): - return True + return dtype elif pd.api.types.is_numeric_dtype(dtype): - return pd.api.types.is_number(scalar) - else: - return False + # Implicit conversion currently only supported for numeric types + if pd.api.types.is_bool(scalar): + return lcd_type(pd.BooleanDtype(), dtype) + if pd.api.types.is_float(scalar): + return lcd_type(pd.Float64Dtype(), dtype) + if pd.api.types.is_integer(scalar): + return lcd_type(pd.Int64Dtype(), dtype) + if isinstance(scalar, decimal.Decimal): + # TODO: Check context to see if can use NUMERIC instead of BIGNUMERIC + return lcd_type(pd.ArrowDtype(pa.decimal128(76, 38)), dtype) + return None + + +def lcd_type(dtype1: Dtype, dtype2: Dtype) -> typing.Optional[Dtype]: + # Implicit conversion currently only supported for numeric types + hierarchy: list[Dtype] = [ + pd.BooleanDtype(), + pd.Int64Dtype(), + pd.Float64Dtype(), + pd.ArrowDtype(pa.decimal128(38, 9)), + pd.ArrowDtype(pa.decimal256(76, 38)), + ] + if (dtype1 not in hierarchy) or (dtype2 not in hierarchy): + return None + lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2)) + return hierarchy[lcd_index] diff --git a/bigframes/series.py b/bigframes/series.py index 6837c1c7f8..eefd2b755d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -16,6 +16,7 @@ from __future__ import annotations +import functools import itertools import numbers import textwrap @@ -455,7 +456,7 @@ def replace( else: # Scalar replace_list = [to_replace] replace_list = [ - i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype) + i for i in replace_list if bigframes.dtypes.is_compatible(i, self.dtype) ] return self._simple_replace(replace_list, value) if replace_list else self @@ -472,11 +473,15 @@ def _regex_replace(self, to_replace: str, value: str): return Series(block.select_column(result_col)) def _simple_replace(self, to_replace_list: typing.Sequence, value): - if not bigframes.dtypes.is_dtype(value, self.dtype): + result_type = bigframes.dtypes.is_compatible(value, self.dtype) + if not result_type: raise NotImplementedError( f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) + if result_type != self.dtype: + return self.astype(result_type)._simple_replace(to_replace_list, value) + block, cond = self._block.apply_unary_op( self._value_column, ops.IsInOp(to_replace_list) ) @@ -490,15 +495,26 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value): def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]): tuples = [] + lcd_types: list[typing.Optional[bigframes.dtypes.Dtype]] = [] for key, value in mapping.items(): - if not bigframes.dtypes.is_comparable(key, self.dtype): + lcd_type = bigframes.dtypes.is_compatible(key, self.dtype) + if not lcd_type: continue if not bigframes.dtypes.is_dtype(value, self.dtype): raise NotImplementedError( f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) tuples.append((key, value)) + lcd_types.append(lcd_type) + result_dtype = functools.reduce( + lambda t1, t2: bigframes.dtypes.lcd_type(t1, t2) if (t1 and t2) else None, + lcd_types, + ) + if not result_dtype: + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible mapping {mapping} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" + ) block, result = self._block.apply_unary_op( self._value_column, ops.MapOp(tuple(tuples)) ) @@ -782,7 +798,7 @@ def _central_moment(self, n: int) -> float: def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: if _is_list_like(func): - if self.dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES: + if self.dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: raise NotImplementedError( f"Multiple aggregations only supported on numeric series. {constants.FEEDBACK_LINK}" ) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 0c280e5d02..6ea4f72489 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -72,7 +72,7 @@ def test_columntransformer_standalone_fit_and_transform( expected.standard_scaled_flipper_length_mm.astype("Float64") ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) + pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) def test_columntransformer_standalone_fit_transform(new_penguins_df): @@ -123,4 +123,4 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): expected.standard_scaled_flipper_length_mm.astype("Float64") ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) + pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index 3b30d7eb1d..df387e6ee1 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -184,4 +184,5 @@ def test_bqml_standalone_transform(penguins_df_default_index, new_penguins_df): expected, check_exact=False, rtol=0.1, + check_dtype=False, ) diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index eece5ef21d..f39815aec2 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -292,11 +292,12 @@ def test_model_predict_with_unnamed_index( def test_remote_model_predict( bqml_linear_remote_model: core.BqmlModel, new_penguins_df ): - predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas() expected = pd.DataFrame( {"predicted_body_mass_g": [[3739.54], [3675.79], [3619.54]]}, index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + dtype=pd.ArrowDtype(pa.list_(pa.float64())), ) + predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas() pd.testing.assert_frame_equal( predictions[["predicted_body_mass_g"]].sort_index(), expected, diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py index 9008e85a0b..8ffd9924e9 100644 --- a/tests/system/small/ml/test_imported.py +++ b/tests/system/small/ml/test_imported.py @@ -51,6 +51,7 @@ def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df): result, expected, check_exact=False, + check_dtype=False, atol=0.1, ) @@ -90,6 +91,7 @@ def test_onnx_model_predict(imported_onnx_model, onnx_iris_df): result, expected, check_exact=False, + check_dtype=False, atol=0.1, ) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 267a2ed9c1..fd1b803eea 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import pytest from bigframes.ml import llm @@ -202,8 +201,7 @@ def test_embedding_generator_predict_success( assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] - assert isinstance(value, np.ndarray) - assert value.size == 768 + assert len(value) == 768 @pytest.mark.flaky(retries=2, delay=120) @@ -215,8 +213,7 @@ def test_embedding_generator_multilingual_predict_success( assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] - assert isinstance(value, np.ndarray) - assert value.size == 768 + assert len(value) == 768 @pytest.mark.flaky(retries=2, delay=120) @@ -228,5 +225,4 @@ def test_embedding_generator_predict_series_success( assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] - assert isinstance(value, np.ndarray) - assert value.size == 768 + assert len(value) == 768 diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 45548acca3..c3bd7f3b87 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -15,6 +15,7 @@ import math import pandas as pd +import pyarrow as pa import bigframes.ml.preprocessing @@ -453,6 +454,9 @@ def test_one_hot_encoder_default_params(new_penguins_df): [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -482,6 +486,9 @@ def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -507,6 +514,9 @@ def test_one_hot_encoder_series_default_params(new_penguins_df): [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -537,6 +547,9 @@ def test_one_hot_encoder_params(new_penguins_df): [{"index": 0, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -567,6 +580,9 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) diff --git a/tests/system/small/ml/test_remote.py b/tests/system/small/ml/test_remote.py index e8eb1c85e8..5036cdadfc 100644 --- a/tests/system/small/ml/test_remote.py +++ b/tests/system/small/ml/test_remote.py @@ -29,5 +29,6 @@ def test_remote_linear_vertex_model_predict( predictions[["predicted_body_mass_g"]].sort_index(), expected, check_exact=False, + check_dtype=False, rtol=0.1, ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ed78e73e5d..86b8cfbe66 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -19,7 +19,6 @@ from typing import Tuple import geopandas as gpd # type: ignore -import numpy as np import pandas as pd import pandas.testing import pyarrow as pa # type: ignore @@ -29,7 +28,11 @@ import bigframes._config.display_options as display_options import bigframes.dataframe as dataframe import bigframes.series as series -from tests.system.utils import assert_pandas_df_equal, assert_series_equal +from tests.system.utils import ( + assert_pandas_df_equal, + assert_series_equal, + skip_legacy_pandas, +) def test_df_construct_copy(scalars_dfs): @@ -273,19 +276,19 @@ def test_df_info(scalars_dfs): " # Column Non-Null Count Dtype\n" "--- ------------- ---------------- ------------------------------\n" " 0 bool_col 8 non-null boolean\n" - " 1 bytes_col 6 non-null object\n" + " 1 bytes_col 6 non-null binary[pyarrow]\n" " 2 date_col 7 non-null date32[day][pyarrow]\n" " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" " 4 geography_col 4 non-null geometry\n" " 5 int64_col 8 non-null Int64\n" " 6 int64_too 9 non-null Int64\n" - " 7 numeric_col 6 non-null object\n" + " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" " 8 float64_col 7 non-null Float64\n" " 9 rowindex_2 9 non-null Int64\n" " 10 string_col 8 non-null string\n" " 11 time_col 6 non-null time64[us][pyarrow]\n" " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" - "dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" "memory usage: 945 bytes\n" ) @@ -362,6 +365,7 @@ def test_drop_bigframes_index_with_na(scalars_dfs): pd.testing.assert_frame_equal(pd_result, bf_result) +@skip_legacy_pandas def test_drop_bigframes_multiindex(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs scalars_df = scalars_df.copy() @@ -841,13 +845,11 @@ def test_df_fillna(scalars_dfs): def test_df_replace_scalar_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas() - pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!") + bf_result = scalars_df.replace(555.555, 3).to_pandas() + pd_result = scalars_pandas_df.replace(555.555, 3) - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) + # pandas has narrower result types as they are determined dynamically + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) def test_df_replace_regex_scalar(scalars_dfs): @@ -863,12 +865,14 @@ def test_df_replace_regex_scalar(scalars_dfs): def test_df_replace_list_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas() - pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!") + bf_result = scalars_df.replace([555.555, 3.2], 3).to_pandas() + pd_result = scalars_pandas_df.replace([555.555, 3.2], 3) + # pandas has narrower result types as they are determined dynamically pd.testing.assert_frame_equal( pd_result, bf_result, + check_dtype=False, ) @@ -1198,13 +1202,13 @@ def test_get_dtypes(scalars_df_default_index): pd.Series( { "bool_col": pd.BooleanDtype(), - "bytes_col": np.dtype("O"), + "bytes_col": pd.ArrowDtype(pa.binary()), "date_col": pd.ArrowDtype(pa.date32()), "datetime_col": pd.ArrowDtype(pa.timestamp("us")), "geography_col": gpd.array.GeometryDtype(), "int64_col": pd.Int64Dtype(), "int64_too": pd.Int64Dtype(), - "numeric_col": np.dtype("O"), + "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), "float64_col": pd.Float64Dtype(), "rowindex": pd.Int64Dtype(), "rowindex_2": pd.Int64Dtype(), @@ -1232,7 +1236,7 @@ def test_get_dtypes_array_struct(session): dtypes, pd.Series( { - "array_column": np.dtype("O"), + "array_column": pd.ArrowDtype(pa.list_(pa.int64())), "struct_column": pd.ArrowDtype( pa.struct( [ @@ -2138,6 +2142,7 @@ def test_dataframe_agg_multi_string(scalars_dfs): ).all() +@skip_legacy_pandas def test_df_describe(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs # pyarrows time columns fail in pandas diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index fb9fb7bb89..59864e483e 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -56,7 +56,9 @@ def test_to_pandas_array_struct_correct_result(session): result = df.to_pandas() expected = pd.DataFrame( { - "array_column": [[1, 3, 2]], + "array_column": pd.Series( + [[1, 3, 2]], dtype=pd.ArrowDtype(pa.list_(pa.int64())) + ), "struct_column": pd.Series( [{"string_field": "a", "float_field": 1.2}], dtype=pd.ArrowDtype( @@ -91,7 +93,8 @@ def test_load_json(session): expected = pd.DataFrame( { "json_column": ['{"bar":true,"foo":10}'], - } + }, + dtype=pd.StringDtype(storage="pyarrow"), ) expected.index = expected.index.astype("Int64") pd.testing.assert_series_equal(result.dtypes, expected.dtypes) @@ -137,6 +140,8 @@ def test_to_csv_index( dtype = scalars_df.reset_index().dtypes.to_dict() dtype.pop("geography_col") dtype.pop("rowindex") + # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string + dtype.pop("bytes_col") gcs_df = pd.read_csv( path, dtype=dtype, @@ -148,7 +153,6 @@ def test_to_csv_index( scalars_pandas_df = scalars_pandas_df.copy() scalars_pandas_df.index = scalars_pandas_df.index.astype("int64") - # Ordering should be maintained for tables smaller than 1 GB. pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) @@ -174,6 +178,8 @@ def test_to_csv_tabs( dtype = scalars_df.reset_index().dtypes.to_dict() dtype.pop("geography_col") dtype.pop("rowindex") + # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string + dtype.pop("bytes_col") gcs_df = pd.read_csv( path, sep="\t", @@ -216,6 +222,8 @@ def test_to_gbq_index(scalars_dfs, dataset_id, index): df_out = df_out.sort_values("rowindex_2").reset_index(drop=True) convert_pandas_dtypes(df_out, bytes_col=False) + # pd.read_gbq interpets bytes_col as object, reconvert to pyarrow binary + df_out["bytes_col"] = df_out["bytes_col"].astype(pd.ArrowDtype(pa.binary())) expected = scalars_pandas_df.copy() expected.index.name = index_col pd.testing.assert_frame_equal(df_out, expected, check_index_type=False) @@ -377,7 +385,9 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): scalars_pandas_df.index = scalars_pandas_df.index.astype("Int64") # Ordering should be maintained for tables smaller than 1 GB. - pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) + pd.testing.assert_frame_equal( + gcs_df.drop("bytes_col", axis=1), scalars_pandas_df.drop("bytes_col", axis=1) + ) def test_to_sql_query_unnamed_index_included( diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 1708735f4c..2d4e1f0204 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -394,14 +394,17 @@ def test_multi_index_dataframe_groupby_level_aggregate( def test_multi_index_dataframe_groupby_level_analytic( scalars_df_index, scalars_pandas_df_index, level, as_index ): + # Drop "numeric_col" as pandas doesn't support numerics for grouped window function bf_result = ( - scalars_df_index.set_index(["int64_too", "bool_col"]) + scalars_df_index.drop("numeric_col", axis=1) + .set_index(["int64_too", "bool_col"]) .groupby(level=level, as_index=as_index, dropna=False) .cumsum(numeric_only=True) .to_pandas() ) pd_result = ( - scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) + scalars_pandas_df_index.drop("numeric_col", axis=1) + .set_index(["int64_too", "bool_col"]) .groupby(level=level, as_index=as_index, dropna=False) .cumsum(numeric_only=True) ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 623da74aa4..6f919f740f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -24,7 +24,11 @@ import bigframes.pandas import bigframes.series as series -from tests.system.utils import assert_pandas_df_equal, assert_series_equal +from tests.system.utils import ( + assert_pandas_df_equal, + assert_series_equal, + skip_legacy_pandas, +) def test_series_construct_copy(scalars_dfs): @@ -81,14 +85,14 @@ def test_series_construct_from_list_escaped_strings(): [ ("bool_col", pd.BooleanDtype()), # TODO(swast): Use a more efficient type. - ("bytes_col", numpy.dtype("object")), + ("bytes_col", pd.ArrowDtype(pa.binary())), ("date_col", pd.ArrowDtype(pa.date32())), ("datetime_col", pd.ArrowDtype(pa.timestamp("us"))), ("float64_col", pd.Float64Dtype()), ("geography_col", gpd.array.GeometryDtype()), ("int64_col", pd.Int64Dtype()), # TODO(swast): Use a more efficient type. - ("numeric_col", numpy.dtype("object")), + ("numeric_col", pd.ArrowDtype(pa.decimal128(38, 9))), ("int64_too", pd.Int64Dtype()), ("string_col", pd.StringDtype(storage="pyarrow")), ("time_col", pd.ArrowDtype(pa.time64("us"))), @@ -2519,8 +2523,12 @@ def test_mask_custom_value(scalars_dfs): ("int64_col", pd.Float64Dtype()), ("int64_col", "string[pyarrow]"), ("int64_col", "boolean"), + ("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))), + ("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), + ("string_col", "binary[pyarrow]"), + ("bytes_col", "string[pyarrow]"), # pandas actually doesn't let folks convert to/from naive timestamp and # raises a deprecation warning to use tz_localize/tz_convert instead, # but BigQuery always stores values as UTC and doesn't have to deal @@ -2538,6 +2546,7 @@ def test_mask_custom_value(scalars_dfs): # https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions ], ) +@skip_legacy_pandas def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type): bf_result = scalars_df_index[column].astype(to_type).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index e6eb40a5fa..8ce442376a 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -30,6 +30,7 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model +from tests.system.utils import skip_legacy_pandas FIRST_FILE = "000000000000" @@ -385,6 +386,7 @@ def test_read_pandas_tokyo( pd.testing.assert_frame_equal(result, expected) +@skip_legacy_pandas def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs if scalars_df.index.name is not None: @@ -441,6 +443,7 @@ def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder): pytest.param("\t", id="custom_sep"), ], ) +@skip_legacy_pandas def test_read_csv_local_default_engine(session, scalars_dfs, sep): scalars_df, scalars_pandas_df = scalars_dfs with tempfile.TemporaryDirectory() as dir: diff --git a/tests/system/utils.py b/tests/system/utils.py index f49b5ece31..a4647b4f51 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -14,11 +14,23 @@ import base64 import decimal +import functools import geopandas as gpd # type: ignore import numpy as np import pandas as pd import pyarrow as pa # type: ignore +import pytest + + +def skip_legacy_pandas(test): + @functools.wraps(test) + def wrapper(*args, **kwds): + if pd.__version__.startswith("1."): + pytest.skip("Skips pandas 1.x as not compatible with 2.x behavior.") + return test(*args, **kwds) + + return wrapper def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): @@ -133,16 +145,28 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): df["geography_col"].replace({np.nan: None}) ) - # Convert bytes types column. - if bytes_col: + if bytes_col and not isinstance(df["bytes_col"].dtype, pd.ArrowDtype): df["bytes_col"] = df["bytes_col"].apply( lambda value: base64.b64decode(value) if not pd.isnull(value) else value ) + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["bytes_col"]), + schema=pa.schema([("bytes_col", pa.binary())]), + ) + df["bytes_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["bytes_col"] - # Convert numeric types column. - df["numeric_col"] = df["numeric_col"].apply( - lambda value: decimal.Decimal(str(value)) if value else None # type: ignore - ) + if not isinstance(df["numeric_col"].dtype, pd.ArrowDtype): + # Convert numeric types column. + df["numeric_col"] = df["numeric_col"].apply( + lambda value: decimal.Decimal(str(value)) if value else None # type: ignore + ) + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["numeric_col"]), + schema=pa.schema([("numeric_col", pa.decimal128(38, 9))]), + ) + df["numeric_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ + "numeric_col" + ] def assert_pandas_df_equal_pca_components(actual, expected, **kwargs): diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 6ceaaf911b..e648fd28cc 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -31,11 +31,11 @@ # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types pytest.param( ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), - np.dtype("O"), + pd.ArrowDtype(pa.decimal256(76, 38)), id="bignumeric", ), pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"), - pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"), + pytest.param(ibis_dtypes.binary, pd.ArrowDtype(pa.binary()), id="bytes"), pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"), pytest.param( ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime" @@ -49,10 +49,9 @@ pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"), pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"), # TODO(tswast): custom dtype (or at least string dtype) for JSON objects - pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"), pytest.param( ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), - np.dtype("O"), + pd.ArrowDtype(pa.decimal128(38, 9)), id="numeric", ), pytest.param(