From 620533e6df031becee7f9cf2225d534c825234a9 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 18 Dec 2023 21:59:20 +0000 Subject: [PATCH] feat: new bytes, json, decimal type mappings --- bigframes/core/compile/compiled.py | 14 +++++++-- bigframes/dtypes.py | 41 ++++++++++++++----------- tests/system/small/test_dataframe_io.py | 3 +- tests/system/utils.py | 24 +++++++++++---- tests/unit/test_dtypes.py | 7 ++--- 5 files changed, 58 insertions(+), 31 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index d6183228d1..199c8db785 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -220,7 +220,10 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value: raise ValueError( "Column name {} not in set of values: {}".format(key, self.column_ids) ) - return typing.cast(ibis_types.Value, self._column_names[key]) + return typing.cast( + ibis_types.Value, + bigframes.dtypes.ibis_value_to_canonical_type(self._column_names[key]), + ) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: ibis_type = typing.cast( @@ -1177,7 +1180,14 @@ def _to_ibis_expr( # Make sure all dtypes are the "canonical" ones for BigFrames. This is # important for operations like UNION where the schema must match. table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + bigframes.dtypes.ibis_value_to_canonical_type( + column.resolve(self._table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) + else column + ) + for column in columns ) base_table = table if self._reduced_predicate is not None: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 774eb74d06..b0f27ef2f4 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -29,6 +29,7 @@ import bigframes.constants as constants import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ @@ -96,6 +97,15 @@ ibis_dtypes.Timestamp(timezone="UTC"), pd.ArrowDtype(pa.timestamp("us", tz="UTC")), ), + (ibis_dtypes.binary, pd.ArrowDtype(pa.binary())), + ( + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), + pd.ArrowDtype(pa.decimal128(38, 9)), + ), + ( + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), + pd.ArrowDtype(pa.decimal256(76, 38)), + ), ) BIGFRAMES_TO_IBIS: Dict[Dtype, ibis_dtypes.DataType] = { @@ -111,6 +121,13 @@ ibis_dtypes.time: pa.time64("us"), ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), + ibis_dtypes.binary: pd.ArrowDtype(pa.binary()), + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pd.ArrowDtype( + pa.decimal128(38, 9) + ), + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pd.ArrowDtype( + pa.decimal256(76, 38) + ), } ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} @@ -124,10 +141,6 @@ ) IBIS_TO_BIGFRAMES.update( { - ibis_dtypes.binary: np.dtype("O"), - ibis_dtypes.json: np.dtype("O"), - ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): np.dtype("O"), - ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): np.dtype("O"), ibis_dtypes.GeoSpatial( geotype="geography", srid=4326, nullable=True ): gpd.array.GeometryDtype(), @@ -177,7 +190,7 @@ def ibis_dtype_to_bigframes_dtype( # our IO returns them as objects. Eventually, we should support them as # ArrowDType (and update the IO accordingly) if isinstance(ibis_dtype, ibis_dtypes.Array): - return np.dtype("O") + return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) if isinstance(ibis_dtype, ibis_dtypes.Struct): return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) @@ -223,21 +236,13 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: This is useful in cases where multiple types correspond to the same BigFrames dtype. """ ibis_type = value.type() + name = value.get_name() + if ibis_type.is_json(): + value = vendored_ibis_ops.ToJsonString(value).to_expr() + return value.name(name) # Allow REQUIRED fields to be joined with NULLABLE fields. nullable_type = ibis_type.copy(nullable=True) - return value.cast(nullable_type).name(value.get_name()) - - -def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table: - """Converts an Ibis table expression to canonical types. - - This is useful in cases where multiple types correspond to the same BigFrames dtype. - """ - casted_columns = [] - for column_name in table.columns: - column = typing.cast(ibis_types.Value, table[column_name]) - casted_columns.append(ibis_value_to_canonical_type(column)) - return table.select(*casted_columns) + return value.cast(nullable_type).name(name) def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index fb9fb7bb89..b803a5de3d 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -91,7 +91,8 @@ def test_load_json(session): expected = pd.DataFrame( { "json_column": ['{"bar":true,"foo":10}'], - } + }, + dtype=pd.StringDtype(storage="pyarrow"), ) expected.index = expected.index.astype("Int64") pd.testing.assert_series_equal(result.dtypes, expected.dtypes) diff --git a/tests/system/utils.py b/tests/system/utils.py index f49b5ece31..94c1aabb26 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -133,16 +133,28 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): df["geography_col"].replace({np.nan: None}) ) - # Convert bytes types column. - if bytes_col: + if not isinstance(df["bytes_col"].dtype, pd.ArrowDtype): df["bytes_col"] = df["bytes_col"].apply( lambda value: base64.b64decode(value) if not pd.isnull(value) else value ) + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["bytes_col"]), + schema=pa.schema([("bytes_col", pa.binary())]), + ) + df["bytes_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["bytes_col"] - # Convert numeric types column. - df["numeric_col"] = df["numeric_col"].apply( - lambda value: decimal.Decimal(str(value)) if value else None # type: ignore - ) + if not isinstance(df["numeric_col"].dtype, pd.ArrowDtype): + # Convert numeric types column. + df["numeric_col"] = df["numeric_col"].apply( + lambda value: decimal.Decimal(str(value)) if value else None # type: ignore + ) + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["numeric_col"]), + schema=pa.schema([("numeric_col", pa.decimal128(38, 9))]), + ) + df["numeric_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ + "numeric_col" + ] def assert_pandas_df_equal_pca_components(actual, expected, **kwargs): diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 6ceaaf911b..e648fd28cc 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -31,11 +31,11 @@ # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types pytest.param( ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), - np.dtype("O"), + pd.ArrowDtype(pa.decimal256(76, 38)), id="bignumeric", ), pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"), - pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"), + pytest.param(ibis_dtypes.binary, pd.ArrowDtype(pa.binary()), id="bytes"), pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"), pytest.param( ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime" @@ -49,10 +49,9 @@ pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"), pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"), # TODO(tswast): custom dtype (or at least string dtype) for JSON objects - pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"), pytest.param( ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), - np.dtype("O"), + pd.ArrowDtype(pa.decimal128(38, 9)), id="numeric", ), pytest.param(