From 91ea332b2775ba2042a36c7423ecf35bd5204205 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 28 Mar 2024 18:51:17 +0100 Subject: [PATCH] refactor(formats): remove unnecessary schema argument from schema inference (#8814) --- ibis/backends/dask/__init__.py | 15 ++++++++------- ibis/backends/pandas/__init__.py | 12 +++++------- ibis/expr/datatypes/core.py | 9 --------- ibis/expr/schema.py | 26 +++++++------------------- ibis/expr/tests/test_schema.py | 11 ----------- ibis/formats/__init__.py | 4 +--- ibis/formats/pandas.py | 17 ++++++----------- 7 files changed, 27 insertions(+), 67 deletions(-) diff --git a/ibis/backends/dask/__init__.py b/ibis/backends/dask/__init__.py index e7c3d8a958d0..b8e217affccb 100644 --- a/ibis/backends/dask/__init__.py +++ b/ibis/backends/dask/__init__.py @@ -10,8 +10,6 @@ # import the pandas execution module to register dispatched implementations of # execute_node that the dask backend will later override -import ibis.expr.operations as ops -import ibis.expr.schema as sch import ibis.expr.types as ir from ibis import util from ibis.backends import NoUrl @@ -167,11 +165,14 @@ def read_parquet( self.dictionary[table_name] = df return self.table(table_name) - def table(self, name: str, schema: sch.Schema | None = None): - df = self.dictionary[name] - schema = schema or self.schemas.get(name, None) - schema = PandasData.infer_table(df.head(1), schema=schema) - return ops.DatabaseTable(name, schema, self).to_expr() + def get_schema(self, table_name, *, database=None): + try: + schema = self.schemas[table_name] + except KeyError: + df = self.dictionary[table_name] + self.schemas[table_name] = schema = PandasData.infer_table(df.head(1)) + + return schema def _convert_object(self, obj) -> dd.DataFrame: if isinstance(obj, dd.DataFrame): diff --git a/ibis/backends/pandas/__init__.py b/ibis/backends/pandas/__init__.py index 199dc65f214b..c29bb0b38b4e 100644 --- a/ibis/backends/pandas/__init__.py +++ b/ibis/backends/pandas/__init__.py @@ -165,18 +165,16 @@ def list_tables(self, like=None, database=None): return self._filter_with_like(list(self.dictionary.keys()), like) def table(self, name: str, schema: sch.Schema | None = None): - df = self.dictionary[name] - schema = schema or self.schemas.get(name, None) - schema = PandasData.infer_table(df, schema=schema) - return ops.DatabaseTable(name, schema, self).to_expr() + inferred_schema = self.get_schema(name) + overridden_schema = {**inferred_schema, **(schema or {})} + return ops.DatabaseTable(name, overridden_schema, self).to_expr() def get_schema(self, table_name, *, database=None): - schemas = self.schemas try: - schema = schemas[table_name] + schema = self.schemas[table_name] except KeyError: df = self.dictionary[table_name] - schemas[table_name] = schema = PandasData.infer_table(df) + self.schemas[table_name] = schema = PandasData.infer_table(df) return schema diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py index 25565bfe4036..c640190fa9b6 100644 --- a/ibis/expr/datatypes/core.py +++ b/ibis/expr/datatypes/core.py @@ -247,11 +247,6 @@ def from_polars(cls, polars_type, nullable=True) -> Self: return PolarsType.to_ibis(polars_type, nullable=nullable) - @classmethod - def from_dask(cls, dask_type, nullable=True) -> Self: - """Return the equivalent ibis datatype.""" - return cls.from_pandas(dask_type, nullable=nullable) - def to_numpy(self): """Return the equivalent numpy datatype.""" from ibis.formats.numpy import NumpyType @@ -276,10 +271,6 @@ def to_polars(self): return PolarsType.from_ibis(self) - def to_dask(self): - """Return the equivalent dask datatype.""" - return self.to_pandas() - def is_array(self) -> bool: """Return True if an instance of an Array type.""" return isinstance(self, Array) diff --git a/ibis/expr/schema.py b/ibis/expr/schema.py index 55b9f385f2fb..a5802072bf9e 100644 --- a/ibis/expr/schema.py +++ b/ibis/expr/schema.py @@ -162,11 +162,6 @@ def from_polars(cls, polars_schema): return PolarsSchema.to_ibis(polars_schema) - @classmethod - def from_dask(cls, dask_schema): - """Return the equivalent ibis schema.""" - return cls.from_pandas(dask_schema) - def to_numpy(self): """Return the equivalent numpy dtypes.""" from ibis.formats.numpy import NumpySchema @@ -191,10 +186,6 @@ def to_polars(self): return PolarsSchema.from_ibis(self) - def to_dask(self): - """Return the equivalent dask dtypes.""" - return self.to_pandas() - def as_struct(self) -> dt.Struct: return dt.Struct(self) @@ -238,7 +229,7 @@ def schema(value: Any) -> Schema: @lazy_singledispatch -def infer(value: Any, schema=None) -> Schema: +def infer(value: Any) -> Schema: """Infer the corresponding ibis schema for a python object.""" raise InputTypeError(value) @@ -278,28 +269,25 @@ def from_pyarrow_schema(schema): @infer.register("pandas.DataFrame") -def infer_pandas_dataframe(df, schema=None): +def infer_pandas_dataframe(df): from ibis.formats.pandas import PandasData - return PandasData.infer_table(df, schema) + return PandasData.infer_table(df) -# TODO(kszucs): do we really need the schema kwarg? @infer.register("pyarrow.Table") -def infer_pyarrow_table(table, schema=None): +def infer_pyarrow_table(table): from ibis.formats.pyarrow import PyArrowSchema - schema = schema if schema is not None else table.schema - return PyArrowSchema.to_ibis(schema) + return PyArrowSchema.to_ibis(table.schema) @infer.register("polars.DataFrame") @infer.register("polars.LazyFrame") -def infer_polars_dataframe(df, schema=None): +def infer_polars_dataframe(df): from ibis.formats.polars import PolarsSchema - schema = schema if schema is not None else df.schema - return PolarsSchema.to_ibis(schema) + return PolarsSchema.to_ibis(df.schema) # lock the dispatchers to avoid adding new implementations diff --git a/ibis/expr/tests/test_schema.py b/ibis/expr/tests/test_schema.py index 8acde0bd67c2..05467619de17 100644 --- a/ibis/expr/tests/test_schema.py +++ b/ibis/expr/tests/test_schema.py @@ -20,12 +20,6 @@ has_pandas = True -has_dask = False -with contextlib.suppress(ImportError): - import dask.dataframe as dd # noqa: F401 - - has_dask = True - def test_whole_schema(): schema = { @@ -437,11 +431,6 @@ def test_schema_from_to_numpy_dtypes(): @pytest.mark.parametrize( ("from_method", "to_method"), [ - pytest.param( - "from_dask", - "to_dask", - marks=pytest.mark.skipif(not has_dask, reason="dask not installed"), - ), pytest.param( "from_pandas", "to_pandas", diff --git a/ibis/formats/__init__.py b/ibis/formats/__init__.py index d8c3b24669c7..30dc8ab29cde 100644 --- a/ibis/formats/__init__.py +++ b/ibis/formats/__init__.py @@ -168,15 +168,13 @@ def convert_column(cls, obj: C, dtype: DataType) -> C: raise NotImplementedError @classmethod - def convert_table(cls, obj: T, schema: Schema) -> T: + def convert_table(cls, obj: T) -> T: """Convert a format-specific table to the given ibis schema. Parameters ---------- obj The format-specific table-like object to convert. - schema - The Ibis schema to convert to. Returns ------- diff --git a/ibis/formats/pandas.py b/ibis/formats/pandas.py index e6619030b3be..597b4efb22ea 100644 --- a/ibis/formats/pandas.py +++ b/ibis/formats/pandas.py @@ -94,9 +94,7 @@ def infer_column(cls, s): return PyArrowData.infer_column(s) @classmethod - def infer_table(cls, df, schema=None): - schema = schema if schema is not None else {} - + def infer_table(cls, df): pairs = [] for column_name in df.dtypes.keys(): if not isinstance(column_name, str): @@ -104,15 +102,12 @@ def infer_table(cls, df, schema=None): "Column names must be strings to use the pandas backend" ) - if column_name in schema: - ibis_dtype = schema[column_name] + pandas_column = df[column_name] + pandas_dtype = pandas_column.dtype + if pandas_dtype == np.object_: + ibis_dtype = cls.infer_column(pandas_column) else: - pandas_column = df[column_name] - pandas_dtype = pandas_column.dtype - if pandas_dtype == np.object_: - ibis_dtype = cls.infer_column(pandas_column) - else: - ibis_dtype = PandasType.to_ibis(pandas_dtype) + ibis_dtype = PandasType.to_ibis(pandas_dtype) pairs.append((column_name, ibis_dtype))