From eb223cac0fefa3852a9a0b0a5ce1b9d1bf1a6ca8 Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Mon, 8 Nov 2021 23:39:19 -0500 Subject: [PATCH 1/3] implement dataframe types - added submodules in pandera.typing module for dask, modin, koalas - new documentation for mypy integration, other dataframe library support - update copy on existing documentation - expand scope --- README.md | 26 +++-- docs/source/conf.py | 5 +- docs/source/dask.rst | 134 +++++++++++++++++++++ docs/source/dataframe_schemas.rst | 4 + docs/source/dtypes.rst | 4 +- docs/source/{scaling.rst => fugue.rst} | 39 ++++--- docs/source/index.rst | 34 ++++-- docs/source/integrations.rst | 78 ++++++++++++- docs/source/koalas.rst | 111 ++++++++++++++++++ docs/source/modin.rst | 113 ++++++++++++++++++ docs/source/schema_models.rst | 64 ++++++++++- docs/source/supported_libraries.rst | 50 ++++++++ docs/source/third_party_schema.rst | 4 +- pandera/external_config.py | 10 +- pandera/io.py | 1 - pandera/model.py | 38 +++--- pandera/typing/__init__.py | 58 ++++++++++ pandera/{typing.py => typing/common.py} | 147 +++++------------------- pandera/typing/dask.py | 67 +++++++++++ pandera/typing/koalas.py | 54 +++++++++ pandera/typing/modin.py | 46 ++++++++ pandera/typing/pandas.py | 101 ++++++++++++++++ tests/core/static/pandas_dataframe.py | 44 +++++-- tests/core/test_model.py | 12 +- tests/core/test_static_type_checking.py | 15 +-- tests/core/test_typing.py | 32 +++++- tests/dask/test_dask.py | 51 +++++++- tests/koalas/test_schemas_on_koalas.py | 53 +++++++-- tests/modin/test_schemas_on_modin.py | 51 ++++++-- 29 files changed, 1220 insertions(+), 226 deletions(-) create mode 100644 docs/source/dask.rst rename docs/source/{scaling.rst => fugue.rst} (91%) create mode 100644 docs/source/koalas.rst create mode 100644 docs/source/modin.rst create mode 100644 docs/source/supported_libraries.rst create mode 100644 pandera/typing/__init__.py rename pandera/{typing.py => typing/common.py} (59%) create mode 100644 pandera/typing/dask.py create mode 100644 pandera/typing/koalas.py create mode 100644 pandera/typing/modin.py create mode 100644 pandera/typing/pandas.py diff --git a/README.md b/README.md index 1e5b3cd57..f86c68ce0 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@
-*A data validation library for scientists, engineers, and analysts seeking +*A dataframe validation library for scientists, engineers, and analysts seeking correctness.*
@@ -22,10 +22,18 @@ correctness.* [![Downloads](https://pepy.tech/badge/pandera/month)](https://pepy.tech/project/pandera) [![Downloads](https://pepy.tech/badge/pandera)](https://pepy.tech/project/pandera) -`pandas` data structures contain information that `pandera` explicitly -validates at runtime. This is useful in production-critical or reproducible -research settings. With `pandera`, you can: +`pandera` provides a flexible and expressive API for performing data +validation on dataframes to make data processing pipelines more readable and +robust. +Dataframes contain information that `pandera` explicitly validates at runtime. +This is useful in production-critical or reproducible research settings. With +`pandera`, you can: + +1. Define a schema once and use it to validate + [different dataframe types](https://pandera.readthedocs.io/en/stable/supported_libraries.html) + including [pandas](http://pandas.pydata.org), [dask](https://dask.org), + [modin](https://modin.readthedocs.io/), and [koalas](https://koalas.readthedocs.io). 1. [Check](https://pandera.readthedocs.io/en/stable/checks.html) the types and properties of columns in a `DataFrame` or values in a `Series`. 1. Perform more complex statistical validation like @@ -37,11 +45,11 @@ research settings. With `pandera`, you can: with pydantic-style syntax and validate dataframes using the typing syntax. 1. [Synthesize data](https://pandera.readthedocs.io/en/stable/data_synthesis_strategies.html#data-synthesis-strategies) from schema objects for property-based testing with pandas data structures. - -`pandera` provides a flexible and expressive API for performing data validation -on tidy (long-form) and wide data to make data processing pipelines more -readable and robust. - +1. [Lazily Validate](https://pandera.readthedocs.io/en/stable/lazy_validation.html) + dataframes so that all validation checks are executed before raising an error. +1. [Integrate](https://pandera.readthedocs.io/en/stable/integrations.html) with + a rich ecosystem of python tools like [pydantic](https://pydantic-docs.helpmanual.io) + and [mypy](http://mypy-lang.org/). ## Documentation diff --git a/docs/source/conf.py b/docs/source/conf.py index 9abb7ebce..77c9b06f3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -175,7 +175,10 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), "numpy": ("https://docs.scipy.org/doc/numpy/", None), - "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "dask": ("https://docs.dask.org/en/latest/", None), + "koalas": ("https://koalas.readthedocs.io/en/latest/", None), + "modin": ("https://modin.readthedocs.io/en/latest/", None), } # strip prompts diff --git a/docs/source/dask.rst b/docs/source/dask.rst new file mode 100644 index 000000000..93676a255 --- /dev/null +++ b/docs/source/dask.rst @@ -0,0 +1,134 @@ +.. currentmodule:: pandera + +.. _scaling_dask: + +Data Validation with Dask +========================= + +*new in 0.8.0* + +`Dask `__ is a distributed +compute framework that offers a pandas-like dataframe API. +You can use pandera to validate :py:func:`~dask.dataframe.DataFrame` +and :py:func:`~dask.dataframe.Series` objects directly. First, install +``pandera`` with the ``dask`` extra: + +.. code:: bash + + pip install pandera[dask] + + +Then you can use pandera schemas to validate dask dataframes. In the example +below we'll use the :ref:`class-based API ` to define a +:py:class:`SchemaModel` for validation. + +.. testcode:: scaling_dask + + import dask.dataframe as dd + import pandas as pd + import pandera as pa + + from pandera.typing.dask import DataFrame, Series + + + class Schema(pa.SchemaModel): + state: Series[str] + city: Series[str] + price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20}) + + + ddf = dd.from_pandas( + pd.DataFrame( + { + 'state': ['FL','FL','FL','CA','CA','CA'], + 'city': [ + 'Orlando', + 'Miami', + 'Tampa', + 'San Francisco', + 'Los Angeles', + 'San Diego', + ], + 'price': [8, 12, 10, 16, 20, 18], + } + ), + npartitions=2 + ) + pandera_ddf = Schema(ddf) + + print(pandera_ddf) + + +.. testoutput:: scaling_dask + + Dask DataFrame Structure: + state city price + npartitions=2 + 0 object object int64 + 3 ... ... ... + 5 ... ... ... + Dask Name: validate, 4 tasks + + +As you can see, passing the dask dataframe into ``Schema`` will produce +another dask dataframe which hasn't been evaluated yet. What this means is +that pandera will only validate when the dask graph is evaluated. + +.. testcode:: scaling_dask + + print(pandera_ddf.compute()) + + +.. testoutput:: scaling_dask + + state city price + 0 FL Orlando 8 + 1 FL Miami 12 + 2 FL Tampa 10 + 3 CA San Francisco 16 + 4 CA Los Angeles 20 + 5 CA San Diego 18 + + +You can also use the :py:func:`~pandera.check_types` decorator to validate +dask dataframes at runtime: + +.. testcode:: scaling_dask + + @pa.check_types + def function(ddf: DataFrame[Schema]) -> DataFrame[Schema]: + return ddf[ddf["state"] == "CA"] + + print(function(ddf).compute()) + + +.. testoutput:: scaling_dask + + state city price + 3 CA San Francisco 16 + 4 CA Los Angeles 20 + 5 CA San Diego 18 + + +And of course, you can use the object-based API to validate dask dataframes: + + +.. testcode:: scaling_dask + + schema = pa.DataFrameSchema({ + "state": pa.Column(str), + "city": pa.Column(str), + "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20)) + }) + print(schema(ddf).compute()) + + +.. testoutput:: scaling_dask + + state city price + 0 FL Orlando 8 + 1 FL Miami 12 + 2 FL Tampa 10 + 3 CA San Francisco 16 + 4 CA Los Angeles 20 + 5 CA San Diego 18 diff --git a/docs/source/dataframe_schemas.rst b/docs/source/dataframe_schemas.rst index 9fcda7818..61462a2e9 100644 --- a/docs/source/dataframe_schemas.rst +++ b/docs/source/dataframe_schemas.rst @@ -39,6 +39,10 @@ The :class:`~pandera.schemas.DataFrameSchema` object consists of |column|_\s and coerce=True, ) +You can refer to :ref:`schema_models` to see how to define dataframe schemas +using the alternative pydantic/dataclass-style syntax. + + .. _column: Column Validation diff --git a/docs/source/dtypes.rst b/docs/source/dtypes.rst index b687d0d19..148caa42b 100644 --- a/docs/source/dtypes.rst +++ b/docs/source/dtypes.rst @@ -4,8 +4,8 @@ .. _dtypes: -Pandera Data Types (new) -======================== +Pandera Data Types +================== *new in 0.7.0* diff --git a/docs/source/scaling.rst b/docs/source/fugue.rst similarity index 91% rename from docs/source/scaling.rst rename to docs/source/fugue.rst index 221424b01..5abb3d175 100644 --- a/docs/source/scaling.rst +++ b/docs/source/fugue.rst @@ -1,9 +1,9 @@ .. currentmodule:: pandera -.. _scaling: +.. _scaling_fugue: -Scaling Pandera to Big Data -================================= +Data Validation with Fugue +========================== Validation on big data comes in two forms. The first is performing one set of validations on data that doesn't fit in memory. The second happens when a large dataset @@ -17,8 +17,8 @@ code can be used on top of ``Spark`` or ``Dask`` engines with to be performed in a distributed setting. ``Fugue`` is an open source abstraction layer that ports ``Python``, ``pandas``, and ``SQL`` code to ``Spark`` and ``Dask``. -Fugue ------ +What is Fugue? +-------------- ``Fugue`` serves as an interface to distributed computing. Because of its non-invasive design, existing ``Python`` code can be scaled to a distributed setting without significant changes. @@ -40,17 +40,22 @@ In this example, a pandas ``DataFrame`` is created with ``state``, ``city`` and columns. ``Pandera`` will be used to validate that the ``price`` column values are within a certain range. -.. testcode:: scaling_pandera +.. testcode:: scaling_fugue import pandas as pd - data = pd.DataFrame({'state': ['FL','FL','FL','CA','CA','CA'], - 'city': ['Orlando', 'Miami', 'Tampa', - 'San Francisco', 'Los Angeles', 'San Diego'], - 'price': [8, 12, 10, 16, 20, 18]}) + data = pd.DataFrame( + { + 'state': ['FL','FL','FL','CA','CA','CA'], + 'city': [ + 'Orlando', 'Miami', 'Tampa', 'San Francisco', 'Los Angeles', 'San Diego' + ], + 'price': [8, 12, 10, 16, 20, 18], + } + ) print(data) -.. testoutput:: scaling_pandera +.. testoutput:: scaling_fugue state city price 0 FL Orlando 8 @@ -64,7 +69,7 @@ a certain range. Validation is then applied using pandera. A ``price_validation`` function is created that runs the validation. None of this will be new. -.. testcode:: scaling_pandera +.. testcode:: scaling_fugue from pandera import Column, DataFrameSchema, Check @@ -85,7 +90,7 @@ to run the code on top of ``Spark``. ``Fugue`` also has a ``DaskExecutionEngine` the default pandas-based ``ExecutionEngine``. Because the ``SparkExecutionEngine`` is used, the result becomes a ``Spark DataFrame``. -.. testcode:: scaling_pandera +.. testcode:: scaling_fugue :skipif: SKIP_SCALING from fugue import transform @@ -94,7 +99,7 @@ becomes a ``Spark DataFrame``. spark_df = transform(data, price_validation, schema="*", engine=SparkExecutionEngine) spark_df.show() -.. testoutput:: scaling_pandera +.. testoutput:: scaling_fugue :skipif: SKIP_SCALING +-----+-------------+-----+ @@ -118,7 +123,7 @@ price range for the records with ``state`` FL is lower than the range for the `` Two :class:`~pandera.schemas.DataFrameSchema` will be created to reflect this. Notice their ranges for the :class:`~pandera.checks.Check` differ. -.. testcode:: scaling_pandera +.. testcode:: scaling_fugue price_check_FL = DataFrameSchema({ "price": Column(int, Check.in_range(min_value=7,max_value=13)), @@ -139,7 +144,7 @@ To partition our data by ``state``, all we need to do is pass it into the ``tran through the ``partition`` argument. This splits up the data across different workers before they each run the ``price_validation`` function. Again, this is like a groupby-validation. -.. testcode:: scaling_pandera +.. testcode:: scaling_fugue :skipif: SKIP_SCALING def price_validation(df:pd.DataFrame) -> pd.DataFrame: @@ -156,7 +161,7 @@ each run the ``price_validation`` function. Again, this is like a groupby-valida spark_df.show() -.. testoutput:: scaling_pandera +.. testoutput:: scaling_fugue :skipif: SKIP_SCALING SparkDataFrame diff --git a/docs/source/index.rst b/docs/source/index.rst index 93afabd8e..c303439b5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,9 +1,9 @@ .. pandera documentation master file -Statistical Data Validation for Pandas -====================================== +A Statistical DataFrame Testing Toolkit +======================================= -*A data validation library for scientists, engineers, and analysts seeking +*A dataframe validation library for scientists, engineers, and analysts seeking correctness.* @@ -62,14 +62,16 @@ correctness.* | ``pandera`` provides a flexible and expressive API for performing data -validation on tidy (long-form) and wide data to make data processing pipelines -more readable and robust. +validation on dataframes to make data processing pipelines more readable and +robust. -`pandas `_ data structures contain information that -``pandera`` explicitly validates at runtime. This is useful in -production-critical data pipelines or reproducible research settings. With -``pandera``, you can: +Dataframes contain information that ``pandera`` explicitly validates at runtime. +This is useful in production-critical data pipelines or reproducible research +settings. With ``pandera``, you can: +#. Define a schema once and use it to validate :ref:`different dataframe types ` + including `pandas `_, `dask `_, + `modin `_, and `koalas `_. #. :ref:`Check` the types and properties of columns in a ``pd.DataFrame`` or values in a ``pd.Series``. #. Perform more complex statistical validation like @@ -80,6 +82,11 @@ production-critical data pipelines or reproducible research settings. With pydantic-style syntax and validate dataframes using the typing syntax. #. :ref:`Synthesize data` from schema objects for property-based testing with pandas data structures. +#. :ref:`Lazily Validate` dataframes so that all validation + rules are executed before raising an error. +#. :ref:`Integrate ` with a rich ecosystem of python tools like + `pydantic `_ and + `mypy `_. .. _installation: @@ -101,6 +108,11 @@ Installing optional functionality: pip install pandera[hypotheses] # hypothesis checks pip install pandera[io] # yaml/script schema io utilities pip install pandera[strategies] # data synthesis strategies + pip install pandera[dask] # validate dask dataframes + pip install pandera[koalas] # validate koalas dataframes + pip install pandera[modin] # validate modin dataframes + pip install pandera[modin-ray] # validate modin dataframes with ray + pip install pandera[modin-dask] # validate modin dataframes with dask pip install pandera[all] # all packages @@ -303,18 +315,18 @@ Submit issues, feature requests or bugfixes on :hidden: dataframe_schemas + schema_models series_schemas checks hypothesis dtypes decorators schema_inference - schema_models lazy_validation data_synthesis_strategies extensions third_party_schema - scaling + supported_libraries integrations .. toctree:: diff --git a/docs/source/integrations.rst b/docs/source/integrations.rst index 94dfe9d00..79c475f80 100644 --- a/docs/source/integrations.rst +++ b/docs/source/integrations.rst @@ -5,10 +5,11 @@ Integrations ============ - Pydantic -------- +*new in 0.8.0* + :class:`~pandera.model.SchemaModel` is fully compatible with `pydantic `_. @@ -53,3 +54,78 @@ Other pandera components are also compatible with pydantic: - :class:`~pandera.schema_components.MultiIndex` - :class:`~pandera.schema_components.Column` - :class:`~pandera.schema_components.Index` + + +Mypy +---- + +*new in 0.8.0* + +Pandera integrates with mypy out of the box to provide static type-linting of +dataframes, relying on `pandas-stubs `__ +for typing information. + +.. :: + + Mypy static type-linting is supported for only pandas dataframes. + +In the example below, we define a few schemas to see how type-linting with +pandera works. + +.. literalinclude:: ../../tests/core/static/pandas_dataframe.py + :lines: 8-27 + +The mypy linter will complain if the output type of the function body doesn't +match the function's return signature. + +.. literalinclude:: ../../tests/core/static/pandas_dataframe.py + :lines: 30-43 + +It'll also complain if the input type doesn't match the expected input type. +Note that we're using the :py:class:`pandera.typing.pandas.DataFrame` generic +type to define dataframes that are validated against the +:py:class:`~pandera.model.SchemaModel` type variable on initialization. + +.. literalinclude:: ../../tests/core/static/pandas_dataframe.py + :lines: 47-60 + + +To make mypy happy with respect to the return type, you can either initialize +a dataframe of the expected type: + +.. literalinclude:: ../../tests/core/static/pandas_dataframe.py + :lines: 63-64 + +.. note:: + If you use the approach above with the :py:func:`~pandera.check_types` + decorator, pandera will do its best to not to validate the dataframe twice + if it's already been initialized with the + ``DataFrame[Schema](**data)`` syntax. + +Or use :py:func:`typing.cast` to indicate to mypy that the return value of +the function is of the correct type. + +.. literalinclude:: ../../tests/core/static/pandas_dataframe.py + :lines: 67-68 + + +Limitations +^^^^^^^^^^^ + +An important caveat to static type-linting with pandera dataframe types is that, +since pandas dataframes are mutable objects, there's no way for ``mypy`` to +know whether a mutated instance of a +:py:class:`~pandera.model.SchemaModel`-typed dataframe has the correct +contents. Fortunately, we can simply rely on the :py:func:`~pandera.check_types` +decorator to verify that the output dataframe is valid. + +Consider the examples below: + +.. literalinclude:: ../../tests/core/static/pandas_dataframe.py + :lines: 63-72 + +Even though the outputs of these functions are incorrect, mypy doesn't catch +the error during static type-linting but pandera will raise a +:py:class:`~pandera.errors.SchemaError` or :py:class:`~pandera.errors.SchemaErrors` +exception at runtime, depending on whether you're doing +:ref:`lazy validation` or not. diff --git a/docs/source/koalas.rst b/docs/source/koalas.rst new file mode 100644 index 000000000..5c9e6787b --- /dev/null +++ b/docs/source/koalas.rst @@ -0,0 +1,111 @@ +.. currentmodule:: pandera + +.. _scaling_koalas: + +Data Validation with Koalas +=========================== + +*new in 0.8.0* + +`Koalas `__ is a distributed +compute framework that offers pandas drop-in replacement dataframe +implementation. You can use pandera to validate :py:func:`~databricks.koalas.DataFrame` +and :py:func:`~databricks.koalas.Series` objects directly. First, install +``pandera`` with the ``dask`` extra: + +.. code:: bash + + pip install pandera[koalas] + + +Then you can use pandera schemas to validate koalas dataframes. In the example +below we'll use the :ref:`class-based API ` to define a +:py:class:`SchemaModel` for validation. + +.. testcode:: scaling_koalas + + import databricks.koalas as ks + import pandas as pd + import pandera as pa + + from pandera.typing.koalas import DataFrame, Series + + + class Schema(pa.SchemaModel): + state: Series[str] + city: Series[str] + price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20}) + + + # create a koalas dataframe that's validated on object initialization + df = DataFrame[Schema]( + { + 'state': ['FL','FL','FL','CA','CA','CA'], + 'city': [ + 'Orlando', + 'Miami', + 'Tampa', + 'San Francisco', + 'Los Angeles', + 'San Diego', + ], + 'price': [8, 12, 10, 16, 20, 18], + } + ) + print(df) + + +.. testoutput:: scaling_koalas + + state city price + 0 FL Orlando 8 + 1 FL Miami 12 + 2 FL Tampa 10 + 3 CA San Francisco 16 + 4 CA Los Angeles 20 + 5 CA San Diego 18 + + +You can also use the :py:func:`~pandera.check_types` decorator to validate +koalas dataframes at runtime: + + +.. testcode:: scaling_koalas + + @pa.check_types + def function(df: DataFrame[Schema]) -> DataFrame[Schema]: + return df[df["state"] == "CA"] + + print(function(df)) + + +.. testoutput:: scaling_koalas + + state city price + 3 CA San Francisco 16 + 4 CA Los Angeles 20 + 5 CA San Diego 18 + + +And of course, you can use the object-based API to validate dask dataframes: + + +.. testcode:: scaling_koalas + + schema = pa.DataFrameSchema({ + "state": pa.Column(str), + "city": pa.Column(str), + "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20)) + }) + print(schema(df)) + + +.. testoutput:: scaling_koalas + + state city price + 0 FL Orlando 8 + 1 FL Miami 12 + 2 FL Tampa 10 + 3 CA San Francisco 16 + 4 CA Los Angeles 20 + 5 CA San Diego 18 diff --git a/docs/source/modin.rst b/docs/source/modin.rst new file mode 100644 index 000000000..3d5b1bb1b --- /dev/null +++ b/docs/source/modin.rst @@ -0,0 +1,113 @@ +.. currentmodule:: pandera + +.. _scaling_modin: + +Data Validation with Modin +========================== + +*new in 0.8.0* + +`Modin `__ is a distributed +compute framework that offers pandas drop-in replacement dataframe +implementation. You can use pandera to validate :py:func:`~modin.pandas.DataFrame` +and :py:func:`~modin.pandas.Series` objects directly. First, install +``pandera`` with the ``dask`` extra: + +.. code:: bash + + pip install pandera[modin] # installs both ray and dask backends + pip install pandera[modin-ray] # only ray backend + pip install pandera[modin-dask] # only dask backend + + +Then you can use pandera schemas to validate modin dataframes. In the example +below we'll use the :ref:`class-based API ` to define a +:py:class:`SchemaModel` for validation. + +.. testcode:: scaling_modin + + import modin.pandas as pd + import pandas as pd + import pandera as pa + + from pandera.typing.modin import DataFrame, Series + + + class Schema(pa.SchemaModel): + state: Series[str] + city: Series[str] + price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20}) + + + # create a modin dataframe that's validated on object initialization + df = DataFrame[Schema]( + { + 'state': ['FL','FL','FL','CA','CA','CA'], + 'city': [ + 'Orlando', + 'Miami', + 'Tampa', + 'San Francisco', + 'Los Angeles', + 'San Diego', + ], + 'price': [8, 12, 10, 16, 20, 18], + } + ) + print(df) + + +.. testoutput:: scaling_modin + + state city price + 0 FL Orlando 8 + 1 FL Miami 12 + 2 FL Tampa 10 + 3 CA San Francisco 16 + 4 CA Los Angeles 20 + 5 CA San Diego 18 + + +You can also use the :py:func:`~pandera.check_types` decorator to validate +modin dataframes at runtime: + + +.. testcode:: scaling_modin + + @pa.check_types + def function(df: DataFrame[Schema]) -> DataFrame[Schema]: + return df[df["state"] == "CA"] + + print(function(df)) + + +.. testoutput:: scaling_modin + + state city price + 3 CA San Francisco 16 + 4 CA Los Angeles 20 + 5 CA San Diego 18 + + +And of course, you can use the object-based API to validate dask dataframes: + + +.. testcode:: scaling_modin + + schema = pa.DataFrameSchema({ + "state": pa.Column(str), + "city": pa.Column(str), + "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20)) + }) + print(schema(df)) + + +.. testoutput:: scaling_modin + + state city price + 0 FL Orlando 8 + 1 FL Miami 12 + 2 FL Tampa 10 + 3 CA San Francisco 16 + 4 CA Los Angeles 20 + 5 CA San Diego 18 diff --git a/docs/source/schema_models.rst b/docs/source/schema_models.rst index fcbc56e30..5ad259a30 100644 --- a/docs/source/schema_models.rst +++ b/docs/source/schema_models.rst @@ -107,6 +107,52 @@ In the example above, this will simply be the string `"year"`. 2 2003 365 +Validate on Initialization +-------------------------- + +*new in 0.8.0* + +Pandera provides an interface for validating dataframes on initialization. +This API uses the :py:class:`pandera.typing.pandas.DataFrame` generic type +to validated against the :py:class:`~pandera.model.SchemaModel` type variable +on initialization: + +.. testcode:: validate_on_init + + import pandas as pd + import pandera as pa + + from pandera.typing import DataFrame, Series + + + class Schema(pa.SchemaModel): + state: Series[str] + city: Series[str] + price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20}) + + df = DataFrame[Schema]( + { + 'state': ['NY','FL','GA','CA'], + 'city': ['New York', 'Miami', 'Atlanta', 'San Francisco'], + 'price': [8, 12, 10, 16], + } + ) + print(df) + + +.. testoutput:: validate_on_init + + state city price + 0 NY New York 8 + 1 FL Miami 12 + 2 GA Atlanta 10 + 3 CA San Francisco 16 + + +Refer to :ref:`supported-dataframe-libraries` to see how this syntax applies +to other supported dataframe types. + + Converting to DataFrameSchema ----------------------------- @@ -134,7 +180,8 @@ You can easily convert a :class:`~pandera.model.SchemaModel` class into a ordered=False )> -Or use the :meth:`~pandera.model.SchemaModel.validate` method to validate dataframes: +You can also use the :meth:`~pandera.model.SchemaModel.validate` method to +validate dataframes: .. testcode:: dataframe_schema_model @@ -147,6 +194,21 @@ Or use the :meth:`~pandera.model.SchemaModel.validate` method to validate datafr 1 2002 6 156 2 2003 12 365 +Or you can use the :meth:`~pandera.model.SchemaModel` class directly to +validate dataframes, which is syntactic sugar that simply delegates to the +:meth:`~pandera.model.SchemaModel.validate` method. + +.. testcode:: dataframe_schema_model + + print(InputSchema(df)) + +.. testoutput:: dataframe_schema_model + + year month day + 0 2001 3 200 + 1 2002 6 156 + 2 2003 12 365 + Excluded attributes ------------------- diff --git a/docs/source/supported_libraries.rst b/docs/source/supported_libraries.rst new file mode 100644 index 000000000..60ae94f4c --- /dev/null +++ b/docs/source/supported_libraries.rst @@ -0,0 +1,50 @@ +.. currentmodule:: pandera + +.. _supported-dataframe-libraries: + +Supported DataFrame Libraries (New) +=================================== + +Pandera started out as a pandas-specific dataframe validation library, and +moving forward its core functionality will continue to support pandas. However, +pandera's adoption has resulted in the realization that it can be a much more +powerful tool by supporting other dataframe-like formats. + +Scaling Up Data Validation +-------------------------- + +Pandera provides multiple ways of scaling up data validation to dataframes +that don't fit into memory. Fortunately, pandera doesn't have to re-invent +the wheel. Standing on shoulders of giants, it integrates with the existing +ecosystem of libraries that allow you to perform validations on out-of-memory +dataframes. + +.. list-table:: + :widths: 25 75 + + * - :ref:`Dask ` + - Apply pandera schemas to Dask dataframe partitions. + * - :ref:`Fugue ` + - Apply pandera schemas to distributed dataframe partitions with Fugue. + * - :ref:`Koalas ` + - A pandas drop-in replacement, distributed using a Spark backend. + * - :ref:`Modin ` + - A pandas drop-in replacement, distributed using a Ray or Dask backend. + +.. toctree:: + :maxdepth: 1 + :caption: Introduction + :hidden: + + Dask + Fugue + Koalas + Modin + +.. note:: + + Don't see a library that you want supported? Check out the + `github issues `__ to see if + that library is in the roadmap. If it isn't, open up a + `new issue `__ + to add support for it! diff --git a/docs/source/third_party_schema.rst b/docs/source/third_party_schema.rst index 233604435..e6d54b0ec 100644 --- a/docs/source/third_party_schema.rst +++ b/docs/source/third_party_schema.rst @@ -4,8 +4,8 @@ .. _third_party_schema: -Reading Third-Party Schema (new) -================================ +Reading Third-Party Schema + *new in 0.7.0* diff --git a/pandera/external_config.py b/pandera/external_config.py index b28aa45ca..699b0c9fb 100644 --- a/pandera/external_config.py +++ b/pandera/external_config.py @@ -9,9 +9,13 @@ # Series and DataFrames to support type hinting: # https://koalas.readthedocs.io/en/latest/user_guide/typehints.html#type-hinting-with-names # pylint: disable=unused-import - import databricks.koalas as ks - if os.getenv("SPARK_LOCAL_IP") is None: os.environ["SPARK_LOCAL_IP"] = "127.0.0.1" + if os.getenv("PYARROW_IGNORE_TIMEZONE") is None: + # This can be overriden by the user + os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" + + import databricks.koalas as ks except ImportError: - pass + os.environ.pop("SPARK_LOCAL_IP") + os.environ.pop("PYARROW_IGNORE_TIMEZONE") diff --git a/pandera/io.py b/pandera/io.py index 15d9c3fde..07ade9546 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -252,7 +252,6 @@ def _deserialize_schema(serialized_schema): index = MultiIndex( indexes=[Index(**index_properties) for index_properties in index] ) - return DataFrameSchema( columns=columns, checks=checks, diff --git a/pandera/model.py b/pandera/model.py index b937c9d7c..1816f0214 100644 --- a/pandera/model.py +++ b/pandera/model.py @@ -34,7 +34,8 @@ FieldInfo, ) from .schemas import DataFrameSchema -from .typing import AnnotationInfo, DataFrame, Index, Series +from .typing import INDEX_TYPES, SERIES_TYPES, AnnotationInfo +from .typing.common import DataFrameBase if sys.version_info[:2] < (3, 9): from typing_extensions import get_type_hints @@ -173,8 +174,11 @@ class SchemaModel(metaclass=_MetaSchema): __checks__: Dict[str, List[Check]] = {} __dataframe_checks__: List[Check] = [] - def __new__(cls, *args, **kwargs): - raise TypeError(f"{cls.__name__} may not be instantiated.") + # This is syntantic sugar that delegates to the validate method + @docstring_substitution(validate_doc=DataFrameSchema.validate.__doc__) + def __new__(cls, *args, **kwargs) -> DataFrameBase[TSchemaModel]: # type: ignore [misc] + """%(validate_doc)s""" + return cast(DataFrameBase[TSchemaModel], cls.validate(*args, **kwargs)) def __init_subclass__(cls, **kwargs): """Ensure :class:`~pandera.model_components.FieldInfo` instances.""" @@ -250,10 +254,13 @@ def validate( random_state: Optional[int] = None, lazy: bool = False, inplace: bool = False, - ) -> DataFrame[TSchemaModel]: + ) -> DataFrameBase[TSchemaModel]: """%(validate_doc)s""" - return cls.to_schema().validate( - check_obj, head, tail, sample, random_state, lazy, inplace + return cast( + DataFrameBase[TSchemaModel], + cls.to_schema().validate( + check_obj, head, tail, sample, random_state, lazy, inplace + ), ) @classmethod @@ -261,7 +268,7 @@ def validate( @st.strategy_import_error def strategy( cls: Type[TSchemaModel], *, size: Optional[int] = None - ) -> DataFrame[TSchemaModel]: + ) -> DataFrameBase[TSchemaModel]: """%(strategy_doc)s""" return cls.to_schema().strategy(size=size) @@ -270,9 +277,11 @@ def strategy( @st.strategy_import_error def example( cls: Type[TSchemaModel], *, size: Optional[int] = None - ) -> DataFrame[TSchemaModel]: + ) -> DataFrameBase[TSchemaModel]: """%(example_doc)s""" - return cls.to_schema().example(size=size) + return cast( + DataFrameBase[TSchemaModel], cls.to_schema().example(size=size) + ) @classmethod def _build_columns_index( # pylint:disable=too-many-locals @@ -285,7 +294,8 @@ def _build_columns_index( # pylint:disable=too-many-locals Optional[Union[schema_components.Index, schema_components.MultiIndex]], ]: index_count = sum( - annotation.origin is Index for annotation, _ in fields.values() + annotation.origin in INDEX_TYPES + for annotation, _ in fields.values() ) columns: Dict[str, schema_components.Column] = {} @@ -310,8 +320,8 @@ def _build_columns_index( # pylint:disable=too-many-locals dtype = None if dtype is Any else dtype if ( - annotation.origin is Series - or annotation.raw_annotation is Series + annotation.origin in SERIES_TYPES + or annotation.raw_annotation in SERIES_TYPES ): col_constructor = ( field.to_column if field else schema_components.Column @@ -329,8 +339,8 @@ def _build_columns_index( # pylint:disable=too-many-locals name=field_name, ) elif ( - annotation.origin is Index - or annotation.raw_annotation is Index + annotation.origin in INDEX_TYPES + or annotation.raw_annotation in INDEX_TYPES ): if annotation.optional: raise SchemaInitError( diff --git a/pandera/typing/__init__.py b/pandera/typing/__init__.py new file mode 100644 index 000000000..1090ee02a --- /dev/null +++ b/pandera/typing/__init__.py @@ -0,0 +1,58 @@ +"""Typing module. + +For backwards compatibility, pandas types are exposed to the top-level scope of +the typing module. +""" + +from . import dask, koalas, modin +from .common import ( + INT8, + INT16, + INT32, + INT64, + STRING, + UINT8, + UINT16, + UINT32, + UINT64, + AnnotationInfo, + Bool, + Category, + DateTime, + Float, + Float16, + Float32, + Float64, + Int, + Int8, + Int16, + Int32, + Int64, + Object, + String, + Timedelta, + UInt8, + UInt16, + UInt32, + UInt64, +) +from .pandas import DataFrame, Index, Series + +DATAFRAME_TYPES = {DataFrame} +SERIES_TYPES = {Series} +INDEX_TYPES = {Index} + +if dask.DASK_INSTALLED: + DATAFRAME_TYPES.update({dask.DataFrame}) + SERIES_TYPES.update({dask.Series}) + INDEX_TYPES.update({dask.Index}) + +if modin.MODIN_INSTALLED: + DATAFRAME_TYPES.update({modin.DataFrame}) + SERIES_TYPES.update({modin.Series}) + INDEX_TYPES.update({modin.Index}) + +if koalas.KOALAS_INSTALLED: + DATAFRAME_TYPES.update({koalas.DataFrame}) + SERIES_TYPES.update({koalas.Series}) + INDEX_TYPES.update({koalas.Index}) diff --git a/pandera/typing.py b/pandera/typing/common.py similarity index 59% rename from pandera/typing.py rename to pandera/typing/common.py index 72da6031b..99861ddb0 100644 --- a/pandera/typing.py +++ b/pandera/typing/common.py @@ -1,32 +1,14 @@ -"""Typing definitions and helpers.""" +"""Common typing functionality.""" # pylint:disable=abstract-method,disable=too-many-ancestors + import inspect from typing import TYPE_CHECKING, Any, Generic, Type, TypeVar import pandas as pd import typing_inspect -from . import dtypes -from .engines import numpy_engine, pandas_engine -from .errors import SchemaError, SchemaInitError - -try: - from typing import _GenericAlias # type: ignore[attr-defined] -except ImportError: # pragma: no cover - _GenericAlias = None - - -try: - from pydantic.fields import ModelField -except ImportError: - ModelField = Any # type: ignore - -try: - import dask.dataframe as dd - - _DASK_INSTALLED = True -except ImportError: - _DASK_INSTALLED = False +from .. import dtypes +from ..engines import numpy_engine, pandas_engine Bool = dtypes.Bool #: ``"bool"`` numpy dtype DateTime = dtypes.DateTime #: ``"datetime64[ns]"`` numpy dtype @@ -99,35 +81,6 @@ Schema = TypeVar("Schema", bound="SchemaModel") # type: ignore -# pylint:disable=too-few-public-methods -class Index(pd.Index, Generic[GenericDtype]): - """Representation of pandas.Index, only used for type annotation. - - *new in 0.5.0* - """ - - -# pylint:disable=too-few-public-methods -class Series(pd.Series, Generic[GenericDtype]): # type: ignore - """Representation of pandas.Series, only used for type annotation. - - *new in 0.5.0* - """ - - if hasattr(pd.Series, "__class_getitem__") and _GenericAlias: - - def __class_getitem__(cls, item): - """Define this to override the patch that koalas performs on pandas. - https://github.com/databricks/koalas/blob/master/databricks/koalas/__init__.py#L207-L223 - """ - return _GenericAlias(cls, item) - - def __get__( - self, instance: object, owner: Type - ) -> str: # pragma: no cover - raise AttributeError("Series should resolve to Field-s") - - # pylint:disable=invalid-name if TYPE_CHECKING: T = TypeVar("T") # pragma: no cover @@ -135,16 +88,19 @@ def __get__( T = Schema -class DataFrameBase(pd.DataFrame): +class DataFrameBase(Generic[T]): + # pylint: disable=too-few-public-methods """ - Pandera pandas.Dataframe base class for validating dataframes on + Pandera Dataframe base class for validating dataframes on initialization. """ def __setattr__(self, name: str, value: Any) -> None: + # pylint: disable=no-member object.__setattr__(self, name, value) if name == "__orig_class__": - class_args = getattr(self.__orig_class__, "__args__", None) + orig_class = getattr(self, "__orig_class__") + class_args = getattr(orig_class, "__args__", None) if any( x.__name__ == "SchemaModel" for x in inspect.getmro(class_args[0]) @@ -153,73 +109,32 @@ def __setattr__(self, name: str, value: Any) -> None: # prevent the double validation problem by preventing checks for # dataframes with a defined pandera.schema + pandera = getattr(self, "pandera") if ( - self.pandera.schema is None - or self.pandera.schema != schema_model.to_schema() + pandera.schema is None + or pandera.schema != schema_model.to_schema() ): # pylint: disable=self-cls-assignment self = schema_model.validate(self) - self.pandera.add_schema(schema_model.to_schema()) + pandera.add_schema(schema_model.to_schema()) # pylint:disable=too-few-public-methods -class DataFrame(Generic[T], DataFrameBase): - """ - Representation of pandas.DataFrame, only used for type annotation. - - *new in 0.5.0* - """ +class SeriesBase(Generic[GenericDtype]): + """Pandera Series base class to use for all pandas-like APIs.""" - if hasattr(pd.Series, "__class_getitem__") and _GenericAlias: - - def __class_getitem__(cls, item): - """Define this to override the patch that koalas performs on pandas. - https://github.com/databricks/koalas/blob/master/databricks/koalas/__init__.py#L207-L223 - """ - return _GenericAlias(cls, item) - - @classmethod - def __get_validators__(cls): - yield cls._pydantic_validate - - @classmethod - def _pydantic_validate( - cls, df: pd.DataFrame, field: ModelField - ) -> pd.DataFrame: - """Verify that the input is a pandas dataframe that meets all - schema requirements.""" - if not isinstance(df, pd.DataFrame): - raise TypeError("Expected a pandas DataFrame") - - if not field.sub_fields: - raise TypeError( - "Expected a typed pandera.typing.DataFrame," - " e.g. DataFrame[Schema]" - ) - schema_model = field.sub_fields[0].type_ - try: - schema = schema_model.to_schema() - except SchemaInitError as exc: - raise ValueError( - f"Cannot use {cls.__name__} as a pydantic type as its " - "SchemaModel cannot be converted to a DataFrameSchema.\n" - f"Please revisit the model to address the following errors:" - f"\n{exc}" - ) from exc + def __get__( + self, instance: object, owner: Type + ) -> str: # pragma: no cover + raise AttributeError("Series should resolve to Field-s") - try: - return schema.validate(df) - except SchemaError as exc: - raise ValueError(str(exc)) from exc +# pylint:disable=too-few-public-methods +class IndexBase(Generic[GenericDtype]): + """Representation of pandas.Index, only used for type annotation. -if _DASK_INSTALLED: - # pylint:disable=too-few-public-methods - class DaskDataFrame(dd.DataFrame, Generic[T]): - """ - Representation of dask.dataframe.DataFrame, only used for type - annotation. - """ + *new in 0.5.0* + """ class AnnotationInfo: # pylint:disable=too-few-public-methods @@ -227,7 +142,8 @@ class AnnotationInfo: # pylint:disable=too-few-public-methods Attributes: origin: The non-parameterized generic class. - arg: The first generic type (SchemaModel does not support more than 1 argument). + arg: The first generic type (SchemaModel does not support more than + 1 argument). literal: Whether the annotation is a literal. optional: Whether the annotation is optional. raw_annotation: The raw annotation. @@ -239,16 +155,11 @@ def __init__(self, raw_annotation: Type) -> None: @property def is_generic_df(self) -> bool: - """True if the annotation is a pandera.typing.DataFrame or - pandera.typing.DaskDataFrame. - """ + """True if the annotation is a DataFrameBase subclass.""" try: if self.origin is None: return False - if _DASK_INSTALLED: - return issubclass(self.origin, (DataFrame, DaskDataFrame)) - else: - return issubclass(self.origin, DataFrame) + return issubclass(self.origin, DataFrameBase) except TypeError: return False diff --git a/pandera/typing/dask.py b/pandera/typing/dask.py new file mode 100644 index 000000000..ae807c036 --- /dev/null +++ b/pandera/typing/dask.py @@ -0,0 +1,67 @@ +"""Pandera type annotations for Dask.""" + +import inspect +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +from .common import DataFrameBase, IndexBase, SeriesBase +from .pandas import GenericDtype, Schema + +try: + import dask.dataframe as dd + + DASK_INSTALLED = True +except ImportError: + DASK_INSTALLED = False + + +# pylint:disable=invalid-name +if TYPE_CHECKING: + T = TypeVar("T") # pragma: no cover +else: + T = Schema + + +if DASK_INSTALLED: + + # pylint: disable=too-few-public-methods + class DataFrame(DataFrameBase, dd.DataFrame, Generic[T]): + """ + Representation of dask.dataframe.DataFrame, only used for type + annotation. + + *new in 0.8.0* + """ + + def __setattr__(self, name: str, value: Any) -> None: + object.__setattr__(self, name, value) + if name == "__orig_class__": + class_args = getattr(self.__orig_class__, "__args__", None) + if any( + x.__name__ == "SchemaModel" + for x in inspect.getmro(class_args[0]) + ): + schema_model = value.__args__[0] + + # prevent the double validation problem by preventing checks + # for dataframes with a defined pandera.schema + if ( + self.pandera.schema is None + or self.pandera.schema != schema_model.to_schema() + ): + # pylint: disable=self-cls-assignment + self.__dict__ = schema_model.validate(self).__dict__ + self.pandera.add_schema(schema_model.to_schema()) + + # pylint:disable=too-few-public-methods + class Series(SeriesBase, dd.Series, Generic[GenericDtype]): # type: ignore + """Representation of pandas.Series, only used for type annotation. + + *new in 0.8.0* + """ + + # pylint:disable=too-few-public-methods + class Index(IndexBase, dd.Index, Generic[GenericDtype]): + """Representation of pandas.Index, only used for type annotation. + + *new in 0.8.0* + """ diff --git a/pandera/typing/koalas.py b/pandera/typing/koalas.py new file mode 100644 index 000000000..052338f7f --- /dev/null +++ b/pandera/typing/koalas.py @@ -0,0 +1,54 @@ +"""Pandera type annotations for Dask.""" + +from typing import TYPE_CHECKING, Generic, TypeVar + +from .common import DataFrameBase, IndexBase, SeriesBase +from .pandas import GenericDtype, Schema, _GenericAlias + +try: + import databricks.koalas as ks + + KOALAS_INSTALLED = True +except ImportError: + KOALAS_INSTALLED = False + + +# pylint:disable=invalid-name +if TYPE_CHECKING: + T = TypeVar("T") # pragma: no cover +else: + T = Schema + + +if KOALAS_INSTALLED: + + # pylint: disable=too-few-public-methods + class DataFrame(DataFrameBase, ks.DataFrame, Generic[T]): + """ + Representation of dask.dataframe.DataFrame, only used for type + annotation. + + *new in 0.8.0* + """ + + def __class_getitem__(cls, item): + """Define this to override's koalas generic type.""" + return _GenericAlias(cls, item) + + # pylint:disable=too-few-public-methods + class Series(SeriesBase, ks.Series, Generic[GenericDtype]): + """Representation of pandas.Series, only used for type annotation. + + *new in 0.8.0* + """ + + def __class_getitem__(cls, item): + """Define this to override koalas generic type""" + return _GenericAlias(cls, item) + + # pylint:disable=too-few-public-methods + class Index(IndexBase, ks.Index, Generic[GenericDtype]): + """Representation of pandas.Index, only used for type annotation. + + *new in 0.8.0* + """ diff --git a/pandera/typing/modin.py b/pandera/typing/modin.py new file mode 100644 index 000000000..c69782f21 --- /dev/null +++ b/pandera/typing/modin.py @@ -0,0 +1,46 @@ +"""Pandera type annotations for Dask.""" + +from typing import TYPE_CHECKING, Generic, TypeVar + +from .common import DataFrameBase, IndexBase, SeriesBase +from .pandas import GenericDtype, Schema + +try: + import modin.pandas as mpd + + MODIN_INSTALLED = True +except ImportError: + MODIN_INSTALLED = False + + +# pylint:disable=invalid-name +if TYPE_CHECKING: + T = TypeVar("T") # pragma: no cover +else: + T = Schema + + +if MODIN_INSTALLED: + + # pylint: disable=too-few-public-methods + class DataFrame(DataFrameBase, mpd.DataFrame, Generic[T]): + """ + Representation of dask.dataframe.DataFrame, only used for type + annotation. + + *new in 0.8.0* + """ + + # pylint:disable=too-few-public-methods + class Series(SeriesBase, mpd.Series, Generic[GenericDtype]): + """Representation of pandas.Series, only used for type annotation. + + *new in 0.8.0* + """ + + # pylint:disable=too-few-public-methods + class Index(IndexBase, mpd.Index, Generic[GenericDtype]): + """Representation of pandas.Index, only used for type annotation. + + *new in 0.8.0* + """ diff --git a/pandera/typing/pandas.py b/pandera/typing/pandas.py new file mode 100644 index 000000000..37c0824b5 --- /dev/null +++ b/pandera/typing/pandas.py @@ -0,0 +1,101 @@ +"""Typing definitions and helpers.""" +# pylint:disable=abstract-method,disable=too-many-ancestors +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +import pandas as pd + +from ..errors import SchemaError, SchemaInitError +from .common import DataFrameBase, GenericDtype, IndexBase, Schema, SeriesBase + +try: + from typing import _GenericAlias # type: ignore[attr-defined] +except ImportError: # pragma: no cover + _GenericAlias = None + + +try: + from pydantic.fields import ModelField +except ImportError: + ModelField = Any # type: ignore + + +# pylint:disable=too-few-public-methods +class Index(IndexBase, pd.Index, Generic[GenericDtype]): + """Representation of pandas.Index, only used for type annotation. + + *new in 0.5.0* + """ + + +# pylint:disable=too-few-public-methods +class Series(SeriesBase, pd.Series, Generic[GenericDtype]): # type: ignore + """Representation of pandas.Series, only used for type annotation. + + *new in 0.5.0* + """ + + if hasattr(pd.Series, "__class_getitem__") and _GenericAlias: + + def __class_getitem__(cls, item): + """Define this to override the patch that koalas performs on pandas. + https://github.com/databricks/koalas/blob/master/databricks/koalas/__init__.py#L207-L223 + """ + return _GenericAlias(cls, item) + + +# pylint:disable=invalid-name +if TYPE_CHECKING: + T = TypeVar("T") # pragma: no cover +else: + T = Schema + + +# pylint:disable=too-few-public-methods +class DataFrame(DataFrameBase, pd.DataFrame, Generic[T]): + """ + Representation of pandas.DataFrame, only used for type annotation. + + *new in 0.5.0* + """ + + if hasattr(pd.DataFrame, "__class_getitem__") and _GenericAlias: + + def __class_getitem__(cls, item): + """Define this to override the patch that koalas performs on pandas. + https://github.com/databricks/koalas/blob/master/databricks/koalas/__init__.py#L207-L223 + """ + return _GenericAlias(cls, item) + + @classmethod + def __get_validators__(cls): + yield cls._pydantic_validate + + @classmethod + def _pydantic_validate( + cls, df: pd.DataFrame, field: ModelField + ) -> pd.DataFrame: + """Verify that the input is a pandas dataframe that meets all + schema requirements.""" + if not isinstance(df, pd.DataFrame): + raise TypeError("Expected a pandas DataFrame") + + if not field.sub_fields: + raise TypeError( + "Expected a typed pandera.typing.DataFrame," + " e.g. DataFrame[Schema]" + ) + schema_model = field.sub_fields[0].type_ + try: + schema = schema_model.to_schema() + except SchemaInitError as exc: + raise ValueError( + f"Cannot use {cls.__name__} as a pydantic type as its " + "SchemaModel cannot be converted to a DataFrameSchema.\n" + f"Please revisit the model to address the following errors:" + f"\n{exc}" + ) from exc + + try: + return schema.validate(df) + except SchemaError as exc: + raise ValueError(str(exc)) from exc diff --git a/tests/core/static/pandas_dataframe.py b/tests/core/static/pandas_dataframe.py index 58b718395..2d320851a 100644 --- a/tests/core/static/pandas_dataframe.py +++ b/tests/core/static/pandas_dataframe.py @@ -27,37 +27,61 @@ class AnotherSchema(pa.SchemaModel): first_name: Series[str] -pd_df = pd.DataFrame({"id": [1], "name": ["foo"]}) -valid_df = DataFrame[Schema]({"id": [1], "name": ["foo"]}) -another_df = DataFrame[AnotherSchema]({"id": [1], "first_name": ["foo"]}) - - def fn(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: - return df.assign(age=30).pipe(DataFrame[SchemaOut]) + return df.assign(age=30).pipe(DataFrame[SchemaOut]) # mypy okay def fn_pipe_incorrect_type(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: return df.assign(age=30).pipe(DataFrame[AnotherSchema]) # mypy error + # error: Argument 1 to "pipe" of "NDFrame" has incompatible type "Type[DataFrame[Any]]"; # noqa + # expected "Union[Callable[..., DataFrame[SchemaOut]], Tuple[Callable[..., DataFrame[SchemaOut]], str]]" [arg-type] # noqa def fn_assign_copy(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: return df.assign(age=30) # mypy error + # error: Incompatible return value type (got "pandas.core.frame.DataFrame", + # expected "pandera.typing.pandas.DataFrame[SchemaOut]") [return-value] + + +# Define a few dataframe objects +schema_df = DataFrame[Schema]({"id": [1], "name": ["foo"]}) +pandas_df = pd.DataFrame({"id": [1], "name": ["foo"]}) +another_df = DataFrame[AnotherSchema]({"id": [1], "first_name": ["foo"]}) + +fn(schema_df) # mypy okay + +fn(pandas_df) # mypy error +# error: Argument 1 to "fn" has incompatible type "pandas.core.frame.DataFrame"; # noqa +# expected "pandera.typing.pandas.DataFrame[Schema]" [arg-type] -fn(valid_df) -fn(pd_df) # mypy error fn(another_df) # mypy error +# error: Argument 1 to "fn" has incompatible type "DataFrame[AnotherSchema]"; +# expected "DataFrame[Schema]" [arg-type] + +def fn_pipe_dataframe(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: + return df.assign(age=30).pipe(DataFrame[SchemaOut]) # mypy okay + +def fn_cast_dataframe(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: + return cast(DataFrame[SchemaOut], df.assign(age=30)) # mypy okay + + +@pa.check_types def fn_mutate_inplace(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: out = df.assign(age=30).pipe(DataFrame[SchemaOut]) out.drop(["age"], axis=1, inplace=True) return out # okay for mypy, pandera raises error +@pa.check_types def fn_assign_and_get_index(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: return df.assign(foo=30).iloc[:3] # okay for mypy, pandera raises error -def fn_cast_dataframe(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: - return cast(DataFrame[SchemaOut], df) # okay for mypy +@pa.check_types +def fn_cast_dataframe_invalid(df: DataFrame[Schema]) -> DataFrame[SchemaOut]: + return cast( + DataFrame[SchemaOut], df + ) # okay for mypy, pandera raises error # noqa diff --git a/tests/core/test_model.py b/tests/core/test_model.py index d93303dd3..315aa9b37 100644 --- a/tests/core/test_model.py +++ b/tests/core/test_model.py @@ -12,8 +12,11 @@ from pandera.typing import DataFrame, Index, Series, String -def test_to_schema() -> None: - """Test that SchemaModel.to_schema() can produce the correct schema.""" +def test_to_schema_and_validate() -> None: + """ + Test that SchemaModel.to_schema() can produce the correct schema and + can validate dataframe objects. + """ class Schema(pa.SchemaModel): a: Series[int] @@ -27,8 +30,9 @@ class Schema(pa.SchemaModel): ) assert expected == Schema.to_schema() - with pytest.raises(TypeError): - Schema() + Schema(pd.DataFrame({"a": [1], "b": ["foo"], "c": [3.4]}, index=["1"])) + with pytest.raises(pa.errors.SchemaError): + Schema(pd.DataFrame({"a": [1]})) def test_empty_schema() -> None: diff --git a/tests/core/test_static_type_checking.py b/tests/core/test_static_type_checking.py index 28ef13dc2..20fe64226 100644 --- a/tests/core/test_static_type_checking.py +++ b/tests/core/test_static_type_checking.py @@ -51,7 +51,7 @@ def test_mypy_pandas_dataframe(capfd) -> None: ) errors = _get_mypy_errors(capfd.readouterr().out) # assert error messages on particular lines of code - assert errors[40] == { + assert errors[35] == { "msg": ( 'Argument 1 to "pipe" of "NDFrame" has incompatible type ' '"Type[DataFrame[Any]]"; expected ' @@ -60,23 +60,23 @@ def test_mypy_pandas_dataframe(capfd) -> None: ), "errcode": "arg-type", } - assert errors[44] == { + assert errors[41] == { "msg": ( "Incompatible return value type (got " '"pandas.core.frame.DataFrame", expected ' - '"pandera.typing.DataFrame[SchemaOut]")' + '"pandera.typing.pandas.DataFrame[SchemaOut]")' ), "errcode": "return-value", } - assert errors[48] == { + assert errors[54] == { "msg": ( 'Argument 1 to "fn" has incompatible type ' '"pandas.core.frame.DataFrame"; expected ' - '"pandera.typing.DataFrame[Schema]"' + '"pandera.typing.pandas.DataFrame[Schema]"' ), "errcode": "arg-type", } - assert errors[49] == { + assert errors[58] == { "msg": ( 'Argument 1 to "fn" has incompatible type ' '"DataFrame[AnotherSchema]"; expected "DataFrame[Schema]"' @@ -90,6 +90,7 @@ def test_mypy_pandas_dataframe(capfd) -> None: [ pandas_dataframe.fn_mutate_inplace, pandas_dataframe.fn_assign_and_get_index, + pandas_dataframe.fn_cast_dataframe_invalid, ], ) def test_pandera_runtime_errors(fn) -> None: @@ -97,6 +98,6 @@ def test_pandera_runtime_errors(fn) -> None: # both functions don't add a required column "age" try: - pa.check_types(fn)(pandas_dataframe.valid_df) + fn(pandas_dataframe.schema_df) except pa.errors.SchemaError as e: assert e.failure_cases["failure_case"].item() == "age" diff --git a/tests/core/test_typing.py b/tests/core/test_typing.py index 652871f4f..585be8cb1 100644 --- a/tests/core/test_typing.py +++ b/tests/core/test_typing.py @@ -9,7 +9,7 @@ import pandera as pa from pandera.dtypes import DataType -from pandera.typing import Series +from pandera.typing import DataFrame, Index, Series try: # python 3.9+ from typing import Annotated # type: ignore @@ -456,3 +456,33 @@ def test_new_pandas_extension_dtype_class( ): """Test type annotations with the new nullable pandas dtypes.""" _test_default_annotated_dtype(model, dtype, has_mandatory_args) + + +class InitSchema(pa.SchemaModel): + col1: Series[int] + col2: Series[float] + col3: Series[str] + index: Index[int] + + +def test_init_pandas_dataframe(): + """Test initialization of pandas.typing.DataFrame with Schema.""" + assert isinstance( + DataFrame[InitSchema]({"col1": [1], "col2": [1.0], "col3": ["1"]}), + DataFrame, + ) + + +@pytest.mark.parametrize( + "invalid_data", + [ + {"col1": [1.0], "col2": [1.0], "col3": ["1"]}, + {"col1": [1], "col2": [1], "col3": ["1"]}, + {"col1": [1], "col2": [1.0], "col3": [1]}, + {"col1": [1]}, + ], +) +def test_init_pandas_dataframe_errors(invalid_data): + """Test errors from initializing a pandas.typing.DataFrame with Schema.""" + with pytest.raises(pa.errors.SchemaError): + DataFrame[InitSchema](invalid_data) diff --git a/tests/dask/test_dask.py b/tests/dask/test_dask.py index 696376345..859834c3f 100644 --- a/tests/dask/test_dask.py +++ b/tests/dask/test_dask.py @@ -6,7 +6,7 @@ import pytest import pandera as pa -from pandera.typing import DaskDataFrame, Series +from pandera.typing.dask import DataFrame, Index, Series class IntSchema(pa.SchemaModel): # pylint: disable=missing-class-docstring @@ -25,17 +25,17 @@ def test_model_validation() -> None: ddf = dd.from_pandas(df, npartitions=1) ddf = StrSchema.validate(ddf) - pd.testing.assert_frame_equal(df, ddf.compute()) + pd.testing.assert_frame_equal(df, ddf.compute()) # type: ignore [attr-defined] ddf = IntSchema.validate(ddf) with pytest.raises(pa.errors.SchemaError): - ddf.compute() + ddf.compute() # type: ignore [attr-defined] IntSchema.validate(ddf, inplace=True) with pytest.raises(pa.errors.SchemaError): - ddf.compute() + ddf.compute() # type: ignore [attr-defined] def test_dataframe_schema() -> None: @@ -91,11 +91,11 @@ def test_decorator() -> None: """Test that pandera check_types decorator works with Dask DataFrames.""" @pa.check_types - def str_func(x: DaskDataFrame[StrSchema]) -> DaskDataFrame[StrSchema]: + def str_func(x: DataFrame[StrSchema]) -> DataFrame[StrSchema]: return x @pa.check_types - def int_func(x: DaskDataFrame[IntSchema]) -> DaskDataFrame[IntSchema]: + def int_func(x: DataFrame[IntSchema]) -> DataFrame[IntSchema]: return x df = pd.DataFrame({"col": ["1"]}) @@ -106,3 +106,42 @@ def int_func(x: DaskDataFrame[IntSchema]) -> DaskDataFrame[IntSchema]: with pytest.raises(pa.errors.SchemaError): print(result.compute()) + + +class InitSchema(pa.SchemaModel): + """Schema used to test dataframe initialization.""" + + col1: Series[int] + col2: Series[float] + col3: Series[str] + index: Index[int] + + +def test_init_dask_dataframe(): + """Test initialization of pandas.typing.dask.DataFrame with Schema.""" + ddf = dd.from_pandas( + pd.DataFrame({"col1": [1], "col2": [1.0], "col3": ["1"]}), + npartitions=2, + ) + assert isinstance( + DataFrame[InitSchema](ddf.dask, ddf._name, ddf._meta, ddf.divisions), + DataFrame, + ) + + +@pytest.mark.parametrize( + "invalid_data", + [ + {"col1": [1.0], "col2": [1.0], "col3": ["1"]}, + {"col1": [1], "col2": [1], "col3": ["1"]}, + {"col1": [1], "col2": [1.0], "col3": [1]}, + {"col1": [1]}, + ], +) +def test_init_pandas_dataframe_errors(invalid_data): + """Test errors from initializing a pandas.typing.DataFrame with Schema.""" + ddf = dd.from_pandas(pd.DataFrame(invalid_data), npartitions=2) + with pytest.raises(pa.errors.SchemaError): + DataFrame[InitSchema]( + ddf.dask, ddf._name, ddf._meta, ddf.divisions + ).compute() diff --git a/tests/koalas/test_schemas_on_koalas.py b/tests/koalas/test_schemas_on_koalas.py index d6d15cd5d..5d529c0ee 100644 --- a/tests/koalas/test_schemas_on_koalas.py +++ b/tests/koalas/test_schemas_on_koalas.py @@ -10,6 +10,7 @@ import pandera as pa from pandera import dtypes, extensions, system from pandera.engines import numpy_engine, pandas_engine +from pandera.typing import DataFrame, Index, Series from tests.strategies.test_strategies import NULLABLE_DTYPES from tests.strategies.test_strategies import ( UNSUPPORTED_DTYPE_CLS as UNSUPPORTED_STRATEGY_DTYPE_CLS, @@ -430,9 +431,11 @@ def test_schema_model(): # pylint: disable=too-few-public-methods class Schema(pa.SchemaModel): - int_field: pa.typing.Series[int] = pa.Field(gt=0) - float_field: pa.typing.Series[float] = pa.Field(lt=0) - str_field: pa.typing.Series[str] = pa.Field(isin=["a", "b", "c"]) + int_field: pa.typing.koalas.Series[int] = pa.Field(gt=0) + float_field: pa.typing.koalas.Series[float] = pa.Field(lt=0) + str_field: pa.typing.koalas.Series[str] = pa.Field( + isin=["a", "b", "c"] + ) valid_df = ks.DataFrame( { @@ -495,10 +498,10 @@ def test_check_decorators(): # pylint: disable=too-few-public-methods class InSchema(pa.SchemaModel): - a: pa.typing.Series[int] + a: pa.typing.koalas.Series[int] class OutSchema(InSchema): - b: pa.typing.Series[int] + b: pa.typing.koalas.Series[int] @pa.check_input(in_schema) @pa.check_output(out_schema) @@ -522,15 +525,15 @@ def function_check_io_invalid(df: ks.DataFrame) -> ks.DataFrame: @pa.check_types def function_check_types( - df: pa.typing.DataFrame[InSchema], - ) -> pa.typing.DataFrame[OutSchema]: + df: pa.typing.koalas.DataFrame[InSchema], + ) -> pa.typing.koalas.DataFrame[OutSchema]: df["b"] = df["a"] + 1 return df @pa.check_types def function_check_types_invalid( - df: pa.typing.DataFrame[InSchema], - ) -> pa.typing.DataFrame[OutSchema]: + df: pa.typing.koalas.DataFrame[InSchema], + ) -> pa.typing.koalas.DataFrame[OutSchema]: return df valid_df = ks.DataFrame({"a": [1, 2, 3]}) @@ -558,3 +561,35 @@ def function_check_types_invalid( ): with pytest.raises(pa.errors.SchemaError): fn(valid_df) + + +class InitSchema(pa.SchemaModel): + """Schema used to test dataframe initialization.""" + + col1: Series[int] + col2: Series[float] + col3: Series[str] + index: Index[int] + + +def test_init_koalas_dataframe(): + """Test initialization of pandas.typing.dask.DataFrame with Schema.""" + assert isinstance( + DataFrame[InitSchema]({"col1": [1], "col2": [1.0], "col3": ["1"]}), + DataFrame, + ) + + +@pytest.mark.parametrize( + "invalid_data", + [ + {"col1": [1.0], "col2": [1.0], "col3": ["1"]}, + {"col1": [1], "col2": [1], "col3": ["1"]}, + {"col1": [1], "col2": [1.0], "col3": [1]}, + {"col1": [1]}, + ], +) +def test_init_koalas_dataframe_errors(invalid_data): + """Test errors from initializing a pandas.typing.DataFrame with Schema.""" + with pytest.raises(pa.errors.SchemaError): + DataFrame[InitSchema](invalid_data) diff --git a/tests/modin/test_schemas_on_modin.py b/tests/modin/test_schemas_on_modin.py index d8f39c197..b80d09d17 100644 --- a/tests/modin/test_schemas_on_modin.py +++ b/tests/modin/test_schemas_on_modin.py @@ -11,6 +11,7 @@ import pandera as pa from pandera import extensions from pandera.engines import numpy_engine, pandas_engine +from pandera.typing.modin import DataFrame, Index, Series from tests.strategies.test_strategies import NULLABLE_DTYPES from tests.strategies.test_strategies import ( SUPPORTED_DTYPES as SUPPORTED_STRATEGY_DTYPES, @@ -338,9 +339,9 @@ def test_schema_model(): # pylint: disable=too-few-public-methods class Schema(pa.SchemaModel): - int_field: pa.typing.Series[int] = pa.Field(gt=0) - float_field: pa.typing.Series[float] = pa.Field(lt=0) - str_field: pa.typing.Series[str] = pa.Field(isin=["a", "b", "c"]) + int_field: pa.typing.modin.Series[int] = pa.Field(gt=0) + float_field: pa.typing.modin.Series[float] = pa.Field(lt=0) + str_field: pa.typing.modin.Series[str] = pa.Field(isin=["a", "b", "c"]) valid_df = mpd.DataFrame( { @@ -403,10 +404,10 @@ def test_check_decorators(): # pylint: disable=too-few-public-methods class InSchema(pa.SchemaModel): - a: pa.typing.Series[int] + a: pa.typing.modin.Series[int] class OutSchema(InSchema): - b: pa.typing.Series[int] + b: pa.typing.modin.Series[int] @pa.check_input(in_schema) @pa.check_output(out_schema) @@ -432,15 +433,15 @@ def function_check_io_invalid(df: mpd.DataFrame) -> mpd.DataFrame: @pa.check_types def function_check_types( - df: pa.typing.DataFrame[InSchema], - ) -> pa.typing.DataFrame[OutSchema]: + df: pa.typing.modin.DataFrame[InSchema], + ) -> pa.typing.modin.DataFrame[OutSchema]: df["b"] = df["a"] + 1 return df @pa.check_types def function_check_types_invalid( - df: pa.typing.DataFrame[InSchema], - ) -> pa.typing.DataFrame[OutSchema]: + df: pa.typing.modin.DataFrame[InSchema], + ) -> pa.typing.modin.DataFrame[OutSchema]: return df valid_df = mpd.DataFrame({"a": [1, 2, 3]}) @@ -468,3 +469,35 @@ def function_check_types_invalid( ): with pytest.raises(pa.errors.SchemaError): fn(valid_df) + + +class InitSchema(pa.SchemaModel): + """Schema used for dataframe initialization.""" + + col1: Series[int] + col2: Series[float] + col3: Series[str] + index: Index[int] + + +def test_init_modin_dataframe(): + """Test initialization of pandas.typing.dask.DataFrame with Schema.""" + assert isinstance( + DataFrame[InitSchema]({"col1": [1], "col2": [1.0], "col3": ["1"]}), + DataFrame, + ) + + +@pytest.mark.parametrize( + "invalid_data", + [ + {"col1": [1.0], "col2": [1.0], "col3": ["1"]}, + {"col1": [1], "col2": [1], "col3": ["1"]}, + {"col1": [1], "col2": [1.0], "col3": [1]}, + {"col1": [1]}, + ], +) +def test_init_modin_dataframe_errors(invalid_data): + """Test errors from initializing a pandas.typing.DataFrame with Schema.""" + with pytest.raises(pa.errors.SchemaError): + DataFrame[InitSchema](invalid_data) From d368f1236f3c9d995357ad437f500ec101cfd64c Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Tue, 9 Nov 2021 00:11:20 -0500 Subject: [PATCH 2/3] fix lint --- pandera/typing/dask.py | 2 +- pandera/typing/modin.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandera/typing/dask.py b/pandera/typing/dask.py index ae807c036..168c02037 100644 --- a/pandera/typing/dask.py +++ b/pandera/typing/dask.py @@ -23,7 +23,7 @@ if DASK_INSTALLED: - # pylint: disable=too-few-public-methods + # pylint: disable=too-few-public-methods,abstract-method class DataFrame(DataFrameBase, dd.DataFrame, Generic[T]): """ Representation of dask.dataframe.DataFrame, only used for type diff --git a/pandera/typing/modin.py b/pandera/typing/modin.py index c69782f21..b497890d8 100644 --- a/pandera/typing/modin.py +++ b/pandera/typing/modin.py @@ -31,14 +31,14 @@ class DataFrame(DataFrameBase, mpd.DataFrame, Generic[T]): *new in 0.8.0* """ - # pylint:disable=too-few-public-methods + # pylint:disable=too-few-public-methods,abstract-method class Series(SeriesBase, mpd.Series, Generic[GenericDtype]): """Representation of pandas.Series, only used for type annotation. *new in 0.8.0* """ - # pylint:disable=too-few-public-methods + # pylint:disable=too-few-public-methods,abstract-method class Index(IndexBase, mpd.Index, Generic[GenericDtype]): """Representation of pandas.Index, only used for type annotation. From e0fa3bba100b65ee067322f375f838703d5cc900 Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Tue, 9 Nov 2021 08:57:25 -0500 Subject: [PATCH 3/3] fix lint, docs tests --- docs/source/conf.py | 3 +++ docs/source/modin.rst | 6 ++++++ pandera/typing/koalas.py | 4 ++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 77c9b06f3..8e47ab41c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -50,6 +50,7 @@ ] doctest_global_setup = """ +import platform import sys import pandas as pd import numpy as np @@ -76,6 +77,8 @@ SKIP_PANDAS_LT_V1 = version.parse(pd.__version__).release < (1, 0) or PY36 SKIP_SCALING = True SKIP_SCHEMA_MODEL = SKIP_PANDAS_LT_V1 or KOALAS_INSTALLED +SKIP_MODIN = platform.system() == "Windows" + """ doctest_default_flags = ( diff --git a/docs/source/modin.rst b/docs/source/modin.rst index 3d5b1bb1b..fc6c58fd2 100644 --- a/docs/source/modin.rst +++ b/docs/source/modin.rst @@ -25,6 +25,7 @@ below we'll use the :ref:`class-based API ` to define a :py:class:`SchemaModel` for validation. .. testcode:: scaling_modin + :skipif: SKIP_MODIN import modin.pandas as pd import pandas as pd @@ -58,6 +59,7 @@ below we'll use the :ref:`class-based API ` to define a .. testoutput:: scaling_modin + :skipif: SKIP_MODIN state city price 0 FL Orlando 8 @@ -73,6 +75,7 @@ modin dataframes at runtime: .. testcode:: scaling_modin + :skipif: SKIP_MODIN @pa.check_types def function(df: DataFrame[Schema]) -> DataFrame[Schema]: @@ -82,6 +85,7 @@ modin dataframes at runtime: .. testoutput:: scaling_modin + :skipif: SKIP_MODIN state city price 3 CA San Francisco 16 @@ -93,6 +97,7 @@ And of course, you can use the object-based API to validate dask dataframes: .. testcode:: scaling_modin + :skipif: SKIP_MODIN schema = pa.DataFrameSchema({ "state": pa.Column(str), @@ -103,6 +108,7 @@ And of course, you can use the object-based API to validate dask dataframes: .. testoutput:: scaling_modin + :skipif: SKIP_MODIN state city price 0 FL Orlando 8 diff --git a/pandera/typing/koalas.py b/pandera/typing/koalas.py index 052338f7f..940ce8f76 100644 --- a/pandera/typing/koalas.py +++ b/pandera/typing/koalas.py @@ -22,7 +22,7 @@ if KOALAS_INSTALLED: - # pylint: disable=too-few-public-methods + # pylint: disable=too-few-public-methods,arguments-renamed class DataFrame(DataFrameBase, ks.DataFrame, Generic[T]): """ Representation of dask.dataframe.DataFrame, only used for type @@ -35,7 +35,7 @@ def __class_getitem__(cls, item): """Define this to override's koalas generic type.""" return _GenericAlias(cls, item) - # pylint:disable=too-few-public-methods + # pylint:disable=too-few-public-methods,arguments-renamed class Series(SeriesBase, ks.Series, Generic[GenericDtype]): """Representation of pandas.Series, only used for type annotation.