From 2d2a5a826d4e13d608fb8e07b1f982adc9cddad5 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 21 Jun 2021 19:44:45 +0200 Subject: [PATCH 01/23] delete print statements --- pandera/engines/pandas_engine.py | 6 ------ tests/core/test_schemas.py | 5 ----- 2 files changed, 11 deletions(-) diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py index 44a8f2fb2..c4657680a 100644 --- a/pandera/engines/pandas_engine.py +++ b/pandera/engines/pandas_engine.py @@ -197,7 +197,6 @@ def _register_numpy_numbers( equivalents.add("integer") numpy_data_type = getattr(numpy_engine, f"{pandera_name}{bit_width}") - print(f"EQUIVALENTS FOR {numpy_data_type}: {list(equivalents)}") Engine.register_dtype(numpy_data_type, equivalents=list(equivalents)) @@ -543,8 +542,3 @@ def from_parametrized_dtype(cls, pd_dtype: pd.IntervalDtype): """Convert a :class:`pandas.IntervalDtype` to a Pandera :class:`~pandera.engines.pandas_engine.Interval`.""" return cls(subtype=pd_dtype.subtype) # type: ignore - - -print("PANDAS ENGINE EQUIVALENTS") -for k, v in engine.Engine._registry[Engine].equivalents.items(): - print(f"{k}: equivalents={v}") diff --git a/tests/core/test_schemas.py b/tests/core/test_schemas.py index cd3988681..f7ba37e38 100644 --- a/tests/core/test_schemas.py +++ b/tests/core/test_schemas.py @@ -191,10 +191,7 @@ def test_dataframe_dtype_coerce(): assert (df.dtypes == float_alias).all() # raises ValueError if _coerce_dtype is called when dtype is None - print("---") schema.dtype = None - print("----=-") - print(schema.dtype) with pytest.raises(ValueError): schema._coerce_dtype(df) @@ -316,7 +313,6 @@ def test_series_schema(): ) def f(series): - print(series) return series.isin(["foo", "bar", "baz"]) str_schema = SeriesSchema( @@ -1561,7 +1557,6 @@ def test_schema_str_repr(schema, fields): schema.__str__(), schema.__repr__(), ]: - print(x) assert x.startswith(f"") for field in fields: From 101b98acc8d8ef62df065584f53a7bd3f502dfe4 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Sun, 27 Jun 2021 17:45:47 +0200 Subject: [PATCH 02/23] pin furo --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index d15718413..a3ae8dcdc 100644 --- a/environment.yml +++ b/environment.yml @@ -50,7 +50,7 @@ dependencies: - pre_commit - pip: - - furo + - furo==2021.6.18b36 - types-click - types-pyyaml - types-pkg_resources diff --git a/requirements-dev.txt b/requirements-dev.txt index c95ac87a3..7d1f16941 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -31,7 +31,7 @@ recommonmark twine asv pre_commit -furo +furo==2021.6.18b36 types-click types-pyyaml types-pkg_resources \ No newline at end of file From 7f6aeb16f6fd199d7be6d0626c83e0124891b29d Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Sun, 27 Jun 2021 23:26:14 +0200 Subject: [PATCH 03/23] fix generated docs not removed by nox --- noxfile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/noxfile.py b/noxfile.py index bcd132984..235114902 100644 --- a/noxfile.py +++ b/noxfile.py @@ -332,8 +332,8 @@ def docs(session: Session) -> None: # build html docs if not CI_RUN and not session.posargs: - shutil.rmtree(os.path.join("_build"), ignore_errors=True) - shutil.rmtree(os.path.join("generated"), ignore_errors=True) + shutil.rmtree("_build", ignore_errors=True) + shutil.rmtree(os.path.join("source", "generated"), ignore_errors=True) session.run( "sphinx-build", "-W", From 9aa016f671d7f0d0e724c3bbd09e68cab2d6ab7c Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 28 Jun 2021 15:10:52 +0200 Subject: [PATCH 04/23] re-organize API section --- .gitignore | 2 +- docs/source/API_reference.rst | 167 -------------------- docs/source/conf.py | 2 +- docs/source/index.rst | 2 +- docs/source/reference/checks.rst | 10 ++ docs/source/reference/decorators.rst | 11 ++ docs/source/reference/errors.rst | 12 ++ docs/source/reference/extensions.rst | 9 ++ docs/source/reference/index.rst | 24 +++ docs/source/reference/io.rst | 10 ++ docs/source/reference/schema_components.rst | 12 ++ docs/source/reference/schema_inference.rst | 8 + docs/source/reference/schema_models.rst | 39 +++++ docs/source/reference/schemas.rst | 10 ++ docs/source/reference/strategies.rst | 9 ++ noxfile.py | 4 +- 16 files changed, 160 insertions(+), 171 deletions(-) delete mode 100644 docs/source/API_reference.rst create mode 100644 docs/source/reference/checks.rst create mode 100644 docs/source/reference/decorators.rst create mode 100644 docs/source/reference/errors.rst create mode 100644 docs/source/reference/extensions.rst create mode 100644 docs/source/reference/index.rst create mode 100644 docs/source/reference/io.rst create mode 100644 docs/source/reference/schema_components.rst create mode 100644 docs/source/reference/schema_inference.rst create mode 100644 docs/source/reference/schema_models.rst create mode 100644 docs/source/reference/schemas.rst create mode 100644 docs/source/reference/strategies.rst diff --git a/.gitignore b/.gitignore index 0d8362231..39e188cf5 100644 --- a/.gitignore +++ b/.gitignore @@ -113,7 +113,7 @@ venv.bak/ /asv_bench/results/ # Docs -docs/source/generated +docs/source/reference/generated # Nox .nox diff --git a/docs/source/API_reference.rst b/docs/source/API_reference.rst deleted file mode 100644 index cca8a43d5..000000000 --- a/docs/source/API_reference.rst +++ /dev/null @@ -1,167 +0,0 @@ -.. pandera package index documentation toctree - -.. currentmodule:: pandera - -API -=== - -The ``io`` module and built-in ``Hypothesis`` checks require a pandera -installation with the corresponding extension, see the -:ref:`installation` instructions for more details. - -Schemas -------- - -.. autosummary:: - :toctree: generated - :template: class.rst - :nosignatures: - - pandera.schemas.DataFrameSchema - pandera.schemas.SeriesSchema - - -Schema Components ------------------ - -.. autosummary:: - :toctree: generated - :template: class.rst - :nosignatures: - - pandera.schema_components.Column - pandera.schema_components.Index - pandera.schema_components.MultiIndex - - -Schema Models -------------- - -.. autosummary:: - :toctree: generated - :template: class.rst - :nosignatures: - - pandera.model.SchemaModel - -**Model Components** - -.. autosummary:: - :toctree: generated - :nosignatures: - - pandera.model_components.Field - pandera.model_components.check - pandera.model_components.dataframe_check - -**Typing** - -.. autosummary:: - :toctree: generated - :template: typing_module.rst - :nosignatures: - - pandera.typing - -**Config** - -.. autosummary:: - :toctree: generated - :template: model_component_class.rst - :nosignatures: - - pandera.model.BaseConfig - - -Checks ------- - -.. autosummary:: - :toctree: generated - :template: class.rst - :nosignatures: - - pandera.checks.Check - pandera.hypotheses.Hypothesis - - -Pandas Data Types ------------------ - -.. autosummary:: - :toctree: generated - :template: pandas_dtype_class.rst - :nosignatures: - - pandera.dtypes.DataType - - -Decorators ----------- - -.. autosummary:: - :toctree: generated - :nosignatures: - - pandera.decorators.check_input - pandera.decorators.check_output - pandera.decorators.check_io - pandera.decorators.check_types - - -Schema Inference ----------------- - -.. autosummary:: - :toctree: generated - :nosignatures: - - pandera.schema_inference.infer_schema - - -IO Utils --------- - -.. autosummary:: - :toctree: generated - :nosignatures: - - pandera.io.from_yaml - pandera.io.to_yaml - pandera.io.to_script - - -Data Synthesis Strategies -------------------------- - -.. autosummary:: - :toctree: generated - :template: strategies_module.rst - :nosignatures: - - pandera.strategies - - -Extensions ----------- - -.. autosummary:: - :toctree: generated - :template: module.rst - :nosignatures: - - pandera.extensions - - -Errors ------- - -.. autosummary:: - :toctree: generated - :template: class.rst - :nosignatures: - - pandera.errors.SchemaError - pandera.errors.SchemaErrors - pandera.errors.SchemaInitError - pandera.errors.SchemaDefinitionError diff --git a/docs/source/conf.py b/docs/source/conf.py index f82b1237e..9c0446fb8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -162,7 +162,7 @@ .. role:: green """ -autosummary_generate = ["API_reference.rst"] +autosummary_generate = True autosummary_filename_map = { "pandera.Check": "pandera.Check", "pandera.check": "pandera.check_decorator", diff --git a/docs/source/index.rst b/docs/source/index.rst index 3396d18e6..0e5e72096 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -318,7 +318,7 @@ Submit issues, feature requests or bugfixes on :caption: Reference :hidden: - API_reference + reference/index .. toctree:: :maxdepth: 6 diff --git a/docs/source/reference/checks.rst b/docs/source/reference/checks.rst new file mode 100644 index 000000000..2cf236dfd --- /dev/null +++ b/docs/source/reference/checks.rst @@ -0,0 +1,10 @@ +Checks +====== + +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + pandera.checks.Check + pandera.hypotheses.Hypothesis \ No newline at end of file diff --git a/docs/source/reference/decorators.rst b/docs/source/reference/decorators.rst new file mode 100644 index 000000000..7957deb0b --- /dev/null +++ b/docs/source/reference/decorators.rst @@ -0,0 +1,11 @@ +Decorators +========== + +.. autosummary:: + :toctree: generated + :nosignatures: + + pandera.decorators.check_input + pandera.decorators.check_output + pandera.decorators.check_io + pandera.decorators.check_types \ No newline at end of file diff --git a/docs/source/reference/errors.rst b/docs/source/reference/errors.rst new file mode 100644 index 000000000..59cb986b8 --- /dev/null +++ b/docs/source/reference/errors.rst @@ -0,0 +1,12 @@ +Errors +====== + +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + pandera.errors.SchemaError + pandera.errors.SchemaErrors + pandera.errors.SchemaInitError + pandera.errors.SchemaDefinitionError diff --git a/docs/source/reference/extensions.rst b/docs/source/reference/extensions.rst new file mode 100644 index 000000000..49f73f452 --- /dev/null +++ b/docs/source/reference/extensions.rst @@ -0,0 +1,9 @@ +Extensions +========== + +.. autosummary:: + :toctree: generated + :template: module.rst + :nosignatures: + + pandera.extensions \ No newline at end of file diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst new file mode 100644 index 000000000..6ccb6ff52 --- /dev/null +++ b/docs/source/reference/index.rst @@ -0,0 +1,24 @@ +.. pandera package index documentation toctree + +.. currentmodule:: pandera + +API +=== + +The ``io`` module and built-in ``Hypothesis`` checks require a pandera +installation with the corresponding extension, see the +:ref:`installation` instructions for more details. + +.. toctree:: + :maxdepth: 2 + + schemas + schema_components + schema_models + checks + decorators + schema_inference + io + strategies + extensions + errors diff --git a/docs/source/reference/io.rst b/docs/source/reference/io.rst new file mode 100644 index 000000000..414df51a0 --- /dev/null +++ b/docs/source/reference/io.rst @@ -0,0 +1,10 @@ +IO Utils +======== + +.. autosummary:: + :toctree: generated + :nosignatures: + + pandera.io.from_yaml + pandera.io.to_yaml + pandera.io.to_script \ No newline at end of file diff --git a/docs/source/reference/schema_components.rst b/docs/source/reference/schema_components.rst new file mode 100644 index 000000000..113fc528a --- /dev/null +++ b/docs/source/reference/schema_components.rst @@ -0,0 +1,12 @@ + +Schema Components +----------------- + +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + pandera.schema_components.Column + pandera.schema_components.Index + pandera.schema_components.MultiIndex diff --git a/docs/source/reference/schema_inference.rst b/docs/source/reference/schema_inference.rst new file mode 100644 index 000000000..d7748ae70 --- /dev/null +++ b/docs/source/reference/schema_inference.rst @@ -0,0 +1,8 @@ +Schema Inference +================ + +.. autosummary:: + :toctree: generated + :nosignatures: + + pandera.schema_inference.infer_schema diff --git a/docs/source/reference/schema_models.rst b/docs/source/reference/schema_models.rst new file mode 100644 index 000000000..89cea7f1e --- /dev/null +++ b/docs/source/reference/schema_models.rst @@ -0,0 +1,39 @@ +Schema Models +============= +.. currentmodule:: pandera + +Schema Model +~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated + :template: class.rst + + pandera.model.SchemaModel + +Model Components +~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated + + pandera.model_components.Field + pandera.model_components.check + pandera.model_components.dataframe_check + +Typing +~~~~~~ + +.. autosummary:: + :toctree: generated + :template: typing_module.rst + :nosignatures: + + pandera.typing + +Config +~~~~~~ +.. autosummary:: + :toctree: generated + :template: model_component_class.rst + :nosignatures: + + pandera.model.BaseConfig \ No newline at end of file diff --git a/docs/source/reference/schemas.rst b/docs/source/reference/schemas.rst new file mode 100644 index 000000000..a1b16e44c --- /dev/null +++ b/docs/source/reference/schemas.rst @@ -0,0 +1,10 @@ +Schemas +======= + +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + pandera.schemas.DataFrameSchema + pandera.schemas.SeriesSchema \ No newline at end of file diff --git a/docs/source/reference/strategies.rst b/docs/source/reference/strategies.rst new file mode 100644 index 000000000..5e878102b --- /dev/null +++ b/docs/source/reference/strategies.rst @@ -0,0 +1,9 @@ +Data Synthesis Strategies +========================= + +.. autosummary:: + :toctree: generated + :template: strategies_module.rst + :nosignatures: + + pandera.strategies \ No newline at end of file diff --git a/noxfile.py b/noxfile.py index 235114902..e7c65009e 100644 --- a/noxfile.py +++ b/noxfile.py @@ -333,7 +333,9 @@ def docs(session: Session) -> None: # build html docs if not CI_RUN and not session.posargs: shutil.rmtree("_build", ignore_errors=True) - shutil.rmtree(os.path.join("source", "generated"), ignore_errors=True) + shutil.rmtree( + os.path.join("source", "reference", "generated"), ignore_errors=True + ) session.run( "sphinx-build", "-W", From 93270d06026d7da088abd612926fa863bd808625 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 28 Jun 2021 15:12:44 +0200 Subject: [PATCH 05/23] replace aliased pandas_engine data types with their aliases --- pandera/engines/pandas_engine.py | 41 ++++++++------------------------ tests/core/test_dtypes.py | 16 ++++++------- 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py index c4657680a..53e763f9d 100644 --- a/pandera/engines/pandas_engine.py +++ b/pandera/engines/pandas_engine.py @@ -139,12 +139,10 @@ def numpy_dtype(cls, pandera_dtype: dtypes.DataType) -> np.dtype: equivalents=["boolean", pd.BooleanDtype, pd.BooleanDtype()], ) @immutable -class Bool(DataType, dtypes.Bool): +class BOOL(DataType, dtypes.Bool): type = pd.BooleanDtype() -BOOL = Bool - ############################################################################### # number ############################################################################### @@ -213,43 +211,32 @@ def _register_numpy_numbers( @Engine.register_dtype(equivalents=[pd.Int64Dtype, pd.Int64Dtype()]) @immutable -class Int64(DataType, dtypes.Int): +class INT64(DataType, dtypes.Int): type = pd.Int64Dtype() bit_width: int = 64 -INT64 = Int64 - - @Engine.register_dtype(equivalents=[pd.Int32Dtype, pd.Int32Dtype()]) @immutable -class Int32(Int64): +class INT32(INT64): type = pd.Int32Dtype() bit_width: int = 32 -INT32 = Int32 - - @Engine.register_dtype(equivalents=[pd.Int16Dtype, pd.Int16Dtype()]) @immutable -class Int16(Int32): +class INT16(INT32): type = pd.Int16Dtype() bit_width: int = 16 -INT16 = Int16 - - @Engine.register_dtype(equivalents=[pd.Int8Dtype, pd.Int8Dtype()]) @immutable -class Int8(Int16): +class INT8(INT16): type = pd.Int8Dtype() bit_width: int = 8 -INT8 = Int8 - ############################################################################### # unsigned integer ############################################################################### @@ -263,37 +250,32 @@ class Int8(Int16): @Engine.register_dtype(equivalents=[pd.UInt64Dtype, pd.UInt64Dtype()]) @immutable -class UInt64(DataType, dtypes.UInt): +class UINT64(DataType, dtypes.UInt): type = pd.UInt64Dtype() bit_width: int = 64 @Engine.register_dtype(equivalents=[pd.UInt32Dtype, pd.UInt32Dtype()]) @immutable -class UInt32(UInt64): +class UINT32(UINT64): type = pd.UInt32Dtype() bit_width: int = 32 @Engine.register_dtype(equivalents=[pd.UInt16Dtype, pd.UInt16Dtype()]) @immutable -class UInt16(UInt32): +class UINT16(UINT32): type = pd.UInt16Dtype() bit_width: int = 16 @Engine.register_dtype(equivalents=[pd.UInt8Dtype, pd.UInt8Dtype()]) @immutable -class UInt8(UInt16): +class UINT8(UINT16): type = pd.UInt8Dtype() bit_width: int = 8 -UINT64 = UInt64 -UINT32 = UInt32 -UINT16 = UInt16 -UINT8 = UInt8 - # ############################################################################### # # float # ############################################################################### @@ -358,13 +340,10 @@ def from_parametrized_dtype( equivalents=["string", pd.StringDtype, pd.StringDtype()] ) @immutable -class String(DataType, dtypes.String): +class STRING(DataType, dtypes.String): type = pd.StringDtype() -STRING = String - - @Engine.register_dtype( equivalents=["str", str, dtypes.String, dtypes.String(), np.str_] ) diff --git a/tests/core/test_dtypes.py b/tests/core/test_dtypes.py index 48e303c10..a509bdf58 100644 --- a/tests/core/test_dtypes.py +++ b/tests/core/test_dtypes.py @@ -39,10 +39,10 @@ nullable_int_dtypes = { - pandas_engine.Int8: "Int8", - pandas_engine.Int16: "Int16", - pandas_engine.Int32: "Int32", - pandas_engine.Int64: "Int64", + pandas_engine.INT8: "Int8", + pandas_engine.INT16: "Int16", + pandas_engine.INT32: "Int32", + pandas_engine.INT64: "Int64", } uint_dtypes = { @@ -58,10 +58,10 @@ } nullable_uint_dtypes = { - pandas_engine.UInt8: "UInt8", - pandas_engine.UInt16: "UInt16", - pandas_engine.UInt32: "UInt32", - pandas_engine.UInt64: "UInt64", + pandas_engine.UINT8: "UInt8", + pandas_engine.UINT16: "UInt16", + pandas_engine.UINT32: "UInt32", + pandas_engine.UINT64: "UInt64", } float_dtypes = { From d1fb38f1a39cde5c162367f515adc5d6a5bf83ad Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 28 Jun 2021 15:13:42 +0200 Subject: [PATCH 06/23] drop warning when calling Engine.register_dtype without arguments --- pandera/engines/engine.py | 6 ------ tests/core/test_engine.py | 8 -------- 2 files changed, 14 deletions(-) diff --git a/pandera/engines/engine.py b/pandera/engines/engine.py index bf01e0473..7fec60017 100644 --- a/pandera/engines/engine.py +++ b/pandera/engines/engine.py @@ -3,7 +3,6 @@ # pylint:disable=no-value-for-parameter import functools import inspect -import warnings from abc import ABCMeta from dataclasses import dataclass from typing import ( @@ -155,11 +154,6 @@ def _wrapper(pandera_dtype_cls: Union[DataType, Type[DataType]]): if "from_parametrized_dtype" in pandera_dtype_cls.__dict__: cls._register_from_parametrized_dtype(pandera_dtype_cls) - elif not equivalents: - warnings.warn( - f"register_dtype({pandera_dtype_cls}) on a class without a " - + "'from_parametrized_dtype' classmethod has no effect." - ) cls._registered_dtypes.add(pandera_dtype_cls) return pandera_dtype_cls diff --git a/tests/core/test_engine.py b/tests/core/test_engine.py index a4b8bfefb..6e38121c0 100644 --- a/tests/core/test_engine.py +++ b/tests/core/test_engine.py @@ -41,14 +41,6 @@ class FakeEngine( # pylint:disable=too-few-public-methods del FakeEngine -def test_register_bare_dtype(engine: Engine): - """Test that a dtype without equivalents nor 'from_parametrized_dtype' - classmethod can be registered. - """ - with pytest.warns(UserWarning): - engine.register_dtype(SimpleDtype) - - def test_register_equivalents(engine: Engine, equivalents: List[Any]): """Test that a dtype with equivalents can be registered.""" engine.register_dtype(SimpleDtype, equivalents=equivalents) From 5f78b77491cb24edbebdfcead7734b91de1f0ed9 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 28 Jun 2021 22:41:18 +0200 Subject: [PATCH 07/23] add data types to api reference doc --- docs/source/_templates/dtype.rst | 41 ++++++++++++ docs/source/reference/checks.rst | 2 +- docs/source/reference/decorators.rst | 2 +- docs/source/reference/dtypes.rst | 89 +++++++++++++++++++++++++ docs/source/reference/extensions.rst | 2 +- docs/source/reference/index.rst | 1 + docs/source/reference/io.rst | 2 +- docs/source/reference/schema_models.rst | 2 +- docs/source/reference/schemas.rst | 2 +- docs/source/reference/strategies.rst | 2 +- noxfile.py | 3 +- pandera/dtypes.py | 15 +++-- pandera/engines/engine.py | 20 +++--- pandera/engines/numpy_engine.py | 5 +- pandera/engines/pandas_engine.py | 36 ++++++++-- pandera/schemas.py | 5 +- 16 files changed, 196 insertions(+), 33 deletions(-) create mode 100644 docs/source/_templates/dtype.rst create mode 100644 docs/source/reference/dtypes.rst diff --git a/docs/source/_templates/dtype.rst b/docs/source/_templates/dtype.rst new file mode 100644 index 000000000..7625a0dfe --- /dev/null +++ b/docs/source/_templates/dtype.rst @@ -0,0 +1,41 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + + {% block attributes %} + {% if attributes %} + .. rubric:: Attributes + + .. autosummary:: + :nosignatures: + + {% for item in attributes %} + ~{{ name }}.{{ item }} + {%- endfor %} + + {% endif %} + {% endblock %} + + {% block methods %} + {% if methods %} + .. rubric:: Methods + + .. autosummary:: + :nosignatures: + :toctree: methods + + {# Ignore the DateTime alias to avoid `WARNING: document isn't included in any toctree`#} + {% if objname != "DateTime" %} + {% for item in methods %} + ~{{ name }}.{{ item }} + {%- endfor %} + + {%- if members and '__call__' in members %} + ~{{ name }}.__call__ + {%- endif %} + {%- endif %} + + {%- endif %} + {% endblock %} diff --git a/docs/source/reference/checks.rst b/docs/source/reference/checks.rst index 2cf236dfd..6085baa42 100644 --- a/docs/source/reference/checks.rst +++ b/docs/source/reference/checks.rst @@ -7,4 +7,4 @@ Checks :nosignatures: pandera.checks.Check - pandera.hypotheses.Hypothesis \ No newline at end of file + pandera.hypotheses.Hypothesis diff --git a/docs/source/reference/decorators.rst b/docs/source/reference/decorators.rst index 7957deb0b..5b265f1e4 100644 --- a/docs/source/reference/decorators.rst +++ b/docs/source/reference/decorators.rst @@ -8,4 +8,4 @@ Decorators pandera.decorators.check_input pandera.decorators.check_output pandera.decorators.check_io - pandera.decorators.check_types \ No newline at end of file + pandera.decorators.check_types diff --git a/docs/source/reference/dtypes.rst b/docs/source/reference/dtypes.rst new file mode 100644 index 000000000..f1c551c13 --- /dev/null +++ b/docs/source/reference/dtypes.rst @@ -0,0 +1,89 @@ +Pandera Data Types +================== + +Library-agnostic dtypes +~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated + :template: dtype.rst + :nosignatures: + + pandera.dtypes.DataType + pandera.dtypes.Bool + pandera.dtypes.Timestamp + pandera.dtypes.DateTime + pandera.dtypes.Timedelta + pandera.dtypes.Category + pandera.dtypes.Float + pandera.dtypes.Float16 + pandera.dtypes.Float32 + pandera.dtypes.Float64 + pandera.dtypes.Float128 + pandera.dtypes.Int + pandera.dtypes.Int8 + pandera.dtypes.Int16 + pandera.dtypes.Int32 + pandera.dtypes.Int64 + pandera.dtypes.UInt + pandera.dtypes.UInt8 + pandera.dtypes.UInt16 + pandera.dtypes.UInt32 + pandera.dtypes.UInt64 + pandera.dtypes.Complex + pandera.dtypes.Complex64 + pandera.dtypes.Complex128 + pandera.dtypes.Complex256 + pandera.dtypes.String + + +Pandas-specific Dtypes +~~~~~~~~~~~~~~~~~~~~~~ + +Listed here for compatibility with pandera versions < 0.7. +Passing native pandas dtypes to pandera components is preferred. + +.. autosummary:: + :toctree: generated + :template: dtype.rst + :nosignatures: + + pandera.engines.pandas_engine.BOOL + pandera.engines.pandas_engine.INT8 + pandera.engines.pandas_engine.INT16 + pandera.engines.pandas_engine.INT32 + pandera.engines.pandas_engine.INT64 + pandera.engines.pandas_engine.UINT8 + pandera.engines.pandas_engine.UINT16 + pandera.engines.pandas_engine.UINT32 + pandera.engines.pandas_engine.UINT64 + pandera.engines.pandas_engine.STRING + pandera.engines.numpy_engine.Object + +Utility functions +~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated + :nosignatures: + + pandera.dtypes.is_subdtype + pandera.dtypes.is_float + pandera.dtypes.is_int + pandera.dtypes.is_uint + pandera.dtypes.is_complex + pandera.dtypes.is_numeric + pandera.dtypes.is_bool + pandera.dtypes.is_string + pandera.dtypes.is_datetime + pandera.dtypes.is_timedelta + pandera.dtypes.immutable + +Engines +~~~~~~~ +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + pandera.engines.engine.Engine + pandera.engines.numpy_engine.Engine + pandera.engines.pandas_engine.Engine diff --git a/docs/source/reference/extensions.rst b/docs/source/reference/extensions.rst index 49f73f452..58474a7ea 100644 --- a/docs/source/reference/extensions.rst +++ b/docs/source/reference/extensions.rst @@ -6,4 +6,4 @@ Extensions :template: module.rst :nosignatures: - pandera.extensions \ No newline at end of file + pandera.extensions diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 6ccb6ff52..707a0ed51 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -22,3 +22,4 @@ installation with the corresponding extension, see the strategies extensions errors + dtypes diff --git a/docs/source/reference/io.rst b/docs/source/reference/io.rst index 414df51a0..f7e334af4 100644 --- a/docs/source/reference/io.rst +++ b/docs/source/reference/io.rst @@ -7,4 +7,4 @@ IO Utils pandera.io.from_yaml pandera.io.to_yaml - pandera.io.to_script \ No newline at end of file + pandera.io.to_script diff --git a/docs/source/reference/schema_models.rst b/docs/source/reference/schema_models.rst index 89cea7f1e..0615fbbff 100644 --- a/docs/source/reference/schema_models.rst +++ b/docs/source/reference/schema_models.rst @@ -36,4 +36,4 @@ Config :template: model_component_class.rst :nosignatures: - pandera.model.BaseConfig \ No newline at end of file + pandera.model.BaseConfig diff --git a/docs/source/reference/schemas.rst b/docs/source/reference/schemas.rst index a1b16e44c..f9eff67aa 100644 --- a/docs/source/reference/schemas.rst +++ b/docs/source/reference/schemas.rst @@ -7,4 +7,4 @@ Schemas :nosignatures: pandera.schemas.DataFrameSchema - pandera.schemas.SeriesSchema \ No newline at end of file + pandera.schemas.SeriesSchema diff --git a/docs/source/reference/strategies.rst b/docs/source/reference/strategies.rst index 5e878102b..dc8f51a99 100644 --- a/docs/source/reference/strategies.rst +++ b/docs/source/reference/strategies.rst @@ -6,4 +6,4 @@ Data Synthesis Strategies :template: strategies_module.rst :nosignatures: - pandera.strategies \ No newline at end of file + pandera.strategies diff --git a/noxfile.py b/noxfile.py index e7c65009e..065e7ec9d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -334,7 +334,8 @@ def docs(session: Session) -> None: if not CI_RUN and not session.posargs: shutil.rmtree("_build", ignore_errors=True) shutil.rmtree( - os.path.join("source", "reference", "generated"), ignore_errors=True + os.path.join("source", "reference", "generated"), + ignore_errors=True, ) session.run( "sphinx-build", diff --git a/pandera/dtypes.py b/pandera/dtypes.py index f0ccde032..a07d0b015 100644 --- a/pandera/dtypes.py +++ b/pandera/dtypes.py @@ -18,7 +18,8 @@ class DataType(ABC): """Base class of all Pandera data types.""" - continuous: bool = False + continuous: Optional[bool] = None + """Whether the number data type is continuous.""" def __init__(self): if self.__class__ is DataType: @@ -27,15 +28,16 @@ def __init__(self): ) def coerce(self, data_container: Any): - """Coerce data container to the dtype.""" + """Coerce data container to the data type.""" raise NotImplementedError() def __call__(self, data_container: Any): - """Coerce data container to the dtype.""" + """Coerce data container to the data type.""" return self.coerce(data_container) def check(self, pandera_dtype: "DataType") -> bool: - """Check that pandera :class:`DataType`s are equivalent.""" + """Check that pandera :class:`~pandera.dtypes.DataType` are + equivalent.""" return self == pandera_dtype def __repr__(self) -> str: @@ -64,7 +66,7 @@ def immutable( :param dtype: :class:`DataType` to decorate. :param dataclass_kwargs: Keywords arguments forwarded to :func:`dataclasses.dataclass`. - :returns: Immutable :class:`~pandera.dtypes.DataType` + :returns: Immutable :class:`DataType` """ kwargs = {"frozen": True, "init": False, "repr": False} kwargs.update(dataclass_kwargs) @@ -98,6 +100,7 @@ class _Number(DataType): """Semantic representation of a numeric data type.""" exact: Optional[bool] = None + """Whether the data type is an exact representation of a number.""" def check(self, pandera_dtype: "DataType") -> bool: if self.__class__ is _Number: @@ -109,6 +112,7 @@ def check(self, pandera_dtype: "DataType") -> bool: class _PhysicalNumber(_Number): bit_width: Optional[int] = None + """Number of bits used by the machine representation.""" _base_name: Optional[str] = dataclasses.field( default=None, init=False, repr=False ) @@ -149,6 +153,7 @@ class Int(_PhysicalNumber): # type: ignore exact = True bit_width = 64 signed: bool = dataclasses.field(default=True, init=False) + """Whether the integer data type is signed.""" def check(self, pandera_dtype: DataType) -> bool: return ( diff --git a/pandera/engines/engine.py b/pandera/engines/engine.py index 7fec60017..198c243e3 100644 --- a/pandera/engines/engine.py +++ b/pandera/engines/engine.py @@ -11,11 +11,11 @@ Callable, Dict, List, + Optional, Set, Tuple, Type, TypeVar, - Union, get_type_hints, ) @@ -117,9 +117,7 @@ def _method(*args, **kwargs): cls._registry[cls].dispatch.register(source_dtype, _method) def _register_equivalents( - cls, - pandera_dtype_cls: Type[DataType], - *source_dtypes: Any, + cls, pandera_dtype_cls: Type[DataType], *source_dtypes: Any ) -> None: pandera_dtype = pandera_dtype_cls() # type: ignore for source_dtype in source_dtypes: @@ -128,10 +126,10 @@ def _register_equivalents( def register_dtype( cls: _EngineType, - pandera_dtype_cls: Type[DataType] = None, + pandera_dtype_cls: Type[_DataType] = None, *, - equivalents: List[Any] = None, - ): + equivalents: Optional[List[Any]] = None, + ) -> Callable: """Register a Pandera :class:`DataType`. :param pandera_dtype: The DataType to register. @@ -142,7 +140,7 @@ def register_dtype( The classmethod ``from_parametrized_dtype`` will also be registered. """ - def _wrapper(pandera_dtype_cls: Union[DataType, Type[DataType]]): + def _wrapper(pandera_dtype_cls: Type[_DataType]) -> Type[_DataType]: if not inspect.isclass(pandera_dtype_cls): raise ValueError( f"{cls.__name__}.register_dtype can only decorate a class, " @@ -193,7 +191,9 @@ def dtype(cls: _EngineType, data_type: Any) -> _DataType: f"Data type '{data_type}' not understood by {cls.__name__}." ) from None - def get_registered_dtypes(cls) -> List[Type[DataType]]: - """Return :class:`pandera.dtypes.DataType`s registered + def get_registered_dtypes( # pylint:disable=W1401 + cls, + ) -> List[Type[DataType]]: + """Return the :class:`pandera.dtypes.DataType`\s registered with this engine.""" return list(cls._registered_dtypes) diff --git a/pandera/engines/numpy_engine.py b/pandera/engines/numpy_engine.py index c5fd62ff2..1a6982895 100644 --- a/pandera/engines/numpy_engine.py +++ b/pandera/engines/numpy_engine.py @@ -25,6 +25,7 @@ class DataType(dtypes.DataType): type: np.dtype = dataclasses.field( default=np.dtype("object"), repr=False, init=False ) + """Native numpy dtype boxed by the data type.""" def __init__(self, dtype: Any): super().__init__() @@ -58,7 +59,7 @@ class Engine( # pylint:disable=too-few-public-methods @classmethod def dtype(cls, data_type: Any) -> dtypes.DataType: """Convert input into a numpy-compatible - Pandera :class:`DataType` object.""" + Pandera :class:`~pandera.dtypes.DataType` object.""" try: return engine.Engine.dtype(cls, data_type) except TypeError: @@ -324,6 +325,8 @@ def check(self, pandera_dtype: "dtypes.DataType") -> bool: @Engine.register_dtype(equivalents=["object", "O", object, np.object_]) @immutable class Object(DataType): + """Semantic representation of a :class:`numpy.object_`.""" + type = np.dtype("object") diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py index 53e763f9d..b8549537c 100644 --- a/pandera/engines/pandas_engine.py +++ b/pandera/engines/pandas_engine.py @@ -41,6 +41,7 @@ class DataType(dtypes.DataType): """Base `DataType` for boxing Pandas data types.""" type: Any = dataclasses.field(repr=False, init=False) + """Native pandas dtype boxed by the data type.""" def __init__(self, dtype: Any): super().__init__() @@ -82,7 +83,7 @@ class Engine( # pylint:disable=too-few-public-methods @classmethod def dtype(cls, data_type: Any) -> "DataType": """Convert input into a pandas-compatible - Pandera :class:`DataType` object.""" + Pandera :class:`~pandera.dtypes.DataType` object.""" try: return engine.Engine.dtype(cls, data_type) except TypeError: @@ -113,7 +114,8 @@ def dtype(cls, data_type: Any) -> "DataType": @classmethod def numpy_dtype(cls, pandera_dtype: dtypes.DataType) -> np.dtype: - """Convert a pandera data type to a numpy data type.""" + """Convert a Pandera :class:`~pandera.dtypes.DataType + to a :class:`numpy.dtype`.""" pandera_dtype = engine.Engine.dtype(cls, pandera_dtype) alias = str(pandera_dtype).lower() @@ -140,6 +142,8 @@ def numpy_dtype(cls, pandera_dtype: dtypes.DataType) -> np.dtype: ) @immutable class BOOL(DataType, dtypes.Bool): + """Semantic representation of a :class:`pandas.BooleanDtype`.""" + type = pd.BooleanDtype() @@ -212,6 +216,8 @@ def _register_numpy_numbers( @Engine.register_dtype(equivalents=[pd.Int64Dtype, pd.Int64Dtype()]) @immutable class INT64(DataType, dtypes.Int): + """Semantic representation of a :class:`pandas.Int64Dtype`.""" + type = pd.Int64Dtype() bit_width: int = 64 @@ -219,6 +225,8 @@ class INT64(DataType, dtypes.Int): @Engine.register_dtype(equivalents=[pd.Int32Dtype, pd.Int32Dtype()]) @immutable class INT32(INT64): + """Semantic representation of a :class:`pandas.Int32Dtype`.""" + type = pd.Int32Dtype() bit_width: int = 32 @@ -226,6 +234,8 @@ class INT32(INT64): @Engine.register_dtype(equivalents=[pd.Int16Dtype, pd.Int16Dtype()]) @immutable class INT16(INT32): + """Semantic representation of a :class:`pandas.Int16Dtype`.""" + type = pd.Int16Dtype() bit_width: int = 16 @@ -233,6 +243,8 @@ class INT16(INT32): @Engine.register_dtype(equivalents=[pd.Int8Dtype, pd.Int8Dtype()]) @immutable class INT8(INT16): + """Semantic representation of a :class:`pandas.Int8Dtype`.""" + type = pd.Int8Dtype() bit_width: int = 8 @@ -251,6 +263,8 @@ class INT8(INT16): @Engine.register_dtype(equivalents=[pd.UInt64Dtype, pd.UInt64Dtype()]) @immutable class UINT64(DataType, dtypes.UInt): + """Semantic representation of a :class:`pandas.UInt64Dtype`.""" + type = pd.UInt64Dtype() bit_width: int = 64 @@ -258,6 +272,8 @@ class UINT64(DataType, dtypes.UInt): @Engine.register_dtype(equivalents=[pd.UInt32Dtype, pd.UInt32Dtype()]) @immutable class UINT32(UINT64): + """Semantic representation of a :class:`pandas.UInt32Dtype`.""" + type = pd.UInt32Dtype() bit_width: int = 32 @@ -265,6 +281,8 @@ class UINT32(UINT64): @Engine.register_dtype(equivalents=[pd.UInt16Dtype, pd.UInt16Dtype()]) @immutable class UINT16(UINT32): + """Semantic representation of a :class:`pandas.UInt16Dtype`.""" + type = pd.UInt16Dtype() bit_width: int = 16 @@ -272,6 +290,8 @@ class UINT16(UINT32): @Engine.register_dtype(equivalents=[pd.UInt8Dtype, pd.UInt8Dtype()]) @immutable class UINT8(UINT16): + """Semantic representation of a :class:`pandas.UInt8Dtype`.""" + type = pd.UInt8Dtype() bit_width: int = 8 @@ -330,7 +350,7 @@ def from_parametrized_dtype( cls, cat: Union[dtypes.Category, pd.CategoricalDtype] ): """Convert a categorical to - a Pandera :class:`~pandera.dtypes.pandas_engine.Category`.""" + a Pandera :class:`pandera.dtypes.pandas_engine.Category`.""" return cls( # type: ignore categories=cat.categories, ordered=cat.ordered ) @@ -341,6 +361,8 @@ def from_parametrized_dtype( ) @immutable class STRING(DataType, dtypes.String): + """Semantic representation of a :class:`pandas.StringDtype`.""" + type = pd.StringDtype() @@ -432,7 +454,7 @@ def _to_datetime(col: pd.Series) -> pd.Series: @classmethod def from_parametrized_dtype(cls, pd_dtype: pd.DatetimeTZDtype): """Convert a :class:`pandas.DatetimeTZDtype` to - a Pandera :class:`~pandera.engines.pandas_engine.DateTime`.""" + a Pandera :class:`pandera.engines.pandas_engine.DateTime`.""" return cls(unit=pd_dtype.unit, tz=pd_dtype.tz) # type: ignore def __str__(self) -> str: @@ -469,7 +491,7 @@ def __post_init__(self): @classmethod def from_parametrized_dtype(cls, pd_dtype: pd.PeriodDtype): """Convert a :class:`pandas.PeriodDtype` to - a Pandera :class:`~pandera.engines.pandas_engine.Period`.""" + a Pandera :class:`pandera.engines.pandas_engine.Period`.""" return cls(freq=pd_dtype.freq) # type: ignore @@ -497,7 +519,7 @@ def __post_init__(self): @classmethod def from_parametrized_dtype(cls, pd_dtype: pd.SparseDtype): """Convert a :class:`pandas.SparseDtype` to - a Pandera :class:`~pandera.engines.pandas_engine.Sparse`.""" + a Pandera :class:`pandera.engines.pandas_engine.Sparse`.""" return cls( # type: ignore dtype=pd_dtype.subtype, fill_value=pd_dtype.fill_value ) @@ -519,5 +541,5 @@ def __post_init__(self): @classmethod def from_parametrized_dtype(cls, pd_dtype: pd.IntervalDtype): """Convert a :class:`pandas.IntervalDtype` to - a Pandera :class:`~pandera.engines.pandas_engine.Interval`.""" + a Pandera :class:`pandera.engines.pandas_engine.Interval`.""" return cls(subtype=pd_dtype.subtype) # type: ignore diff --git a/pandera/schemas.py b/pandera/schemas.py index d8f205345..0ffb10ca6 100644 --- a/pandera/schemas.py +++ b/pandera/schemas.py @@ -233,8 +233,9 @@ def _set_column_handler(column, column_name): @property def dtypes(self) -> Dict[str, DataType]: """ - A pandas style dtypes dict where the keys are column names and values - are pandas dtype for the column. Excludes columns where regex=True. + A dict where the keys are column names and values are + :class:`~pandera.dtypes.DataType` for the column. Excludes columns + where `regex=True`. :returns: dictionary of columns and their associated dtypes. """ From 6b0f034cff0d428c9559e37c317d8d01a6b0f062 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Tue, 29 Jun 2021 23:29:24 +0200 Subject: [PATCH 08/23] add document for DataType refactor --- docs/source/dataframe_schemas.rst | 30 +++-- docs/source/dtypes.rst | 183 ++++++++++++++++++++++++++++++ docs/source/index.rst | 1 + pandera/dtypes.py | 1 - pandera/engines/engine.py | 24 +++- pandera/engines/pandas_engine.py | 2 +- pandera/schemas.py | 3 +- 7 files changed, 230 insertions(+), 14 deletions(-) create mode 100644 docs/source/dtypes.rst diff --git a/docs/source/dataframe_schemas.rst b/docs/source/dataframe_schemas.rst index ec8b0fd7f..1fe814eef 100644 --- a/docs/source/dataframe_schemas.rst +++ b/docs/source/dataframe_schemas.rst @@ -10,7 +10,7 @@ DataFrame Schemas The :class:`~pandera.schemas.DataFrameSchema` class enables the specification of a schema that verifies the columns and index of a pandas ``DataFrame`` object. -The ``DataFrameSchema`` object consists of |column|_\s and an |index|_. +The :class:`~pandera.schemas.DataFrameSchema` object consists of |column|_\s and an |index|_. .. |column| replace:: ``Column`` .. |index| replace:: ``Index`` @@ -44,12 +44,25 @@ The ``DataFrameSchema`` object consists of |column|_\s and an |index|_. Column Validation ----------------- -A :class:`~pandera.schema_components.Column` must specify the properties of a column in a dataframe -object. It can be optionally verified for its data type, `null values`_ or +A :class:`~pandera.schema_components.Column` must specify the properties of a +column in a dataframe object. It can be optionally verified for its data type, +`null values`_ or duplicate values. The column can be coerced_ into the specified type, and the required_ parameter allows control over whether or not the column is allowed to be missing. +Similarly to pandas, the data type can be specified as: + +* a string alias, as long as it is recognized by pandas. +* a python type: `int`, `float`, `double`, `bool`, `str` +* a `numpy data type <(https://numpy.org/doc/stable/user/basics.types.html)>`_ +* a `pandas extension type <(https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#dtypes)>`_: + it can be an instance (e.g `pd.CategoricalDtype(["a", "b"])`) or a + class (e.g `pandas.CategoricalDtype`) if it can be initialized with default + values. +* a pandera :class:`~pandera.dtypes.DataType`: it can also be an instance or a + class. + :ref:`Column checks` allow for the DataFrame's values to be checked against a user-provided function. ``Check`` objects also support :ref:`grouping` by a different column so that the user can make @@ -270,7 +283,7 @@ objects can also be used to validate columns in a dataframe on its own: validated_df = df.pipe(column1_schema).pipe(column2_schema) -For multi-column use cases, the ``DataFrameSchema`` is still recommended, but +For multi-column use cases, the :class:`~pandera.schemas.DataFrameSchema` is still recommended, but if you have one or a small number of columns to verify, using ``Column`` objects by themselves is appropriate. @@ -594,12 +607,13 @@ indexes by composing a list of ``pandera.Index`` objects. foo 2 3 -Get Pandas Datatypes --------------------- +Get Pandas Data Types +--------------------- Pandas provides a `dtype` parameter for casting a dataframe to a specific dtype -schema. ``DataFrameSchema`` provides a `dtype` property which returns a pandas -style dict. The keys of the dict are column names and values are the dtype. +schema. :class:`~pandera.schemas.DataFrameSchema` provides +a :attr:`~pandera.schemas.DataFrameSchema.dtypes` property which returns a +dictionary whose keys are column names and values are :class:`~pandera.dtypes.DataType`. Some examples of where this can be provided to pandas are: diff --git a/docs/source/dtypes.rst b/docs/source/dtypes.rst new file mode 100644 index 000000000..4e54b2b69 --- /dev/null +++ b/docs/source/dtypes.rst @@ -0,0 +1,183 @@ +.. pandera documentation for check_input and check_output decorators + +.. currentmodule:: pandera + +.. _dtypes: + +Extending Data Types (new) +========================== + +*new in 0.7.0* + +Motivations +~~~~~~~~~~~ + +Pandera defines its own inferface for data types in order to abstract the +specificities of dataframe-like data structures in the python ecosystem, such +as Apache Spark, Apache Arrow and xarray. + +.. note:: In the following section ``Pandera Data Type`` refers to a + :class:`pandera.dtypes.DataType` object whereas ``native data type`` refers + to data types used by third-party libraries that Pandera supports (e.g. pandas). + +Most of the time, it is transparent to end users since pandera columns and +indexes accept native data types. However, it is possible to extend the pandera +interface to: + +* modify the **data type check** performed during schema validation. +* modify the behavior of the **coerce** argument for :class:`~pandea.schemas.DataFrameSchema`. +* add your **own custom data types**. + +DataType basics +~~~~~~~~~~~~~~~ + +All pandera data types inherit from :class:`pandera.dtypes.DataType` and must +be hashable. + +A data type implements key methods: + +* :meth:`pandera.dtypes.DataType.check` which validates that data types are equivalent. +* :meth:`pandera.dtypes.DataType.coerce` which coerces a data container + (e.g. :class:`pandas.Series`) to the data type. +* The dunder method ``__str__()`` which should output the native alias. + For example ``str(pandera.Float64) == "float64"`` + + +For pandera's validation methods to be aware of a data type, it has to be +registered with the targeted engine via :meth:`pandera.engines.engine.Engine.register_dtype`. +An engine is in charge of mapping a pandera :class:`~pandera.dtypes.DataType` +with a native data type counterpart belonging to a third-party library. The mapping +can be queried with :meth:`pandera.engines.engine.Engine.dtype`. + +As of pandera 0.7, only the pandas :class:`~pandera.engines.pandas_engine.Engine` +is supported. + + +Example +~~~~~~~ + +Let's extend :class:`pandas.BooleanDtype` coercion to handle the string +litterals "True" and "False". + +.. testcode:: dtypes + + import pandas as pd + import pandera as pa + from pandera import dtypes + from pandera.engines import pandas_engine + + + @pandas_engine.Engine.register_dtype # 1 + @dtypes.immutable # 2 + class LiteralBool(pandas_engine.BOOL): # 3 + def coerce(self, series: pd.Series) -> pd.Series: + """Coerce a pandas.Series to date types.""" + if pd.api.types.is_string_dtype(series): + series = series.replace({"True": 1, "False": 0}) + return series.astype("boolean") + + + data = pd.Series(["True", "False"], name="literal_bools") + pa.SeriesSchema(LiteralBool(), coerce=True, name="literal_bools").validate(data).dtype # 4 + +.. testoutput:: dtypes + + boolean + +1. Register the data type with the pandas engine. +2. :func:`pandera.dtypes.immutable` creates an immutable (and hashable) + :func:`dataclass`. +3. Inherit :class:`pandera.engines.pandas_engine.BOOL`, which is the pandera + representation of :class:`pandas.BooleanDtype`. This is not mandatory but + it makes our life easier by having already implemented all the required + methods. +4. Check that our new data type can coerce the string literals. + +So far we did not override the default behavior: + +.. testcode:: dtypes + + import pandera as pa + + pa.SeriesSchema("boolean", coerce=True).validate(data) + + +.. testoutput:: dtypes + + Traceback (most recent call last): + ... + pandera.errors.SchemaError: Error while coercing 'literal_bools' to type boolean: Need to pass bool-like values + +To completely replace the default :class:`~pandera.engines.pandas_engine.BOOL`, +we need to supply all the equivalent representations to +:meth:`~pandera.engines.engine.Engine.register_dtype`. Behind the scenes, when +``pa.SeriesSchema("boolean")`` is called the corresponding pandera data type +is looked up using :meth:`pandera.engines.engine.Engine.dtype`. + +.. testcode:: dtypes + + print(f"before: {pandas_engine.Engine.dtype('boolean').__class__}") + + + @pandas_engine.Engine.register_dtype + equivalents=["boolean", pd.BooleanDtype, pd.BooleanDtype()], + ) + @dtypes.immutable + class LiteralBool(pandas_engine.BOOL): + def coerce(self, series: pd.Series) -> pd.Series: + """Coerce a pandas.Series to date types.""" + if pd.api.types.is_string_dtype(series): + series = series.replace({"True": 1, "False": 0}) + return series.astype("boolean") + + print(f"after: {pandas_engine.Engine.dtype('boolean')}") + + for dtype in ["boolean", pd.BooleanDtype, pd.BooleanDtype()]: + pa.SeriesSchema(dtype, coerce=True).validate(data) + +.. testoutput:: dtypes + + before: + after: + +.. note:: For convenience, we specified both ``pd.BooleanDtype`` and + ``pd.BooleanDtype()`` as equivalents. That gives us more flexibility in + what pandera schemas can recognize (see last for-loop above). + +Parametrized data types +~~~~~~~~~~~~~~~~~~~~~~~ + +Some data types can be parametrized. One common example is +`pandas.CategoricalDtype(categories=None, ordered=False) <(https://pandas.pydata.org/docs/reference/api/pandas.CategoricalDtype.html)>`_. + +The ``equivalents`` argument of +:meth:`~pandera.engines.engine.Engine.register_dtype` does not handle +this situation but will automatically register a :func:`classmethod` with +signature ``from_parametrized_dtype(cls, equivalent:...)`` if the decorated +:class:`~pandera.dtypes.DataType` defines it. The ``equivalent`` argument must +be type-annotated because it is leveraged to dispatch the input of +:class:`~pandera.engines.engine.Engine.dtype` to the appropriate +``from_parametrized_dtype`` class method. + +For example, here is a snippet from :class:`pandera.engines.pandas_engine.Category`: + +.. code-block:: python + + import pandas as pd + from pandera import dtypes + + ... + + @classmethod + def from_parametrized_dtype( + cls, cat: Union[dtypes.Category, pd.CategoricalDtype] + ): + """Convert a categorical to + a Pandera :class:`pandera.dtypes.pandas_engine.Category`.""" + return cls( # type: ignore + categories=cat.categories, ordered=cat.ordered + ) + + +.. note:: The dispatch mechanism relies on :func:`functools.singledispatch`. + Unlike the built-in implementation, :data:`typing.Union` is recognized. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 0e5e72096..2c967e954 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -312,6 +312,7 @@ Submit issues, feature requests or bugfixes on lazy_validation data_synthesis_strategies extensions + dtypes .. toctree:: :maxdepth: 6 diff --git a/pandera/dtypes.py b/pandera/dtypes.py index a07d0b015..53ced27f4 100644 --- a/pandera/dtypes.py +++ b/pandera/dtypes.py @@ -139,7 +139,6 @@ def __str__(self) -> str: return "bool" -Boolean = Bool ############################################################################### # signed integer ############################################################################### diff --git a/pandera/engines/engine.py b/pandera/engines/engine.py index 198c243e3..f0cf3198d 100644 --- a/pandera/engines/engine.py +++ b/pandera/engines/engine.py @@ -130,14 +130,32 @@ def register_dtype( *, equivalents: Optional[List[Any]] = None, ) -> Callable: - """Register a Pandera :class:`DataType`. + """Register a Pandera :class:`~pandera.dtypes.DataType` with the engine, + as class decorator. :param pandera_dtype: The DataType to register. - :param equivalents: Equivalent scalar data type class or - non-parametrized data type instance. + :param equivalents: Equivalent scalar data type classes or + non-parametrized data type instances. .. note:: The classmethod ``from_parametrized_dtype`` will also be registered. + See :ref:`here` for more usage details. + + :example: + + >>> import pandera as pa + >>> + >>> class MyDataType(pa.DataType): + ... pass + >>> + >>> class MyEngine( + ... metaclass=pa.engines.engine.Engine, base_pandera_dtypes=MyDataType + ... ): + ... pass + >>> + >>> @MyEngine.register_dtype(equivalents=[bool]) + >>> class MyBool(MyDataType): + ... pass """ def _wrapper(pandera_dtype_cls: Type[_DataType]) -> Type[_DataType]: diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py index b8549537c..e862fa887 100644 --- a/pandera/engines/pandas_engine.py +++ b/pandera/engines/pandas_engine.py @@ -65,7 +65,7 @@ def check(self, pandera_dtype: dtypes.DataType) -> bool: pandera_dtype = Engine.dtype(pandera_dtype) except TypeError: return False - return super().check(pandera_dtype) + return self.type == pandera_dtype.type def __str__(self) -> str: return str(self.type) diff --git a/pandera/schemas.py b/pandera/schemas.py index 0ffb10ca6..908716ab8 100644 --- a/pandera/schemas.py +++ b/pandera/schemas.py @@ -232,9 +232,10 @@ def _set_column_handler(column, column_name): @property def dtypes(self) -> Dict[str, DataType]: + # pylint:disable=anomalous-backslash-in-string """ A dict where the keys are column names and values are - :class:`~pandera.dtypes.DataType` for the column. Excludes columns + :class:`~pandera.dtypes.DataType`\s for the column. Excludes columns where `regex=True`. :returns: dictionary of columns and their associated dtypes. From f17c473db61cf72e28b445d3436264780c8c1cf1 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Tue, 29 Jun 2021 23:32:04 +0200 Subject: [PATCH 09/23] unpin sphinx and drop sphinx_rtd_theme --- environment.yml | 5 ++--- requirements-dev.txt | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index a3ae8dcdc..ec578f31b 100644 --- a/environment.yml +++ b/environment.yml @@ -30,12 +30,11 @@ dependencies: - pytest-cov - pytest-xdist - setuptools >= 52.0.0 - - nox = 2020.12.31 # pinning due to UnicodeDecodeError, see https://github.com/pandera-dev/pandera/pull/504/checks?check_run_id=2841360122 + - nox = 2020.12.31 # pinning due to UnicodeDecodeError, see https://github.com/pandera-dev/pandera/pull/504/checks?check_run_id=2841360122 - importlib_metadata # required if python < 3.8 # documentation - - sphinx = 3.5.4 # pinned due to doc-building error https://github.com/pandera-dev/pandera/runs/2601459267 - - sphinx_rtd_theme + - sphinx - sphinx-autodoc-typehints - sphinx-copybutton - recommonmark diff --git a/requirements-dev.txt b/requirements-dev.txt index 7d1f16941..b7a3544de 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -23,8 +23,7 @@ pytest-xdist setuptools >= 52.0.0 nox == 2020.12.31 importlib_metadata -sphinx == 3.5.4 -sphinx_rtd_theme +sphinx sphinx-autodoc-typehints sphinx-copybutton recommonmark From b9f19ca6d58238d034fc7a4eef272fdb0f59662e Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Tue, 29 Jun 2021 23:33:55 +0200 Subject: [PATCH 10/23] add xdoctest --- .github/workflows/ci-tests.yml | 7 +++++++ environment.yml | 1 + noxfile.py | 13 +++++++++---- requirements-dev.txt | 1 + 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 6d940cb0a..a3bd01e22 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -209,6 +209,13 @@ jobs: - name: Upload coverage to Codecov uses: "codecov/codecov-action@v1" + - name: Check Docstrings + run: > + nox + -db conda -r -v + --non-interactive + --session "doctests-${{ matrix.python-version }}" + - name: Check Docs run: > nox diff --git a/environment.yml b/environment.yml index ec578f31b..b9edb80a6 100644 --- a/environment.yml +++ b/environment.yml @@ -29,6 +29,7 @@ dependencies: - pytest - pytest-cov - pytest-xdist + - xdoctest - setuptools >= 52.0.0 - nox = 2020.12.31 # pinning due to UnicodeDecodeError, see https://github.com/pandera-dev/pandera/pull/504/checks?check_run_id=2841360122 - importlib_metadata # required if python < 3.8 diff --git a/noxfile.py b/noxfile.py index 065e7ec9d..586bf506e 100644 --- a/noxfile.py +++ b/noxfile.py @@ -22,6 +22,7 @@ "mypy", "tests", "docs", + "doctests", ) DEFAULT_PYTHON = "3.8" @@ -294,10 +295,7 @@ def mypy(session: Session) -> None: @nox.parametrize("extra", EXTRA_NAMES) def tests(session: Session, extra: str) -> None: """Run the test suite.""" - install_extras( - session, - extra, - ) + install_extras(session, extra) if session.posargs: args = session.posargs @@ -324,6 +322,13 @@ def tests(session: Session, extra: str) -> None: session.run("pytest", *args) +@nox.session(python=PYTHON_VERSIONS) +def doctests(session: Session) -> None: + """Build the documentation.""" + install_extras(session, extra="all") + session.run("xdoctest", PACKAGE, "--quiet") + + @nox.session(python=PYTHON_VERSIONS) def docs(session: Session) -> None: """Build the documentation.""" diff --git a/requirements-dev.txt b/requirements-dev.txt index b7a3544de..af2ed6b92 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -20,6 +20,7 @@ pylint >= 2.7.2 pytest pytest-cov pytest-xdist +xdoctest setuptools >= 52.0.0 nox == 2020.12.31 importlib_metadata From ed205a4aaa8872113b9c9b5ba357f55b3dcb427f Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Tue, 29 Jun 2021 23:34:22 +0200 Subject: [PATCH 11/23] ignore prompt when copying example from doc --- docs/source/conf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 9c0446fb8..32feb47d7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -174,6 +174,11 @@ "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None), } +# strip prompts +copybutton_prompt_text = ( + r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +) +copybutton_prompt_is_regexp = True # this is a workaround to filter out forward reference issue in # sphinx_autodoc_typehints From c274afe086cb2c9e4f1e3afdbdfcd6e4725a5475 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Tue, 29 Jun 2021 23:46:03 +0200 Subject: [PATCH 12/23] add doctest builder when running sphinx-build locally --- noxfile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/noxfile.py b/noxfile.py index 586bf506e..e4db7c09d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -347,6 +347,7 @@ def docs(session: Session) -> None: "-W", "-T", "-b=html", + "-b=doctest", "-d", os.path.join("_build", "doctrees", ""), "source", From 659d0beb26bfa9adcce6de7d76c0135df991e699 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Tue, 29 Jun 2021 23:51:13 +0200 Subject: [PATCH 13/23] fix dtypes doc examples --- docs/source/dtypes.rst | 128 +++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/docs/source/dtypes.rst b/docs/source/dtypes.rst index 4e54b2b69..94328956b 100644 --- a/docs/source/dtypes.rst +++ b/docs/source/dtypes.rst @@ -49,7 +49,7 @@ An engine is in charge of mapping a pandera :class:`~pandera.dtypes.DataType` with a native data type counterpart belonging to a third-party library. The mapping can be queried with :meth:`pandera.engines.engine.Engine.dtype`. -As of pandera 0.7, only the pandas :class:`~pandera.engines.pandas_engine.Engine` +As of pandera 0.7.0, only the pandas :class:`~pandera.engines.pandas_engine.Engine` is supported. @@ -61,24 +61,28 @@ litterals "True" and "False". .. testcode:: dtypes - import pandas as pd - import pandera as pa - from pandera import dtypes - from pandera.engines import pandas_engine - - - @pandas_engine.Engine.register_dtype # 1 - @dtypes.immutable # 2 - class LiteralBool(pandas_engine.BOOL): # 3 - def coerce(self, series: pd.Series) -> pd.Series: - """Coerce a pandas.Series to date types.""" - if pd.api.types.is_string_dtype(series): - series = series.replace({"True": 1, "False": 0}) - return series.astype("boolean") - - - data = pd.Series(["True", "False"], name="literal_bools") - pa.SeriesSchema(LiteralBool(), coerce=True, name="literal_bools").validate(data).dtype # 4 + import pandas as pd + import pandera as pa + from pandera import dtypes + from pandera.engines import pandas_engine + + + @pandas_engine.Engine.register_dtype # 1 + @dtypes.immutable # 2 + class LiteralBool(pandas_engine.BOOL): # 3 + def coerce(self, series: pd.Series) -> pd.Series: + """Coerce a pandas.Series to date types.""" + if pd.api.types.is_string_dtype(series): + series = series.replace({"True": 1, "False": 0}) + return series.astype("boolean") + + + data = pd.Series(["True", "False"], name="literal_bools") + print( # 4 + pa.SeriesSchema(LiteralBool(), coerce=True, name="literal_bools") + .validate(data) + .dtype + ) .. testoutput:: dtypes @@ -97,16 +101,16 @@ So far we did not override the default behavior: .. testcode:: dtypes - import pandera as pa - - pa.SeriesSchema("boolean", coerce=True).validate(data) + import pandera as pa + + pa.SeriesSchema("boolean", coerce=True).validate(data) .. testoutput:: dtypes - Traceback (most recent call last): - ... - pandera.errors.SchemaError: Error while coercing 'literal_bools' to type boolean: Need to pass bool-like values + Traceback (most recent call last): + ... + pandera.errors.SchemaError: Error while coercing 'literal_bools' to type boolean: Need to pass bool-like values To completely replace the default :class:`~pandera.engines.pandas_engine.BOOL`, we need to supply all the equivalent representations to @@ -116,33 +120,33 @@ is looked up using :meth:`pandera.engines.engine.Engine.dtype`. .. testcode:: dtypes - print(f"before: {pandas_engine.Engine.dtype('boolean').__class__}") - - - @pandas_engine.Engine.register_dtype - equivalents=["boolean", pd.BooleanDtype, pd.BooleanDtype()], - ) - @dtypes.immutable - class LiteralBool(pandas_engine.BOOL): - def coerce(self, series: pd.Series) -> pd.Series: - """Coerce a pandas.Series to date types.""" - if pd.api.types.is_string_dtype(series): - series = series.replace({"True": 1, "False": 0}) - return series.astype("boolean") - - print(f"after: {pandas_engine.Engine.dtype('boolean')}") - - for dtype in ["boolean", pd.BooleanDtype, pd.BooleanDtype()]: - pa.SeriesSchema(dtype, coerce=True).validate(data) + print(f"before: {pandas_engine.Engine.dtype('boolean').__class__}") + + + @pandas_engine.Engine.register_dtype( + equivalents=["boolean", pd.BooleanDtype, pd.BooleanDtype()], + ) + @dtypes.immutable + class LiteralBool(pandas_engine.BOOL): + def coerce(self, series: pd.Series) -> pd.Series: + """Coerce a pandas.Series to date types.""" + if pd.api.types.is_string_dtype(series): + series = series.replace({"True": 1, "False": 0}) + return series.astype("boolean") + + print(f"after: {pandas_engine.Engine.dtype('boolean').__class__}") + + for dtype in ["boolean", pd.BooleanDtype, pd.BooleanDtype()]: + pa.SeriesSchema(dtype, coerce=True).validate(data) .. testoutput:: dtypes - before: - after: + before: + after: .. note:: For convenience, we specified both ``pd.BooleanDtype`` and - ``pd.BooleanDtype()`` as equivalents. That gives us more flexibility in - what pandera schemas can recognize (see last for-loop above). + ``pd.BooleanDtype()`` as equivalents. That gives us more flexibility in + what pandera schemas can recognize (see last for-loop above). Parametrized data types ~~~~~~~~~~~~~~~~~~~~~~~ @@ -163,21 +167,21 @@ For example, here is a snippet from :class:`pandera.engines.pandas_engine.Catego .. code-block:: python - import pandas as pd - from pandera import dtypes - - ... - - @classmethod - def from_parametrized_dtype( - cls, cat: Union[dtypes.Category, pd.CategoricalDtype] - ): - """Convert a categorical to - a Pandera :class:`pandera.dtypes.pandas_engine.Category`.""" - return cls( # type: ignore - categories=cat.categories, ordered=cat.ordered - ) + import pandas as pd + from pandera import dtypes + + ... + + @classmethod + def from_parametrized_dtype( + cls, cat: Union[dtypes.Category, pd.CategoricalDtype] + ): + """Convert a categorical to + a Pandera :class:`pandera.dtypes.pandas_engine.Category`.""" + return cls( # type: ignore + categories=cat.categories, ordered=cat.ordered + ) .. note:: The dispatch mechanism relies on :func:`functools.singledispatch`. - Unlike the built-in implementation, :data:`typing.Union` is recognized. \ No newline at end of file + Unlike the built-in implementation, :data:`typing.Union` is recognized. \ No newline at end of file From a772a05b2eb298dd606faa4e80152c2c941ece66 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Wed, 30 Jun 2021 00:02:28 +0200 Subject: [PATCH 14/23] fix pandas_engine.DataType.check --- pandera/engines/pandas_engine.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py index e862fa887..404a7053c 100644 --- a/pandera/engines/pandas_engine.py +++ b/pandera/engines/pandas_engine.py @@ -65,7 +65,14 @@ def check(self, pandera_dtype: dtypes.DataType) -> bool: pandera_dtype = Engine.dtype(pandera_dtype) except TypeError: return False - return self.type == pandera_dtype.type + + # attempts to compare pandas native type if possible + # to let subclass inherit check + # (super will compare that DataType classes are exactly the same) + try: + return self.type == pandera_dtype.type + except Exception: + return super().check(pandera_dtype) def __str__(self) -> str: return str(self.type) From 032f9cbd05240d06352e9fee4aed5c118a5a72a6 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Wed, 30 Jun 2021 00:03:57 +0200 Subject: [PATCH 15/23] fix pylint --- pandera/checks.py | 12 ++++++------ pandera/engines/pandas_engine.py | 2 +- tests/core/checks_fixtures.py | 2 +- tests/io/test_io.py | 2 +- tests/strategies/test_strategies.py | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index 0345dc39b..ce67eaaa1 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -464,13 +464,13 @@ def __eq__(self, other): are_strategy_fn_objects_equal = True are_all_other_check_attributes_equal = { - i: self.__dict__[i] - for i in self.__dict__ - if i not in ["_check_fn", "strategy"] + k: v + for k, v in self.__dict__.items() + if k not in ["_check_fn", "strategy"] } == { - i: other.__dict__[i] - for i in other.__dict__ - if i not in ["_check_fn", "strategy"] + k: v + for k, v in other.__dict__.items() + if k not in ["_check_fn", "strategy"] } return ( diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py index 404a7053c..0369a2d54 100644 --- a/pandera/engines/pandas_engine.py +++ b/pandera/engines/pandas_engine.py @@ -71,7 +71,7 @@ def check(self, pandera_dtype: dtypes.DataType) -> bool: # (super will compare that DataType classes are exactly the same) try: return self.type == pandera_dtype.type - except Exception: + except TypeError: return super().check(pandera_dtype) def __str__(self) -> str: diff --git a/tests/core/checks_fixtures.py b/tests/core/checks_fixtures.py index baa99e81e..08b11dbd2 100644 --- a/tests/core/checks_fixtures.py +++ b/tests/core/checks_fixtures.py @@ -1,5 +1,5 @@ """Pytest fixtures for testing custom checks.""" -import unittest.mock as mock +from unittest import mock import pandas as pd import pytest diff --git a/tests/io/test_io.py b/tests/io/test_io.py index 00da31761..f1b7507c5 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -2,8 +2,8 @@ import platform import tempfile -import unittest.mock as mock from pathlib import Path +from unittest import mock import pandas as pd import pytest diff --git a/tests/strategies/test_strategies.py b/tests/strategies/test_strategies.py index 850ed1292..6b68f6bb9 100644 --- a/tests/strategies/test_strategies.py +++ b/tests/strategies/test_strategies.py @@ -10,7 +10,7 @@ import pytest import pandera as pa -import pandera.strategies as strategies +from pandera import strategies from pandera.checks import _CheckBase, register_check_statistics from pandera.dtypes import is_category, is_complex, is_float from pandera.engines import pandas_engine From d8567f87c38cded44e3dca8885cc30f73cf231c7 Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Wed, 30 Jun 2021 00:36:46 +0200 Subject: [PATCH 16/23] remove whitespaces in dtypes doc --- docs/source/dtypes.rst | 104 ++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/docs/source/dtypes.rst b/docs/source/dtypes.rst index 94328956b..c5f74e513 100644 --- a/docs/source/dtypes.rst +++ b/docs/source/dtypes.rst @@ -13,14 +13,14 @@ Motivations ~~~~~~~~~~~ Pandera defines its own inferface for data types in order to abstract the -specificities of dataframe-like data structures in the python ecosystem, such +specificities of dataframe-like data structures in the python ecosystem, such as Apache Spark, Apache Arrow and xarray. -.. note:: In the following section ``Pandera Data Type`` refers to a +.. note:: In the following section ``Pandera Data Type`` refers to a :class:`pandera.dtypes.DataType` object whereas ``native data type`` refers to data types used by third-party libraries that Pandera supports (e.g. pandas). -Most of the time, it is transparent to end users since pandera columns and +Most of the time, it is transparent to end users since pandera columns and indexes accept native data types. However, it is possible to extend the pandera interface to: @@ -37,26 +37,26 @@ be hashable. A data type implements key methods: * :meth:`pandera.dtypes.DataType.check` which validates that data types are equivalent. -* :meth:`pandera.dtypes.DataType.coerce` which coerces a data container +* :meth:`pandera.dtypes.DataType.coerce` which coerces a data container (e.g. :class:`pandas.Series`) to the data type. -* The dunder method ``__str__()`` which should output the native alias. +* The dunder method ``__str__()`` which should output the native alias. For example ``str(pandera.Float64) == "float64"`` - -For pandera's validation methods to be aware of a data type, it has to be -registered with the targeted engine via :meth:`pandera.engines.engine.Engine.register_dtype`. -An engine is in charge of mapping a pandera :class:`~pandera.dtypes.DataType` + +For pandera's validation methods to be aware of a data type, it has to be +registered with the targeted engine via :meth:`pandera.engines.engine.Engine.register_dtype`. +An engine is in charge of mapping a pandera :class:`~pandera.dtypes.DataType` with a native data type counterpart belonging to a third-party library. The mapping -can be queried with :meth:`pandera.engines.engine.Engine.dtype`. +can be queried with :meth:`pandera.engines.engine.Engine.dtype`. -As of pandera 0.7.0, only the pandas :class:`~pandera.engines.pandas_engine.Engine` +As of pandera 0.7.0, only the pandas :class:`~pandera.engines.pandas_engine.Engine` is supported. Example ~~~~~~~ - -Let's extend :class:`pandas.BooleanDtype` coercion to handle the string + +Let's extend :class:`pandas.BooleanDtype` coercion to handle the string litterals "True" and "False". .. testcode:: dtypes @@ -65,31 +65,31 @@ litterals "True" and "False". import pandera as pa from pandera import dtypes from pandera.engines import pandas_engine - - - @pandas_engine.Engine.register_dtype # 1 - @dtypes.immutable # 2 - class LiteralBool(pandas_engine.BOOL): # 3 + + + @pandas_engine.Engine.register_dtype # 1 + @dtypes.immutable # 2 + class LiteralBool(pandas_engine.BOOL): # 3 def coerce(self, series: pd.Series) -> pd.Series: """Coerce a pandas.Series to date types.""" if pd.api.types.is_string_dtype(series): series = series.replace({"True": 1, "False": 0}) return series.astype("boolean") - - + + data = pd.Series(["True", "False"], name="literal_bools") - print( # 4 + print( # 4 pa.SeriesSchema(LiteralBool(), coerce=True, name="literal_bools") .validate(data) .dtype - ) + ) .. testoutput:: dtypes boolean 1. Register the data type with the pandas engine. -2. :func:`pandera.dtypes.immutable` creates an immutable (and hashable) +2. :func:`pandera.dtypes.immutable` creates an immutable (and hashable) :func:`dataclass`. 3. Inherit :class:`pandera.engines.pandas_engine.BOOL`, which is the pandera representation of :class:`pandas.BooleanDtype`. This is not mandatory but @@ -102,66 +102,67 @@ So far we did not override the default behavior: .. testcode:: dtypes import pandera as pa - + pa.SeriesSchema("boolean", coerce=True).validate(data) - + .. testoutput:: dtypes - + Traceback (most recent call last): ... pandera.errors.SchemaError: Error while coercing 'literal_bools' to type boolean: Need to pass bool-like values -To completely replace the default :class:`~pandera.engines.pandas_engine.BOOL`, -we need to supply all the equivalent representations to -:meth:`~pandera.engines.engine.Engine.register_dtype`. Behind the scenes, when +To completely replace the default :class:`~pandera.engines.pandas_engine.BOOL`, +we need to supply all the equivalent representations to +:meth:`~pandera.engines.engine.Engine.register_dtype`. Behind the scenes, when ``pa.SeriesSchema("boolean")`` is called the corresponding pandera data type is looked up using :meth:`pandera.engines.engine.Engine.dtype`. .. testcode:: dtypes print(f"before: {pandas_engine.Engine.dtype('boolean').__class__}") - - + + @pandas_engine.Engine.register_dtype( - equivalents=["boolean", pd.BooleanDtype, pd.BooleanDtype()], + equivalents=["boolean", pd.BooleanDtype, pd.BooleanDtype()], ) @dtypes.immutable class LiteralBool(pandas_engine.BOOL): def coerce(self, series: pd.Series) -> pd.Series: """Coerce a pandas.Series to date types.""" if pd.api.types.is_string_dtype(series): - series = series.replace({"True": 1, "False": 0}) + series = series.replace({"True": 1, "False": 0}) return series.astype("boolean") - + + print(f"after: {pandas_engine.Engine.dtype('boolean').__class__}") - + for dtype in ["boolean", pd.BooleanDtype, pd.BooleanDtype()]: pa.SeriesSchema(dtype, coerce=True).validate(data) .. testoutput:: dtypes - + before: after: -.. note:: For convenience, we specified both ``pd.BooleanDtype`` and +.. note:: For convenience, we specified both ``pd.BooleanDtype`` and ``pd.BooleanDtype()`` as equivalents. That gives us more flexibility in what pandera schemas can recognize (see last for-loop above). Parametrized data types ~~~~~~~~~~~~~~~~~~~~~~~ -Some data types can be parametrized. One common example is +Some data types can be parametrized. One common example is `pandas.CategoricalDtype(categories=None, ordered=False) <(https://pandas.pydata.org/docs/reference/api/pandas.CategoricalDtype.html)>`_. -The ``equivalents`` argument of -:meth:`~pandera.engines.engine.Engine.register_dtype` does not handle -this situation but will automatically register a :func:`classmethod` with -signature ``from_parametrized_dtype(cls, equivalent:...)`` if the decorated +The ``equivalents`` argument of +:meth:`~pandera.engines.engine.Engine.register_dtype` does not handle +this situation but will automatically register a :func:`classmethod` with +signature ``from_parametrized_dtype(cls, equivalent:...)`` if the decorated :class:`~pandera.dtypes.DataType` defines it. The ``equivalent`` argument must -be type-annotated because it is leveraged to dispatch the input of -:class:`~pandera.engines.engine.Engine.dtype` to the appropriate -``from_parametrized_dtype`` class method. +be type-annotated because it is leveraged to dispatch the input of +:class:`~pandera.engines.engine.Engine.dtype` to the appropriate +``from_parametrized_dtype`` class method. For example, here is a snippet from :class:`pandera.engines.pandas_engine.Category`: @@ -169,19 +170,18 @@ For example, here is a snippet from :class:`pandera.engines.pandas_engine.Catego import pandas as pd from pandera import dtypes - + ... - + + @classmethod def from_parametrized_dtype( cls, cat: Union[dtypes.Category, pd.CategoricalDtype] ): """Convert a categorical to a Pandera :class:`pandera.dtypes.pandas_engine.Category`.""" - return cls( # type: ignore - categories=cat.categories, ordered=cat.ordered - ) + return cls(categories=cat.categories, ordered=cat.ordered) # type: ignore -.. note:: The dispatch mechanism relies on :func:`functools.singledispatch`. - Unlike the built-in implementation, :data:`typing.Union` is recognized. \ No newline at end of file +.. note:: The dispatch mechanism relies on :func:`functools.singledispatch`. + Unlike the built-in implementation, :data:`typing.Union` is recognized. From 7b2d2fb6097b7972d8415842dd29a36574896d43 Mon Sep 17 00:00:00 2001 From: Niels Bantilan Date: Wed, 30 Jun 2021 09:47:30 -0400 Subject: [PATCH 17/23] Update docs/source/dtypes.rst --- docs/source/dtypes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/dtypes.rst b/docs/source/dtypes.rst index c5f74e513..32cade527 100644 --- a/docs/source/dtypes.rst +++ b/docs/source/dtypes.rst @@ -12,7 +12,7 @@ Extending Data Types (new) Motivations ~~~~~~~~~~~ -Pandera defines its own inferface for data types in order to abstract the +Pandera defines its own interface for data types in order to abstract the specificities of dataframe-like data structures in the python ecosystem, such as Apache Spark, Apache Arrow and xarray. From 1c94b0d8253fc0e208e6e0dda90fa91dceda4aae Mon Sep 17 00:00:00 2001 From: Niels Bantilan Date: Wed, 30 Jun 2021 09:49:56 -0400 Subject: [PATCH 18/23] Update dtypes.rst --- docs/source/dtypes.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/dtypes.rst b/docs/source/dtypes.rst index 32cade527..f05cbfb07 100644 --- a/docs/source/dtypes.rst +++ b/docs/source/dtypes.rst @@ -13,7 +13,7 @@ Motivations ~~~~~~~~~~~ Pandera defines its own interface for data types in order to abstract the -specificities of dataframe-like data structures in the python ecosystem, such +specifics of dataframe-like data structures in the python ecosystem, such as Apache Spark, Apache Arrow and xarray. .. note:: In the following section ``Pandera Data Type`` refers to a @@ -22,11 +22,11 @@ as Apache Spark, Apache Arrow and xarray. Most of the time, it is transparent to end users since pandera columns and indexes accept native data types. However, it is possible to extend the pandera -interface to: +interface by: -* modify the **data type check** performed during schema validation. -* modify the behavior of the **coerce** argument for :class:`~pandea.schemas.DataFrameSchema`. -* add your **own custom data types**. +* modifying the **data type check** performed during schema validation. +* modifying the behavior of the **coerce** argument for :class:`~pandea.schemas.DataFrameSchema`. +* adding your **own custom data types**. DataType basics ~~~~~~~~~~~~~~~ @@ -34,7 +34,7 @@ DataType basics All pandera data types inherit from :class:`pandera.dtypes.DataType` and must be hashable. -A data type implements key methods: +A data type implements three key methods: * :meth:`pandera.dtypes.DataType.check` which validates that data types are equivalent. * :meth:`pandera.dtypes.DataType.coerce` which coerces a data container @@ -49,7 +49,7 @@ An engine is in charge of mapping a pandera :class:`~pandera.dtypes.DataType` with a native data type counterpart belonging to a third-party library. The mapping can be queried with :meth:`pandera.engines.engine.Engine.dtype`. -As of pandera 0.7.0, only the pandas :class:`~pandera.engines.pandas_engine.Engine` +As of pandera ``0.7.0``, only the pandas :class:`~pandera.engines.pandas_engine.Engine` is supported. @@ -57,7 +57,7 @@ Example ~~~~~~~ Let's extend :class:`pandas.BooleanDtype` coercion to handle the string -litterals "True" and "False". +literals ``"True"`` and ``"False"``. .. testcode:: dtypes From f7ad5354b730cc75c5ddb9a11824a2dd73335a33 Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Wed, 30 Jun 2021 11:11:35 -0400 Subject: [PATCH 19/23] update docs structure --- Makefile | 2 +- docs/source/data_synthesis_strategies.rst | 4 +-- docs/source/dtypes.rst | 21 +++++++------ docs/source/extensions.rst | 4 +-- docs/source/index.rst | 2 +- docs/source/reference/checks.rst | 10 ------ docs/source/reference/core.rst | 35 +++++++++++++++++++++ docs/source/reference/decorators.rst | 2 ++ docs/source/reference/dtypes.rst | 13 +++++--- docs/source/reference/errors.rst | 2 ++ docs/source/reference/extensions.rst | 2 ++ docs/source/reference/index.rst | 30 +++++++++++++----- docs/source/reference/io.rst | 6 ++++ docs/source/reference/schema_components.rst | 12 ------- docs/source/reference/schema_inference.rst | 2 ++ docs/source/reference/schema_models.rst | 14 ++++++--- docs/source/reference/schemas.rst | 10 ------ docs/source/reference/strategies.rst | 2 ++ pandera/engines/engine.py | 3 +- pandera/engines/pandas_engine.py | 2 ++ 20 files changed, 114 insertions(+), 64 deletions(-) delete mode 100644 docs/source/reference/checks.rst create mode 100644 docs/source/reference/core.rst delete mode 100644 docs/source/reference/schema_components.rst delete mode 100644 docs/source/reference/schemas.rst diff --git a/Makefile b/Makefile index 983f3c6bf..98107bb49 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ requirements: pip install -r requirements-dev.txt docs: - rm -rf docs/source/generated && \ + rm -rf docs/**/generated docs/**/methods docs/_build && \ python -m sphinx -E "docs/source" "docs/_build" -W && \ make -C docs doctest diff --git a/docs/source/data_synthesis_strategies.rst b/docs/source/data_synthesis_strategies.rst index d49e4a115..3bbd410e0 100644 --- a/docs/source/data_synthesis_strategies.rst +++ b/docs/source/data_synthesis_strategies.rst @@ -4,8 +4,8 @@ .. _data synthesis strategies: -Data Synthesis Strategies (new) -=============================== +Data Synthesis Strategies +========================= *new in 0.6.0* diff --git a/docs/source/dtypes.rst b/docs/source/dtypes.rst index f05cbfb07..b687d0d19 100644 --- a/docs/source/dtypes.rst +++ b/docs/source/dtypes.rst @@ -4,8 +4,8 @@ .. _dtypes: -Extending Data Types (new) -========================== +Pandera Data Types (new) +======================== *new in 0.7.0* @@ -67,9 +67,9 @@ literals ``"True"`` and ``"False"``. from pandera.engines import pandas_engine - @pandas_engine.Engine.register_dtype # 1 - @dtypes.immutable # 2 - class LiteralBool(pandas_engine.BOOL): # 3 + @pandas_engine.Engine.register_dtype # step 1 + @dtypes.immutable # step 2 + class LiteralBool(pandas_engine.BOOL): # step 3 def coerce(self, series: pd.Series) -> pd.Series: """Coerce a pandas.Series to date types.""" if pd.api.types.is_string_dtype(series): @@ -78,7 +78,9 @@ literals ``"True"`` and ``"False"``. data = pd.Series(["True", "False"], name="literal_bools") - print( # 4 + + # step 4 + print( pa.SeriesSchema(LiteralBool(), coerce=True, name="literal_bools") .validate(data) .dtype @@ -88,6 +90,8 @@ literals ``"True"`` and ``"False"``. boolean +The example above performs the following steps: + 1. Register the data type with the pandas engine. 2. :func:`pandera.dtypes.immutable` creates an immutable (and hashable) :func:`dataclass`. @@ -153,7 +157,7 @@ Parametrized data types ~~~~~~~~~~~~~~~~~~~~~~~ Some data types can be parametrized. One common example is -`pandas.CategoricalDtype(categories=None, ordered=False) <(https://pandas.pydata.org/docs/reference/api/pandas.CategoricalDtype.html)>`_. +:class:`pandas.CategoricalDtype`. The ``equivalents`` argument of :meth:`~pandera.engines.engine.Engine.register_dtype` does not handle @@ -171,9 +175,6 @@ For example, here is a snippet from :class:`pandera.engines.pandas_engine.Catego import pandas as pd from pandera import dtypes - ... - - @classmethod def from_parametrized_dtype( cls, cat: Union[dtypes.Category, pd.CategoricalDtype] diff --git a/docs/source/extensions.rst b/docs/source/extensions.rst index de4928bd8..dd9be7344 100644 --- a/docs/source/extensions.rst +++ b/docs/source/extensions.rst @@ -4,8 +4,8 @@ .. _extensions: -Extensions (new) -================ +Extensions +========== *new in 0.6.0* diff --git a/docs/source/index.rst b/docs/source/index.rst index 2c967e954..c49cf791d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -306,13 +306,13 @@ Submit issues, feature requests or bugfixes on series_schemas checks hypothesis + dtypes decorators schema_inference schema_models lazy_validation data_synthesis_strategies extensions - dtypes .. toctree:: :maxdepth: 6 diff --git a/docs/source/reference/checks.rst b/docs/source/reference/checks.rst deleted file mode 100644 index 6085baa42..000000000 --- a/docs/source/reference/checks.rst +++ /dev/null @@ -1,10 +0,0 @@ -Checks -====== - -.. autosummary:: - :toctree: generated - :template: class.rst - :nosignatures: - - pandera.checks.Check - pandera.hypotheses.Hypothesis diff --git a/docs/source/reference/core.rst b/docs/source/reference/core.rst new file mode 100644 index 000000000..c39175c6e --- /dev/null +++ b/docs/source/reference/core.rst @@ -0,0 +1,35 @@ +.. _api-core: + +Schemas +======= + +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + pandera.schemas.DataFrameSchema + pandera.schemas.SeriesSchema + +Schema Components +================= + +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + pandera.schema_components.Column + pandera.schema_components.Index + pandera.schema_components.MultiIndex + +Checks +====== + +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + pandera.checks.Check + pandera.hypotheses.Hypothesis diff --git a/docs/source/reference/decorators.rst b/docs/source/reference/decorators.rst index 5b265f1e4..2506336f4 100644 --- a/docs/source/reference/decorators.rst +++ b/docs/source/reference/decorators.rst @@ -1,3 +1,5 @@ +.. _api-decorators: + Decorators ========== diff --git a/docs/source/reference/dtypes.rst b/docs/source/reference/dtypes.rst index f1c551c13..0fc166e75 100644 --- a/docs/source/reference/dtypes.rst +++ b/docs/source/reference/dtypes.rst @@ -1,8 +1,11 @@ +.. _api-dtypes: + Pandera Data Types ================== Library-agnostic dtypes -~~~~~~~~~~~~~~~~~~~~~~~ +----------------------- + .. autosummary:: :toctree: generated :template: dtype.rst @@ -37,7 +40,7 @@ Library-agnostic dtypes Pandas-specific Dtypes -~~~~~~~~~~~~~~~~~~~~~~ +---------------------- Listed here for compatibility with pandera versions < 0.7. Passing native pandas dtypes to pandera components is preferred. @@ -60,7 +63,8 @@ Passing native pandas dtypes to pandera components is preferred. pandera.engines.numpy_engine.Object Utility functions -~~~~~~~~~~~~~~~~~ +----------------- + .. autosummary:: :toctree: generated :nosignatures: @@ -78,7 +82,8 @@ Utility functions pandera.dtypes.immutable Engines -~~~~~~~ +------- + .. autosummary:: :toctree: generated :template: class.rst diff --git a/docs/source/reference/errors.rst b/docs/source/reference/errors.rst index 59cb986b8..74fac1bde 100644 --- a/docs/source/reference/errors.rst +++ b/docs/source/reference/errors.rst @@ -1,3 +1,5 @@ +.. _api-errors: + Errors ====== diff --git a/docs/source/reference/extensions.rst b/docs/source/reference/extensions.rst index 58474a7ea..617b5ed7a 100644 --- a/docs/source/reference/extensions.rst +++ b/docs/source/reference/extensions.rst @@ -1,3 +1,5 @@ +.. _api-extensions: + Extensions ========== diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 707a0ed51..3ec1d4713 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -5,17 +5,33 @@ API === -The ``io`` module and built-in ``Hypothesis`` checks require a pandera -installation with the corresponding extension, see the -:ref:`installation` instructions for more details. +.. list-table:: + :widths: 25 75 + + * - :ref:`Core ` + - The core objects for defining pandera schemas + * - :ref:`Data Types ` + - Data types for type checking and coercion. + * - :ref:`Schema Models ` + - Alternative class-based API for defining pandera schemas. + * - :ref:`Decorators ` + - Decorators for integrating pandera schemas with python functions. + * - :ref:`Schema Inference ` + - Bootstrap schemas from real data + * - :ref:`IO Utilities ` + - Utility functions for reading/writing schemas + * - :ref:`Strategies ` + - Module of functions for generating data from schemas. + * - :ref:`Extensions ` + - Utility functions for extending pandera functionality + * - :ref:`Errors ` + - Pandera-specific exceptions .. toctree:: - :maxdepth: 2 + :hidden: - schemas - schema_components + core schema_models - checks decorators schema_inference io diff --git a/docs/source/reference/io.rst b/docs/source/reference/io.rst index f7e334af4..2da272a14 100644 --- a/docs/source/reference/io.rst +++ b/docs/source/reference/io.rst @@ -1,6 +1,12 @@ +.. _api-io-utils: + IO Utils ======== +The ``io`` module and built-in ``Hypothesis`` checks require a pandera +installation with the corresponding extension, see the +:ref:`installation` instructions for more details. + .. autosummary:: :toctree: generated :nosignatures: diff --git a/docs/source/reference/schema_components.rst b/docs/source/reference/schema_components.rst deleted file mode 100644 index 113fc528a..000000000 --- a/docs/source/reference/schema_components.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Schema Components ------------------ - -.. autosummary:: - :toctree: generated - :template: class.rst - :nosignatures: - - pandera.schema_components.Column - pandera.schema_components.Index - pandera.schema_components.MultiIndex diff --git a/docs/source/reference/schema_inference.rst b/docs/source/reference/schema_inference.rst index d7748ae70..179c151ac 100644 --- a/docs/source/reference/schema_inference.rst +++ b/docs/source/reference/schema_inference.rst @@ -1,3 +1,5 @@ +.. _api-schema-inference: + Schema Inference ================ diff --git a/docs/source/reference/schema_models.rst b/docs/source/reference/schema_models.rst index 0615fbbff..9468a3380 100644 --- a/docs/source/reference/schema_models.rst +++ b/docs/source/reference/schema_models.rst @@ -1,9 +1,13 @@ +.. _api-schema-models: + Schema Models ============= + .. currentmodule:: pandera Schema Model -~~~~~~~~~~~~ +------------ + .. autosummary:: :toctree: generated :template: class.rst @@ -11,7 +15,8 @@ Schema Model pandera.model.SchemaModel Model Components -~~~~~~~~~~~~~~~~ +---------------- + .. autosummary:: :toctree: generated @@ -20,7 +25,7 @@ Model Components pandera.model_components.dataframe_check Typing -~~~~~~ +------ .. autosummary:: :toctree: generated @@ -30,7 +35,8 @@ Typing pandera.typing Config -~~~~~~ +------ + .. autosummary:: :toctree: generated :template: model_component_class.rst diff --git a/docs/source/reference/schemas.rst b/docs/source/reference/schemas.rst deleted file mode 100644 index f9eff67aa..000000000 --- a/docs/source/reference/schemas.rst +++ /dev/null @@ -1,10 +0,0 @@ -Schemas -======= - -.. autosummary:: - :toctree: generated - :template: class.rst - :nosignatures: - - pandera.schemas.DataFrameSchema - pandera.schemas.SeriesSchema diff --git a/docs/source/reference/strategies.rst b/docs/source/reference/strategies.rst index dc8f51a99..16f9b1aaa 100644 --- a/docs/source/reference/strategies.rst +++ b/docs/source/reference/strategies.rst @@ -1,3 +1,5 @@ +.. _api-strategies: + Data Synthesis Strategies ========================= diff --git a/pandera/engines/engine.py b/pandera/engines/engine.py index f0cf3198d..e8a4e3b00 100644 --- a/pandera/engines/engine.py +++ b/pandera/engines/engine.py @@ -154,8 +154,9 @@ def register_dtype( ... pass >>> >>> @MyEngine.register_dtype(equivalents=[bool]) - >>> class MyBool(MyDataType): + ... class MyBool(MyDataType): ... pass + """ def _wrapper(pandera_dtype_cls: Type[_DataType]) -> Type[_DataType]: diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py index 0369a2d54..55b9a400d 100644 --- a/pandera/engines/pandas_engine.py +++ b/pandera/engines/pandas_engine.py @@ -338,6 +338,8 @@ class UINT8(UINT16): ) @immutable(init=True) class Category(DataType, dtypes.Category): + """Semantic representation of a :class:`pandas.CategoricalDtype`.""" + type: pd.CategoricalDtype = dataclasses.field(default=None, init=False) def __init__( # pylint:disable=super-init-not-called From 9126b29bb0e7f6b1a3f55b72d9ac77b0c60e2b20 Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Thu, 1 Jul 2021 10:08:32 -0400 Subject: [PATCH 20/23] update nox file --- noxfile.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/noxfile.py b/noxfile.py index e4db7c09d..105c14faf 100644 --- a/noxfile.py +++ b/noxfile.py @@ -173,11 +173,12 @@ def install_extras( force_pip: bool = False, ) -> None: """Install dependencies.""" - specs = [ - spec if spec != "pandas" else "pandas" - for spec in REQUIRES[extra].values() - if spec not in ALWAYS_USE_PIP - ] + specs, pip_specs = [], [] + for spec in REQUIRES[extra].values(): + if spec.split("==")[0] in ALWAYS_USE_PIP: + pip_specs.append(spec) + else: + specs.append(spec if spec != "pandas" else "pandas") if extra == "core": specs.append(REQUIRES["all"]["hypothesis"]) @@ -191,7 +192,7 @@ def install_extras( print("using pip installer") session.install(*specs) - session.install(*ALWAYS_USE_PIP) + session.install(*pip_specs) # always use pip for these packages session.install("-e", ".", "--no-deps") # install pandera From f7b095cf67a0da4f0992cca1e54577399c25fe4d Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Thu, 1 Jul 2021 11:45:58 -0400 Subject: [PATCH 21/23] force pip on doctests --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 105c14faf..63a1d7be3 100644 --- a/noxfile.py +++ b/noxfile.py @@ -326,7 +326,7 @@ def tests(session: Session, extra: str) -> None: @nox.session(python=PYTHON_VERSIONS) def doctests(session: Session) -> None: """Build the documentation.""" - install_extras(session, extra="all") + install_extras(session, extra="all", force_pip=True) session.run("xdoctest", PACKAGE, "--quiet") From f953a32bcf32ac9cc7063a195a3b8f36cad007eb Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Thu, 1 Jul 2021 12:03:24 -0400 Subject: [PATCH 22/23] update test_schemas --- tests/core/test_schemas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/core/test_schemas.py b/tests/core/test_schemas.py index f7ba37e38..f6db2e819 100644 --- a/tests/core/test_schemas.py +++ b/tests/core/test_schemas.py @@ -88,7 +88,6 @@ def test_dataframe_schema(): # checks if 'a' is converted to float, while schema says int, will a schema # error be thrown with pytest.raises(errors.SchemaError): - df.assign(a=[1.7, 2.3, 3.1]).info() schema.validate(df.assign(a=[1.7, 2.3, 3.1])) From f0a33c2750a49fd7f2de0804507d8f2494bc3bbe Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Thu, 1 Jul 2021 23:23:36 +0200 Subject: [PATCH 23/23] fix docs session not overriding html with doctest output --- noxfile.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/noxfile.py b/noxfile.py index 63a1d7be3..f2efaa02d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -343,17 +343,17 @@ def docs(session: Session) -> None: os.path.join("source", "reference", "generated"), ignore_errors=True, ) - session.run( - "sphinx-build", - "-W", - "-T", - "-b=html", - "-b=doctest", - "-d", - os.path.join("_build", "doctrees", ""), - "source", - os.path.join("_build", "html", ""), - ) + for builder in ["doctest", "html"]: + session.run( + "sphinx-build", + "-W", + "-T", + f"-b={builder}", + "-d", + os.path.join("_build", "doctrees", ""), + "source", + os.path.join("_build", builder, ""), + ) else: shutil.rmtree(os.path.join("_build"), ignore_errors=True) args = session.posargs or [