From 3e2fad0b64698d3a84a63af23e8f927632f5cefd Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Thu, 25 Feb 2021 16:42:43 -0500 Subject: [PATCH 01/21] increase cache version --- .github/workflows/ci-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 84af168cf..d5339e9d6 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -9,7 +9,7 @@ env: DEFAULT_PYTHON: 3.8 CI: "true" # Increase this value to reset cache if environment.yml has not changed - CACHE_VERSION: 0 + CACHE_VERSION: 1 jobs: codestyle: From b3ec21b9ae3bf9375a783ecaa6859c8be8c344fd Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 4 Mar 2021 10:07:24 -0800 Subject: [PATCH 02/21] ci: add dataframe checks tests --- tests/core/test_schema_statistics.py | 1 + tests/io/test_io.py | 49 +++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/tests/core/test_schema_statistics.py b/tests/core/test_schema_statistics.py index 640468eff..1c074d70f 100644 --- a/tests/core/test_schema_statistics.py +++ b/tests/core/test_schema_statistics.py @@ -388,6 +388,7 @@ def test_get_dataframe_schema_statistics(): ), ) expectation = { + "checks": None, "columns": { "int": { "pandas_dtype": pa.Int, diff --git a/tests/io/test_io.py b/tests/io/test_io.py index 68a0e5561..bb8038dbd 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -2,6 +2,7 @@ import platform import tempfile +import unittest.mock as mock from pathlib import Path import pandas as pd @@ -9,6 +10,7 @@ from packaging import version import pandera as pa +import pandera.extensions as pa_ext try: from pandera import io @@ -183,6 +185,7 @@ def _create_schema(index="single"): coerce: true required: false regex: true +checks: null index: - pandas_dtype: int nullable: false @@ -244,6 +247,7 @@ def _create_schema_null_index(): min_value: 1 max_value: 3 index: null +checks: null coerce: false strict: false """ @@ -272,6 +276,7 @@ def _create_schema_python_types(): pandas_dtype: str object_column: pandas_dtype: object +checks: null index: null coerce: false strict: false @@ -283,7 +288,7 @@ def _create_schema_python_types(): reason="pyyaml >= 5.1.0 required", ) def test_inferred_schema_io(): - """Test that inferred schema can be writted to yaml.""" + """Test that inferred schema can be written to yaml.""" df = pd.DataFrame( { "column1": [5, 10, 20], @@ -424,3 +429,45 @@ def test_to_yaml_lambda_check(): with pytest.warns(UserWarning): pa.io.to_yaml(schema) + + +@mock.patch("pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict) +def test_to_yaml_registered_dataframe_check(_): + """Tests that writing DataFrameSchema with a registered dataframe works.""" + ncols_gt_called = False + + @pa_ext.register_check_method(statistics=["column_count"]) + def ncols_gt(pandas_obj: pd.DataFrame, column_count: int) -> bool: + """test registered dataframe check""" + + # pylint: disable=unused-variable + nonlocal ncols_gt_called + ncols_gt_called = True + assert isinstance(column_count, int), "column_count must be integral" + assert isinstance( + pandas_obj, pd.DataFrame + ), "ncols_gt should only be applied to DataFrame" + return len(pandas_obj.columns) > column_count + + assert ( + len(pa.Check.REGISTERED_CUSTOM_CHECKS) == 1 + ), "custom check is registered" + + schema = pa.DataFrameSchema( + { + "a": pa.Column( + pa.Int, + ), + }, + checks=[pa.Check.ncols_gt(column_count=5)], + ) + + serialized = pa.io.to_yaml(schema) + loaded = pa.io.from_yaml(serialized) + + assert len(loaded.checks) == 1, "global check was stripped" + + with pytest.raises(pa.errors.SchemaError): + schema.validate(pd.DataFrame(data={"a": [1]})) + + assert ncols_gt_called, "did not call ncols_gt" From 49db091954f22c15d548a92379d52c32ebf138ff Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 4 Mar 2021 10:08:24 -0800 Subject: [PATCH 03/21] bugfix: allow serialization of dataframe checks to_yaml --- pandera/io.py | 40 +++++++++++++++++++++++++++++++----- pandera/schema_statistics.py | 1 + 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/pandera/io.py b/pandera/io.py index aabdd1d8d..2ea9c2c64 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -25,15 +25,23 @@ NOT_JSON_SERIALIZABLE = {PandasDtype.DateTime, PandasDtype.Timedelta} -def _serialize_check_stats(check_stats, pandas_dtype): +def _serialize_check_stats(check_stats, pandas_dtype=None): """Serialize check statistics into json/yaml-compatible format.""" + # pylint: disable=unused-argument def handle_stat_dtype(stat): + # infer dtype if not passed + nonlocal pandas_dtype + + if pandas_dtype is None: + pandas_dtype = PandasDtype.get_dtype(type(stat)) + if pandas_dtype == PandasDtype.DateTime: return stat.strftime(DATETIME_FORMAT) elif pandas_dtype == PandasDtype.Timedelta: # serialize to int in nanoseconds return stat.delta + return stat # for unary checks, return a single value instead of a dictionary @@ -88,7 +96,7 @@ def _serialize_schema(dataframe_schema): statistics = get_dataframe_schema_statistics(dataframe_schema) - columns, index = None, None + columns, index, checks = None, None, None if statistics["columns"] is not None: columns = { col_name: _serialize_component_stats(column_stats) @@ -101,18 +109,33 @@ def _serialize_schema(dataframe_schema): for index_stats in statistics["index"] ] + if statistics["checks"] is not None: + checks = { + check_name: _serialize_check_stats( + check_stats, + ) + for check_name, check_stats in statistics["checks"].items() + } + return { "schema_type": "dataframe", "version": __version__, "columns": columns, + "checks": checks, "index": index, "coerce": dataframe_schema.coerce, "strict": dataframe_schema.strict, } -def _deserialize_check_stats(check, serialized_check_stats, pandas_dtype): +def _deserialize_check_stats(check, serialized_check_stats, pandas_dtype=None): + # pylint: disable=unused-argument def handle_stat_dtype(stat): + nonlocal pandas_dtype + + if pandas_dtype is None: + pandas_dtype = PandasDtype.get_dtype(type(stat)) + if pandas_dtype == PandasDtype.DateTime: return pd.to_datetime(stat, format=DATETIME_FORMAT) elif pandas_dtype == PandasDtype.Timedelta: @@ -167,9 +190,9 @@ def _deserialize_component_stats(serialized_component_stats): def _deserialize_schema(serialized_schema): # pylint: disable=import-outside-toplevel - from pandera import Column, DataFrameSchema, Index, MultiIndex + from pandera import Check, Column, DataFrameSchema, Index, MultiIndex - columns, index = None, None + columns, index, checks = None, None, None if serialized_schema["columns"] is not None: columns = { col_name: Column(**_deserialize_component_stats(column_stats)) @@ -182,6 +205,12 @@ def _deserialize_schema(serialized_schema): for index_component in serialized_schema["index"] ] + if serialized_schema["checks"] is not None: + checks = [ + _deserialize_check_stats(getattr(Check, check_name), check_stats) + for check_name, check_stats in serialized_schema["checks"].items() + ] + if index is None: pass elif len(index) == 1: @@ -193,6 +222,7 @@ def _deserialize_schema(serialized_schema): return DataFrameSchema( columns=columns, + checks=checks, index=index, coerce=serialized_schema["coerce"], strict=serialized_schema["strict"], diff --git a/pandera/schema_statistics.py b/pandera/schema_statistics.py index 3271442a1..99e8815ec 100644 --- a/pandera/schema_statistics.py +++ b/pandera/schema_statistics.py @@ -115,6 +115,7 @@ def get_dataframe_schema_statistics(dataframe_schema): } for col_name, column in dataframe_schema.columns.items() }, + "checks": parse_checks(dataframe_schema.checks), "index": ( None if dataframe_schema.index is None From c444d29f4c8d98cff46b46514ad6e49e11cf2246 Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 4 Mar 2021 13:45:01 -0800 Subject: [PATCH 04/21] ci: add test to ensure serialization of lambda check fails --- tests/io/test_io.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/io/test_io.py b/tests/io/test_io.py index bb8038dbd..59a019b25 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -471,3 +471,19 @@ def ncols_gt(pandas_obj: pd.DataFrame, column_count: int) -> bool: schema.validate(pd.DataFrame(data={"a": [1]})) assert ncols_gt_called, "did not call ncols_gt" + + +def test_to_yaml_custom_dataframe_check(): + """Tests that writing DataFrameSchema with a registered dataframe raises.""" + + schema = pa.DataFrameSchema( + { + "a": pa.Column( + pa.Int, + ), + }, + checks=[pa.Check(lambda obj: len(obj.index) > 1)], + ) + + with pytest.raises(UserWarning, match=".*register custom checks.*"): + pa.io.to_yaml(schema) From 4f77a0d6abb50eb9bb346d5709d88f33fdf13581 Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 4 Mar 2021 13:47:27 -0800 Subject: [PATCH 05/21] bugfix: ensure checks with no parameters generate appropriate schema --- pandera/checks.py | 19 ++++++++++++++++--- tests/core/test_schema_statistics.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index 8046b4ba2..b6993d2d0 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -397,9 +397,12 @@ def __call__( ) def __eq__(self, other): + if not isinstance(other, type(self)): + # we can only be equal if the same type + return False + are_check_fn_objects_equal = ( - self.__dict__["_check_fn"].__code__.co_code - == other.__dict__["_check_fn"].__code__.co_code + self._get_check_fn_code() == other._get_check_fn_code() ) try: @@ -427,8 +430,18 @@ def __eq__(self, other): and are_all_other_check_attributes_equal ) + def _get_check_fn_code(self): + check_fn = self.__dict__["_check_fn"] + try: + code = check_fn.__code__.co_code + except AttributeError: + # try accessing the functools.partial wrapper + code = check_fn.func.__code__.co_code + + return code + def __hash__(self): - return hash(self.__dict__["_check_fn"].__code__.co_code) + return hash(self._get_check_fn_code()) def __repr__(self): return ( diff --git a/tests/core/test_schema_statistics.py b/tests/core/test_schema_statistics.py index 1c074d70f..eb03e57b0 100644 --- a/tests/core/test_schema_statistics.py +++ b/tests/core/test_schema_statistics.py @@ -1,16 +1,32 @@ # pylint: disable=W0212 """Unit tests for inferring statistics of pandas objects.""" +import unittest.mock as mock import pandas as pd import pytest import pandera as pa +import pandera.extensions as pa_ext from pandera import PandasDtype, dtypes, schema_statistics DEFAULT_INT = PandasDtype.from_str_alias(dtypes._DEFAULT_PANDAS_INT_TYPE) DEFAULT_FLOAT = PandasDtype.from_str_alias(dtypes._DEFAULT_PANDAS_FLOAT_TYPE) +@pytest.fixture(scope="function") +def extra_registered_checks(): + """temporarily registers custom checks onto the Check class""" + with mock.patch( + "pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict + ): + # register custom checks here + @pa_ext.register_check_method() + def no_param_check(_: pd.DataFrame) -> bool: + return True + + yield + + def _create_dataframe(multi_index=False, nullable=False): if multi_index: index = pd.MultiIndex.from_arrays( @@ -562,3 +578,15 @@ def test_parse_checks_and_statistics_roundtrip(checks, expectation): check_statistics = {check.name: check.statistics for check in checks} check_list = schema_statistics.parse_check_statistics(check_statistics) assert set(check_list) == set(checks) + + +def test_parse_checks_and_statistics_no_param(extra_registered_checks): + """Ensure that an edge case where a check does not have parameters is appropriately handled.""" + + checks = [pa.Check.no_param_check()] + expectation = {"no_param_check": {}} + assert schema_statistics.parse_checks(checks) == expectation + + check_statistics = {check.name: check.statistics for check in checks} + check_list = schema_statistics.parse_check_statistics(check_statistics) + assert set(check_list) == set(checks) From e23d6edf02138b42b1f79dde0878de0867279f43 Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 4 Mar 2021 15:23:14 -0800 Subject: [PATCH 06/21] wip: allow looking up registered checks --- pandera/checks.py | 20 ++++++++++++++---- pandera/io.py | 39 +++++++++++++++++++++++++++--------- pandera/schema_statistics.py | 7 ++++++- 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index b6993d2d0..a1b5fcd16 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -5,7 +5,7 @@ import re from collections import namedtuple from functools import partial, wraps -from typing import Any, Callable, Dict, Iterable, List, Optional, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Union, TypeVar, Type import pandas as pd @@ -451,9 +451,14 @@ def __repr__(self): ) +T = TypeVar("T") + + class _CheckMeta(type): # pragma: no cover """Check metaclass.""" + REGISTERED_CUSTOM_CHECKS: Dict[str, Callable] = {} # noqa + def __getattr__(cls, name: str) -> Any: """Prevent attribute errors for registered checks.""" attr = cls.__dict__.get(name) @@ -461,12 +466,19 @@ def __getattr__(cls, name: str) -> Any: raise AttributeError(f"'{cls}' object has no attribute '{name}'") return attr + def __contains__(cls: Type[T], item: Union[T, str]) -> bool: + """Allow lookups for registered checks.""" + if isinstance(item, cls): + name = item.name + ref = getattr(cls, name, None) + return False if ref is None else ref == item + else: + # assume item is str + return hasattr(cls, item) + class Check(_CheckBase, metaclass=_CheckMeta): """Check a pandas Series or DataFrame for certain properties.""" - - REGISTERED_CUSTOM_CHECKS: Dict[str, Callable] = {} # noqa - @classmethod @st.register_check_strategy(st.eq_strategy) @register_check_statistics(["value"]) diff --git a/pandera/io.py b/pandera/io.py index 2ea9c2c64..2a95ff5d5 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -25,7 +25,7 @@ NOT_JSON_SERIALIZABLE = {PandasDtype.DateTime, PandasDtype.Timedelta} -def _serialize_check_stats(check_stats, pandas_dtype=None): +def _serialize_check_stats(check_stats, pandas_dtype = None): """Serialize check statistics into json/yaml-compatible format.""" # pylint: disable=unused-argument @@ -55,18 +55,44 @@ def handle_stat_dtype(stat): return serialized_check_stats +def _serialize_dataframe_stats(dataframe_checks): + """ + Serialize global dataframe check statistics into json/yaml-compatible format. + """ + from pandera.checks import Check + + serialized_checks = {} + + if dataframe_checks is None: + return serialized_checks + + for check_name, check_stats in dataframe_checks.items(): + if check_name not in Check: + warnings.warn( + f"Check {check_name} cannot be serialized. This check will be " + "ignored. Did you forget to register it with the extension API?" + ) + else: + # infer dtype and + serialized_checks[check_name] = _serialize_check_stats(check_stats) + + return serialized_checks + + def _serialize_component_stats(component_stats): """ Serialize column or index statistics into json/yaml-compatible format. """ + from pandera.checks import Check + serialized_checks = None if component_stats["checks"] is not None: serialized_checks = {} for check_name, check_stats in component_stats["checks"].items(): - if check_stats is None: + if check_name not in Check: warnings.warn( f"Check {check_name} cannot be serialized. This check will be " - f"ignored" + "ignored. Did you forget to register it with the extension API?" ) else: serialized_checks[check_name] = _serialize_check_stats( @@ -110,12 +136,7 @@ def _serialize_schema(dataframe_schema): ] if statistics["checks"] is not None: - checks = { - check_name: _serialize_check_stats( - check_stats, - ) - for check_name, check_stats in statistics["checks"].items() - } + checks = _serialize_dataframe_stats(statistics["checks"]) return { "schema_type": "dataframe", diff --git a/pandera/schema_statistics.py b/pandera/schema_statistics.py index 99e8815ec..899cbffde 100644 --- a/pandera/schema_statistics.py +++ b/pandera/schema_statistics.py @@ -159,7 +159,12 @@ def parse_checks(checks) -> Union[Dict[str, Any], None]: check_statistics = {} _check_memo = {} for check in checks: - check_statistics[check.name] = check.statistics + if check not in Check: + warnings.warn("only registered checks may be converted to statistics. " + f"Check `{check.name}` will be skipped.") + continue + + check_statistics[check.name] = {} if check.statistics is None else check.statistics _check_memo[check.name] = check # raise ValueError on incompatible checks From e7853448049bc25d6747e483ad68b08fc0c96c5b Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 18 Mar 2021 15:14:43 -0700 Subject: [PATCH 07/21] fix: compare checks by name rather than by object equality --- pandera/checks.py | 11 +++++------ tests/io/test_io.py | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index a1b5fcd16..36549c024 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -399,7 +399,7 @@ def __call__( def __eq__(self, other): if not isinstance(other, type(self)): # we can only be equal if the same type - return False + return NotImplemented are_check_fn_objects_equal = ( self._get_check_fn_code() == other._get_check_fn_code() @@ -470,11 +470,10 @@ def __contains__(cls: Type[T], item: Union[T, str]) -> bool: """Allow lookups for registered checks.""" if isinstance(item, cls): name = item.name - ref = getattr(cls, name, None) - return False if ref is None else ref == item - else: - # assume item is str - return hasattr(cls, item) + return hasattr(cls, name) + + # assume item is str + return hasattr(cls, item) class Check(_CheckBase, metaclass=_CheckMeta): diff --git a/tests/io/test_io.py b/tests/io/test_io.py index 59a019b25..f6e89a3c4 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -485,5 +485,5 @@ def test_to_yaml_custom_dataframe_check(): checks=[pa.Check(lambda obj: len(obj.index) > 1)], ) - with pytest.raises(UserWarning, match=".*register custom checks.*"): + with pytest.warns(UserWarning, match=".*only registered checks.*"): pa.io.to_yaml(schema) From ecbfaa615638adf513783a712e54f165a4853e35 Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 18 Mar 2021 15:20:58 -0700 Subject: [PATCH 08/21] ci: black --- pandera/checks.py | 13 ++++++++++++- pandera/io.py | 2 +- pandera/schema_statistics.py | 10 +++++++--- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index 36549c024..e4dc4bb39 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -5,7 +5,17 @@ import re from collections import namedtuple from functools import partial, wraps -from typing import Any, Callable, Dict, Iterable, List, Optional, Union, TypeVar, Type +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Union, + TypeVar, + Type, +) import pandas as pd @@ -478,6 +488,7 @@ def __contains__(cls: Type[T], item: Union[T, str]) -> bool: class Check(_CheckBase, metaclass=_CheckMeta): """Check a pandas Series or DataFrame for certain properties.""" + @classmethod @st.register_check_strategy(st.eq_strategy) @register_check_statistics(["value"]) diff --git a/pandera/io.py b/pandera/io.py index 2a95ff5d5..7849e3946 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -25,7 +25,7 @@ NOT_JSON_SERIALIZABLE = {PandasDtype.DateTime, PandasDtype.Timedelta} -def _serialize_check_stats(check_stats, pandas_dtype = None): +def _serialize_check_stats(check_stats, pandas_dtype=None): """Serialize check statistics into json/yaml-compatible format.""" # pylint: disable=unused-argument diff --git a/pandera/schema_statistics.py b/pandera/schema_statistics.py index 899cbffde..8e6fd33ac 100644 --- a/pandera/schema_statistics.py +++ b/pandera/schema_statistics.py @@ -160,11 +160,15 @@ def parse_checks(checks) -> Union[Dict[str, Any], None]: _check_memo = {} for check in checks: if check not in Check: - warnings.warn("only registered checks may be converted to statistics. " - f"Check `{check.name}` will be skipped.") + warnings.warn( + "only registered checks may be converted to statistics. " + f"Check `{check.name}` will be skipped." + ) continue - check_statistics[check.name] = {} if check.statistics is None else check.statistics + check_statistics[check.name] = ( + {} if check.statistics is None else check.statistics + ) _check_memo[check.name] = check # raise ValueError on incompatible checks From 6f1df542065ed2b744ec4ca37fd9a31f359a2460 Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 18 Mar 2021 15:40:03 -0700 Subject: [PATCH 09/21] ci: lint --- pandera/checks.py | 4 ++-- pandera/io.py | 4 +++- pandera/schema_statistics.py | 3 ++- tests/core/test_schema_statistics.py | 6 ++++++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index e4dc4bb39..1223f324e 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -461,7 +461,7 @@ def __repr__(self): ) -T = TypeVar("T") +_T = TypeVar("_T") class _CheckMeta(type): # pragma: no cover @@ -476,7 +476,7 @@ def __getattr__(cls, name: str) -> Any: raise AttributeError(f"'{cls}' object has no attribute '{name}'") return attr - def __contains__(cls: Type[T], item: Union[T, str]) -> bool: + def __contains__(cls: Type[_T], item: Union[_T, str]) -> bool: """Allow lookups for registered checks.""" if isinstance(item, cls): name = item.name diff --git a/pandera/io.py b/pandera/io.py index 7849e3946..11a4cee52 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -59,6 +59,7 @@ def _serialize_dataframe_stats(dataframe_checks): """ Serialize global dataframe check statistics into json/yaml-compatible format. """ + # pylint: disable=import-outside-toplevel from pandera.checks import Check serialized_checks = {} @@ -69,7 +70,7 @@ def _serialize_dataframe_stats(dataframe_checks): for check_name, check_stats in dataframe_checks.items(): if check_name not in Check: warnings.warn( - f"Check {check_name} cannot be serialized. This check will be " + f"Check `{check_name}` cannot be serialized. This check will be " "ignored. Did you forget to register it with the extension API?" ) else: @@ -83,6 +84,7 @@ def _serialize_component_stats(component_stats): """ Serialize column or index statistics into json/yaml-compatible format. """ + # pylint: disable=import-outside-toplevel from pandera.checks import Check serialized_checks = None diff --git a/pandera/schema_statistics.py b/pandera/schema_statistics.py index 8e6fd33ac..865335143 100644 --- a/pandera/schema_statistics.py +++ b/pandera/schema_statistics.py @@ -161,7 +161,8 @@ def parse_checks(checks) -> Union[Dict[str, Any], None]: for check in checks: if check not in Check: warnings.warn( - "only registered checks may be converted to statistics. " + "Only registered checks may be converted to statistics. " + "Did you forget to register it with the extension API? " f"Check `{check.name}` will be skipped." ) continue diff --git a/tests/core/test_schema_statistics.py b/tests/core/test_schema_statistics.py index eb03e57b0..62001cc6c 100644 --- a/tests/core/test_schema_statistics.py +++ b/tests/core/test_schema_statistics.py @@ -16,6 +16,7 @@ @pytest.fixture(scope="function") def extra_registered_checks(): """temporarily registers custom checks onto the Check class""" + # pylint: disable=unused-variable with mock.patch( "pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict ): @@ -580,6 +581,8 @@ def test_parse_checks_and_statistics_roundtrip(checks, expectation): assert set(check_list) == set(checks) +# The next line is a workaround for pylint's confusion about pytest fixtures +# pylint: disable=redefined-outer-name,unused-argument def test_parse_checks_and_statistics_no_param(extra_registered_checks): """Ensure that an edge case where a check does not have parameters is appropriately handled.""" @@ -590,3 +593,6 @@ def test_parse_checks_and_statistics_no_param(extra_registered_checks): check_statistics = {check.name: check.statistics for check in checks} check_list = schema_statistics.parse_check_statistics(check_statistics) assert set(check_list) == set(checks) + + +# pylint: enable=redefined-outer-name,unused-argument From c3b61bab17b71f2977de29ea12da20b69f301536 Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 18 Mar 2021 16:38:04 -0700 Subject: [PATCH 10/21] enh: use REGISTERED_CUSTOM_CHECKS for attribute lookup, add dir method --- pandera/checks.py | 9 ++++++-- pandera/extensions.py | 5 ++--- pandera/io.py | 4 +++- tests/core/checks_fixtures.py | 32 ++++++++++++++++++++++++++++ tests/core/conftest.py | 4 ++++ tests/core/test_extensions.py | 10 +++------ tests/core/test_schema_statistics.py | 22 ++----------------- tests/io/test_io.py | 2 +- 8 files changed, 54 insertions(+), 34 deletions(-) create mode 100644 tests/core/checks_fixtures.py create mode 100644 tests/core/conftest.py diff --git a/pandera/checks.py b/pandera/checks.py index 1223f324e..e69ce6c3c 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -3,8 +3,9 @@ import inspect import operator import re -from collections import namedtuple +from collections import namedtuple, ChainMap from functools import partial, wraps +from itertools import chain from typing import ( Any, Callable, @@ -471,11 +472,15 @@ class _CheckMeta(type): # pragma: no cover def __getattr__(cls, name: str) -> Any: """Prevent attribute errors for registered checks.""" - attr = cls.__dict__.get(name) + attr = ChainMap(cls.__dict__, cls.REGISTERED_CUSTOM_CHECKS).get(name) if attr is None: raise AttributeError(f"'{cls}' object has no attribute '{name}'") return attr + def __dir__(cls) -> Iterable[str]: + """Allow custom checks to show up as attributes when autocompleting.""" + return chain(super().__dir__(), cls.REGISTERED_CUSTOM_CHECKS.keys()) + def __contains__(cls: Type[_T], item: Union[_T, str]) -> bool: """Allow lookups for registered checks.""" if isinstance(item, cls): diff --git a/pandera/extensions.py b/pandera/extensions.py index 308476f9f..32f0eb606 100644 --- a/pandera/extensions.py +++ b/pandera/extensions.py @@ -161,9 +161,8 @@ def check_method(cls, *args, **kwargs): if strategy is not None: check_method = st.register_check_strategy(strategy)(check_method) - setattr(Check, check_fn.__name__, classmethod(check_method)) - Check.REGISTERED_CUSTOM_CHECKS[check_fn.__name__] = getattr( - Check, check_fn.__name__ + Check.REGISTERED_CUSTOM_CHECKS[check_fn.__name__] = partial( + check_method, Check ) return register_check_wrapper(check_fn) diff --git a/pandera/io.py b/pandera/io.py index 11a4cee52..36e16f5c4 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -1,4 +1,5 @@ """Module for reading and writing schema objects.""" +# pylint: disable=fixme import warnings from functools import partial @@ -30,7 +31,7 @@ def _serialize_check_stats(check_stats, pandas_dtype=None): # pylint: disable=unused-argument def handle_stat_dtype(stat): - # infer dtype if not passed + # fixme: change interface to not require a dtype spec nonlocal pandas_dtype if pandas_dtype is None: @@ -154,6 +155,7 @@ def _serialize_schema(dataframe_schema): def _deserialize_check_stats(check, serialized_check_stats, pandas_dtype=None): # pylint: disable=unused-argument def handle_stat_dtype(stat): + # fixme: change interface to not require a dtype spec nonlocal pandas_dtype if pandas_dtype is None: diff --git a/tests/core/checks_fixtures.py b/tests/core/checks_fixtures.py new file mode 100644 index 000000000..54f804b13 --- /dev/null +++ b/tests/core/checks_fixtures.py @@ -0,0 +1,32 @@ +"""Pytest fixtures for testing custom checks.""" +import unittest.mock as mock +import pytest +import pandas as pd + +import pandera as pa +import pandera.extensions as pa_ext + +__all__ = "custom_check_teardown", "extra_registered_checks" + + +@pytest.fixture(scope="function") +def custom_check_teardown(): + """Remove all custom checks after execution of each pytest function.""" + yield + for check_name in list(pa.Check.REGISTERED_CUSTOM_CHECKS): + del pa.Check.REGISTERED_CUSTOM_CHECKS[check_name] + + +@pytest.fixture(scope="function") +def extra_registered_checks(): + """temporarily registers custom checks onto the Check class""" + # pylint: disable=unused-variable + with mock.patch( + "pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict + ): + # register custom checks here + @pa_ext.register_check_method() + def no_param_check(_: pd.DataFrame) -> bool: + return True + + yield diff --git a/tests/core/conftest.py b/tests/core/conftest.py new file mode 100644 index 000000000..06f637dd9 --- /dev/null +++ b/tests/core/conftest.py @@ -0,0 +1,4 @@ +"""Registers fixtures for core""" + +# pylint: disable=unused-import +from .checks_fixtures import custom_check_teardown, extra_registered_checks diff --git a/tests/core/test_extensions.py b/tests/core/test_extensions.py index 689268a0a..17fc3b24e 100644 --- a/tests/core/test_extensions.py +++ b/tests/core/test_extensions.py @@ -13,13 +13,9 @@ from pandera.checks import Check -@pytest.fixture(scope="function") -def custom_check_teardown(): - """Remove all custom checks after execution of each pytest function.""" - yield - for check_name in list(pa.Check.REGISTERED_CUSTOM_CHECKS): - delattr(pa.Check, check_name) - del pa.Check.REGISTERED_CUSTOM_CHECKS[check_name] +def test_custom_checks_in_dir(extra_registered_checks): + """Ensures that autocomplete works with registered custom checks.""" + assert "no_param_check" in dir(pa.Check) @pytest.mark.parametrize( diff --git a/tests/core/test_schema_statistics.py b/tests/core/test_schema_statistics.py index 62001cc6c..d82f85f9d 100644 --- a/tests/core/test_schema_statistics.py +++ b/tests/core/test_schema_statistics.py @@ -1,33 +1,16 @@ # pylint: disable=W0212 """Unit tests for inferring statistics of pandas objects.""" -import unittest.mock as mock import pandas as pd import pytest import pandera as pa -import pandera.extensions as pa_ext from pandera import PandasDtype, dtypes, schema_statistics DEFAULT_INT = PandasDtype.from_str_alias(dtypes._DEFAULT_PANDAS_INT_TYPE) DEFAULT_FLOAT = PandasDtype.from_str_alias(dtypes._DEFAULT_PANDAS_FLOAT_TYPE) -@pytest.fixture(scope="function") -def extra_registered_checks(): - """temporarily registers custom checks onto the Check class""" - # pylint: disable=unused-variable - with mock.patch( - "pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict - ): - # register custom checks here - @pa_ext.register_check_method() - def no_param_check(_: pd.DataFrame) -> bool: - return True - - yield - - def _create_dataframe(multi_index=False, nullable=False): if multi_index: index = pd.MultiIndex.from_arrays( @@ -581,8 +564,7 @@ def test_parse_checks_and_statistics_roundtrip(checks, expectation): assert set(check_list) == set(checks) -# The next line is a workaround for pylint's confusion about pytest fixtures -# pylint: disable=redefined-outer-name,unused-argument +# pylint: disable=unused-argument def test_parse_checks_and_statistics_no_param(extra_registered_checks): """Ensure that an edge case where a check does not have parameters is appropriately handled.""" @@ -595,4 +577,4 @@ def test_parse_checks_and_statistics_no_param(extra_registered_checks): assert set(check_list) == set(checks) -# pylint: enable=redefined-outer-name,unused-argument +# pylint: enable=unused-argument diff --git a/tests/io/test_io.py b/tests/io/test_io.py index f6e89a3c4..3a75f9c88 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -485,5 +485,5 @@ def test_to_yaml_custom_dataframe_check(): checks=[pa.Check(lambda obj: len(obj.index) > 1)], ) - with pytest.warns(UserWarning, match=".*only registered checks.*"): + with pytest.warns(UserWarning, match=".*registered checks.*"): pa.io.to_yaml(schema) From a1a51fd6346d50d500038740076f4b541d0541ab Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 18 Mar 2021 17:19:12 -0700 Subject: [PATCH 11/21] enh: add to_yaml method to Schema, add unit test --- pandera/model.py | 8 ++++++++ tests/io/test_io.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/pandera/model.py b/pandera/model.py index c5f69b5c6..0472bc8a2 100644 --- a/pandera/model.py +++ b/pandera/model.py @@ -2,6 +2,7 @@ import inspect import re import sys +import os import typing from typing import ( Any, @@ -169,6 +170,13 @@ def to_schema(cls) -> DataFrameSchema: MODEL_CACHE[cls] = cls.__schema__ return cls.__schema__ + @classmethod + def to_yaml(cls, stream: Optional[os.PathLike] = None): + """ + Convert `Schema` to yaml using `io.to_yaml`. + """ + return cls.to_schema().to_yaml(stream) + @classmethod @pd.util.Substitution(validate_doc=DataFrameSchema.validate.__doc__) def validate( diff --git a/tests/io/test_io.py b/tests/io/test_io.py index 3a75f9c88..fb654ce54 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -11,6 +11,7 @@ import pandera as pa import pandera.extensions as pa_ext +import pandera.typing as pat try: from pandera import io @@ -487,3 +488,22 @@ def test_to_yaml_custom_dataframe_check(): with pytest.warns(UserWarning, match=".*registered checks.*"): pa.io.to_yaml(schema) + + +def test_to_yaml_bugfix_419(): + """Ensure that GH#419 is fixed""" + # pylint: disable=no-self-use + + class CheckedSchemaModel(pa.SchemaModel): + """Schema with a global check""" + + a: pat.Series[pat.Int64] + b: pat.Series[pat.Int64] + + @pa.dataframe_check() + def unregistered_check(self, _): + """sample unregistered check""" + ... + + with pytest.warns(UserWarning, match=".*registered checks.*"): + CheckedSchemaModel.to_yaml() From 92dc2f0f862f3cacf098e2a2dea6eaa78fbba10c Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Thu, 18 Mar 2021 18:13:29 -0700 Subject: [PATCH 12/21] ci: disable typechecking on _CheckMeta --- pandera/checks.py | 11 ++++++++++- pandera/schemas.py | 6 +++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index e69ce6c3c..78c5a2279 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -1,4 +1,5 @@ """Data validation checks.""" +# pylint: disable=fixme import inspect import operator @@ -16,6 +17,7 @@ Union, TypeVar, Type, + no_type_check, ) import pandas as pd @@ -462,12 +464,14 @@ def __repr__(self): ) -_T = TypeVar("_T") +_T = TypeVar("_T", bound=_CheckBase) class _CheckMeta(type): # pragma: no cover """Check metaclass.""" + # FIXME: this should probably just be moved to _CheckBase + REGISTERED_CUSTOM_CHECKS: Dict[str, Callable] = {} # noqa def __getattr__(cls, name: str) -> Any: @@ -481,6 +485,11 @@ def __dir__(cls) -> Iterable[str]: """Allow custom checks to show up as attributes when autocompleting.""" return chain(super().__dir__(), cls.REGISTERED_CUSTOM_CHECKS.keys()) + # pylint: disable=line-too-long + # mypy has limited metaclass support so this doesn't pass typecheck + # see https://mypy.readthedocs.io/en/stable/metaclasses.html#gotchas-and-limitations-of-metaclass-support + # pylint: enable=line-too-long + @no_type_check def __contains__(cls: Type[_T], item: Union[_T, str]) -> bool: """Allow lookups for registered checks.""" if isinstance(item, cls): diff --git a/pandera/schemas.py b/pandera/schemas.py index cbad5e4d2..237ba0a07 100644 --- a/pandera/schemas.py +++ b/pandera/schemas.py @@ -1,6 +1,7 @@ """Core pandera schema class definitions.""" # pylint: disable=too-many-lines +import os import copy import itertools import warnings @@ -1186,17 +1187,16 @@ def from_yaml(cls, yaml_schema) -> "DataFrameSchema": return pandera.io.from_yaml(yaml_schema) - def to_yaml(self, fp: Union[str, Path] = None): + def to_yaml(self, stream: Optional[os.PathLike] = None): """Write DataFrameSchema to yaml file. - :param dataframe_schema: schema to write to file or dump to string. :param stream: file stream to write to. If None, dumps to string. :returns: yaml string if stream is None, otherwise returns None. """ # pylint: disable=import-outside-toplevel,cyclic-import import pandera.io - return pandera.io.to_yaml(self, fp) + return pandera.io.to_yaml(self, stream=stream) def set_index( self, keys: List[str], drop: bool = True, append: bool = False From d3a9b1bb57ba09a7a832a53dc9c5b8feabde67df Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Fri, 19 Mar 2021 09:17:15 -0700 Subject: [PATCH 13/21] ci: isort --- pandera/checks.py | 6 +++--- pandera/model.py | 2 +- pandera/schemas.py | 2 +- tests/core/checks_fixtures.py | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index 78c5a2279..f2be4022c 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -4,7 +4,7 @@ import inspect import operator import re -from collections import namedtuple, ChainMap +from collections import ChainMap, namedtuple from functools import partial, wraps from itertools import chain from typing import ( @@ -14,9 +14,9 @@ Iterable, List, Optional, - Union, - TypeVar, Type, + TypeVar, + Union, no_type_check, ) diff --git a/pandera/model.py b/pandera/model.py index e3275a0da..1efecfac2 100644 --- a/pandera/model.py +++ b/pandera/model.py @@ -1,8 +1,8 @@ """Class-based api""" import inspect +import os import re import sys -import os import typing from typing import ( Any, diff --git a/pandera/schemas.py b/pandera/schemas.py index 237ba0a07..e197f648c 100644 --- a/pandera/schemas.py +++ b/pandera/schemas.py @@ -1,9 +1,9 @@ """Core pandera schema class definitions.""" # pylint: disable=too-many-lines -import os import copy import itertools +import os import warnings from functools import wraps from pathlib import Path diff --git a/tests/core/checks_fixtures.py b/tests/core/checks_fixtures.py index 54f804b13..baa99e81e 100644 --- a/tests/core/checks_fixtures.py +++ b/tests/core/checks_fixtures.py @@ -1,7 +1,8 @@ """Pytest fixtures for testing custom checks.""" import unittest.mock as mock -import pytest + import pandas as pd +import pytest import pandera as pa import pandera.extensions as pa_ext From cbfaa983d12ef2502aa5094ad179776d6a0eeef6 Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Fri, 19 Mar 2021 09:18:05 -0700 Subject: [PATCH 14/21] ci: doctests --- docs/source/schema_inference.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/schema_inference.rst b/docs/source/schema_inference.rst index 04f184b53..bba6889bf 100644 --- a/docs/source/schema_inference.rst +++ b/docs/source/schema_inference.rst @@ -214,6 +214,7 @@ is a convenience method for this functionality. coerce: false required: true regex: false + checks: null index: - pandas_dtype: int64 nullable: false From ebaf219b700611b1caea20b95caf1aa90c079dbe Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Fri, 19 Mar 2021 11:52:18 -0700 Subject: [PATCH 15/21] ci: improve coverage --- pandera/checks.py | 7 ++++-- pandera/io.py | 17 +++++-------- pandera/schema_statistics.py | 2 +- tests/core/test_checks.py | 4 ++- tests/io/test_io.py | 47 +++++++++++++++++++++++++++++++++--- 5 files changed, 59 insertions(+), 18 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index f2be4022c..65d5c504b 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -411,7 +411,6 @@ def __call__( def __eq__(self, other): if not isinstance(other, type(self)): - # we can only be equal if the same type return NotImplemented are_check_fn_objects_equal = ( @@ -478,7 +477,11 @@ def __getattr__(cls, name: str) -> Any: """Prevent attribute errors for registered checks.""" attr = ChainMap(cls.__dict__, cls.REGISTERED_CUSTOM_CHECKS).get(name) if attr is None: - raise AttributeError(f"'{cls}' object has no attribute '{name}'") + raise AttributeError( + f"'{cls}' object has no attribute '{name}'. " + "Make sure any custom checks have been registered " + "using the extensions api." + ) return attr def __dir__(cls) -> Iterable[str]: diff --git a/pandera/io.py b/pandera/io.py index fdc775973..746ea64c6 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -60,23 +60,17 @@ def _serialize_dataframe_stats(dataframe_checks): """ Serialize global dataframe check statistics into json/yaml-compatible format. """ - # pylint: disable=import-outside-toplevel - from pandera.checks import Check - serialized_checks = {} if dataframe_checks is None: return serialized_checks for check_name, check_stats in dataframe_checks.items(): - if check_name not in Check: - warnings.warn( - f"Check `{check_name}` cannot be serialized. This check will be " - "ignored. Did you forget to register it with the extension API?" - ) - else: - # infer dtype and - serialized_checks[check_name] = _serialize_check_stats(check_stats) + # The case that `check_name` is not registered is handled in `parse_checks`, + # so we know that `check_name` exists. + + # infer dtype of statistics and serialize them + serialized_checks[check_name] = _serialize_check_stats(check_stats) return serialized_checks @@ -237,6 +231,7 @@ def _deserialize_schema(serialized_schema): ] if serialized_schema["checks"] is not None: + # handles unregistered checks by raising AttributeErrors from getattr checks = [ _deserialize_check_stats(getattr(Check, check_name), check_stats) for check_name, check_stats in serialized_schema["checks"].items() diff --git a/pandera/schema_statistics.py b/pandera/schema_statistics.py index 865335143..6b30782dc 100644 --- a/pandera/schema_statistics.py +++ b/pandera/schema_statistics.py @@ -161,7 +161,7 @@ def parse_checks(checks) -> Union[Dict[str, Any], None]: for check in checks: if check not in Check: warnings.warn( - "Only registered checks may be converted to statistics. " + "Only registered checks may be serialized to statistics. " "Did you forget to register it with the extension API? " f"Check `{check.name}` will be skipped." ) diff --git a/tests/core/test_checks.py b/tests/core/test_checks.py index 5f2f3678d..b2fb9f027 100644 --- a/tests/core/test_checks.py +++ b/tests/core/test_checks.py @@ -353,12 +353,14 @@ def test_reshape_failure_cases_exceptions(): def test_check_equality_operators(): - """Test the usage of == between a Check and an entirely different Check.""" + """Test the usage of == between a Check and an entirely different Check, + and a non-Check.""" check = Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3") not_equal_check = Check(lambda x: x.isna().sum() == 0) assert check == copy.deepcopy(check) assert check != not_equal_check + assert check != "not a check" def test_equality_operators_functional_equivalence(): diff --git a/tests/io/test_io.py b/tests/io/test_io.py index 08c974470..69a6a1312 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -293,6 +293,28 @@ def _create_schema_python_types(): """ +YAML_SCHEMA_MISSING_CHECKS = f""" +schema_type: dataframe +version: {pa.__version__} +columns: + int_column: + pandas_dtype: int64 + float_column: + pandas_dtype: float64 + str_column: + pandas_dtype: str + object_column: + pandas_dtype: object +checks: + unregistered_check: + stat1: missing_str_stat + stat2: 11 +index: null +coerce: false +strict: false +""" + + @pytest.mark.skipif( SKIP_YAML_TESTS, reason="pyyaml >= 5.1.0 required", @@ -346,6 +368,13 @@ def test_from_yaml(yaml_str, schema_creator): assert expected_schema == schema_from_yaml +def test_from_yaml_unregistered_checks(): + """Test that from_yaml raises an exception when deserializing unregistered checks.""" + + with pytest.raises(AttributeError, match=".*custom checks.*"): + io.from_yaml(YAML_SCHEMA_MISSING_CHECKS) + + def test_io_yaml_file_obj(): """Test read and write operation on file object.""" schema = _create_schema() @@ -413,7 +442,7 @@ def test_to_script(index): def test_to_script_lambda_check(): """Test writing DataFrameSchema to a script with lambda check.""" - schema = pa.DataFrameSchema( + schema1 = pa.DataFrameSchema( { "a": pa.Column( pa.Int, @@ -423,7 +452,19 @@ def test_to_script_lambda_check(): ) with pytest.warns(UserWarning): - pa.io.to_script(schema) + pa.io.to_script(schema1) + + schema2 = pa.DataFrameSchema( + { + "a": pa.Column( + pa.Int, + ), + }, + checks=pa.Check(lambda s: s.mean() > 5, element_wise=False), + ) + + with pytest.warns(UserWarning, match=".*registered checks.*"): + pa.io.to_script(schema2) def test_to_yaml_lambda_check(): @@ -443,7 +484,7 @@ def test_to_yaml_lambda_check(): @mock.patch("pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict) def test_to_yaml_registered_dataframe_check(_): - """Tests that writing DataFrameSchema with a registered dataframe works.""" + """Tests that writing DataFrameSchema with a registered dataframe check works.""" ncols_gt_called = False @pa_ext.register_check_method(statistics=["column_count"]) From 87438ad884b998fc29f638bd8fb33f86293f1f0b Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Fri, 19 Mar 2021 12:52:51 -0700 Subject: [PATCH 16/21] ci: codecov In these lines, dataframe_checks cannot be None based on the call condition. --- pandera/io.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandera/io.py b/pandera/io.py index 746ea64c6..3208c688a 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -62,9 +62,6 @@ def _serialize_dataframe_stats(dataframe_checks): """ serialized_checks = {} - if dataframe_checks is None: - return serialized_checks - for check_name, check_stats in dataframe_checks.items(): # The case that `check_name` is not registered is handled in `parse_checks`, # so we know that `check_name` exists. From 90d6871d14aaea7a25168b686837e71beb810f9b Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Sat, 20 Mar 2021 12:46:31 +0100 Subject: [PATCH 17/21] fix unrecognized check dtype during (de)serialization --- pandera/io.py | 27 ++++++--------------------- tests/io/test_io.py | 13 ++++++++++--- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/pandera/io.py b/pandera/io.py index 3208c688a..0cf7a7d80 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -1,5 +1,4 @@ """Module for reading and writing schema objects.""" -# pylint: disable=fixme import warnings from functools import partial @@ -28,15 +27,8 @@ def _serialize_check_stats(check_stats, pandas_dtype=None): """Serialize check statistics into json/yaml-compatible format.""" - # pylint: disable=unused-argument - - def handle_stat_dtype(stat): - # fixme: change interface to not require a dtype spec - nonlocal pandas_dtype - - if pandas_dtype is None: - pandas_dtype = PandasDtype.get_dtype(type(stat)) + def handle_stat_dtype(stat, pandas_dtype): if pandas_dtype == PandasDtype.DateTime: return stat.strftime(DATETIME_FORMAT) elif pandas_dtype == PandasDtype.Timedelta: @@ -47,12 +39,12 @@ def handle_stat_dtype(stat): # for unary checks, return a single value instead of a dictionary if len(check_stats) == 1: - return handle_stat_dtype(list(check_stats.values())[0]) + return handle_stat_dtype(list(check_stats.values())[0], pandas_dtype) # otherwise return a dictionary of keyword args needed to create the Check serialized_check_stats = {} for arg, stat in check_stats.items(): - serialized_check_stats[arg] = handle_stat_dtype(stat) + serialized_check_stats[arg] = handle_stat_dtype(stat, pandas_dtype) return serialized_check_stats @@ -149,14 +141,7 @@ def _serialize_schema(dataframe_schema): def _deserialize_check_stats(check, serialized_check_stats, pandas_dtype=None): - # pylint: disable=unused-argument - def handle_stat_dtype(stat): - # fixme: change interface to not require a dtype spec - nonlocal pandas_dtype - - if pandas_dtype is None: - pandas_dtype = PandasDtype.get_dtype(type(stat)) - + def handle_stat_dtype(stat, pandas_dtype): if pandas_dtype == PandasDtype.DateTime: return pd.to_datetime(stat, format=DATETIME_FORMAT) elif pandas_dtype == PandasDtype.Timedelta: @@ -169,10 +154,10 @@ def handle_stat_dtype(stat): # dictionary mapping Check arg names to values. check_stats = {} for arg, stat in serialized_check_stats.items(): - check_stats[arg] = handle_stat_dtype(stat) + check_stats[arg] = handle_stat_dtype(stat, pandas_dtype) return check(**check_stats) # otherwise assume unary check function signature - return check(handle_stat_dtype(serialized_check_stats)) + return check(handle_stat_dtype(serialized_check_stats, pandas_dtype)) def _deserialize_component_stats(serialized_component_stats): diff --git a/tests/io/test_io.py b/tests/io/test_io.py index 69a6a1312..a7f1a1598 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -102,7 +102,9 @@ def _create_schema(index="single"): regex=True, checks=[pa.Check.str_length(1, 3)], ), - "empty_column": pa.Column(), + "notype_column": pa.Column( + checks=pa.Check.isin(["foo", "bar", "x", "xy"]), + ), }, index=index, coerce=False, @@ -187,10 +189,15 @@ def _create_schema(index="single"): coerce: true required: false regex: true - empty_column: + notype_column: pandas_dtype: null nullable: false - checks: null + checks: + isin: + - foo + - bar + - x + - xy allow_duplicates: true coerce: false required: true From c067b7f8092d0b51f95bbce93ec658b6c8a7d9ba Mon Sep 17 00:00:00 2001 From: Jean-Francois Zinque Date: Mon, 22 Mar 2021 15:22:27 +0100 Subject: [PATCH 18/21] fix handle_stat__dtype closures --- pandera/io.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandera/io.py b/pandera/io.py index 0cf7a7d80..08f4f7a5d 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -28,7 +28,7 @@ def _serialize_check_stats(check_stats, pandas_dtype=None): """Serialize check statistics into json/yaml-compatible format.""" - def handle_stat_dtype(stat, pandas_dtype): + def handle_stat_dtype(stat): if pandas_dtype == PandasDtype.DateTime: return stat.strftime(DATETIME_FORMAT) elif pandas_dtype == PandasDtype.Timedelta: @@ -39,12 +39,12 @@ def handle_stat_dtype(stat, pandas_dtype): # for unary checks, return a single value instead of a dictionary if len(check_stats) == 1: - return handle_stat_dtype(list(check_stats.values())[0], pandas_dtype) + return handle_stat_dtype(list(check_stats.values())[0]) # otherwise return a dictionary of keyword args needed to create the Check serialized_check_stats = {} for arg, stat in check_stats.items(): - serialized_check_stats[arg] = handle_stat_dtype(stat, pandas_dtype) + serialized_check_stats[arg] = handle_stat_dtype(stat) return serialized_check_stats @@ -141,7 +141,7 @@ def _serialize_schema(dataframe_schema): def _deserialize_check_stats(check, serialized_check_stats, pandas_dtype=None): - def handle_stat_dtype(stat, pandas_dtype): + def handle_stat_dtype(stat): if pandas_dtype == PandasDtype.DateTime: return pd.to_datetime(stat, format=DATETIME_FORMAT) elif pandas_dtype == PandasDtype.Timedelta: @@ -154,10 +154,10 @@ def handle_stat_dtype(stat, pandas_dtype): # dictionary mapping Check arg names to values. check_stats = {} for arg, stat in serialized_check_stats.items(): - check_stats[arg] = handle_stat_dtype(stat, pandas_dtype) + check_stats[arg] = handle_stat_dtype(stat) return check(**check_stats) # otherwise assume unary check function signature - return check(handle_stat_dtype(serialized_check_stats, pandas_dtype)) + return check(handle_stat_dtype(serialized_check_stats)) def _deserialize_component_stats(serialized_component_stats): From c3108b9391a8ef5a196e0a319dee894d85ee996b Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Mon, 22 Mar 2021 11:27:08 -0700 Subject: [PATCH 19/21] enh: move metaclass onto _CheckBase --- pandera/checks.py | 83 +++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/pandera/checks.py b/pandera/checks.py index 65d5c504b..0345dc39b 100644 --- a/pandera/checks.py +++ b/pandera/checks.py @@ -1,5 +1,4 @@ """Data validation checks.""" -# pylint: disable=fixme import inspect import operator @@ -64,7 +63,45 @@ def _wrapper(cls, *args, **kwargs): return register_check_statistics_decorator -class _CheckBase: +_T = TypeVar("_T", bound="_CheckBase") + + +class _CheckMeta(type): # pragma: no cover + """Check metaclass.""" + + REGISTERED_CUSTOM_CHECKS: Dict[str, Callable] = {} # noqa + + def __getattr__(cls, name: str) -> Any: + """Prevent attribute errors for registered checks.""" + attr = ChainMap(cls.__dict__, cls.REGISTERED_CUSTOM_CHECKS).get(name) + if attr is None: + raise AttributeError( + f"'{cls}' object has no attribute '{name}'. " + "Make sure any custom checks have been registered " + "using the extensions api." + ) + return attr + + def __dir__(cls) -> Iterable[str]: + """Allow custom checks to show up as attributes when autocompleting.""" + return chain(super().__dir__(), cls.REGISTERED_CUSTOM_CHECKS.keys()) + + # pylint: disable=line-too-long + # mypy has limited metaclass support so this doesn't pass typecheck + # see https://mypy.readthedocs.io/en/stable/metaclasses.html#gotchas-and-limitations-of-metaclass-support + # pylint: enable=line-too-long + @no_type_check + def __contains__(cls: Type[_T], item: Union[_T, str]) -> bool: + """Allow lookups for registered checks.""" + if isinstance(item, cls): + name = item.name + return hasattr(cls, name) + + # assume item is str + return hasattr(cls, item) + + +class _CheckBase(metaclass=_CheckMeta): """Check base class.""" def __init__( @@ -463,47 +500,7 @@ def __repr__(self): ) -_T = TypeVar("_T", bound=_CheckBase) - - -class _CheckMeta(type): # pragma: no cover - """Check metaclass.""" - - # FIXME: this should probably just be moved to _CheckBase - - REGISTERED_CUSTOM_CHECKS: Dict[str, Callable] = {} # noqa - - def __getattr__(cls, name: str) -> Any: - """Prevent attribute errors for registered checks.""" - attr = ChainMap(cls.__dict__, cls.REGISTERED_CUSTOM_CHECKS).get(name) - if attr is None: - raise AttributeError( - f"'{cls}' object has no attribute '{name}'. " - "Make sure any custom checks have been registered " - "using the extensions api." - ) - return attr - - def __dir__(cls) -> Iterable[str]: - """Allow custom checks to show up as attributes when autocompleting.""" - return chain(super().__dir__(), cls.REGISTERED_CUSTOM_CHECKS.keys()) - - # pylint: disable=line-too-long - # mypy has limited metaclass support so this doesn't pass typecheck - # see https://mypy.readthedocs.io/en/stable/metaclasses.html#gotchas-and-limitations-of-metaclass-support - # pylint: enable=line-too-long - @no_type_check - def __contains__(cls: Type[_T], item: Union[_T, str]) -> bool: - """Allow lookups for registered checks.""" - if isinstance(item, cls): - name = item.name - return hasattr(cls, name) - - # assume item is str - return hasattr(cls, item) - - -class Check(_CheckBase, metaclass=_CheckMeta): +class Check(_CheckBase): """Check a pandas Series or DataFrame for certain properties.""" @classmethod From 0b2c45375b1b653ecbaab81530082e9f808123f4 Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Mon, 22 Mar 2021 11:28:16 -0700 Subject: [PATCH 20/21] ci: add test that ensures to_yaml warns on unregistered checks --- tests/io/test_io.py | 50 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/tests/io/test_io.py b/tests/io/test_io.py index a7f1a1598..ed753295d 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -300,7 +300,7 @@ def _create_schema_python_types(): """ -YAML_SCHEMA_MISSING_CHECKS = f""" +YAML_SCHEMA_MISSING_GLOBAL_CHECK = f""" schema_type: dataframe version: {pa.__version__} columns: @@ -322,6 +322,28 @@ def _create_schema_python_types(): """ +YAML_SCHEMA_MISSING_COLUMN_CHECK = f""" +schema_type: dataframe +version: {pa.__version__} +columns: + int_column: + pandas_dtype: int64 + checks: + unregistered_check: + stat1: missing_str_stat + stat2: 11 + float_column: + pandas_dtype: float64 + str_column: + pandas_dtype: str + object_column: + pandas_dtype: object +index: null +coerce: false +strict: false +""" + + @pytest.mark.skipif( SKIP_YAML_TESTS, reason="pyyaml >= 5.1.0 required", @@ -355,6 +377,27 @@ def test_to_yaml(): assert yaml_str_schema_method.strip() == YAML_SCHEMA.strip() +@pytest.mark.skipif( + SKIP_YAML_TESTS, + reason="pyyaml >= 5.1.0 required", +) +def test_to_yaml_missing_checks(): + """Test that to_yaml warns when using unregistered checks on columns/globally.""" + schema = _create_schema() + unregistered = pa.Check(lambda _: False) + schema.columns["int_column"]._checks.append(unregistered) + + with pytest.warns(UserWarning, match=".*registered checks.*"): + io.to_yaml(schema) + + del schema.columns["int_column"]._checks[-1] + + schema.checks.append(unregistered) + + with pytest.warns(UserWarning, match=".*registered checks.*"): + io.to_yaml(schema) + + @pytest.mark.skipif( SKIP_YAML_TESTS, reason="pyyaml >= 5.1.0 required", @@ -379,7 +422,10 @@ def test_from_yaml_unregistered_checks(): """Test that from_yaml raises an exception when deserializing unregistered checks.""" with pytest.raises(AttributeError, match=".*custom checks.*"): - io.from_yaml(YAML_SCHEMA_MISSING_CHECKS) + io.from_yaml(YAML_SCHEMA_MISSING_COLUMN_CHECK) + + with pytest.raises(AttributeError, match=".*custom checks.*"): + io.from_yaml(YAML_SCHEMA_MISSING_GLOBAL_CHECK) def test_io_yaml_file_obj(): From e1ee5d5c0f2b1fe49d0849e86090dec0195ec568 Mon Sep 17 00:00:00 2001 From: Anton Loukianov Date: Mon, 22 Mar 2021 11:38:30 -0700 Subject: [PATCH 21/21] ci: revert adding duplicate test --- tests/io/test_io.py | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/tests/io/test_io.py b/tests/io/test_io.py index ed753295d..d82542a01 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -377,27 +377,6 @@ def test_to_yaml(): assert yaml_str_schema_method.strip() == YAML_SCHEMA.strip() -@pytest.mark.skipif( - SKIP_YAML_TESTS, - reason="pyyaml >= 5.1.0 required", -) -def test_to_yaml_missing_checks(): - """Test that to_yaml warns when using unregistered checks on columns/globally.""" - schema = _create_schema() - unregistered = pa.Check(lambda _: False) - schema.columns["int_column"]._checks.append(unregistered) - - with pytest.warns(UserWarning, match=".*registered checks.*"): - io.to_yaml(schema) - - del schema.columns["int_column"]._checks[-1] - - schema.checks.append(unregistered) - - with pytest.warns(UserWarning, match=".*registered checks.*"): - io.to_yaml(schema) - - @pytest.mark.skipif( SKIP_YAML_TESTS, reason="pyyaml >= 5.1.0 required", @@ -578,7 +557,7 @@ def ncols_gt(pandas_obj: pd.DataFrame, column_count: int) -> bool: def test_to_yaml_custom_dataframe_check(): - """Tests that writing DataFrameSchema with a registered dataframe raises.""" + """Tests that writing DataFrameSchema with an unregistered check raises.""" schema = pa.DataFrameSchema( { @@ -592,6 +571,8 @@ def test_to_yaml_custom_dataframe_check(): with pytest.warns(UserWarning, match=".*registered checks.*"): pa.io.to_yaml(schema) + # the unregistered column check case is tested in `test_to_yaml_lambda_check` + def test_to_yaml_bugfix_419(): """Ensure that GH#419 is fixed"""