-
-
Notifications
You must be signed in to change notification settings - Fork 312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix to_yaml serialization dropping global checks #428
Changes from 17 commits
3e2fad0
b3ec21b
49db091
c444d29
4f77a0d
e23d6ed
e785344
ecbfaa6
6f1df54
c3b61ba
a1a51fd
8f0790c
92dc2f0
d3a9b1b
cbfaa98
ebaf219
87438ad
90d6871
c067b7f
c3108b9
0b2c453
e1ee5d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
"""Module for reading and writing schema objects.""" | ||
# pylint: disable=fixme | ||
|
||
import warnings | ||
from functools import partial | ||
|
@@ -25,15 +26,23 @@ | |
NOT_JSON_SERIALIZABLE = {PandasDtype.DateTime, PandasDtype.Timedelta} | ||
|
||
|
||
def _serialize_check_stats(check_stats, pandas_dtype): | ||
def _serialize_check_stats(check_stats, pandas_dtype=None): | ||
"""Serialize check statistics into json/yaml-compatible format.""" | ||
# pylint: disable=unused-argument | ||
|
||
def handle_stat_dtype(stat): | ||
# fixme: change interface to not require a dtype spec | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume this is fixed with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was worried that my fix is a hack and that there's a better way to do this (I don't like nonlocal). In the component-wise case it's an optimization that prevents a dtype lookup because you assume that each component has the same types for the statistics. Is this a valid assumption? Maybe we can just drop the dtype argument. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've committed a change that circumvents An example of failure was: import pandera as pa
schema = pa.DataFrameSchema(
columns={
"notype_column": pa.Column(
str,
checks=pa.Check.isin(["foo", "bar", "x", "xy"]), # list is not a PandasDtype
)
}
)
schema.to_yaml() |
||
nonlocal pandas_dtype | ||
|
||
if pandas_dtype is None: | ||
pandas_dtype = PandasDtype.get_dtype(type(stat)) | ||
|
||
if pandas_dtype == PandasDtype.DateTime: | ||
return stat.strftime(DATETIME_FORMAT) | ||
elif pandas_dtype == PandasDtype.Timedelta: | ||
# serialize to int in nanoseconds | ||
return stat.delta | ||
|
||
return stat | ||
|
||
# for unary checks, return a single value instead of a dictionary | ||
|
@@ -47,18 +56,37 @@ def handle_stat_dtype(stat): | |
return serialized_check_stats | ||
|
||
|
||
def _serialize_dataframe_stats(dataframe_checks): | ||
""" | ||
Serialize global dataframe check statistics into json/yaml-compatible format. | ||
""" | ||
serialized_checks = {} | ||
|
||
for check_name, check_stats in dataframe_checks.items(): | ||
# The case that `check_name` is not registered is handled in `parse_checks`, | ||
# so we know that `check_name` exists. | ||
|
||
# infer dtype of statistics and serialize them | ||
serialized_checks[check_name] = _serialize_check_stats(check_stats) | ||
|
||
return serialized_checks | ||
|
||
|
||
def _serialize_component_stats(component_stats): | ||
""" | ||
Serialize column or index statistics into json/yaml-compatible format. | ||
""" | ||
# pylint: disable=import-outside-toplevel | ||
from pandera.checks import Check | ||
|
||
serialized_checks = None | ||
if component_stats["checks"] is not None: | ||
serialized_checks = {} | ||
for check_name, check_stats in component_stats["checks"].items(): | ||
if check_stats is None: | ||
if check_name not in Check: | ||
warnings.warn( | ||
f"Check {check_name} cannot be serialized. This check will be " | ||
f"ignored" | ||
"ignored. Did you forget to register it with the extension API?" | ||
) | ||
else: | ||
serialized_checks[check_name] = _serialize_check_stats( | ||
|
@@ -93,7 +121,7 @@ def _serialize_schema(dataframe_schema): | |
|
||
statistics = get_dataframe_schema_statistics(dataframe_schema) | ||
|
||
columns, index = None, None | ||
columns, index, checks = None, None, None | ||
if statistics["columns"] is not None: | ||
columns = { | ||
col_name: _serialize_component_stats(column_stats) | ||
|
@@ -106,18 +134,29 @@ def _serialize_schema(dataframe_schema): | |
for index_stats in statistics["index"] | ||
] | ||
|
||
if statistics["checks"] is not None: | ||
checks = _serialize_dataframe_stats(statistics["checks"]) | ||
|
||
return { | ||
"schema_type": "dataframe", | ||
"version": __version__, | ||
"columns": columns, | ||
"checks": checks, | ||
"index": index, | ||
"coerce": dataframe_schema.coerce, | ||
"strict": dataframe_schema.strict, | ||
} | ||
|
||
|
||
def _deserialize_check_stats(check, serialized_check_stats, pandas_dtype): | ||
def _deserialize_check_stats(check, serialized_check_stats, pandas_dtype=None): | ||
# pylint: disable=unused-argument | ||
def handle_stat_dtype(stat): | ||
# fixme: change interface to not require a dtype spec | ||
antonl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
nonlocal pandas_dtype | ||
|
||
if pandas_dtype is None: | ||
pandas_dtype = PandasDtype.get_dtype(type(stat)) | ||
|
||
if pandas_dtype == PandasDtype.DateTime: | ||
return pd.to_datetime(stat, format=DATETIME_FORMAT) | ||
elif pandas_dtype == PandasDtype.Timedelta: | ||
|
@@ -173,9 +212,9 @@ def _deserialize_component_stats(serialized_component_stats): | |
|
||
def _deserialize_schema(serialized_schema): | ||
# pylint: disable=import-outside-toplevel | ||
from pandera import Column, DataFrameSchema, Index, MultiIndex | ||
from pandera import Check, Column, DataFrameSchema, Index, MultiIndex | ||
|
||
columns, index = None, None | ||
columns, index, checks = None, None, None | ||
if serialized_schema["columns"] is not None: | ||
columns = { | ||
col_name: Column(**_deserialize_component_stats(column_stats)) | ||
|
@@ -188,6 +227,13 @@ def _deserialize_schema(serialized_schema): | |
for index_component in serialized_schema["index"] | ||
] | ||
|
||
if serialized_schema["checks"] is not None: | ||
# handles unregistered checks by raising AttributeErrors from getattr | ||
checks = [ | ||
_deserialize_check_stats(getattr(Check, check_name), check_stats) | ||
for check_name, check_stats in serialized_schema["checks"].items() | ||
] | ||
|
||
if index is None: | ||
pass | ||
elif len(index) == 1: | ||
|
@@ -199,6 +245,7 @@ def _deserialize_schema(serialized_schema): | |
|
||
return DataFrameSchema( | ||
columns=columns, | ||
checks=checks, | ||
index=index, | ||
coerce=serialized_schema["coerce"], | ||
strict=serialized_schema["strict"], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
"""Pytest fixtures for testing custom checks.""" | ||
import unittest.mock as mock | ||
|
||
import pandas as pd | ||
import pytest | ||
|
||
import pandera as pa | ||
import pandera.extensions as pa_ext | ||
|
||
__all__ = "custom_check_teardown", "extra_registered_checks" | ||
|
||
|
||
@pytest.fixture(scope="function") | ||
def custom_check_teardown(): | ||
"""Remove all custom checks after execution of each pytest function.""" | ||
yield | ||
for check_name in list(pa.Check.REGISTERED_CUSTOM_CHECKS): | ||
del pa.Check.REGISTERED_CUSTOM_CHECKS[check_name] | ||
|
||
|
||
@pytest.fixture(scope="function") | ||
def extra_registered_checks(): | ||
"""temporarily registers custom checks onto the Check class""" | ||
# pylint: disable=unused-variable | ||
with mock.patch( | ||
"pandera.Check.REGISTERED_CUSTOM_CHECKS", new_callable=dict | ||
): | ||
# register custom checks here | ||
@pa_ext.register_check_method() | ||
def no_param_check(_: pd.DataFrame) -> bool: | ||
return True | ||
|
||
yield |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
"""Registers fixtures for core""" | ||
|
||
# pylint: disable=unused-import | ||
from .checks_fixtures import custom_check_teardown, extra_registered_checks |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you elaborate why it'd be better in
CheckBase
?REGISTERED_CUSTOM_CHECKS
is referenced by__getattr__
, it seems to make sense to have them sitting next to each others.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment got moved by black. Essentially, I think we could merge the
_CheckMeta
with_CheckBase
. The metaclass actually confusesmypy
, and I don't see the benefit of using this mixin pattern here. Since this is a style choice, I didn't want to just do it though, so I added a comment so that we could discuss.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you move
__getattr__
to_CheckBase
it's going to act on instances of the class and not on the class itself.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right. Metaclasses confuse me, had to look up the descriptor protocol docs again. 👍
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I'm proposing we just move the metaclass to
_CheckBase
. It's not usable on anything else anyway because it requires a name property for example. I'll fix it and remove the fixme comment.