From 84ddccbd5bacd5f5409447e0001efefc3296332b Mon Sep 17 00:00:00 2001 From: Niels Bantilan Date: Fri, 30 Apr 2021 08:37:15 -0400 Subject: [PATCH] coerce=True and pandas_dtype=None should be a noop (#476) * coerce=True and pandas_dtype=None should be a noop before these two options were incompatible and pandera would raise a SchemaInitError. This diff loosens this by allowing for this combination, in which case no coercion will happen if pandas_dtype=None * ignore python=3.7 pandas=0.25.3 in noxfile, minor changes --- environment.yml | 4 ++- noxfile.py | 56 +++++++++++++++++++++++++---- pandera/io.py | 10 +++--- pandera/schema_components.py | 8 ++--- pandera/schemas.py | 25 +++++-------- requirements-dev.txt | 4 +-- tests/core/test_schemas.py | 22 +++++------- tests/strategies/test_strategies.py | 9 +++++ 8 files changed, 89 insertions(+), 49 deletions(-) diff --git a/environment.yml b/environment.yml index 1541e5b7d..9aa7f2c4e 100644 --- a/environment.yml +++ b/environment.yml @@ -38,7 +38,6 @@ dependencies: - sphinx-autodoc-typehints - sphinx-copybutton - recommonmark - - furo # packaging - twine @@ -48,3 +47,6 @@ dependencies: # optional - pre_commit + + - pip: + - furo diff --git a/noxfile.py b/noxfile.py index 580447a70..02eac2c76 100644 --- a/noxfile.py +++ b/noxfile.py @@ -32,6 +32,7 @@ SOURCE_PATHS = PACKAGE, "tests", "noxfile.py" REQUIREMENT_PATH = "requirements-dev.txt" +ALWAYS_USE_PIP = ["furo", "mypy"] CI_RUN = os.environ.get("CI") == "true" if CI_RUN: @@ -153,7 +154,9 @@ def install(session: Session, *args: str): def install_from_requirements(session: Session, *packages: str) -> None: - """Install dependencies, respecting the version specified in requirements.""" + """ + Install dependencies, respecting the version specified in requirements. + """ for package in packages: try: specs = REQUIRES["all"][package] @@ -165,20 +168,28 @@ def install_from_requirements(session: Session, *packages: str) -> None: def install_extras( - session: Session, pandas: str = "latest", extra: str = "core" + session: Session, + pandas: str = "latest", + extra: str = "core", + force_pip=False, ) -> None: """Install dependencies.""" pandas_version = "" if pandas == "latest" else f"=={pandas}" specs = [ spec if spec != "pandas" else f"pandas{pandas_version}" for spec in REQUIRES[extra].values() + if spec not in ALWAYS_USE_PIP ] - if isinstance(session.virtualenv, nox.virtualenv.CondaEnv): + if ( + isinstance(session.virtualenv, nox.virtualenv.CondaEnv) + and not force_pip + ): print("using conda installer") conda_install(session, *specs) else: print("using pip installer") session.install(*specs) + # always use pip for these packages session.install("-e", ".", "--no-deps") # install pandera @@ -265,14 +276,26 @@ def lint(session: Session) -> None: @nox.session(python=PYTHON_VERSIONS) def mypy(session: Session) -> None: """Type-check using mypy.""" - install_extras(session, extra="all") + python_version = version.parse(cast(str, session.python)) + install_extras( + session, + extra="all", + # this is a hack until typed-ast conda package starts working again, + # basically this issue comes up: + # https://github.com/python/mypy/pull/2906 + force_pip=python_version == version.parse("3.7"), + ) args = session.posargs or SOURCE_PATHS session.run("mypy", "--follow-imports=silent", *args, silent=True) def _invalid_python_pandas_versions(session: Session, pandas: str) -> bool: python_version = version.parse(cast(str, session.python)) - if pandas == "0.25.3" and python_version >= version.parse("3.9"): + if pandas == "0.25.3" and ( + python_version >= version.parse("3.9") + # this is just a bandaid until support for 0.25.3 is dropped + or python_version == version.parse("3.7") + ): print("Python 3.9 does not support pandas 0.25.3") return True return False @@ -292,7 +315,16 @@ def tests(session: Session, pandas: str, extra: str) -> None: """Run the test suite.""" if _invalid_python_pandas_versions(session, pandas): return - install_extras(session, pandas, extra) + python_version = version.parse(cast(str, session.python)) + install_extras( + session, + pandas, + extra, + # this is a hack until typed-ast conda package starts working again, + # basically this issue comes up: + # https://github.com/python/mypy/pull/2906 + force_pip=python_version == version.parse("3.7"), + ) if session.posargs: args = session.posargs @@ -325,9 +357,19 @@ def docs(session: Session, pandas: str) -> None: """Build the documentation.""" if _invalid_python_pandas_versions(session, pandas): return - install_extras(session, pandas, extra="all") + python_version = version.parse(cast(str, session.python)) + install_extras( + session, + pandas, + extra="all", + # this is a hack until typed-ast conda package starts working again, + # basically this issue comes up: + # https://github.com/python/mypy/pull/2906 + force_pip=python_version == version.parse("3.7"), + ) session.chdir("docs") + shutil.rmtree(os.path.join("_build"), ignore_errors=True) args = session.posargs or ["-W", "-E", "-b=doctest", "source", "_build"] session.run("sphinx-build", *args) diff --git a/pandera/io.py b/pandera/io.py index bff044cdf..05441a04d 100644 --- a/pandera/io.py +++ b/pandera/io.py @@ -50,13 +50,14 @@ def handle_stat_dtype(stat): def _serialize_dataframe_stats(dataframe_checks): """ - Serialize global dataframe check statistics into json/yaml-compatible format. + Serialize global dataframe check statistics into json/yaml-compatible + format. """ serialized_checks = {} for check_name, check_stats in dataframe_checks.items(): - # The case that `check_name` is not registered is handled in `parse_checks`, - # so we know that `check_name` exists. + # The case that `check_name` is not registered is handled in + # `parse_checks` so we know that `check_name` exists. # infer dtype of statistics and serialize them serialized_checks[check_name] = _serialize_check_stats(check_stats) @@ -305,7 +306,8 @@ def _format_checks(checks_dict): for check_name, check_kwargs in checks_dict.items(): if check_kwargs is None: warnings.warn( - f"Check {check_name} cannot be serialized. This check will be ignored" + f"Check {check_name} cannot be serialized. " + "This check will be ignored" ) else: args = ", ".join( diff --git a/pandera/schema_components.py b/pandera/schema_components.py index 39185b1b6..eead56190 100644 --- a/pandera/schema_components.py +++ b/pandera/schema_components.py @@ -50,7 +50,8 @@ def __init__( :param allow_duplicates: Whether or not column can contain duplicate values. :param coerce: If True, when schema.validate is called the column will - be coerced into the specified dtype. + be coerced into the specified dtype. This has no effect on columns + where ``pandas_dtype=None``. :param required: Whether or not column is allowed to be missing :param name: column name in dataframe to validate. :param regex: whether the ``name`` attribute should be treated as a @@ -90,11 +91,6 @@ def __init__( self._name = name self._regex = regex - if coerce and self._pandas_dtype is None: - raise errors.SchemaInitError( - "Must specify dtype if coercing a Column's type" - ) - @property def regex(self) -> bool: """True if ``name`` attribute should be treated as a regex pattern.""" diff --git a/pandera/schemas.py b/pandera/schemas.py index 041600d15..64a1eef64 100644 --- a/pandera/schemas.py +++ b/pandera/schemas.py @@ -91,7 +91,8 @@ def __init__( `validate` will verify properties of the columns and return the transformed dataframe object. :param coerce: whether or not to coerce all of the columns on - validation. + validation. This has no effect on columns where + ``pandas_dtype=None`` :param strict: ensure that all and only the columns defined in the schema are present in the dataframe. If set to 'filter', only the columns in the schema will be passed to the validated @@ -140,19 +141,6 @@ def __init__( self.columns = {} if columns is None else columns - if coerce: - missing_pandas_type = [ - name - for name, col in self.columns.items() - if col.pandas_dtype is None - ] - if missing_pandas_type: - raise errors.SchemaInitError( - "Must specify dtype in all Columns if coercing " - "DataFrameSchema ; columns with missing pandas_type:" - + ", ".join(missing_pandas_type) - ) - if transformer is not None: warnings.warn( "The `transformers` argument has been deprecated and will no " @@ -318,7 +306,10 @@ def _coerce_dtype(self, obj: pd.DataFrame) -> pd.DataFrame: try: return obj.astype(self.pdtype.str_alias) except (ValueError, TypeError) as exc: - msg = f"Error while coercing '{self.name}' to type {self.dtype}: {exc}" + msg = ( + f"Error while coercing '{self.name}' to type {self.dtype}: " + f"{exc}" + ) raise errors.SchemaError( self, obj, @@ -1639,7 +1630,9 @@ def coerce_dtype(self, obj: Union[pd.Series, pd.Index]) -> pd.Series: (including time series). :returns: ``Series`` with coerced data type """ - if ( + if self._pandas_dtype is None: + return obj + elif ( self._pandas_dtype is PandasDtype.String or self._pandas_dtype is str or self._pandas_dtype == "str" diff --git a/requirements-dev.txt b/requirements-dev.txt index b1ad45fdc..dca83a883 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -27,7 +27,7 @@ sphinx_rtd_theme sphinx-autodoc-typehints sphinx-copybutton recommonmark -furo twine asv -pre_commit \ No newline at end of file +pre_commit +furo \ No newline at end of file diff --git a/tests/core/test_schemas.py b/tests/core/test_schemas.py index 73a4ecb71..cef8054ef 100644 --- a/tests/core/test_schemas.py +++ b/tests/core/test_schemas.py @@ -657,11 +657,12 @@ def test_no_dtype_series(): def test_coerce_without_dtype(): """Test that an error is thrown when a dtype isn't specified and coerce is True.""" - with pytest.raises(errors.SchemaInitError): - DataFrameSchema({"col": Column(coerce=True)}) - - with pytest.raises(errors.SchemaInitError): - DataFrameSchema({"col": Column()}, coerce=True) + df = pd.DataFrame({"col": [1, 2, 3]}) + for schema in [ + DataFrameSchema({"col": Column(coerce=True)}), + DataFrameSchema({"col": Column()}, coerce=True), + ]: + assert isinstance(schema(df), pd.DataFrame) def test_required(): @@ -1027,15 +1028,10 @@ def test_rename_columns(): # Check if new column names are indeed present in the new schema assert all( - [ - col_name in rename_dict.values() - for col_name in schema_renamed.columns - ] + col_name in rename_dict.values() for col_name in schema_renamed.columns ) # Check if original schema didn't change in the process - assert all( - [col_name in schema_original.columns for col_name in rename_dict] - ) + assert all(col_name in schema_original.columns for col_name in rename_dict) with pytest.raises(errors.SchemaInitError): schema_original.rename_columns({"foo": "bar"}) @@ -1525,7 +1521,7 @@ def test_invalid_keys(schema_simple): def test_update_columns(schema_simple): - """ Catch-all test for update columns functionality """ + """Catch-all test for update columns functionality""" # Basic function test_schema = schema_simple.update_columns({"col2": {"pandas_dtype": Int}}) diff --git a/tests/strategies/test_strategies.py b/tests/strategies/test_strategies.py index 13f33f797..715c595eb 100644 --- a/tests/strategies/test_strategies.py +++ b/tests/strategies/test_strategies.py @@ -86,6 +86,9 @@ def test_pandas_dtype_strategy(pdtype, data): @pytest.mark.parametrize("pdtype", NUMERIC_DTYPES) @hypothesis.given(st.data()) +@hypothesis.settings( + suppress_health_check=[hypothesis.HealthCheck.too_slow], +) def test_check_strategy_continuous(pdtype, data): """Test built-in check strategies can generate continuous data.""" value = data.draw( @@ -416,6 +419,9 @@ def test_series_example(): @hypothesis.given(st.data()) +@hypothesis.settings( + suppress_health_check=[hypothesis.HealthCheck.too_slow], +) def test_column_strategy(data): """Test Column schema strategy.""" column_schema = pa.Column(pa.Int, pa.Check.gt(0), name="column") @@ -803,6 +809,9 @@ class Schema(pa.SchemaModel): @hypothesis.given(st.data()) +@hypothesis.settings( + suppress_health_check=[hypothesis.HealthCheck.too_slow], +) def test_schema_model_strategy(schema_model, data): """Test that strategy can be created from a SchemaModel.""" strat = schema_model.strategy(size=10)