Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

coerce=True and pandas_dtype=None should be a noop #476

Merged
merged 9 commits into from
Apr 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ env:
DEFAULT_PYTHON: 3.8
CI: "true"
# Increase this value to reset cache if environment.yml has not changed
CACHE_VERSION: 1
CACHE_VERSION: 2

jobs:
codestyle:
Expand Down
4 changes: 3 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ dependencies:
- sphinx-autodoc-typehints
- sphinx-copybutton
- recommonmark
- furo

# packaging
- twine
Expand All @@ -48,3 +47,6 @@ dependencies:

# optional
- pre_commit

- pip:
- furo
56 changes: 49 additions & 7 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

SOURCE_PATHS = PACKAGE, "tests", "noxfile.py"
REQUIREMENT_PATH = "requirements-dev.txt"
ALWAYS_USE_PIP = ["furo", "mypy"]

CI_RUN = os.environ.get("CI") == "true"
if CI_RUN:
Expand Down Expand Up @@ -153,7 +154,9 @@ def install(session: Session, *args: str):


def install_from_requirements(session: Session, *packages: str) -> None:
"""Install dependencies, respecting the version specified in requirements."""
"""
Install dependencies, respecting the version specified in requirements.
"""
for package in packages:
try:
specs = REQUIRES["all"][package]
Expand All @@ -165,20 +168,28 @@ def install_from_requirements(session: Session, *packages: str) -> None:


def install_extras(
session: Session, pandas: str = "latest", extra: str = "core"
session: Session,
pandas: str = "latest",
extra: str = "core",
force_pip=False,
) -> None:
"""Install dependencies."""
pandas_version = "" if pandas == "latest" else f"=={pandas}"
specs = [
spec if spec != "pandas" else f"pandas{pandas_version}"
for spec in REQUIRES[extra].values()
if spec not in ALWAYS_USE_PIP
]
if isinstance(session.virtualenv, nox.virtualenv.CondaEnv):
if (
isinstance(session.virtualenv, nox.virtualenv.CondaEnv)
and not force_pip
):
print("using conda installer")
conda_install(session, *specs)
else:
print("using pip installer")
session.install(*specs)
# always use pip for these packages
session.install("-e", ".", "--no-deps") # install pandera


Expand Down Expand Up @@ -265,14 +276,26 @@ def lint(session: Session) -> None:
@nox.session(python=PYTHON_VERSIONS)
def mypy(session: Session) -> None:
"""Type-check using mypy."""
install_extras(session, extra="all")
python_version = version.parse(cast(str, session.python))
install_extras(
session,
extra="all",
# this is a hack until typed-ast conda package starts working again,
# basically this issue comes up:
# https://github.com/python/mypy/pull/2906
force_pip=python_version == version.parse("3.7"),
)
args = session.posargs or SOURCE_PATHS
session.run("mypy", "--follow-imports=silent", *args, silent=True)


def _invalid_python_pandas_versions(session: Session, pandas: str) -> bool:
python_version = version.parse(cast(str, session.python))
if pandas == "0.25.3" and python_version >= version.parse("3.9"):
if pandas == "0.25.3" and (
python_version >= version.parse("3.9")
# this is just a bandaid until support for 0.25.3 is dropped
or python_version == version.parse("3.7")
):
print("Python 3.9 does not support pandas 0.25.3")
return True
return False
Expand All @@ -292,7 +315,16 @@ def tests(session: Session, pandas: str, extra: str) -> None:
"""Run the test suite."""
if _invalid_python_pandas_versions(session, pandas):
return
install_extras(session, pandas, extra)
python_version = version.parse(cast(str, session.python))
install_extras(
session,
pandas,
extra,
# this is a hack until typed-ast conda package starts working again,
# basically this issue comes up:
# https://github.com/python/mypy/pull/2906
force_pip=python_version == version.parse("3.7"),
)

if session.posargs:
args = session.posargs
Expand Down Expand Up @@ -325,9 +357,19 @@ def docs(session: Session, pandas: str) -> None:
"""Build the documentation."""
if _invalid_python_pandas_versions(session, pandas):
return
install_extras(session, pandas, extra="all")
python_version = version.parse(cast(str, session.python))
install_extras(
session,
pandas,
extra="all",
# this is a hack until typed-ast conda package starts working again,
# basically this issue comes up:
# https://github.com/python/mypy/pull/2906
force_pip=python_version == version.parse("3.7"),
)
session.chdir("docs")

shutil.rmtree(os.path.join("_build"), ignore_errors=True)
args = session.posargs or ["-W", "-E", "-b=doctest", "source", "_build"]
session.run("sphinx-build", *args)

Expand Down
10 changes: 6 additions & 4 deletions pandera/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,14 @@ def handle_stat_dtype(stat):

def _serialize_dataframe_stats(dataframe_checks):
"""
Serialize global dataframe check statistics into json/yaml-compatible format.
Serialize global dataframe check statistics into json/yaml-compatible
format.
"""
serialized_checks = {}

for check_name, check_stats in dataframe_checks.items():
# The case that `check_name` is not registered is handled in `parse_checks`,
# so we know that `check_name` exists.
# The case that `check_name` is not registered is handled in
# `parse_checks` so we know that `check_name` exists.

# infer dtype of statistics and serialize them
serialized_checks[check_name] = _serialize_check_stats(check_stats)
Expand Down Expand Up @@ -305,7 +306,8 @@ def _format_checks(checks_dict):
for check_name, check_kwargs in checks_dict.items():
if check_kwargs is None:
warnings.warn(
f"Check {check_name} cannot be serialized. This check will be ignored"
f"Check {check_name} cannot be serialized. "
"This check will be ignored"
)
else:
args = ", ".join(
Expand Down
8 changes: 2 additions & 6 deletions pandera/schema_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def __init__(
:param allow_duplicates: Whether or not column can contain duplicate
values.
:param coerce: If True, when schema.validate is called the column will
be coerced into the specified dtype.
be coerced into the specified dtype. This has no effect on columns
where ``pandas_dtype=None``.
:param required: Whether or not column is allowed to be missing
:param name: column name in dataframe to validate.
:param regex: whether the ``name`` attribute should be treated as a
Expand Down Expand Up @@ -90,11 +91,6 @@ def __init__(
self._name = name
self._regex = regex

if coerce and self._pandas_dtype is None:
raise errors.SchemaInitError(
"Must specify dtype if coercing a Column's type"
)

@property
def regex(self) -> bool:
"""True if ``name`` attribute should be treated as a regex pattern."""
Expand Down
25 changes: 9 additions & 16 deletions pandera/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def __init__(
`validate` will verify properties of the columns and return the
transformed dataframe object.
:param coerce: whether or not to coerce all of the columns on
validation.
validation. This has no effect on columns where
``pandas_dtype=None``
:param strict: ensure that all and only the columns defined in the
schema are present in the dataframe. If set to 'filter',
only the columns in the schema will be passed to the validated
Expand Down Expand Up @@ -140,19 +141,6 @@ def __init__(

self.columns = {} if columns is None else columns

if coerce:
missing_pandas_type = [
name
for name, col in self.columns.items()
if col.pandas_dtype is None
]
if missing_pandas_type:
raise errors.SchemaInitError(
"Must specify dtype in all Columns if coercing "
"DataFrameSchema ; columns with missing pandas_type:"
+ ", ".join(missing_pandas_type)
)

if transformer is not None:
warnings.warn(
"The `transformers` argument has been deprecated and will no "
Expand Down Expand Up @@ -318,7 +306,10 @@ def _coerce_dtype(self, obj: pd.DataFrame) -> pd.DataFrame:
try:
return obj.astype(self.pdtype.str_alias)
except (ValueError, TypeError) as exc:
msg = f"Error while coercing '{self.name}' to type {self.dtype}: {exc}"
msg = (
f"Error while coercing '{self.name}' to type {self.dtype}: "
f"{exc}"
)
raise errors.SchemaError(
self,
obj,
Expand Down Expand Up @@ -1639,7 +1630,9 @@ def coerce_dtype(self, obj: Union[pd.Series, pd.Index]) -> pd.Series:
(including time series).
:returns: ``Series`` with coerced data type
"""
if (
if self._pandas_dtype is None:
return obj
elif (
self._pandas_dtype is PandasDtype.String
or self._pandas_dtype is str
or self._pandas_dtype == "str"
Expand Down
4 changes: 2 additions & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ sphinx_rtd_theme
sphinx-autodoc-typehints
sphinx-copybutton
recommonmark
furo
twine
asv
pre_commit
pre_commit
furo
22 changes: 9 additions & 13 deletions tests/core/test_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,11 +657,12 @@ def test_no_dtype_series():
def test_coerce_without_dtype():
"""Test that an error is thrown when a dtype isn't specified and coerce
is True."""
with pytest.raises(errors.SchemaInitError):
DataFrameSchema({"col": Column(coerce=True)})

with pytest.raises(errors.SchemaInitError):
DataFrameSchema({"col": Column()}, coerce=True)
df = pd.DataFrame({"col": [1, 2, 3]})
for schema in [
DataFrameSchema({"col": Column(coerce=True)}),
DataFrameSchema({"col": Column()}, coerce=True),
]:
assert isinstance(schema(df), pd.DataFrame)


def test_required():
Expand Down Expand Up @@ -1027,15 +1028,10 @@ def test_rename_columns():

# Check if new column names are indeed present in the new schema
assert all(
[
col_name in rename_dict.values()
for col_name in schema_renamed.columns
]
col_name in rename_dict.values() for col_name in schema_renamed.columns
)
# Check if original schema didn't change in the process
assert all(
[col_name in schema_original.columns for col_name in rename_dict]
)
assert all(col_name in schema_original.columns for col_name in rename_dict)

with pytest.raises(errors.SchemaInitError):
schema_original.rename_columns({"foo": "bar"})
Expand Down Expand Up @@ -1525,7 +1521,7 @@ def test_invalid_keys(schema_simple):


def test_update_columns(schema_simple):
""" Catch-all test for update columns functionality """
"""Catch-all test for update columns functionality"""

# Basic function
test_schema = schema_simple.update_columns({"col2": {"pandas_dtype": Int}})
Expand Down
9 changes: 9 additions & 0 deletions tests/strategies/test_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ def test_pandas_dtype_strategy(pdtype, data):

@pytest.mark.parametrize("pdtype", NUMERIC_DTYPES)
@hypothesis.given(st.data())
@hypothesis.settings(
suppress_health_check=[hypothesis.HealthCheck.too_slow],
)
def test_check_strategy_continuous(pdtype, data):
"""Test built-in check strategies can generate continuous data."""
value = data.draw(
Expand Down Expand Up @@ -416,6 +419,9 @@ def test_series_example():


@hypothesis.given(st.data())
@hypothesis.settings(
suppress_health_check=[hypothesis.HealthCheck.too_slow],
)
def test_column_strategy(data):
"""Test Column schema strategy."""
column_schema = pa.Column(pa.Int, pa.Check.gt(0), name="column")
Expand Down Expand Up @@ -803,6 +809,9 @@ class Schema(pa.SchemaModel):


@hypothesis.given(st.data())
@hypothesis.settings(
suppress_health_check=[hypothesis.HealthCheck.too_slow],
)
def test_schema_model_strategy(schema_model, data):
"""Test that strategy can be created from a SchemaModel."""
strat = schema_model.strategy(size=10)
Expand Down