Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor PandasDtype #490

Merged
merged 16 commits into from
May 24, 2021
Merged
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies:
- pyyaml >=5.1
- typing_inspect >= 0.6.0
- typing_extensions >= 3.7.4.3
- frictionless

# testing and dependencies
- black >= 20.8b1
Expand Down
3 changes: 3 additions & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,9 @@ def install_extras(
for spec in REQUIRES[extra].values()
if spec not in ALWAYS_USE_PIP
]
if extra == "core":
specs.append(REQUIRES["all"]["hypothesis"])

session.install(*ALWAYS_USE_PIP)
if (
isinstance(session.virtualenv, nox.virtualenv.CondaEnv)
Expand Down
44 changes: 14 additions & 30 deletions pandera/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,18 @@
"""A flexible and expressive pandas validation library."""
from pandera.dtypes_ import *
jeffzi marked this conversation as resolved.
Show resolved Hide resolved
from pandera.engines.numpy_engine import Object
from pandera.engines.pandas_engine import (
BOOL,
INT8,
INT16,
INT32,
INT64,
STRING,
UINT8,
UINT16,
UINT32,
UINT64,
)

from . import constants, errors, pandas_accessor
from .checks import Check
Expand All @@ -11,33 +25,3 @@
from .schema_inference import infer_schema
from .schemas import DataFrameSchema, SeriesSchema
from .version import __version__

# pylint: disable=invalid-name
Bool = PandasDtype.Bool
DateTime = PandasDtype.DateTime
Category = PandasDtype.Category
Float = PandasDtype.Float
Float16 = PandasDtype.Float16
Float32 = PandasDtype.Float32
Float64 = PandasDtype.Float64
Int = PandasDtype.Int
Int8 = PandasDtype.Int8
Int16 = PandasDtype.Int16
Int32 = PandasDtype.Int32
Int64 = PandasDtype.Int64
UInt8 = PandasDtype.UInt8
UInt16 = PandasDtype.UInt16
UInt32 = PandasDtype.UInt32
UInt64 = PandasDtype.UInt64
INT8 = PandasDtype.INT8
INT16 = PandasDtype.INT16
INT32 = PandasDtype.INT32
INT64 = PandasDtype.INT64
UINT8 = PandasDtype.UINT8
UINT16 = PandasDtype.UINT16
UINT32 = PandasDtype.UINT32
UINT64 = PandasDtype.UINT64
Object = PandasDtype.Object
String = PandasDtype.String
STRING = PandasDtype.STRING
Timedelta = PandasDtype.Timedelta
286 changes: 286 additions & 0 deletions pandera/dtypes_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
import functools
from dataclasses import dataclass, field
from typing import Any, Tuple, Type, Union

try: # python 3.8+
from typing import Literal # type: ignore
except ImportError:
from typing_extensions import Literal # type: ignore


def immutable(dtype=None, **kwargs) -> Type:
dataclass_kwargs = {"frozen": True, "init": False, "repr": False}
dataclass_kwargs.update(kwargs)

if dtype is None:
return functools.partial(dataclass, **dataclass_kwargs)
return dataclass(**dataclass_kwargs)(dtype)


class DisableInitMixin:
jeffzi marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self) -> None:
pass


class DataType:
def __init__(self):
if self.__class__ is DataType:
raise TypeError(
f"{self.__class__.__name__} may not be instantiated."
)

def __call__(self, obj: Any):
"""Coerce object to the dtype."""
return self.coerce(obj)

def coerce(self, obj: Any):
"""Coerce object to the dtype."""
raise NotImplementedError()

def __repr__(self) -> str:
return f"DataType({str(self)})"

def __str__(self) -> str:
"""Must be implemented by subclasses."""
raise NotImplementedError()

def check(self, datatype: "DataType") -> bool:
if not isinstance(datatype, DataType):
return False
return self == datatype

def __hash__(self) -> int:
pass


################################################################################
# boolean
################################################################################


@immutable
class Bool(DataType):
"""Semantic representation of a boolean data type."""

def __str__(self) -> str:
return "bool"


Boolean = Bool

################################################################################
# number
################################################################################


@immutable
class _Number(DataType):
continuous: bool = None
exact: bool = None

def check(self, datatype: "DataType") -> bool:
if self.__class__ is _Number:
return isinstance(datatype, (Int, Float, Complex))
return super().check(datatype)


@immutable
class _PhysicalNumber(_Number):
bit_width: int = None
_base_name: str = field(default=None, init=False, repr=False)

def __eq__(self, obj: object) -> bool:
if isinstance(obj, type(self)):
return obj.bit_width == self.bit_width
return super().__eq__(obj)

def __str__(self) -> str:
return f"{self._base_name}{self.bit_width}"


################################################################################
## signed integer
################################################################################


@immutable(eq=False)
class Int(_PhysicalNumber):
_base_name = "int"
continuous = False
exact = True
bit_width = 64
signed: bool = field(default=True, init=False)


@immutable
class Int64(Int, _PhysicalNumber):
bit_width = 64


@immutable
class Int32(Int64):
bit_width = 32


@immutable
class Int16(Int32):
bit_width = 16


@immutable
class Int8(Int16):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you clarify the purpose of this inheritance chain? e.g. can we have all the Int* types inherit from Int?

Copy link
Collaborator Author

@jeffzi jeffzi May 12, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could use the chain to soften the dtype check, i.e. allowing a subtype (inspired by [numpy.can_cast])(https://numpy.org/doc/stable/reference/generated/numpy.can_cast.html#numpy.can_cast). I played with a casting argument in DataType.check that could be False for strict check (current behavior) or True to allow safe downcasting. Then I realized almost anything can be casted to a string and gave up the idea for now.

In the same vein, Pandas and Numpy dtypes inherit the appropriate DataType so that we can implement a cross-engine is_numeric, etc. with a call to isinstance.

bit_width = 8


################################################################################
## unsigned integer
################################################################################


@immutable
class UInt(Int):
_base_name = "uint"
signed: bool = field(default=False, init=False)


@immutable
class UInt64(UInt):
bit_width = 64


@immutable
class UInt32(UInt64):
bit_width = 32


@immutable
class UInt16(UInt32):
bit_width = 16


@immutable
class UInt8(UInt16):
bit_width = 8


################################################################################
## float
################################################################################


@immutable(eq=False)
class Float(_PhysicalNumber):
_base_name = "float"
continuous = True
exact = False
bit_width = 64


@immutable
class Float128(Float):
bit_width = 128


@immutable
class Float64(Float128):
bit_width = 64


@immutable
class Float32(Float64):
bit_width = 32


@immutable
class Float16(Float32):
bit_width = 16


################################################################################
## complex
################################################################################


@immutable(eq=False)
class Complex(_PhysicalNumber):
_base_name = "complex"
bit_width = 128


@immutable
class Complex256(Complex):
bit_width = 256


@immutable
class Complex128(Complex):
bit_width = 128


@immutable
class Complex64(Complex128):
bit_width = 64


################################################################################
# nominal
################################################################################


@dataclass(frozen=True)
jeffzi marked this conversation as resolved.
Show resolved Hide resolved
class Category(DataType):
categories: Tuple[Any] = None # immutable sequence to ensure safe hash
ordered: bool = False

def __post_init__(self) -> "Category":
if self.categories is not None and not isinstance(
self.categories, tuple
):
object.__setattr__(self, "categories", tuple(self.categories))

def check(self, datatype: "DataType") -> bool:
if (
isinstance(datatype, Category)
and self.categories is None
or datatype.categories is None
):
# Category without categories is a superset of any Category
# Allow end-users to not list categories when validating.
return True

return super().check(datatype)

def __str__(self) -> str:
return "category"


@immutable
class String(DataType):
def __str__(self) -> str:
return "string"


################################################################################
# time
################################################################################


@immutable
class Date(DataType):
def __str__(self) -> str:
return "date"


@immutable
class Timestamp(Date):
def __str__(self) -> str:
return "timestamp"


DateTime = Timestamp


@immutable
class Timedelta(DataType):
def __str__(self) -> str:
return "timedelta"
Empty file added pandera/engines/__init__.py
Empty file.
Loading