diff --git a/Makefile b/Makefile index 60b5acb60..02dc3610d 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ pypi_package: check-manifest python setup.py sdist bdist_wheel twine check dist/* - twine upload --skip-existing dist/* + twine upload --skip-existing dist/* -u __token__ ## Run black linting lint: diff --git a/README.md b/README.md index c1c706578..0a6849302 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@
And these visions of data types, they kept us up past the dawn. -

@@ -121,7 +120,8 @@ df.head(2) -The most important abstraction in `visions` are Types - these represent semantic notions about data. You have access to a +The most important abstraction in `visions` are Types - these represent semantic notions about data. You have access to +a range of well tested types like `Integer`, `Float`, and `Files` covering the most common software development use cases. Types can be bundled together into typesets. Behind the scenes, `visions` builds a traversable graph for any collection of types. diff --git a/build/lib/visions/__init__.py b/build/lib/visions/__init__.py new file mode 100644 index 000000000..57086c4e0 --- /dev/null +++ b/build/lib/visions/__init__.py @@ -0,0 +1,15 @@ +"""Core functionality""" + +from visions import types, typesets, utils +from visions.backends import * +from visions.declarative import create_type +from visions.functional import ( + cast_to_detected, + cast_to_inferred, + detect_type, + infer_type, +) +from visions.types import * +from visions.typesets import * + +__version__ = "0.7.6" diff --git a/build/lib/visions/backends/__init__.py b/build/lib/visions/backends/__init__.py new file mode 100644 index 000000000..3bdda55da --- /dev/null +++ b/build/lib/visions/backends/__init__.py @@ -0,0 +1,45 @@ +import logging + +logger = logging.getLogger(__name__) + + +try: + import pandas as pd + + import visions.backends.pandas + from visions.backends.pandas.test_utils import pandas_version + + if pandas_version[0] < 1: + from visions.dtypes.boolean import BoolDtype + logger.info(f"Pandas backend loaded {pd.__version__}") + +except ImportError: + logger.info("Pandas backend NOT loaded") + + +try: + import numpy as np + + import visions.backends.numpy + + logger.info(f"Numpy backend loaded {np.__version__}") +except ImportError: + logger.info("Numpy backend NOT loaded") + + +try: + import pyspark + + import visions.backends.spark + + logger.info(f"Pyspark backend loaded {pyspark.__version__}") +except ImportError: + logger.info("Pyspark backend NOT loaded") + + +try: + import visions.backends.python + + logger.info("Python backend loaded") +except ImportError: + logger.info("Python backend NOT loaded") diff --git a/build/lib/visions/backends/numpy/__init__.py b/build/lib/visions/backends/numpy/__init__.py new file mode 100644 index 000000000..868b4572c --- /dev/null +++ b/build/lib/visions/backends/numpy/__init__.py @@ -0,0 +1,3 @@ +# https://het.as.utexas.edu/HET/Software/Numpy/reference/arrays.scalars.html +import visions.backends.numpy.types +from visions.backends.numpy.array_utils import array_handle_nulls, array_not_empty diff --git a/build/lib/visions/backends/numpy/array_utils.py b/build/lib/visions/backends/numpy/array_utils.py new file mode 100644 index 000000000..373d1a39b --- /dev/null +++ b/build/lib/visions/backends/numpy/array_utils.py @@ -0,0 +1,67 @@ +import functools +from typing import Callable, Sequence, Tuple, TypeVar, Union + +import numpy as np + +from visions.backends.shared.nan_handling import nan_mask +from visions.backends.shared.utilities import has_import + +has_numba = has_import("numba") + +if has_numba: + import numba as nb + +T = TypeVar("T") + + +def array_handle_nulls(fn: Callable[..., bool]) -> Callable[..., bool]: + """Decorator for nullable arrays""" + + handles_missing = array_not_empty(fn) + + @functools.wraps(fn) + def inner(array: np.ndarray, *args, **kwargs) -> bool: + array = array[nan_mask(array)] + return handles_missing(array, *args, **kwargs) + + return inner + + +def array_not_empty(fn: Callable[..., bool]) -> Callable[..., bool]: + """Decorator to exclude empty arrays""" + + @functools.wraps(fn) + def inner(array: np.ndarray, *args, **kwargs) -> bool: + if array.shape[0] == 0: + return False + return fn(array, *args, **kwargs) + + return inner + + +def _base_all_type(array: np.ndarray, dtypes: Union[type, Tuple[type, ...]]) -> bool: + return all(isinstance(v, dtypes) for v in array) + + +if has_numba: + # TODO: This only works when the numpy array dtype falls under a few categories + # There are alternative implementations with forceobj=True which work in all cases + # including the use of isinstance, but in those cases worst case performance can be substantially worse + # than the default python implementation. + def all_type_numba(dtype: Union[Tuple, T]): + @nb.jit(nopython=True) + def inner(array: np.ndarray) -> bool: + for i in nb.prange(array.size): + if type(array[i]) is not dtype: + return False + return True + + return inner + + def all_type(array: np.ndarray, dtypes: Union[type, Tuple[type, ...]]) -> bool: + return _base_all_type(array, dtypes) + +else: + + def all_type(array: np.ndarray, dtypes: Union[type, Tuple[type, ...]]) -> bool: + return _base_all_type(array, dtypes) diff --git a/build/lib/visions/backends/numpy/sequences.py b/build/lib/visions/backends/numpy/sequences.py new file mode 100644 index 000000000..f74b33465 --- /dev/null +++ b/build/lib/visions/backends/numpy/sequences.py @@ -0,0 +1,61 @@ +from typing import Dict, Sequence +from urllib.parse import urlparse + +import numpy as np + + +def get_sequences() -> Dict[str, Sequence]: + sequences = { + "complex_series_float": [ + complex(0, 0), + complex(1, 0), + complex(3, 0), + complex(-1, 0), + ], + "url_nan_series": [ + urlparse("http://www.cwi.nl:80/%7Eguido/Python.html"), + urlparse("https://github.com/dylan-profiling/hurricane"), + np.nan, + ], + "mixed": [True, False, np.nan], + "float_nan_series": [1.0, 2.5, np.nan], + "float_series5": [np.nan, 1.2], + "float_with_inf": [np.inf, np.NINF, np.PINF, 1000000.0, 5.5], + "inf_series": [np.inf, np.NINF, np.Infinity, np.PINF], + "int_nan_series": [1, 2, np.nan], + "nan_series": [np.nan], + "nan_series_2": [np.nan, np.nan, np.nan, np.nan], + "string_num_nan": ["1.0", "2.0", np.nan], + "string_with_sep_num_nan": ["1,000.0", "2.1", np.nan], + "string_flt_nan": ["1.0", "45.67", np.nan], + "string_str_nan": [ + "I was only robbing the register,", + "I hope you understand", + "One of us had better call up the cops", + "In the hot New Jersey night", + np.nan, + ], + "float_series3": np.array([1.2, 2, 3, 4], dtype=np.float64), + "np_uint32": np.array([1, 2, 3, 4], dtype=np.uint32), + "string_np_unicode_series": np.array(["upper", "hall"], dtype=np.unicode_), + "complex_series": [ + complex(0, 0), + complex(1, 2), + complex(3, -1), + ], + "bool_series3": np.array([1, 0, 0, 1], dtype=np.bool_), + "complex_series_nan": [complex(0, 0), complex(1, 2), complex(3, -1), None], + "complex_series_nan_2": [ + complex(0, 0), + complex(1, 2), + complex(3, -1), + np.nan, + ], + "complex_series_py_nan": [ + complex(0, 0), + complex(1, 2), + complex(3, -1), + np.nan, + ], + } + return sequences diff --git a/build/lib/visions/backends/numpy/test_utils.py b/build/lib/visions/backends/numpy/test_utils.py new file mode 100644 index 000000000..fcfbd2c6d --- /dev/null +++ b/build/lib/visions/backends/numpy/test_utils.py @@ -0,0 +1,172 @@ +""" +A selection of testing utilities for visions. +""" + +import functools +from typing import Callable, Dict, List, Optional, Type, Union + +import numpy as np + +from visions.backends.numpy.array_utils import array_handle_nulls + + +def option_coercion_evaluator( + fn: Callable[[np.ndarray], np.ndarray], + extra_errors: Optional[List[Type[Exception]]] = None, +) -> Callable[[np.ndarray], Optional[np.ndarray]]: + """A coercion test evaluator + Evaluates a coercion function and optionally returns the coerced array. + Args: + fn: A function coercing a array to another array. + extra_errors: Additional exceptions to catch + Returns: + The coerced array if the coercion succeeds otherwise None. + """ + + error_list = [ValueError, TypeError, AttributeError] + if extra_errors: + error_list.extend(extra_errors) + + @functools.wraps(fn) + def f(array: np.ndarray) -> Optional[np.ndarray]: + try: + return fn(array) + except tuple(error_list): + return None + + return f + + +def coercion_test( + fn: Callable[[np.ndarray], np.ndarray], + extra_errors: Optional[List[Type[Exception]]] = None, +) -> Callable[[np.ndarray], bool]: + """A coercion test generator + Creates a coercion test based on a provided coercion function. + Args: + fn: A function coercing a array to another type. + extra_errors: Additional exceptions to catch + Returns: + Whether the coercion failed or was successful. + """ + # Returns True or False if the coercion succeeds + tester = option_coercion_evaluator(fn, extra_errors) + + @functools.wraps(fn) + def f(array: np.ndarray) -> bool: + result = tester(array) + return True if result is not None else False + + return f + + +def coercion_true_test( + fn: Callable[[np.ndarray], np.ndarray], + extra_errors: Optional[List[Type[Exception]]] = None, +) -> Callable[[np.ndarray], bool]: + """A coercion equality test generator + Creates a coercion test based on a provided coercion function which also enforces + equality constraints on the output. This is useful when you want to change the + data type of a array without necessarily changing the data, for example, + when converting an integer to a float. + Args: + fn: A function coercing a array to another type. + extra_errors: Additional exceptions to catch + Returns: + Whether the coercion failed or was successful. + """ + tester = option_coercion_evaluator(fn, extra_errors) + + @functools.wraps(tester) + def f(array: np.ndarray) -> bool: + result = tester(array) + return False if result is None else array.all() + + return f + + +def coercion_equality_test( + fn: Callable[[np.ndarray], np.ndarray] +) -> Callable[[np.ndarray], bool]: + """A coercion equality test generator + Creates a coercion test based on a provided coercion function which also enforces + equality constraints on the output. This is useful when you want to change the + data type of a array without necessarily changing the data, for example, + when converting an integer to a float. + Args: + fn: A function coercing a array to another type. + Returns: + Whether the coercion failed or was successful. + """ + tester = option_coercion_evaluator(fn) + + @functools.wraps(tester) + def f(array: np.ndarray) -> bool: + result = tester(array) + return False if result is None else np.array_equal(array, result) + + return f + + +def coercion_single_map_test(mapping: List[Dict]) -> Callable[[np.ndarray, Dict], bool]: + @array_handle_nulls + def f(array: np.ndarray, state: dict = {}) -> bool: + return any( + np.isin(array, list(single_map.keys())).all() for single_map in mapping + ) + + return f + + +def coercion_multi_map_test(mapping: Dict) -> Callable[[np.ndarray, Dict], bool]: + @array_handle_nulls + def f(array: np.ndarray, state: dict = {}) -> bool: + return np.isin(array, list(mapping.keys())).all() + + return f + + +def coercion_map_test( + mapping: Union[List[Dict], Dict] +) -> Callable[[np.ndarray, Dict], bool]: + """Create a testing function for a single mapping or a list of mappings. + Args: + mapping: A dict with a mapping or a list of dicts + Returns: + Callable that checks if a array consists of the mappable values + Examples: + >>> coercion_map_test({"Yes": True, "No": False}) + >>> coercion_map_test( + >>> [ + >>> {"Yes": True, "No": False}, + >>> {"Y": True, "N": False}, + >>> ] + >>> ) + """ + + if isinstance(mapping, list): + f = coercion_single_map_test(mapping) + elif isinstance(mapping, dict): + f = coercion_multi_map_test(mapping) + else: + raise ValueError("Mapping should be dict or list of dicts") + return f + + +def coercion_map( + mapping: Union[List[Dict], Dict] +) -> Callable[[np.ndarray], np.ndarray]: + """Maps a array given a mapping + Args: + mapping: a dict to map, or a list of dicts. + Returns: + A callable that maps the array. + """ + if isinstance(mapping, list): + mapping = {k: v for d in mapping for k, v in d.items()} + elif not isinstance(mapping, dict): + raise ValueError("Mapping should be dict or list of dicts") + + f = np.vectorize(lambda value: mapping.get(value, np.nan)) + + return f diff --git a/build/lib/visions/backends/numpy/types/__init__.py b/build/lib/visions/backends/numpy/types/__init__.py new file mode 100644 index 000000000..e1dea331c --- /dev/null +++ b/build/lib/visions/backends/numpy/types/__init__.py @@ -0,0 +1,8 @@ +import visions.backends.numpy.types.boolean +import visions.backends.numpy.types.complex +import visions.backends.numpy.types.date_time +import visions.backends.numpy.types.float +import visions.backends.numpy.types.integer +import visions.backends.numpy.types.object +import visions.backends.numpy.types.string +import visions.backends.numpy.types.time_delta diff --git a/build/lib/visions/backends/numpy/types/boolean.py b/build/lib/visions/backends/numpy/types/boolean.py new file mode 100644 index 000000000..96ad43eab --- /dev/null +++ b/build/lib/visions/backends/numpy/types/boolean.py @@ -0,0 +1,61 @@ +from typing import Dict, List + +import numpy as np + +from visions.backends.numpy.array_utils import ( + all_type, + array_handle_nulls, + array_not_empty, +) +from visions.backends.numpy.test_utils import coercion_map, coercion_map_test +from visions.backends.python.types.boolean import get_boolean_coercions +from visions.backends.shared.nan_handling import nan_mask +from visions.types.boolean import Boolean +from visions.types.object import Object +from visions.types.string import String + +string_coercions = get_boolean_coercions("en") + + +@Boolean.register_relationship(Object, np.ndarray) +@array_handle_nulls +def object_is_boolean(array: np.ndarray, state: dict) -> bool: + return all_type(array, bool) + + +@Boolean.register_transformer(Object, np.ndarray) +def object_to_boolean(array: np.ndarray, state: dict) -> np.ndarray: + return array + + +@Boolean.register_relationship(String, np.ndarray) +def string_is_boolean(array: np.ndarray, state: dict) -> bool: + try: + mask = nan_mask(array) + # TODO: Nan handling not implemented for generators yet + val_generator = np.array([val.lower() for val in array[mask]]) + return coercion_map_test(string_coercions)(val_generator, state) + except (ValueError, TypeError, AttributeError): + return False + + +@Boolean.register_transformer(String, np.ndarray) +def string_to_boolean(array: np.ndarray, state: dict) -> np.ndarray: + array = array.copy() + mask = nan_mask(array) + # TODO: Nan handling not implemented for generators yet + val_generator = np.array([val.lower() for val in array[mask]]) + array[mask] = object_to_boolean( + coercion_map(string_coercions)(val_generator), state + ) + return array + + +@Boolean.contains_op.register +@array_handle_nulls +@array_not_empty +def boolean_contains(array: np.ndarray, state: dict) -> bool: + if np.issubdtype(array.dtype, np.bool_): + return True + + return all_type(array, bool) diff --git a/build/lib/visions/backends/numpy/types/complex.py b/build/lib/visions/backends/numpy/types/complex.py new file mode 100644 index 000000000..9eeebaf87 --- /dev/null +++ b/build/lib/visions/backends/numpy/types/complex.py @@ -0,0 +1,40 @@ +import numpy as np +from packaging import version + +from visions.backends.numpy import test_utils +from visions.backends.numpy.array_utils import array_not_empty +from visions.backends.numpy.types.float import string_is_float +from visions.types.complex import Complex +from visions.types.string import String + +_OLD_NUMPY = version.parse(np.version.version) <= version.parse("1.19.0") + + +def imaginary_in_string(array: np.ndarray, imaginary_indicator: tuple = ("j", "i")): + return any(any(v in s for v in imaginary_indicator) for s in array) + + +@Complex.register_transformer(String, np.ndarray) +def string_to_complex(array: np.array, state: dict) -> np.ndarray: + if _OLD_NUMPY: + return np.array([complex(v) for v in array]) + else: + return array.astype(complex) + + +@Complex.register_relationship(String, np.ndarray) +def string_is_complex(array: np.ndarray, state: dict) -> bool: + coerced_array = test_utils.option_coercion_evaluator( + lambda x: string_to_complex(x, state) + )(array) + return ( + coerced_array is not None + and not string_is_float(array, state) + and imaginary_in_string(array) + ) + + +@Complex.contains_op.register +@array_not_empty +def complex_contains(array: np.ndarray, state: dict) -> bool: + return np.issubdtype(array.dtype, complex) diff --git a/build/lib/visions/backends/numpy/types/date_time.py b/build/lib/visions/backends/numpy/types/date_time.py new file mode 100644 index 000000000..9d1206740 --- /dev/null +++ b/build/lib/visions/backends/numpy/types/date_time.py @@ -0,0 +1,46 @@ +from datetime import datetime +from functools import partial + +import numpy as np +import pandas as pd + +from visions.backends.numpy import test_utils +from visions.backends.numpy.array_utils import array_handle_nulls, array_not_empty +from visions.backends.pandas.types.date_time import pandas_infer_datetime +from visions.types import DateTime, String + + +@DateTime.register_relationship(String, np.ndarray) +@array_handle_nulls +def string_is_datetime(array: np.ndarray, state: dict) -> bool: + exceptions = [OverflowError, TypeError] + + if len(array) == 0: + return False + + coerced_array = test_utils.option_coercion_evaluator( + partial(string_to_datetime, state=state), exceptions + )(array) + + if coerced_array is None: + return False + elif np.isnat(coerced_array).any(): + return False + + return True + + +@DateTime.register_transformer(String, np.ndarray) +def string_to_datetime(array: np.ndarray, state: dict) -> np.ndarray: + # return array.astype(np.datetime64) + return pandas_infer_datetime(pd.Series(array), state).to_numpy() + + +@DateTime.contains_op.register +@array_handle_nulls +@array_not_empty +def datetime_contains(array: np.ndarray, state: dict) -> bool: + if np.issubdtype(array.dtype, np.datetime64): + return True + + return all(isinstance(v, datetime) for v in array) diff --git a/build/lib/visions/backends/numpy/types/float.py b/build/lib/visions/backends/numpy/types/float.py new file mode 100644 index 000000000..a6eaabaf3 --- /dev/null +++ b/build/lib/visions/backends/numpy/types/float.py @@ -0,0 +1,48 @@ +import numpy as np + +from visions.backends.numpy import test_utils +from visions.backends.numpy.array_utils import array_handle_nulls, array_not_empty +from visions.types.complex import Complex +from visions.types.float import Float +from visions.types.string import String +from visions.utils.warning_handling import suppress_warnings + + +def test_string_leading_zeros(array: np.ndarray, coerced_array: np.ndarray): + return not any(s[0] == "0" for s in array[coerced_array > 1]) + + +@Float.register_relationship(String, np.ndarray) +@array_handle_nulls +def string_is_float(array: np.ndarray, state: dict) -> bool: + coerced_array = test_utils.option_coercion_evaluator(lambda s: s.astype(np.float_))( + array + ) + + return ( + coerced_array is not None + and float_contains(coerced_array, state) + and test_string_leading_zeros(array, coerced_array) + ) + + +@Float.register_transformer(String, np.ndarray) +def string_to_float(array: np.array, state: dict) -> np.ndarray: + return array.astype(np.float_) + + +@Float.register_relationship(Complex, np.ndarray) +def complex_is_float(array: np.array, state: dict) -> bool: + return all(np.imag(array) == 0) + + +@Float.register_transformer(Complex, np.ndarray) +def complex_to_float(array: np.array, state: dict) -> np.ndarray: + return suppress_warnings(lambda s: s.astype(np.float_))(array) + + +@Float.contains_op.register +@array_handle_nulls +@array_not_empty +def float_contains(array: np.ndarray, state: dict) -> bool: + return np.issubdtype(array.dtype, np.floating) diff --git a/build/lib/visions/backends/numpy/types/integer.py b/build/lib/visions/backends/numpy/types/integer.py new file mode 100644 index 000000000..c7634be3b --- /dev/null +++ b/build/lib/visions/backends/numpy/types/integer.py @@ -0,0 +1,31 @@ +import numpy as np + +from visions.backends.numpy.array_utils import array_handle_nulls +from visions.types.float import Float +from visions.types.integer import Integer + + +@Integer.register_relationship(Float, np.ndarray) +def float_is_integer(series: np.ndarray, state: dict) -> bool: + return np.all(np.mod(series[~np.isnan(series)], 1) == 0) + + +# TODO: The array_handle_nulls is actually removing nulls from the result. This is _far_ from ideal but there is no +# other native way to represent nullable integers in numpy +@Integer.register_transformer(Float, np.ndarray) +@array_handle_nulls +def float_to_integer(series: np.ndarray, state: dict) -> np.ndarray: + return series.astype(int) + + +@Integer.contains_op.register +@array_handle_nulls +def integer_contains(sequence: np.ndarray, state: dict) -> bool: + if sequence.shape[0] == 0 or np.issubdtype(sequence.dtype, np.timedelta64): + return False + elif np.issubdtype(sequence.dtype, np.integer): + return True + elif np.issubdtype(sequence.dtype, np.object_): + return all(isinstance(v, int) and not isinstance(v, bool) for v in sequence) + + return False diff --git a/build/lib/visions/backends/numpy/types/object.py b/build/lib/visions/backends/numpy/types/object.py new file mode 100644 index 000000000..c1b07b1f8 --- /dev/null +++ b/build/lib/visions/backends/numpy/types/object.py @@ -0,0 +1,32 @@ +from datetime import datetime + +import numpy as np + +from visions.backends.numpy.array_utils import ( + all_type, + array_handle_nulls, + array_not_empty, +) +from visions.types.object import Object + + +def not_excluded_type(array: np.ndarray, excludes) -> bool: + + if len(array) == 0 or not isinstance(array[0], excludes): + return True + + dtype = type(array[0]) + return not all_type(array, dtype) + + +@Object.contains_op.register +@array_handle_nulls +@array_not_empty +def object_contains(array: np.ndarray, state: dict) -> bool: + if np.issubdtype(array.dtype, np.str_): + return True + + if not np.issubdtype(array.dtype, np.object_): + return False + + return not_excluded_type(array, (bool, int, datetime)) diff --git a/build/lib/visions/backends/numpy/types/string.py b/build/lib/visions/backends/numpy/types/string.py new file mode 100644 index 000000000..ee956599e --- /dev/null +++ b/build/lib/visions/backends/numpy/types/string.py @@ -0,0 +1,23 @@ +import numpy as np + +from visions.backends.numpy.array_utils import array_handle_nulls, array_not_empty +from visions.types.string import String + + +@array_handle_nulls +def _is_string(array: np.ndarray, state: dict): + if not all(isinstance(v, str) for v in array[0:5]): + return False + try: + return (array.astype(str) == array).all() + except (TypeError, ValueError): + return False + + +@String.contains_op.register +@array_not_empty +def string_contains(array: np.ndarray, state: dict) -> bool: + if np.issubdtype(array.dtype, np.str_): + return True + + return _is_string(array, state) diff --git a/build/lib/visions/backends/numpy/types/time_delta.py b/build/lib/visions/backends/numpy/types/time_delta.py new file mode 100644 index 000000000..e5e941491 --- /dev/null +++ b/build/lib/visions/backends/numpy/types/time_delta.py @@ -0,0 +1,16 @@ +import numpy as np + +from visions.backends.numpy.array_utils import array_not_empty +from visions.types.time_delta import TimeDelta + + +@TimeDelta.contains_op.register +@array_not_empty +def time_delta_contains(array: np.ndarray, state: dict) -> bool: + """ + Example: + >>> x = pd.array([pd.Timedelta(days=i) for i in range(3)]) + >>> x in visions.Timedelta + True + """ + return np.issubdtype(array.dtype, np.timedelta64) diff --git a/build/lib/visions/backends/pandas/__init__.py b/build/lib/visions/backends/pandas/__init__.py new file mode 100644 index 000000000..9911dfad7 --- /dev/null +++ b/build/lib/visions/backends/pandas/__init__.py @@ -0,0 +1,2 @@ +import visions.backends.pandas.traversal +import visions.backends.pandas.types diff --git a/build/lib/visions/backends/pandas/sequences.py b/build/lib/visions/backends/pandas/sequences.py new file mode 100644 index 000000000..e80312eb3 --- /dev/null +++ b/build/lib/visions/backends/pandas/sequences.py @@ -0,0 +1,127 @@ +import datetime +from typing import Dict, Iterable + +import numpy as np +import pandas as pd + +from visions.backends.pandas.test_utils import pandas_version +from visions.backends.pandas.types.boolean import hasnan_bool_name + + +def get_sequences() -> Dict[str, Iterable]: + sequences = { + "float_series6": pd.Series([np.nan, 1.1], dtype=np.single), + "bool_series2": pd.Series([True, False, False, True], dtype=bool), + "nullable_bool_series": pd.Series([True, False, None], dtype=hasnan_bool_name), + "int_str_range": pd.Series(range(20)).astype("str"), + "Int64_int_series": pd.Series([1, 2, 3], dtype="Int64"), + "Int64_int_nan_series": pd.Series([1, 2, 3, np.nan], dtype="Int64"), + "pd_uint32": pd.Series([1, 2, 3, 4], dtype="UInt32"), + "categorical_int_series": pd.Series([1, 2, 3], dtype="category"), + "categorical_char": pd.Series( + pd.Categorical( + ["A", "B", "C", "C", "B", "A"], + categories=["A", "B", "C"], + ordered=False, + ), + ), + "categorical_float_series": pd.Series([1.0, 2.0, 3.1], dtype="category"), + "categorical_string_series": pd.Series(["Georgia", "Sam"], dtype="category"), + "categorical_complex_series": pd.Series( + [complex(0, 0), complex(1, 2), complex(3, -1)], + dtype="category", + ), + "ordinal": pd.Series( + pd.Categorical( + ["A", "B", "C", "C", "B", "A"], + categories=["A", "B", "C"], + ordered=True, + ), + ), + "timestamp_series": pd.to_datetime( + pd.Series( + [ + datetime.datetime(2017, 3, 5, 12, 2), + datetime.datetime(2019, 12, 4), + ], + ) + ), + "timestamp_series_nat": pd.to_datetime( + pd.Series( + [ + datetime.datetime(2017, 3, 5), + datetime.datetime(2019, 12, 4, 3, 2, 0), + pd.NaT, + ], + ) + ), + "date_series_nat": pd.to_datetime( + pd.Series( + [ + datetime.datetime(2017, 3, 5), + datetime.datetime(2019, 12, 4), + pd.NaT, + ], + ) + ), + "timestamp_aware_series": pd.Series( + pd.date_range( + start="2013-05-18 12:00:01", + periods=2, + freq="h", + tz="Europe/Brussels", + ) + ), + "datetime": pd.to_datetime( + pd.Series( + [ + datetime.date(2011, 1, 1), + datetime.date(2012, 1, 2), + datetime.date(2013, 1, 1), + ], + ) + ), + # http://pandas-docs.github.io/pandas-docs-travis/user_guide/timeseries.html#timestamp-limitations + # pd.to_datetime( + # pd.Series( + # [ + # datetime.datetime(year=1, month=1, day=1, hour=8, minute=43, second=12), + # datetime.datetime(year=1, month=1, day=1, hour=9, minute=43, second=12), + # datetime.datetime( + # year=1, month=1, day=1, hour=10, minute=43, second=12 + # ), + # ], + # name="datetime_to_time", + # ) + # ), + "timedelta_series": pd.Series([pd.Timedelta(days=i) for i in range(3)]), + "timedelta_series_nat": pd.Series( + [pd.Timedelta(days=i) for i in range(3)] + [pd.NaT], + ), + "timedelta_negative": pd.Series( + [ + pd.Timedelta("1 days 00:03:43"), + pd.Timedelta("5 days 12:33:57"), + pd.Timedelta("0 days 01:25:07"), + pd.Timedelta("-2 days 13:46:56"), + pd.Timedelta("1 days 23:49:25"), + ], + ), + "empty_float": pd.Series([], dtype=float), + "empty_int64": pd.Series([], dtype="Int64"), + "empty_object": pd.Series([], dtype="object"), + "empty_bool": pd.Series([], dtype=bool), + "float_series4": pd.Series([1, 2, 3.05, 4], dtype=np.float64), + # Null Sequences + "all_null_none": pd.Series([None, None]), + "all_null_nan": pd.Series([np.nan, np.nan]), + "all_null_nat": pd.Series([pd.NaT, pd.NaT]), + "all_null_empty_str": pd.Series(["", ""]), + } + + if pandas_version[0] >= 1: + sequences["string_dtype_series"] = pd.Series( + ["Patty", "Valentine"], dtype="string" + ) + + return sequences diff --git a/build/lib/visions/backends/pandas/series_utils.py b/build/lib/visions/backends/pandas/series_utils.py new file mode 100644 index 000000000..b4533c28c --- /dev/null +++ b/build/lib/visions/backends/pandas/series_utils.py @@ -0,0 +1,93 @@ +import functools +from typing import Callable + +import pandas as pd + + +# For future reference: get the dtype from the subtype when the series is sparse +def series_handle_sparse_dtype(fn: Callable[..., bool]) -> Callable[..., bool]: + """Decorator to include the dtype of a sparse subtype.""" + + @functools.wraps(fn) + def inner(series: pd.Series, state: dict, *args, **kwargs) -> bool: + if isinstance(series.dtype, pd.SparseDtype): + dtype = series.dtype.subtype + else: + dtype = series.dtype + state["dtype"] = dtype + + return fn(series, state, *args, **kwargs) + + return inner + + +def series_handle_nulls(fn: Callable[..., bool]) -> Callable[..., bool]: + """Decorator for nullable series""" + + @functools.wraps(fn) + def inner(series: pd.Series, *args, **kwargs) -> bool: + if series.hasnans: + series = series.dropna() + # TODO: use series_not_empty? + if series.empty: + return False + + return fn(series, *args, **kwargs) + + return inner + + +def series_not_sparse(fn: Callable[..., bool]) -> Callable[..., bool]: + """Decorator to exclude sparse series""" + + @functools.wraps(fn) + def inner(series: pd.Series, *args, **kwargs) -> bool: + if isinstance(series, pd.SparseDtype): + return False + return fn(series, *args, **kwargs) + + return inner + + +def series_not_empty(fn: Callable[..., bool]) -> Callable[..., bool]: + """Decorator to exclude empty series""" + + @functools.wraps(fn) + def inner(series: pd.Series, *args, **kwargs) -> bool: + if series.empty: + return False + return fn(series, *args, **kwargs) + + return inner + + +# TODO: What is the type signature on is_method???? +def _contains_instance_attrs( + series: pd.Series, is_method, class_name: str, attrs: list, sample_size: int = 1 +) -> bool: + # TODO: user configurable .head or .sample + # TODO: performance testing for series[0], series.iloc[0], series.head, series.sample + if not all(is_method(x, class_name) for x in series.head(sample_size)): + return False + + try: + return all(all(hasattr(x, attr) for attr in attrs) for x in series) + except AttributeError: + return False + + +# TODO: What is the type signature on class_name???? +def class_name_attrs( + series: pd.Series, class_name, attrs: list, sample_size: int = 1 +) -> bool: + def func(instance, class_name): + return instance.__class__.__name__ == class_name.__name__ + + return _contains_instance_attrs(series, func, class_name, attrs, sample_size) + + +# TODO: What is the type signature on class_name???? +def isinstance_attrs( + series: pd.Series, class_name, attrs: list, sample_size: int = 1 +) -> bool: + return _contains_instance_attrs(series, isinstance, class_name, attrs, sample_size) diff --git a/build/lib/visions/backends/pandas/test_utils.py b/build/lib/visions/backends/pandas/test_utils.py new file mode 100644 index 000000000..bd3076dcd --- /dev/null +++ b/build/lib/visions/backends/pandas/test_utils.py @@ -0,0 +1,173 @@ +""" +A selection of testing utilities for visions. +""" + +import functools +from typing import Callable, Dict, List, Optional, Type, Union + +import pandas as pd + +from visions.backends.pandas.series_utils import series_handle_nulls + +pandas_version = tuple(int(i) for i in pd.__version__.split(".")) +pandas_na_value = pd.NA if hasattr(pd, "NA") else None + + +def option_coercion_evaluator( + fn: Callable[[pd.Series], pd.Series], + extra_errors: Optional[List[Type[Exception]]] = None, +) -> Callable[[pd.Series], Optional[pd.Series]]: + """A coercion test evaluator + Evaluates a coercion function and optionally returns the coerced series. + Args: + fn: A function coercing a Series to another Series. + extra_errors: Additional exceptions to catch + Returns: + The coerced series if the coercion succeeds otherwise None. + """ + + error_list = [ValueError, TypeError, AttributeError] + if extra_errors: + error_list.extend(extra_errors) + + @functools.wraps(fn) + def f(series: pd.Series) -> Optional[pd.Series]: + try: + return fn(series) + except tuple(error_list): + return None + + return f + + +def coercion_test( + fn: Callable[[pd.Series], pd.Series], + extra_errors: Optional[List[Type[Exception]]] = None, +) -> Callable[[pd.Series], bool]: + """A coercion test generator + Creates a coercion test based on a provided coercion function. + Args: + fn: A function coercing a Series to another type. + extra_errors: Additional exceptions to catch + Returns: + Whether the coercion failed or was successful. + """ + # Returns True or False if the coercion succeeds + tester = option_coercion_evaluator(fn, extra_errors) + + @functools.wraps(fn) + def f(series: pd.Series) -> bool: + result = tester(series) + return True if result is not None else False + + return f + + +def coercion_true_test( + fn: Callable[[pd.Series], pd.Series], + extra_errors: Optional[List[Type[Exception]]] = None, +) -> Callable[[pd.Series], bool]: + """A coercion equality test generator + Creates a coercion test based on a provided coercion function which also enforces + equality constraints on the output. This is useful when you want to change the + data type of a series without necessarily changing the data, for example, + when converting an integer to a float. + Args: + fn: A function coercing a Series to another type. + extra_errors: Additional exceptions to catch + Returns: + Whether the coercion failed or was successful. + """ + tester = option_coercion_evaluator(fn, extra_errors) + + @functools.wraps(tester) + def f(series: pd.Series) -> bool: + result = tester(series) + return False if result is None else series.all() + + return f + + +def coercion_equality_test( + fn: Callable[[pd.Series], pd.Series] +) -> Callable[[pd.Series], bool]: + """A coercion equality test generator + Creates a coercion test based on a provided coercion function which also enforces + equality constraints on the output. This is useful when you want to change the + data type of a series without necessarily changing the data, for example, + when converting an integer to a float. + Args: + fn: A function coercing a Series to another type. + Returns: + Whether the coercion failed or was successful. + """ + tester = option_coercion_evaluator(fn) + + @functools.wraps(tester) + def f(series: pd.Series) -> bool: + result = tester(series) + return False if result is None else series.eq(result).all() + + return f + + +def coercion_single_map_test(mapping: List[Dict]) -> Callable[[pd.Series, Dict], bool]: + @series_handle_nulls + def f(series: pd.Series, state: dict = {}) -> bool: + return any(series.isin(list(single_map.keys())).all() for single_map in mapping) + + return f + + +def coercion_multi_map_test(mapping: Dict) -> Callable[[pd.Series, Dict], bool]: + @series_handle_nulls + def f(series: pd.Series, state: dict = {}) -> bool: + return series.isin(list(mapping.keys())).all() + + return f + + +def coercion_map_test( + mapping: Union[List[Dict], Dict] +) -> Callable[[pd.Series, Dict], bool]: + """Create a testing function for a single mapping or a list of mappings. + Args: + mapping: A dict with a mapping or a list of dicts + Returns: + Callable that checks if a series consists of the mappable values + Examples: + >>> coercion_map_test({"Yes": True, "No": False}) + >>> coercion_map_test( + >>> [ + >>> {"Yes": True, "No": False}, + >>> {"Y": True, "N": False}, + >>> ] + >>> ) + """ + + if isinstance(mapping, list): + f = coercion_single_map_test(mapping) + elif isinstance(mapping, dict): + f = coercion_multi_map_test(mapping) + else: + raise ValueError("Mapping should be dict or list of dicts") + return f + + +def coercion_map(mapping: Union[List[Dict], Dict]) -> Callable[[pd.Series], pd.Series]: + """Maps a series given a mapping + Args: + mapping: a dict to map, or a list of dicts. + Returns: + A callable that maps the series. + """ + if isinstance(mapping, list): + mapping = {k: v for d in mapping for k, v in d.items()} + + elif not isinstance(mapping, dict): + raise ValueError("Mapping should be dict or list of dicts") + + def f(series: pd.Series) -> pd.Series: + return series.map(mapping) + + return f diff --git a/build/lib/visions/backends/pandas/traversal.py b/build/lib/visions/backends/pandas/traversal.py new file mode 100644 index 000000000..07f6491e6 --- /dev/null +++ b/build/lib/visions/backends/pandas/traversal.py @@ -0,0 +1,37 @@ +from typing import Dict, List, Tuple, Type + +import networkx as nx +import pandas as pd + +from visions.types.type import VisionsBaseType +from visions.typesets.typeset import traverse_graph, traverse_graph_with_series + +T = Type[VisionsBaseType] + + +@traverse_graph.register(pd.Series) +def _traverse_graph_series( + series: pd.Series, root_node: T, graph: nx.DiGraph +) -> Tuple[pd.Series, List[T], dict]: + return traverse_graph_with_series(root_node, series, graph) + + +@traverse_graph.register(pd.DataFrame) +def _traverse_graph_dataframe( + df: pd.DataFrame, root_node: T, graph: nx.DiGraph +) -> Tuple[pd.DataFrame, Dict[str, List[T]], Dict[str, dict]]: + inferred_values = { + col: traverse_graph(df[col], root_node, graph) for col in df.columns + } + + inferred_series = {} + inferred_paths: Dict[str, List[T]] = {} + inferred_states: Dict[str, dict] = {} + for col, (inf_series, inf_path, inf_state) in inferred_values.items(): + assert isinstance(inf_path, list) # Placate the MyPy Gods + + inferred_series[col] = inf_series + inferred_paths[col] = inf_path + inferred_states[col] = inf_state + + return pd.DataFrame(inferred_series), inferred_paths, inferred_states diff --git a/build/lib/visions/backends/pandas/types/__init__.py b/build/lib/visions/backends/pandas/types/__init__.py new file mode 100644 index 000000000..7a83235d7 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/__init__.py @@ -0,0 +1,23 @@ +import visions.backends.pandas.types.boolean +import visions.backends.pandas.types.categorical +import visions.backends.pandas.types.complex +import visions.backends.pandas.types.count +import visions.backends.pandas.types.date +import visions.backends.pandas.types.date_time +import visions.backends.pandas.types.email_address +import visions.backends.pandas.types.file +import visions.backends.pandas.types.float +import visions.backends.pandas.types.geometry +import visions.backends.pandas.types.image +import visions.backends.pandas.types.integer +import visions.backends.pandas.types.ip_address +import visions.backends.pandas.types.numeric +import visions.backends.pandas.types.object +import visions.backends.pandas.types.ordinal +import visions.backends.pandas.types.path +import visions.backends.pandas.types.sparse +import visions.backends.pandas.types.string +import visions.backends.pandas.types.time +import visions.backends.pandas.types.time_delta +import visions.backends.pandas.types.url +import visions.backends.pandas.types.uuid diff --git a/build/lib/visions/backends/pandas/types/boolean.py b/build/lib/visions/backends/pandas/types/boolean.py new file mode 100644 index 000000000..083bade99 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/boolean.py @@ -0,0 +1,63 @@ +from typing import Dict, List + +import pandas as pd +import pandas.api.types as pdt + +from visions.backends.pandas.series_utils import ( + series_handle_nulls, + series_not_empty, + series_not_sparse, +) +from visions.backends.pandas.test_utils import ( + coercion_map, + coercion_map_test, + pandas_version, +) +from visions.backends.python.types.boolean import get_boolean_coercions +from visions.types.boolean import Boolean +from visions.types.object import Object +from visions.types.string import String + +hasnan_bool_name = "boolean" if pandas_version[0] >= 1 else "Bool" + + +string_coercions = get_boolean_coercions("en") + + +@Boolean.register_relationship(Object, pd.Series) +@series_handle_nulls +def object_is_boolean(series: pd.Series, state: dict) -> bool: + bool_set = {True, False} + try: + ret = all(item in bool_set for item in series.values) + except (ValueError, TypeError, AttributeError): + ret = False + + return ret + + +@Boolean.register_transformer(Object, pd.Series) +def object_to_boolean(series: pd.Series, state: dict) -> pd.Series: + dtype = hasnan_bool_name if series.hasnans else bool + return series.astype(dtype) + + +@Boolean.register_relationship(String, pd.Series) +def string_is_boolean(series: pd.Series, state: dict) -> bool: + try: + return coercion_map_test(string_coercions)(series.str.lower(), state) + except (ValueError, TypeError, AttributeError): + return False + + +@Boolean.register_transformer(String, pd.Series) +def string_to_boolean(series: pd.Series, state: dict) -> pd.Series: + return object_to_boolean(coercion_map(string_coercions)(series.str.lower()), state) + + +@Boolean.contains_op.register +@series_not_sparse +@series_handle_nulls +@series_not_empty +def boolean_contains(series: pd.Series, state: dict) -> bool: + return pdt.is_bool_dtype(series) and not pdt.is_categorical_dtype(series) diff --git a/build/lib/visions/backends/pandas/types/categorical.py b/build/lib/visions/backends/pandas/types/categorical.py new file mode 100644 index 000000000..9a72d878f --- /dev/null +++ b/build/lib/visions/backends/pandas/types/categorical.py @@ -0,0 +1,12 @@ +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas.series_utils import series_not_empty, series_not_sparse +from visions.types.categorical import Categorical + + +@Categorical.contains_op.register +@series_not_sparse +@series_not_empty +def categorical_contains(series: pd.Series, state: dict) -> bool: + return pdt.is_categorical_dtype(series) diff --git a/build/lib/visions/backends/pandas/types/complex.py b/build/lib/visions/backends/pandas/types/complex.py new file mode 100644 index 000000000..4405787ea --- /dev/null +++ b/build/lib/visions/backends/pandas/types/complex.py @@ -0,0 +1,54 @@ +import math +from typing import Union + +import numpy as np +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas import test_utils +from visions.backends.pandas.series_utils import series_not_empty, series_not_sparse +from visions.backends.shared.parallelization_engines import pandas_apply +from visions.types.complex import Complex +from visions.types.string import String + + +def imaginary_in_string( + series: pd.Series, imaginary_indicator: tuple = ("j", "i") +) -> bool: + return any(any(v in s for v in imaginary_indicator) for s in series) + + +def convert_val_to_complex(val: str) -> Union[complex, float]: + result = complex(val) + return ( + np.nan if any(math.isnan(val) for val in (result.real, result.imag)) else result + ) + + +def convert_to_complex_series(series: pd.Series) -> pd.Series: + return pandas_apply(series, convert_val_to_complex) + + +@Complex.register_relationship(String, pd.Series) +def string_is_complex(series: pd.Series, state: dict) -> bool: + coerced_series = test_utils.option_coercion_evaluator(convert_to_complex_series)( + series + ) + + return ( + coerced_series is not None + and not all(v.imag == 0 for v in coerced_series.dropna()) + and imaginary_in_string(series) + ) + + +@Complex.register_transformer(String, pd.Series) +def string_to_complex(series: pd.Series, state: dict) -> pd.Series: + return convert_to_complex_series(series) + + +@Complex.contains_op.register +@series_not_sparse +@series_not_empty +def complex_contains(series: pd.Series, state: dict) -> bool: + return pdt.is_complex_dtype(series) diff --git a/build/lib/visions/backends/pandas/types/count.py b/build/lib/visions/backends/pandas/types/count.py new file mode 100644 index 000000000..fe0c4de80 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/count.py @@ -0,0 +1,12 @@ +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas.series_utils import series_not_empty, series_not_sparse +from visions.types.count import Count + + +@Count.contains_op.register +@series_not_sparse +@series_not_empty +def count_contains(series: pd.Series, state: dict) -> bool: + return pdt.is_unsigned_integer_dtype(series) diff --git a/build/lib/visions/backends/pandas/types/date.py b/build/lib/visions/backends/pandas/types/date.py new file mode 100644 index 000000000..353db2d4e --- /dev/null +++ b/build/lib/visions/backends/pandas/types/date.py @@ -0,0 +1,31 @@ +from datetime import date, time + +import pandas as pd + +from visions.backends.pandas.series_utils import ( + class_name_attrs, + series_handle_nulls, + series_not_empty, +) +from visions.types.date import Date +from visions.types.date_time import DateTime + + +@Date.register_relationship(DateTime, pd.Series) +@series_handle_nulls +def datetime_is_date(series: pd.Series, state: dict) -> bool: + dtseries = series.dt.time + value = time(0, 0) + return all(v == value for v in dtseries) + + +@Date.register_transformer(DateTime, pd.Series) +def datetime_to_date(series: pd.Series, state: dict) -> pd.Series: + return series.dt.date + + +@Date.contains_op.register +@series_handle_nulls +@series_not_empty +def date_contains(series: pd.Series, state: dict) -> bool: + return class_name_attrs(series, date, ["year", "month", "day"]) diff --git a/build/lib/visions/backends/pandas/types/date_time.py b/build/lib/visions/backends/pandas/types/date_time.py new file mode 100644 index 000000000..a57e48e0e --- /dev/null +++ b/build/lib/visions/backends/pandas/types/date_time.py @@ -0,0 +1,49 @@ +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas import test_utils +from visions.backends.pandas.series_utils import ( + series_handle_nulls, + series_not_empty, + series_not_sparse, +) +from visions.types import DateTime, String + + +def pandas_infer_datetime(series: pd.Series, state: dict) -> pd.Series: + try: + return pd.to_datetime(series) + except Exception: + pass + + return pd.to_datetime(series, format="mixed") + + +@DateTime.register_relationship(String, pd.Series) +@series_handle_nulls +def string_is_datetime(series: pd.Series, state: dict) -> bool: + def string_to_datetime_func(series: pd.Series) -> pd.Series: + return pandas_infer_datetime(series, state) + + exceptions = [OverflowError, TypeError] + coerced_series = test_utils.option_coercion_evaluator( + string_to_datetime_func, exceptions + )(series) + + if coerced_series is None: + return False + else: + return not coerced_series.dropna().empty + + +@DateTime.register_transformer(String, pd.Series) +def string_to_datetime(series: pd.Series, state: dict) -> pd.Series: + return pandas_infer_datetime(series, state) + + +@DateTime.contains_op.register +@series_not_sparse +@series_handle_nulls +@series_not_empty +def datetime_contains(series: pd.Series, state: dict) -> bool: + return pdt.is_datetime64_any_dtype(series) diff --git a/build/lib/visions/backends/pandas/types/email_address.py b/build/lib/visions/backends/pandas/types/email_address.py new file mode 100644 index 000000000..5947ab464 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/email_address.py @@ -0,0 +1,31 @@ +import pandas as pd + +from visions.backends.pandas import test_utils +from visions.backends.pandas.series_utils import ( + isinstance_attrs, + series_handle_nulls, + series_not_empty, +) +from visions.backends.shared.parallelization_engines import pandas_apply +from visions.types.email_address import FQDA, EmailAddress, _to_email +from visions.types.string import String + + +@EmailAddress.register_relationship(String, pd.Series) +def string_is_email(series: pd.Series, state: dict) -> bool: + def test_email(s): + return pandas_apply(pandas_apply(s, _to_email), lambda x: x.local and x.fqdn) + + return test_utils.coercion_true_test(test_email)(series) + + +@EmailAddress.register_transformer(String, pd.Series) +def string_to_email(series: pd.Series, state: dict) -> pd.Series: + return pandas_apply(series, _to_email) + + +@EmailAddress.contains_op.register +@series_not_empty +@series_handle_nulls +def email_address_contains(series: pd.Series, state: dict) -> bool: + return isinstance_attrs(series, FQDA, ["local", "fqdn"]) diff --git a/build/lib/visions/backends/pandas/types/file.py b/build/lib/visions/backends/pandas/types/file.py new file mode 100644 index 000000000..b6d7cb97c --- /dev/null +++ b/build/lib/visions/backends/pandas/types/file.py @@ -0,0 +1,13 @@ +import pathlib + +import pandas as pd + +from visions.backends.pandas.series_utils import series_handle_nulls, series_not_empty +from visions.types.file import File + + +@File.contains_op.register +@series_not_empty +@series_handle_nulls +def file_contains(series: pd.Series, state: dict) -> bool: + return all(isinstance(p, pathlib.Path) and p.exists() for p in series) diff --git a/build/lib/visions/backends/pandas/types/float.py b/build/lib/visions/backends/pandas/types/float.py new file mode 100644 index 000000000..34a3290dc --- /dev/null +++ b/build/lib/visions/backends/pandas/types/float.py @@ -0,0 +1,66 @@ +import numpy as np +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas import test_utils +from visions.backends.pandas.series_utils import ( + series_handle_nulls, + series_not_empty, + series_not_sparse, +) +from visions.types.complex import Complex +from visions.types.float import Float +from visions.types.string import String +from visions.utils.warning_handling import suppress_warnings + + +def test_string_leading_zeros(series: pd.Series, coerced_series: pd.Series): + if coerced_series.hasnans: + notna = coerced_series.notna() + coerced_series = coerced_series[notna] + + if coerced_series.empty: + return False + series = series[notna] + return not any(s[0] == "0" for s in series[coerced_series > 1]) + + +@Float.register_relationship(String, pd.Series) +@series_handle_nulls +def string_is_float(series: pd.Series, state: dict) -> bool: + coerced_series = test_utils.option_coercion_evaluator(lambda s: s.astype(float))( + series + ) + + return ( + coerced_series is not None + and float_contains(coerced_series, state) + and test_string_leading_zeros(series, coerced_series) + ) + + +@Float.register_transformer(String, pd.Series) +def string_to_float(series: pd.Series, state: dict) -> pd.Series: + # Slightly faster to check for the character if it's not present than to + # attempt the replacement + # if any("," in x for x in series): + # series = series.str.replace(",", "") + return series.astype(float) + + +@Float.register_relationship(Complex, pd.Series) +def complex_is_float(series: pd.Series, state: dict) -> bool: + return all(np.imag(series.values) == 0) + + +@Float.register_transformer(Complex, pd.Series) +def complex_to_float(series: pd.Series, state: dict) -> pd.Series: + return suppress_warnings(lambda s: s.astype(float))(series) + + +@Float.contains_op.register +@series_not_sparse +@series_handle_nulls +@series_not_empty +def float_contains(series: pd.Series, state: dict) -> bool: + return pdt.is_float_dtype(series) diff --git a/build/lib/visions/backends/pandas/types/geometry.py b/build/lib/visions/backends/pandas/types/geometry.py new file mode 100644 index 000000000..5d6da3639 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/geometry.py @@ -0,0 +1,43 @@ +import os +import sys + +import pandas as pd + +from visions.backends.pandas.series_utils import series_handle_nulls, series_not_empty +from visions.types.geometry import Geometry +from visions.types.string import String + + +# TODO: Evaluate https://jorisvandenbossche.github.io/blog/2019/08/13/geopandas-extension-array-refactor/ +@Geometry.register_relationship(String, pd.Series) +def string_is_geometry(sequence: pd.Series, state: dict) -> bool: + """Shapely logs failures at a silly severity, just trying to suppress it's output on failures.""" + from shapely import wkt + from shapely.errors import WKTReadingError + + # only way to get rid of sys output when wkt.loads hits a bad value + # TODO: use coercion wrapper for this + sys.stderr = open(os.devnull, "w") + try: + result = all(wkt.loads(value) for value in sequence) + except (WKTReadingError, AttributeError, UnicodeEncodeError, TypeError): + result = False + finally: + sys.stderr = sys.__stderr__ + return result + + +@Geometry.register_transformer(String, pd.Series) +def string_to_geometry(series: pd.Series, state: dict) -> pd.Series: + from shapely import wkt + + return pd.Series([wkt.loads(value) for value in series]) + + +@Geometry.contains_op.register +@series_not_empty +@series_handle_nulls +def geometry_contains(series: pd.Series, state: dict) -> bool: + from shapely.geometry.base import BaseGeometry + + return all(issubclass(type(x), BaseGeometry) for x in series) diff --git a/build/lib/visions/backends/pandas/types/image.py b/build/lib/visions/backends/pandas/types/image.py new file mode 100644 index 000000000..5130a2876 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/image.py @@ -0,0 +1,14 @@ +import imghdr +from pathlib import Path + +import pandas as pd + +from visions.backends.pandas.series_utils import series_handle_nulls, series_not_empty +from visions.types.image import Image + + +@Image.contains_op.register +@series_not_empty +@series_handle_nulls +def image_contains(series: pd.Series, state: dict) -> bool: + return all(isinstance(p, Path) and p.exists() and imghdr.what(p) for p in series) diff --git a/build/lib/visions/backends/pandas/types/integer.py b/build/lib/visions/backends/pandas/types/integer.py new file mode 100644 index 000000000..b9f84089f --- /dev/null +++ b/build/lib/visions/backends/pandas/types/integer.py @@ -0,0 +1,38 @@ +import numpy as np +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas.series_utils import ( + series_handle_nulls, + series_not_empty, + series_not_sparse, +) +from visions.types.float import Float +from visions.types.integer import Integer + + +@Integer.register_relationship(Float, pd.Series) +@series_handle_nulls +def float_is_integer(series: pd.Series, state: dict) -> bool: + def check_equality(series): + try: + if not np.isfinite(series).all(): + return False + return series.eq(series.astype(int)).all() + except (ValueError, TypeError, AttributeError): + return False + + return check_equality(series) + + +@Integer.register_transformer(Float, pd.Series) +def float_to_integer(series: pd.Series, state: dict) -> pd.Series: + dtype = "Int64" if series.hasnans else np.int64 + return series.astype(dtype) + + +@Integer.contains_op.register +@series_not_sparse +@series_not_empty +def integer_contains(series: pd.Series, state: dict) -> bool: + return pdt.is_integer_dtype(series) diff --git a/build/lib/visions/backends/pandas/types/ip_address.py b/build/lib/visions/backends/pandas/types/ip_address.py new file mode 100644 index 000000000..41f1b293f --- /dev/null +++ b/build/lib/visions/backends/pandas/types/ip_address.py @@ -0,0 +1,26 @@ +from ipaddress import _BaseAddress, ip_address + +import pandas as pd + +from visions.backends.pandas import test_utils +from visions.backends.pandas.series_utils import series_handle_nulls, series_not_empty +from visions.backends.shared.parallelization_engines import pandas_apply +from visions.types.ip_address import IPAddress +from visions.types.string import String + + +@IPAddress.register_relationship(String, pd.Series) +def string_is_ip_address(series: pd.Series, state: dict) -> bool: + return test_utils.coercion_test(lambda s: pandas_apply(s, ip_address))(series) + + +@IPAddress.register_transformer(String, pd.Series) +def string_to_ip_address(series: pd.Series, state: dict) -> pd.Series: + return pandas_apply(series, ip_address) + + +@IPAddress.contains_op.register +@series_not_empty +@series_handle_nulls +def ip_address_contains(series: pd.Series, state: dict) -> bool: + return all(isinstance(x, _BaseAddress) for x in series) diff --git a/build/lib/visions/backends/pandas/types/numeric.py b/build/lib/visions/backends/pandas/types/numeric.py new file mode 100644 index 000000000..dff865a7b --- /dev/null +++ b/build/lib/visions/backends/pandas/types/numeric.py @@ -0,0 +1,12 @@ +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas.series_utils import series_not_empty, series_not_sparse +from visions.types.numeric import Numeric + + +@Numeric.contains_op.register +@series_not_sparse +@series_not_empty +def numeric_contains_op(series: pd.Series, state: dict) -> bool: + return pdt.is_numeric_dtype(series) diff --git a/build/lib/visions/backends/pandas/types/object.py b/build/lib/visions/backends/pandas/types/object.py new file mode 100644 index 000000000..49352ff3d --- /dev/null +++ b/build/lib/visions/backends/pandas/types/object.py @@ -0,0 +1,26 @@ +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas.series_utils import ( + series_handle_nulls, + series_not_empty, + series_not_sparse, +) +from visions.types.object import Object + +pandas_has_string_dtype_flag = hasattr(pdt, "is_string_dtype") + + +@Object.contains_op.register +@series_not_sparse +@series_handle_nulls +@series_not_empty +def object_contains(series: pd.Series, state: dict) -> bool: + is_object = pdt.is_object_dtype(series) + if is_object: + ret = True + elif pandas_has_string_dtype_flag: + ret = pdt.is_string_dtype(series) and not pdt.is_categorical_dtype(series) + else: + ret = False + return ret diff --git a/build/lib/visions/backends/pandas/types/ordinal.py b/build/lib/visions/backends/pandas/types/ordinal.py new file mode 100644 index 000000000..4f253fd33 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/ordinal.py @@ -0,0 +1,17 @@ +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas.series_utils import series_not_empty +from visions.types.ordinal import Ordinal + +# @Ordinal.register_transformer(Categorical, pd.Series) +# def categorical_to_ordinal(series: pd.Series) -> pd.Categorical: +# return pd.Categorical( +# series, categories=sorted(series.dropna().unique()), ordered=True +# ) + + +@Ordinal.contains_op.register +@series_not_empty +def ordinal_contains(series: pd.Series, state: dict) -> bool: + return pdt.is_categorical_dtype(series) and series.cat.ordered diff --git a/build/lib/visions/backends/pandas/types/path.py b/build/lib/visions/backends/pandas/types/path.py new file mode 100644 index 000000000..c373d1854 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/path.py @@ -0,0 +1,33 @@ +import pathlib + +import pandas as pd + +from visions.backends.pandas.series_utils import series_handle_nulls, series_not_empty +from visions.backends.shared.parallelization_engines import pandas_apply +from visions.types.path import Path +from visions.types.string import String + + +@Path.register_relationship(String, pd.Series) +def string_is_path(series: pd.Series, state: dict) -> bool: + try: + s = string_to_path(series.copy(), state) + return pandas_apply(s, lambda x: x.is_absolute()).all() + except TypeError: + return False + + +@Path.register_transformer(String, pd.Series) +def string_to_path(series: pd.Series, state: dict) -> pd.Series: + s = pandas_apply(series, pathlib.PureWindowsPath) + if not pandas_apply(s, lambda x: x.is_absolute()).all(): + return pandas_apply(series, pathlib.PurePosixPath) + else: + return s + + +@Path.contains_op.register +@series_not_empty +@series_handle_nulls +def path_contains(series: pd.Series, state: dict) -> bool: + return all(isinstance(x, pathlib.PurePath) and x.is_absolute() for x in series) diff --git a/build/lib/visions/backends/pandas/types/sparse.py b/build/lib/visions/backends/pandas/types/sparse.py new file mode 100644 index 000000000..331f5e179 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/sparse.py @@ -0,0 +1,9 @@ +import pandas as pd +from pandas.api import types as pdt + +from visions.types.sparse import Sparse + + +@Sparse.contains_op.register +def sparse_contains(series: pd.Series, state: dict) -> bool: + return pdt.is_sparse(series) diff --git a/build/lib/visions/backends/pandas/types/string.py b/build/lib/visions/backends/pandas/types/string.py new file mode 100644 index 000000000..fabc9c44e --- /dev/null +++ b/build/lib/visions/backends/pandas/types/string.py @@ -0,0 +1,33 @@ +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas.series_utils import ( + series_handle_nulls, + series_not_empty, + series_not_sparse, +) +from visions.types.string import String + +pandas_has_string_dtype_flag = hasattr(pdt, "is_string_dtype") + + +@series_handle_nulls +def _is_string(series: pd.Series, state: dict): + if not all(isinstance(v, str) for v in series.values[0:5]): + return False + try: + return (series.astype(str).values == series.values).all() + except (TypeError, ValueError): + return False + + +@String.contains_op.register +@series_not_sparse +@series_not_empty +def string_contains(series: pd.Series, state: dict) -> bool: + if pdt.is_categorical_dtype(series): + return False + elif not pdt.is_object_dtype(series): + return pandas_has_string_dtype_flag and pdt.is_string_dtype(series) + + return _is_string(series, state) diff --git a/build/lib/visions/backends/pandas/types/time.py b/build/lib/visions/backends/pandas/types/time.py new file mode 100644 index 000000000..8776212aa --- /dev/null +++ b/build/lib/visions/backends/pandas/types/time.py @@ -0,0 +1,29 @@ +from datetime import time + +import pandas as pd + +from visions.backends.pandas.series_utils import ( + class_name_attrs, + series_handle_nulls, + series_not_empty, +) +from visions.types.time import Time + +# @Time.register_relationship(DateTime, pd.Series) +# @series_handle_nulls +# def datetime_is_time(series: pd.Series) -> bool: +# dtseries = series.dt.date +# value = date(1, 1, 1) +# return all(v == value for v in dtseries) +# +# +# @Time.register_transformer(DateTime, pd.Series) +# def datetime_to_time(series: pd.Series, state: dict) -> pd.Series: +# return series.dt.time + + +@Time.contains_op.register +@series_handle_nulls +@series_not_empty +def time_contains(series: pd.Series, state: dict) -> bool: + return class_name_attrs(series, time, ["microsecond", "hour"]) diff --git a/build/lib/visions/backends/pandas/types/time_delta.py b/build/lib/visions/backends/pandas/types/time_delta.py new file mode 100644 index 000000000..cc6d0cc7a --- /dev/null +++ b/build/lib/visions/backends/pandas/types/time_delta.py @@ -0,0 +1,18 @@ +import pandas as pd +from pandas.api import types as pdt + +from visions.backends.pandas.series_utils import series_not_empty, series_not_sparse +from visions.types.time_delta import TimeDelta + + +@TimeDelta.contains_op.register +@series_not_sparse +@series_not_empty +def time_delta_contains(series: pd.Series, state: dict) -> bool: + """ + Example: + >>> x = pd.Series([pd.Timedelta(days=i) for i in range(3)]) + >>> x in visions.Timedelta + True + """ + return pdt.is_timedelta64_dtype(series) diff --git a/build/lib/visions/backends/pandas/types/url.py b/build/lib/visions/backends/pandas/types/url.py new file mode 100644 index 000000000..9ecd5a901 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/url.py @@ -0,0 +1,35 @@ +from urllib.parse import ParseResult, urlparse + +import pandas as pd + +from visions.backends.pandas.series_utils import ( + isinstance_attrs, + series_handle_nulls, + series_not_empty, +) +from visions.backends.shared.parallelization_engines import pandas_apply +from visions.types.string import String +from visions.types.url import URL + + +@URL.register_relationship(String, pd.Series) +@series_handle_nulls +def string_is_url(series: pd.Series, state: dict) -> bool: + try: + return pandas_apply( + string_to_url(series, state), lambda x: x.netloc and x.scheme + ).all() + except AttributeError: + return False + + +@URL.register_transformer(String, pd.Series) +def string_to_url(series: pd.Series, state: dict) -> pd.Series: + return pandas_apply(series, urlparse) + + +@URL.contains_op.register +@series_handle_nulls +@series_not_empty +def url_contains(series: pd.Series, state: dict) -> bool: + return isinstance_attrs(series, ParseResult, ["netloc", "scheme"]) diff --git a/build/lib/visions/backends/pandas/types/uuid.py b/build/lib/visions/backends/pandas/types/uuid.py new file mode 100644 index 000000000..b6e0e29c0 --- /dev/null +++ b/build/lib/visions/backends/pandas/types/uuid.py @@ -0,0 +1,33 @@ +import uuid + +import pandas as pd + +from visions.backends.pandas.series_utils import ( + isinstance_attrs, + series_handle_nulls, + series_not_empty, +) +from visions.backends.pandas.test_utils import coercion_true_test +from visions.backends.shared.parallelization_engines import pandas_apply +from visions.types.string import String +from visions.types.uuid import UUID + + +@UUID.register_relationship(String, pd.Series) +def uuid_is_string(series: pd.Series, state: dict) -> bool: + def f(s): + return pandas_apply(s, uuid.UUID) + + return coercion_true_test(f)(series) + + +@UUID.register_transformer(String, pd.Series) +def uuid_to_string(series: pd.Series, state: dict) -> pd.Series: + return pandas_apply(series, uuid.UUID) + + +@UUID.contains_op.register +@series_not_empty +@series_handle_nulls +def uuid_contains(series: pd.Series, state: dict) -> bool: + return isinstance_attrs(series, uuid.UUID, ["time_low", "hex"]) diff --git a/build/lib/visions/backends/python/__init__.py b/build/lib/visions/backends/python/__init__.py new file mode 100644 index 000000000..daf7b9291 --- /dev/null +++ b/build/lib/visions/backends/python/__init__.py @@ -0,0 +1 @@ +import visions.backends.python.types diff --git a/build/lib/visions/backends/python/sequences.py b/build/lib/visions/backends/python/sequences.py new file mode 100644 index 000000000..71eab8f1a --- /dev/null +++ b/build/lib/visions/backends/python/sequences.py @@ -0,0 +1,186 @@ +import datetime +import os +import uuid +from ipaddress import IPv4Address, IPv6Address +from pathlib import Path, PurePosixPath, PureWindowsPath +from typing import Dict, Sequence, cast +from urllib.parse import urlparse + +from visions.types.email_address import FQDA + + +def get_sequences() -> Dict[str, Sequence]: + base_path = Path(__file__).parent.parent.parent.absolute() + + sequences = { + "int_series": [1, 2, 3], + "int_range": range(10), + "int_series_boolean": [1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0], + "float_series": [1.0, 2.1, 3.0], + "float_series2": [1.0, 2.0, 3.0, 4.0], + "string_series": ["Patty", "Valentine"], + "string_unicode_series": ["mack", "the", "finger"], + "string_num": ["1.0", "2.0", "3.0"], + "string_flt": ["1.0", "45.67", "3.5"], + "string_bool_nan": ["True", "False", None], + "str_url": [ + "http://www.cwi.nl:80/%7Eguido/Python.html", + "https://github.com/dylan-profiling/hurricane", + ], + "path_series_windows_str": [ + r"C:\\home\\user\\file.txt", + r"C:\\home\\user\\test2.txt", + ], + "path_series_linux_str": [r"/home/user/file.txt", r"/home/user/test2.txt"], + "str_int_leading_zeros": ["0011", "12"], + "str_float_non_leading_zeros": ["0.0", "0.04", "0"], + "str_int_zeros": ["0.0", "0.000", "0", "2"], + "bool_series": [True, False], + "bool_nan_series": [True, False, None], + "str_complex": ["(1+1j)", "(2+2j)", "(10+100j)"], + "str_complex_nan": ["(1+1j)", "(2+2j)", "(10+100j)", "NaN"], + "complex_series_py": [complex(0, 0), complex(1, 2), complex(3, -1)], + "complex_series_py_float": [complex(0, 0), complex(1, 0), complex(3, 0)], + "string_date": ["1937-05-06", "20/4/2014"], + "timestamp_string_series": ["1941-05-24", "13/10/2016"], + "py_datetime_str": ["1941-05-24 00:05:00", "2016-10-13 00:10:00"], + "date": [ + datetime.date(2011, 1, 1), + datetime.date(2012, 1, 2), + datetime.date(2013, 1, 1), + ], + "time": [ + datetime.time(8, 43, 12), + datetime.time(9, 43, 12), + datetime.time(10, 43, 12), + ], + "path_series_linux": [ + PurePosixPath("/home/user/file.txt"), + PurePosixPath("/home/user/test2.txt"), + ], + "path_series_linux_missing": [ + PurePosixPath("/home/user/file.txt"), + PurePosixPath("/home/user/test2.txt"), + None, + ], + "path_series_windows": [ + PureWindowsPath("C:\\home\\user\\file.txt"), + PureWindowsPath("C:\\home\\user\\test2.txt"), + ], + "url_series": [ + urlparse("http://www.cwi.nl:80/%7Eguido/Python.html"), + urlparse("https://github.com/dylan-profiling/hurricane"), + ], + "url_none_series": [ + urlparse("http://www.cwi.nl:80/%7Eguido/Python.html"), + urlparse("https://github.com/dylan-profiling/hurricane"), + None, + ], + "uuid_series": [ + uuid.UUID("0b8a22ca-80ad-4df5-85ac-fa49c44b7ede"), + uuid.UUID("aaa381d6-8442-4f63-88c8-7c900e9a23c6"), + uuid.UUID("00000000-0000-0000-0000-000000000000"), + ], + "uuid_series_missing": [ + uuid.UUID("0b8a22ca-80ad-4df5-85ac-fa49c44b7ede"), + uuid.UUID("aaa381d6-8442-4f63-88c8-7c900e9a23c6"), + uuid.UUID("00000000-0000-0000-0000-000000000000"), + None, + ], + "uuid_series_str": [ + "0b8a22ca-80ad-4df5-85ac-fa49c44b7ede", + "aaa381d6-8442-4f63-88c8-7c900e9a23c6", + "00000000-0000-0000-0000-000000000000", + ], + "mixed_list[str,int]": [[1, ""], [2, "Rubin"], [3, "Carter"]], + "mixed_dict": [ + {"why": "did you"}, + {"bring him": "in for he"}, + {"aint": "the guy"}, + ], + "callable": [os.getcwd, os.stat, os.kill], + "module": [os, uuid], + "textual_float": ["1.1", "2"], + "textual_float_nan": ["1.1", "2", "NAN"], + "mixed_integer": ["a", 1], + "mixed_list": [[True], [False], [False]], + "ip_str": ["127.0.0.1", "127.0.0.1"], + "empty": [], + "ip": [IPv4Address("127.0.0.1"), IPv4Address("127.0.0.1")], + "ip_missing": [IPv4Address("127.0.0.1"), None, IPv4Address("127.0.0.1")], + "ip_mixed_v4andv6": [IPv6Address("0:0:0:0:0:0:0:1"), IPv4Address("127.0.0.1")], + "file_test_py": [ + Path(os.path.join(base_path, "test/series.py")).absolute(), + Path(os.path.join(base_path, "test/__init__.py")).absolute(), + Path(os.path.join(base_path, "test/utils.py")).absolute(), + ], + "file_mixed_ext": [ + Path(os.path.join(base_path, "py.typed")).absolute(), + Path(os.path.join(base_path, "test/data", "file.html")).absolute(), + Path(os.path.join(base_path, "test/series.py")).absolute(), + ], + "file_test_py_missing": [ + Path(os.path.join(base_path, "test/series.py")).absolute(), + None, + Path(os.path.join(base_path, "test/__init__.py")).absolute(), + None, + Path(os.path.join(base_path, "test/utils.py")).absolute(), + ], + "image_png": [ + Path( + os.path.join( + base_path, + "test/data", + "img.png", + ) + ).absolute(), + Path( + os.path.join( + base_path, + "test/data", + "img.jpeg", + ) + ).absolute(), + Path( + os.path.join( + base_path, + "test/data", + "img.jpg", + ) + ).absolute(), + ], + "image_png_missing": [ + Path( + os.path.join( + base_path, + "test/data", + "img.png", + ) + ).absolute(), + Path( + os.path.join( + base_path, + "test/data", + "img.jpeg", + ) + ).absolute(), + None, + Path( + os.path.join( + base_path, + "test/data", + "img.jpg", + ) + ).absolute(), + None, + ], + "email_address": [FQDA("test", "example.com"), FQDA("info", "example.eu")], + "email_address_missing": [ + FQDA("test", "example.com"), + FQDA("info", "example.eu"), + None, + ], + "email_address_str": ["test@example.com", "info@example.eu"], + } + assert all(isinstance(v, Sequence) for v in sequences.values()) + return cast(Dict[str, Sequence], sequences) diff --git a/build/lib/visions/backends/python/series_utils.py b/build/lib/visions/backends/python/series_utils.py new file mode 100644 index 000000000..c83643b13 --- /dev/null +++ b/build/lib/visions/backends/python/series_utils.py @@ -0,0 +1,26 @@ +import functools +from typing import Callable, Sequence + + +def sequence_not_empty(fn: Callable[..., bool]) -> Callable[..., bool]: + """Decorator to exclude empty series""" + + @functools.wraps(fn) + def inner(sequence: Sequence, *args, **kwargs) -> bool: + if not any(True for _ in sequence): + return False + + return fn(sequence, *args, **kwargs) + + return inner + + +def sequence_handle_none(fn: Callable[..., bool]) -> Callable[..., bool]: + """Decorator for nullable series""" + + @functools.wraps(fn) + def inner(sequence: Sequence, *args, **kwargs) -> bool: + sequence = tuple(filter(None, sequence)) + return fn(sequence, *args, **kwargs) + + return inner diff --git a/build/lib/visions/backends/python/types/__init__.py b/build/lib/visions/backends/python/types/__init__.py new file mode 100644 index 000000000..0b9192ccc --- /dev/null +++ b/build/lib/visions/backends/python/types/__init__.py @@ -0,0 +1,22 @@ +import visions.backends.python.types.boolean +import visions.backends.python.types.categorical +import visions.backends.python.types.complex +import visions.backends.python.types.count +import visions.backends.python.types.date +import visions.backends.python.types.date_time +import visions.backends.python.types.email_address +import visions.backends.python.types.file +import visions.backends.python.types.float +import visions.backends.python.types.geometry +import visions.backends.python.types.image +import visions.backends.python.types.integer +import visions.backends.python.types.ip_address +import visions.backends.python.types.numeric +import visions.backends.python.types.object +import visions.backends.python.types.ordinal +import visions.backends.python.types.path +import visions.backends.python.types.string +import visions.backends.python.types.time +import visions.backends.python.types.time_delta +import visions.backends.python.types.url +import visions.backends.python.types.uuid diff --git a/build/lib/visions/backends/python/types/boolean.py b/build/lib/visions/backends/python/types/boolean.py new file mode 100644 index 000000000..8ece6320c --- /dev/null +++ b/build/lib/visions/backends/python/types/boolean.py @@ -0,0 +1,60 @@ +from typing import Dict, List, Sequence + +from visions.backends.python.series_utils import ( + sequence_handle_none, + sequence_not_empty, +) +from visions.types import Boolean, Object, String + + +def get_boolean_coercions(id: str) -> List[Dict]: + coercion_map = { + "default": [{"true": True, "false": False}], + "en": [ + {"true": True, "false": False}, + {"y": True, "n": False}, + {"yes": True, "no": False}, + ], + "nl": [ + {"true": True, "false": False}, + {"ja": True, "nee": False}, + {"j": True, "n": False}, + ], + } + return coercion_map[id] + + +@sequence_not_empty +@sequence_handle_none +def is_bool(sequence: Sequence, state: dict): + return all(isinstance(value, bool) for value in sequence) + + +def to_bool(sequence: Sequence, state: dict): + return map(bool, sequence) + + +@Boolean.register_relationship(Object, Sequence) +def object_is_bool(sequence: Sequence, state: dict) -> bool: + return is_bool(sequence, state) + + +@Boolean.register_transformer(Object, Sequence) +def object_to_bool(sequence: Sequence, state: dict) -> Sequence: + return to_bool(sequence, state) + + +@Boolean.register_relationship(String, Sequence) +@sequence_handle_none +def string_is_bool(sequence: Sequence, state: dict): + return all(value.lower() in {"true", "false"} for value in sequence) + + +@Boolean.register_transformer(String, Sequence) +def string_to_bool(sequence: Sequence, state: dict): + return map(lambda v: v.lower() == "true" if isinstance(v, str) else v, sequence) + + +@Boolean.contains_op.register +def boolean_contains(sequence: Sequence, state: dict) -> bool: + return is_bool(sequence, state) diff --git a/build/lib/visions/backends/python/types/categorical.py b/build/lib/visions/backends/python/types/categorical.py new file mode 100644 index 000000000..8299fc356 --- /dev/null +++ b/build/lib/visions/backends/python/types/categorical.py @@ -0,0 +1,8 @@ +from typing import Sequence + +from visions.types.categorical import Categorical + + +@Categorical.contains_op.register +def categorical_contains(sequence: Sequence, state: dict) -> bool: + return False diff --git a/build/lib/visions/backends/python/types/complex.py b/build/lib/visions/backends/python/types/complex.py new file mode 100644 index 000000000..71febe9fb --- /dev/null +++ b/build/lib/visions/backends/python/types/complex.py @@ -0,0 +1,26 @@ +from typing import Sequence + +from visions.backends.python.series_utils import sequence_not_empty +from visions.backends.python.types.float import no_leading_zeros +from visions.types.complex import Complex +from visions.types.string import String + + +@Complex.register_relationship(String, Sequence) +def string_is_complex(sequence: Sequence, state: dict) -> bool: + try: + coerced = list(string_to_complex(sequence, state)) + return no_leading_zeros(sequence, [r.real for r in coerced]) + except (ValueError, TypeError, AttributeError): + return False + + +@Complex.register_transformer(String, Sequence) +def string_to_complex(sequence: Sequence, state: dict) -> Sequence: + return list(map(complex, sequence)) + + +@Complex.contains_op.register +@sequence_not_empty +def complex_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, complex) for value in sequence) diff --git a/build/lib/visions/backends/python/types/count.py b/build/lib/visions/backends/python/types/count.py new file mode 100644 index 000000000..22078aff9 --- /dev/null +++ b/build/lib/visions/backends/python/types/count.py @@ -0,0 +1,8 @@ +from typing import Sequence + +from visions.types.count import Count + + +@Count.contains_op.register +def count_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, int) and value >= 0 for value in sequence) diff --git a/build/lib/visions/backends/python/types/date.py b/build/lib/visions/backends/python/types/date.py new file mode 100644 index 000000000..da2828c97 --- /dev/null +++ b/build/lib/visions/backends/python/types/date.py @@ -0,0 +1,21 @@ +from datetime import date, time +from typing import Sequence + +from visions.types.date import Date +from visions.types.date_time import DateTime + + +@Date.register_relationship(DateTime, Sequence) +def datetime_is_date(sequence: Sequence, state: dict) -> bool: + value = time(0, 0) + return all(v == value for v in sequence) + + +@Date.register_transformer(DateTime, Sequence) +def datetime_to_date(sequence: Sequence, state: dict) -> Sequence: + return tuple(map(lambda v: v.date(), sequence)) + + +@Date.contains_op.register +def date_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, date) for value in sequence) diff --git a/build/lib/visions/backends/python/types/date_time.py b/build/lib/visions/backends/python/types/date_time.py new file mode 100644 index 000000000..4a1a418e7 --- /dev/null +++ b/build/lib/visions/backends/python/types/date_time.py @@ -0,0 +1,30 @@ +from datetime import datetime +from typing import Sequence + +from visions.backends.python.series_utils import sequence_not_empty +from visions.types.date_time import DateTime +from visions.types.string import String + + +@DateTime.register_relationship(String, Sequence) +def string_is_datetime(sequence: Sequence, state: dict) -> bool: + try: + _ = list(string_to_datetime(sequence, state)) + return True + except (OverflowError, TypeError, ValueError): + return False + + +@DateTime.register_transformer(String, Sequence) +def string_to_datetime(sequence: Sequence, state: dict) -> Sequence: + """ + Python 3.7+ + return map(datetime.fromisoformat, sequence) + """ + return tuple(map(lambda s: datetime.strptime(s, "%Y-%m-%d %H:%M:%S"), sequence)) + + +@DateTime.contains_op.register +@sequence_not_empty +def datetime_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, datetime) for value in sequence) diff --git a/build/lib/visions/backends/python/types/email_address.py b/build/lib/visions/backends/python/types/email_address.py new file mode 100644 index 000000000..bb156fec6 --- /dev/null +++ b/build/lib/visions/backends/python/types/email_address.py @@ -0,0 +1,24 @@ +from typing import Sequence + +from visions.types.email_address import FQDA, EmailAddress, _to_email +from visions.types.string import String + + +@EmailAddress.register_relationship(String, Sequence) +def string_is_email(sequence: Sequence, state: dict) -> bool: + try: + return all( + value.local and value.fqdn for value in string_to_email(sequence, state) + ) + except (ValueError, TypeError, AttributeError): + return False + + +@EmailAddress.register_transformer(String, Sequence) +def string_to_email(sequence: Sequence, state: dict) -> Sequence: + return tuple(map(_to_email, sequence)) + + +@EmailAddress.contains_op.register +def email_address_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, FQDA) for value in sequence) diff --git a/build/lib/visions/backends/python/types/file.py b/build/lib/visions/backends/python/types/file.py new file mode 100644 index 000000000..2886a3bce --- /dev/null +++ b/build/lib/visions/backends/python/types/file.py @@ -0,0 +1,9 @@ +import pathlib +from typing import Sequence + +from visions.types.file import File + + +@File.contains_op.register +def file_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(p, pathlib.Path) and p.exists() for p in sequence) diff --git a/build/lib/visions/backends/python/types/float.py b/build/lib/visions/backends/python/types/float.py new file mode 100644 index 000000000..4559b9883 --- /dev/null +++ b/build/lib/visions/backends/python/types/float.py @@ -0,0 +1,43 @@ +from typing import Sequence + +from visions.backends.python.series_utils import sequence_not_empty +from visions.types.complex import Complex +from visions.types.float import Float +from visions.types.string import String + + +def no_leading_zeros(sequence, coerced_sequence) -> bool: + return not any(s[0] == "0" and c > 1 for s, c in zip(sequence, coerced_sequence)) + + +@Float.register_relationship(String, Sequence) +def string_is_float(sequence: Sequence, state: dict) -> bool: + try: + coerced = list(string_to_float(sequence, state)) + return no_leading_zeros(sequence, coerced) + except ValueError: + return False + + +@Float.register_transformer(String, Sequence) +def string_to_float(sequence: Sequence, state: dict) -> Sequence: + return tuple(map(float, sequence)) + + +@Float.register_relationship(Complex, Sequence) +def complex_is_float(sequence: Sequence, state: dict) -> bool: + try: + return all(value.imag == 0 for value in sequence) + except ValueError: + return False + + +@Float.register_transformer(Complex, Sequence) +def complex_to_float(sequence: Sequence, state: dict) -> Sequence: + return list(map(lambda v: v.real, sequence)) + + +@Float.contains_op.register +@sequence_not_empty +def float_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, float) for value in sequence) diff --git a/build/lib/visions/backends/python/types/geometry.py b/build/lib/visions/backends/python/types/geometry.py new file mode 100644 index 000000000..5dd6cd17c --- /dev/null +++ b/build/lib/visions/backends/python/types/geometry.py @@ -0,0 +1,38 @@ +import os +import sys +from typing import Sequence + +from visions.types.geometry import Geometry +from visions.types.string import String + + +@Geometry.register_relationship(String, Sequence) +def string_is_geometry(sequence: Sequence, state: dict) -> bool: + """Shapely logs failures at a silly severity, just trying to suppress it's output on failures.""" + from shapely import wkt + from shapely.errors import WKTReadingError + + # only way to get rid of sys output when wkt.loads hits a bad value + # TODO: use coercion wrapper for this + sys.stderr = open(os.devnull, "w") + try: + result = all(wkt.loads(value) for value in sequence) + except (WKTReadingError, AttributeError, UnicodeEncodeError, TypeError): + result = False + finally: + sys.stderr = sys.__stderr__ + return result + + +@Geometry.register_transformer(String, Sequence) +def string_to_geometry(sequence: Sequence, state: dict) -> Sequence: + from shapely import wkt + + return tuple(map(wkt.loads, sequence)) + + +@Geometry.contains_op.register +def geometry_contains(sequence: Sequence, state: dict) -> bool: + from shapely.geometry.base import BaseGeometry + + return all(issubclass(type(x), BaseGeometry) for x in sequence) diff --git a/build/lib/visions/backends/python/types/image.py b/build/lib/visions/backends/python/types/image.py new file mode 100644 index 000000000..f5ff00296 --- /dev/null +++ b/build/lib/visions/backends/python/types/image.py @@ -0,0 +1,12 @@ +import imghdr +import pathlib +from typing import Sequence + +from visions.types.image import Image + + +@Image.contains_op.register +def image_contains(sequence: Sequence, state: dict) -> bool: + return all( + isinstance(p, pathlib.Path) and p.exists() and imghdr.what(p) for p in sequence + ) diff --git a/build/lib/visions/backends/python/types/integer.py b/build/lib/visions/backends/python/types/integer.py new file mode 100644 index 000000000..79d697e58 --- /dev/null +++ b/build/lib/visions/backends/python/types/integer.py @@ -0,0 +1,26 @@ +from typing import Sequence + +from visions.backends.python.series_utils import sequence_not_empty +from visions.types.float import Float +from visions.types.integer import Integer + + +@Integer.register_relationship(Float, Sequence) +def float_is_int(sequence: Sequence, state: dict) -> bool: + try: + return all(int(value) == value for value in sequence) + except (ValueError, TypeError, OverflowError): + return False + + +@Integer.register_transformer(Float, Sequence) +def float_to_int(sequence: Sequence, state: dict) -> Sequence: + return tuple(map(int, sequence)) + + +@Integer.contains_op.register +@sequence_not_empty +def integer_contains(sequence: Sequence, state: dict) -> bool: + return all( + isinstance(value, int) and not isinstance(value, bool) for value in sequence + ) diff --git a/build/lib/visions/backends/python/types/ip_address.py b/build/lib/visions/backends/python/types/ip_address.py new file mode 100644 index 000000000..cba9dbd5c --- /dev/null +++ b/build/lib/visions/backends/python/types/ip_address.py @@ -0,0 +1,24 @@ +from ipaddress import _BaseAddress, ip_address +from typing import Sequence + +from visions.types.ip_address import IPAddress +from visions.types.string import String + + +@IPAddress.register_relationship(String, Sequence) +def string_is_ip_address(sequence: Sequence, state: dict) -> bool: + try: + _ = list(string_to_ip_address(sequence, state)) + return True + except (ValueError, TypeError, AttributeError): + return False + + +@IPAddress.register_transformer(String, Sequence) +def string_to_ip_address(sequence: Sequence, state: dict) -> Sequence: + return tuple(map(ip_address, sequence)) + + +@IPAddress.contains_op.register +def ip_address_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(x, _BaseAddress) for x in sequence) diff --git a/build/lib/visions/backends/python/types/numeric.py b/build/lib/visions/backends/python/types/numeric.py new file mode 100644 index 000000000..12e3e4aac --- /dev/null +++ b/build/lib/visions/backends/python/types/numeric.py @@ -0,0 +1,12 @@ +import numbers +from typing import Sequence + +from visions.types.numeric import Numeric + + +@Numeric.contains_op.register +def numeric_contains_op(sequence: Sequence, state: dict): + return all( + isinstance(value, numbers.Number) and not isinstance(value, bool) + for value in sequence + ) diff --git a/build/lib/visions/backends/python/types/object.py b/build/lib/visions/backends/python/types/object.py new file mode 100644 index 000000000..5d59bad82 --- /dev/null +++ b/build/lib/visions/backends/python/types/object.py @@ -0,0 +1,14 @@ +from typing import Sequence + +from visions.backends.python.series_utils import ( + sequence_handle_none, + sequence_not_empty, +) +from visions.types.object import Object + + +@Object.contains_op.register +@sequence_not_empty +@sequence_handle_none +def object_contains(sequence: Sequence, state: dict) -> bool: + return any(not isinstance(value, (float, bool, int, complex)) for value in sequence) diff --git a/build/lib/visions/backends/python/types/ordinal.py b/build/lib/visions/backends/python/types/ordinal.py new file mode 100644 index 000000000..dfd500652 --- /dev/null +++ b/build/lib/visions/backends/python/types/ordinal.py @@ -0,0 +1,8 @@ +from typing import Sequence + +from visions.types.ordinal import Ordinal + + +@Ordinal.contains_op.register +def ordinal_contains(sequence: Sequence, state: dict) -> bool: + return False diff --git a/build/lib/visions/backends/python/types/path.py b/build/lib/visions/backends/python/types/path.py new file mode 100644 index 000000000..9cada4e1c --- /dev/null +++ b/build/lib/visions/backends/python/types/path.py @@ -0,0 +1,28 @@ +import pathlib +from typing import Sequence + +from visions.types.path import Path +from visions.types.string import String + + +@Path.register_relationship(String, Sequence) +def string_is_path(series, state: dict) -> bool: + try: + s = string_to_path(series.copy(), state) + return all(value.is_absolute() for value in s) + except TypeError: + return False + + +@Path.register_transformer(String, Sequence) +def string_to_path(sequence: Sequence, state: dict) -> Sequence: + s = tuple(map(pathlib.PureWindowsPath, sequence)) + if not all(value.is_absolute() for value in s): + return tuple(map(pathlib.PurePosixPath, sequence)) + else: + return s + + +@Path.contains_op.register +def path_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(x, pathlib.PurePath) and x.is_absolute() for x in sequence) diff --git a/build/lib/visions/backends/python/types/string.py b/build/lib/visions/backends/python/types/string.py new file mode 100644 index 000000000..29bc952bb --- /dev/null +++ b/build/lib/visions/backends/python/types/string.py @@ -0,0 +1,14 @@ +from typing import Sequence + +from visions.backends.python.series_utils import ( + sequence_handle_none, + sequence_not_empty, +) +from visions.types.string import String + + +@String.contains_op.register +@sequence_not_empty +@sequence_handle_none +def string_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(v, str) for v in sequence) diff --git a/build/lib/visions/backends/python/types/time.py b/build/lib/visions/backends/python/types/time.py new file mode 100644 index 000000000..407b482a1 --- /dev/null +++ b/build/lib/visions/backends/python/types/time.py @@ -0,0 +1,20 @@ +from datetime import time +from typing import Sequence + +# from visions.types.date_time import DateTime +from visions.types.time import Time + +# @Time.register_relationship(DateTime, Sequence) +# def datetime_is_time(sequence: Sequence, state: dict) -> bool: +# value = date(1, 1, 1) +# return all(v == value for v in sequence) +# +# +# @Time.register_transformer(DateTime, Sequence) +# def datetime_to_time(sequence: Sequence, state: dict) -> Sequence: +# return map(lambda v: v.time(), sequence) + + +@Time.contains_op.register +def time_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, time) for value in sequence) diff --git a/build/lib/visions/backends/python/types/time_delta.py b/build/lib/visions/backends/python/types/time_delta.py new file mode 100644 index 000000000..e35c0e916 --- /dev/null +++ b/build/lib/visions/backends/python/types/time_delta.py @@ -0,0 +1,11 @@ +from datetime import timedelta +from typing import Sequence + +from visions.backends.python.series_utils import sequence_not_empty +from visions.types.time_delta import TimeDelta + + +@TimeDelta.contains_op.register +@sequence_not_empty +def time_delta_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, timedelta) for value in sequence) diff --git a/build/lib/visions/backends/python/types/url.py b/build/lib/visions/backends/python/types/url.py new file mode 100644 index 000000000..aa5c880d8 --- /dev/null +++ b/build/lib/visions/backends/python/types/url.py @@ -0,0 +1,23 @@ +from typing import Sequence +from urllib.parse import ParseResult, urlparse + +from visions.types.string import String +from visions.types.url import URL + + +@URL.contains_op.register +def url_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, ParseResult) for value in sequence) + + +@URL.register_transformer(String, Sequence) +def string_to_url(sequence: Sequence, state: dict) -> Sequence: + return tuple(map(urlparse, sequence)) + + +@URL.register_relationship(String, Sequence) +def string_is_url(sequence: Sequence, state: dict) -> bool: + try: + return all(x.netloc and x.scheme for x in string_to_url(sequence, {})) + except (ValueError, TypeError, AttributeError): + return False diff --git a/build/lib/visions/backends/python/types/uuid.py b/build/lib/visions/backends/python/types/uuid.py new file mode 100644 index 000000000..f7672fe29 --- /dev/null +++ b/build/lib/visions/backends/python/types/uuid.py @@ -0,0 +1,24 @@ +import uuid +from typing import Sequence + +from visions.types.string import String +from visions.types.uuid import UUID + + +@UUID.contains_op.register +def uuid_contains(sequence: Sequence, state: dict) -> bool: + return all(isinstance(value, uuid.UUID) for value in sequence) + + +@UUID.register_transformer(String, Sequence) +def string_to_uuid(sequence: Sequence, state: dict) -> Sequence: + return [uuid.UUID(value) for value in sequence] + + +@UUID.register_relationship(String, Sequence) +def string_is_uuid(sequence: Sequence, state: dict) -> bool: + try: + string_to_uuid(sequence) + return True + except (ValueError, TypeError, AttributeError): + return False diff --git a/build/lib/visions/backends/shared/__init__.py b/build/lib/visions/backends/shared/__init__.py new file mode 100644 index 000000000..a66517c65 --- /dev/null +++ b/build/lib/visions/backends/shared/__init__.py @@ -0,0 +1 @@ +from . import nan_handling, parallelization_engines, utilities diff --git a/build/lib/visions/backends/shared/nan_handling.py b/build/lib/visions/backends/shared/nan_handling.py new file mode 100644 index 000000000..cc2e6dcea --- /dev/null +++ b/build/lib/visions/backends/shared/nan_handling.py @@ -0,0 +1,60 @@ +import math +from datetime import datetime, timedelta + +import numpy as np +import pandas as pd + +from .utilities import has_import + +has_numba = has_import("numba") + +if has_numba: + import numba as nb + + +def nan_mask(array: np.ndarray) -> np.ndarray: + # TODO: Fails for values like None, pandas resolves this but it's complicated some links: + # https://github.com/pandas-dev/pandas/blob/3391a348f3f7cd07a96c8e6a4b05e3e9f60c8567/pandas/core/series.py#L192 + # https://github.com/pandas-dev/pandas/blob/65319af6e563ccbb02fb5152949957b6aef570ef/pandas/core/base.py#L816 + # https://github.com/pandas-dev/pandas/blob/65319af6e563ccbb02fb5152949957b6aef570ef/pandas/core/dtypes/missing.py#L133 + # https://github.com/pandas-dev/pandas/blob/65319af6e563ccbb02fb5152949957b6aef570ef/pandas/core/dtypes/missing.py#L202 + # raise NotImplementedError('Robust missing value detection not implemented for numpy arrays') + try: + mask = ~np.isnan(array) + except TypeError: + # mask = np.array([not pd. for v in array], dtype=bool) + mask = ~pd.isna(array) + return mask + + +# TODO: There are optimizations here, just have to define precisely the desired missing ruleset in the +# generated jit +if has_numba: + + def is_missing(x): + """ + Return True if the value is missing, False otherwise. + """ + if isinstance(x, float): + return np.isnan(x) + elif isinstance(x, (datetime, timedelta)): + missing = x("NaT") + return x == missing + elif x is None: + return True + else: + return False + + nb.extending.overload(is_missing)(lambda x: is_missing) + + @nb.jit(nopython=True) + def hasna(x: np.ndarray) -> bool: + for item in x: + if is_missing(item): + return True + return False + +else: + + def anynan(array: np.ndarray) -> bool: + return any(math.isnan(v) for v in array) diff --git a/build/lib/visions/backends/shared/parallelization_engines.py b/build/lib/visions/backends/shared/parallelization_engines.py new file mode 100644 index 000000000..f08970501 --- /dev/null +++ b/build/lib/visions/backends/shared/parallelization_engines.py @@ -0,0 +1,105 @@ +from typing import Callable, List, Type + +import attr +import pandas as pd + +from visions.backends.shared.utilities import has_import + + +@attr.s +class Engine: + name = attr.ib() + + @classmethod + def setup(cls, *args, **kwargs) -> None: + raise NotImplementedError("No setup defined for generic engine") + + @staticmethod + def apply(series: pd.Series) -> Callable[[Callable], pd.Series]: + raise NotImplementedError("No apply defined for generic engine") + + +class PandasEngine(Engine): + name = "pandas" + _is_setup = True + + @classmethod + def setup(cls, *args, **kwargs) -> None: + pass + + @staticmethod + def apply(series: pd.Series) -> Callable[[Callable], pd.Series]: + return series.apply + + +class SwifterEngine(Engine): + name = "swifter" + _is_setup = False + + @classmethod + def setup(cls, *args, **kwargs) -> None: + if cls._is_setup: + return + + import swifter + + cls._is_setup = True + + @staticmethod + def apply(series: pd.Series) -> Callable[[Callable], pd.Series]: + return series.swifter.apply + + +_PANDAS_ENGINES = [PandasEngine, SwifterEngine] + + +class EngineCollection: + def __init__(self, engines: List[Type[Engine]]): + self.engines = {engine.name: engine for engine in engines} + + def is_engine(self, name: str) -> bool: + return name in self.engines + + def get(self, name: str) -> Type[Engine]: + return self.engines[name] + + +class PandasApply: + supported_engines = EngineCollection( + [engine for engine in _PANDAS_ENGINES if hasattr(engine, "apply")] + ) + _engine: Type[Engine] = PandasEngine + + @property + def engine(self) -> Type[Engine]: + return self._engine + + @engine.setter + def engine(self, value: str, *args, **kwargs) -> None: + if not self.supported_engines.is_engine(value): + raise ValueError(f"{value} is not a supported pandas apply engine") + self._engine = self.supported_engines.get(value) + self._engine.setup(*args, **kwargs) + + @property + def apply(self) -> Callable[[pd.Series], Callable[[Callable], pd.Series]]: + return self.engine.apply + + +class PandasHandler: + def __init__(self): + self.has_swifter = has_import("swifter") + + self.applier = PandasApply() + self._set_default_apply_engine() + + def _set_default_apply_engine(self) -> None: + if self.has_swifter: + self.applier.engine = "swifter" + + +_pandas_handler = PandasHandler() + + +def pandas_apply(series: pd.Series, func: Callable) -> pd.Series: + return _pandas_handler.applier.apply(series)(func) diff --git a/build/lib/visions/backends/shared/utilities.py b/build/lib/visions/backends/shared/utilities.py new file mode 100644 index 000000000..abb84708b --- /dev/null +++ b/build/lib/visions/backends/shared/utilities.py @@ -0,0 +1,6 @@ +from importlib import util as import_util + + +def has_import(module: str) -> bool: + has_module = import_util.find_spec(module) is not None + return has_module diff --git a/build/lib/visions/backends/spark/__init__.py b/build/lib/visions/backends/spark/__init__.py new file mode 100644 index 000000000..37d88a021 --- /dev/null +++ b/build/lib/visions/backends/spark/__init__.py @@ -0,0 +1,2 @@ +import visions.backends.spark.traversal +import visions.backends.spark.types diff --git a/build/lib/visions/backends/spark/traversal.py b/build/lib/visions/backends/spark/traversal.py new file mode 100644 index 000000000..429a38ffe --- /dev/null +++ b/build/lib/visions/backends/spark/traversal.py @@ -0,0 +1,33 @@ +from typing import Dict, List, Tuple, Type + +import networkx as nx +import pandas as pd +from pyspark.sql.dataframe import DataFrame + +from visions.types.type import VisionsBaseType +from visions.typesets.typeset import traverse_graph, traverse_graph_with_series + +T = Type[VisionsBaseType] + + +@traverse_graph.register(DataFrame) +def _traverse_graph_spark_dataframe( + df: DataFrame, root_node: T, graph: nx.DiGraph +) -> Tuple[DataFrame, Dict[str, List[T]], Dict[str, dict]]: + inferred_values = { + col: traverse_graph_with_series(root_node, df.select(col), graph) + for col in df.columns + } + + inferred_series = {} + inferred_paths: Dict[str, List[T]] = {} + inferred_states: Dict[str, dict] = {} + for col, (inf_series, inf_path, inf_state) in inferred_values.items(): + assert isinstance(inf_path, list) # Placate the MyPy Gods + + inferred_series[col] = inf_series + inferred_paths[col] = inf_path + inferred_states[col] = inf_state + + # note inference disabled, return df + return df, inferred_paths, inferred_states diff --git a/build/lib/visions/backends/spark/types/__init__.py b/build/lib/visions/backends/spark/types/__init__.py new file mode 100644 index 000000000..82aae7a3a --- /dev/null +++ b/build/lib/visions/backends/spark/types/__init__.py @@ -0,0 +1,8 @@ +import visions.backends.spark.types.boolean +import visions.backends.spark.types.categorical +import visions.backends.spark.types.date +import visions.backends.spark.types.float +import visions.backends.spark.types.integer +import visions.backends.spark.types.numeric +import visions.backends.spark.types.object +import visions.backends.spark.types.string diff --git a/build/lib/visions/backends/spark/types/boolean.py b/build/lib/visions/backends/spark/types/boolean.py new file mode 100644 index 000000000..4d4e08be9 --- /dev/null +++ b/build/lib/visions/backends/spark/types/boolean.py @@ -0,0 +1,13 @@ +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import BooleanType + +from visions.types.boolean import Boolean + + +@Boolean.contains_op.register +def boolean_contains(sequence: DataFrame, state: dict) -> bool: + if len(sequence.schema) != 1: + return False + + dtype = sequence.schema[0].dataType + return isinstance(dtype, BooleanType) diff --git a/build/lib/visions/backends/spark/types/categorical.py b/build/lib/visions/backends/spark/types/categorical.py new file mode 100644 index 000000000..fa7f13117 --- /dev/null +++ b/build/lib/visions/backends/spark/types/categorical.py @@ -0,0 +1,8 @@ +from pyspark.sql.dataframe import DataFrame + +from visions.types.categorical import Categorical + + +@Categorical.contains_op.register +def categorical_contains(sequence: DataFrame, state: dict) -> bool: + return False diff --git a/build/lib/visions/backends/spark/types/date.py b/build/lib/visions/backends/spark/types/date.py new file mode 100644 index 000000000..fdb5b231f --- /dev/null +++ b/build/lib/visions/backends/spark/types/date.py @@ -0,0 +1,13 @@ +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import DateType + +from visions.types.date import Date + + +@Date.contains_op.register +def date_contains(sequence: DataFrame, state: dict) -> bool: + if len(sequence.schema) != 1: + return False + + dtype = sequence.schema[0].dataType + return isinstance(dtype, DateType) diff --git a/build/lib/visions/backends/spark/types/datetime.py b/build/lib/visions/backends/spark/types/datetime.py new file mode 100644 index 000000000..42e24da3b --- /dev/null +++ b/build/lib/visions/backends/spark/types/datetime.py @@ -0,0 +1,13 @@ +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import TimestampType + +from visions.types.date_time import DateTime + + +@DateTime.contains_op.register +def datetime_contains(sequence: DataFrame, state: dict) -> bool: + if len(sequence.schema) != 1: + return False + + dtype = sequence.schema[0].dataType + return isinstance(dtype, TimestampType) diff --git a/build/lib/visions/backends/spark/types/float.py b/build/lib/visions/backends/spark/types/float.py new file mode 100644 index 000000000..8f8be72ef --- /dev/null +++ b/build/lib/visions/backends/spark/types/float.py @@ -0,0 +1,13 @@ +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import DecimalType, DoubleType, FloatType + +from visions.types.float import Float + + +@Float.contains_op.register +def float_contains(sequence: DataFrame, state: dict) -> bool: + if len(sequence.schema) != 1: + return False + + dtype = sequence.schema[0].dataType + return isinstance(dtype, (FloatType, DoubleType, DecimalType)) diff --git a/build/lib/visions/backends/spark/types/integer.py b/build/lib/visions/backends/spark/types/integer.py new file mode 100644 index 000000000..2271b7d13 --- /dev/null +++ b/build/lib/visions/backends/spark/types/integer.py @@ -0,0 +1,13 @@ +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import ByteType, IntegerType, LongType, ShortType + +from visions.types.integer import Integer + + +@Integer.contains_op.register +def integer_contains(sequence: DataFrame, state: dict) -> bool: + if len(sequence.schema) != 1: + return False + + dtype = sequence.schema[0].dataType + return isinstance(dtype, (ByteType, ShortType, IntegerType, LongType)) diff --git a/build/lib/visions/backends/spark/types/numeric.py b/build/lib/visions/backends/spark/types/numeric.py new file mode 100644 index 000000000..939a1a5a3 --- /dev/null +++ b/build/lib/visions/backends/spark/types/numeric.py @@ -0,0 +1,13 @@ +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import NumericType + +from visions.types.numeric import Numeric + + +@Numeric.contains_op.register +def numeric_contains(sequence: DataFrame, state: dict) -> bool: + if len(sequence.schema) != 1: + return False + + dtype = sequence.schema[0].dataType + return isinstance(dtype, NumericType) diff --git a/build/lib/visions/backends/spark/types/object.py b/build/lib/visions/backends/spark/types/object.py new file mode 100644 index 000000000..00b71fd3c --- /dev/null +++ b/build/lib/visions/backends/spark/types/object.py @@ -0,0 +1,13 @@ +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import ArrayType, DateType, MapType, StringType, StructType + +from visions.types.object import Object + + +@Object.contains_op.register +def object_contains(sequence: DataFrame, state: dict) -> bool: + if len(sequence.schema) != 1: + return False + + dtype = sequence.schema[0].dataType + return isinstance(dtype, (StringType, DateType, ArrayType, MapType, StructType)) diff --git a/build/lib/visions/backends/spark/types/string.py b/build/lib/visions/backends/spark/types/string.py new file mode 100644 index 000000000..98fd3536f --- /dev/null +++ b/build/lib/visions/backends/spark/types/string.py @@ -0,0 +1,13 @@ +from pyspark.sql.dataframe import DataFrame +from pyspark.sql.types import StringType + +from visions.types.string import String + + +@String.contains_op.register +def string_contains(sequence: DataFrame, state: dict) -> bool: + if len(sequence.schema) != 1: + return False + + dtype = sequence.schema[0].dataType + return isinstance(dtype, StringType) diff --git a/build/lib/visions/contrib/README.md b/build/lib/visions/contrib/README.md new file mode 100644 index 000000000..76b5066f8 --- /dev/null +++ b/build/lib/visions/contrib/README.md @@ -0,0 +1,10 @@ +# Contribution Guidelines + +Contributions here will go through a standard review process +to be promoted to standard types. Contributions made +to the contrib folder will also receive help / guidance whenever +requested. + + +TODO: +* Add reference in the docs \ No newline at end of file diff --git a/build/lib/visions/contrib/__init__.py b/build/lib/visions/contrib/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/build/lib/visions/contrib/relations/__init__.py b/build/lib/visions/contrib/relations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/build/lib/visions/contrib/relations/categorical_to_ordinal.py b/build/lib/visions/contrib/relations/categorical_to_ordinal.py new file mode 100644 index 000000000..5284f2243 --- /dev/null +++ b/build/lib/visions/contrib/relations/categorical_to_ordinal.py @@ -0,0 +1,27 @@ +# import pandas as pd +# +# from visions.relations.relations import InferenceRelation +# from visions.relations.relations_utils import values_are_consecutive +# from visions.types.ordinal import to_ordinal +# from visions.utils import func_nullable_series_contains +# +# +# @func_nullable_series_contains +# def is_ordinal_cat(series: pd.Series, state: dict) -> bool: +# initial_element = "a" +# s = series.astype(str) +# if s.str.len().max() == 1: +# distinct_values = list(s.str.lower().unique()) +# return initial_element in distinct_values and values_are_consecutive( +# list(map(ord, distinct_values)) +# ) +# else: +# return False +# +# +# def categorical_to_ordinal() -> InferenceRelation: +# from visions.types import Categorical +# +# return InferenceRelation( +# Categorical, relationship=is_ordinal_cat, transformer=to_ordinal +# ) diff --git a/build/lib/visions/contrib/relations/integer_to_count.py b/build/lib/visions/contrib/relations/integer_to_count.py new file mode 100644 index 000000000..4c64cf7ae --- /dev/null +++ b/build/lib/visions/contrib/relations/integer_to_count.py @@ -0,0 +1,22 @@ +import numpy as np +import pandas as pd + +from visions.relations.relations import InferenceRelation +from visions.types.integer import Integer + + +def is_unsigned_int(series: pd.Series, state: dict) -> bool: + # TODO: add coercion, ensure that > uint.MAX raises error + return series.ge(0).all() + + +def to_unsigned_int(series: pd.Series, state: dict) -> pd.Series: + return series.astype(np.uint64) + + +def integer_to_count() -> InferenceRelation: + return InferenceRelation( + relationship=is_unsigned_int, + transformer=to_unsigned_int, + related_type=Integer, + ) diff --git a/build/lib/visions/contrib/relations/integer_to_datetime.py b/build/lib/visions/contrib/relations/integer_to_datetime.py new file mode 100644 index 000000000..441a1afb4 --- /dev/null +++ b/build/lib/visions/contrib/relations/integer_to_datetime.py @@ -0,0 +1,27 @@ +# import pandas as pd +# +# from visions.backends.pandas_be import test_utils +# from visions.relations import InferenceRelation +# from visions.relations.string_to_datetime import to_datetime_year_month_day +# from visions.types import Integer +# +# +# def to_datetime(series: pd.Series) -> pd.Series: +# return pd.to_datetime(series) +# +# +# def _to_datetime(func) -> InferenceRelation: +# return InferenceRelation( +# relationship=test_utils.coercion_test(lambda s: func(s.astype(str))), +# transformer=to_datetime, +# related_type=Integer, +# ) +# +# +# # TODO: do only convert obvious dates (20191003000000) +# def integer_to_datetime(cls): +# return _to_datetime(cls, to_datetime) +# +# +# def integer_to_datetime_year_month_day(cls) -> InferenceRelation: +# return _to_datetime(cls, to_datetime_year_month_day) diff --git a/build/lib/visions/contrib/relations/integer_to_ordinal.py b/build/lib/visions/contrib/relations/integer_to_ordinal.py new file mode 100644 index 000000000..e97f3af70 --- /dev/null +++ b/build/lib/visions/contrib/relations/integer_to_ordinal.py @@ -0,0 +1,23 @@ +# import pandas as pd +# +# from visions.relations import InferenceRelation +# from visions.relations.relations_utils import values_are_consecutive +# from visions.types.ordinal import to_ordinal +# +# +# def is_ordinal_int(s: pd.Series, state: dict) -> bool: +# initial_element = 1 +# distinct_values = list(s.unique()) +# return ( +# initial_element in distinct_values +# and values_are_consecutive(distinct_values) +# and 2 < len(distinct_values) < 10 +# ) +# +# +# def integer_to_ordinal() -> InferenceRelation: +# from visions.types import Integer +# +# return InferenceRelation( +# Integer, relationship=is_ordinal_int, transformer=to_ordinal +# ) diff --git a/build/lib/visions/contrib/relations/relations_utils.py b/build/lib/visions/contrib/relations/relations_utils.py new file mode 100644 index 000000000..82f4b5d6b --- /dev/null +++ b/build/lib/visions/contrib/relations/relations_utils.py @@ -0,0 +1,5 @@ +from typing import Sequence + + +def values_are_consecutive(sequence: Sequence) -> bool: + return sorted(sequence) == list(range(min(sequence), max(sequence) + 1)) diff --git a/build/lib/visions/contrib/relations/string_to_categorical.py b/build/lib/visions/contrib/relations/string_to_categorical.py new file mode 100644 index 000000000..fc4251292 --- /dev/null +++ b/build/lib/visions/contrib/relations/string_to_categorical.py @@ -0,0 +1,16 @@ +# from visions import String +# from visions.relations.relations import InferenceRelation +# +# +# def string_to_categorical_distinct_count() -> InferenceRelation: +# """Convert string to categorical when it has fewer than 50% unique values. +# +# Returns: +# relation +# """ +# # TODO: only when not any other string relation (either exclude others or have ordering and evaluate last) +# return InferenceRelation( +# relationship=lambda s, state: s.nunique() / len(s) < 0.5, +# transformer=lambda s: s.astype("category"), +# related_type=String, +# ) diff --git a/build/lib/visions/contrib/relations/string_to_datetime.py b/build/lib/visions/contrib/relations/string_to_datetime.py new file mode 100644 index 000000000..5fb6c80fb --- /dev/null +++ b/build/lib/visions/contrib/relations/string_to_datetime.py @@ -0,0 +1,64 @@ +# import pandas as pd +# +# from visions import String +# from visions.backends.pandas_be import test_utils +# from visions.relations import InferenceRelation +# +# +# def to_datetime_year_week(series: pd.Series) -> pd.Series: +# """Convert a series of the format YYYY/UU (year, week) to datetime. +# A '0' is added as day dummy value, as pandas requires a day value to parse. +# +# Args: +# series: the Series to parse +# +# Returns: +# A datetime series +# +# Examples: +# >>> series = pd.Series(['2018/47', '2018/12', '2018/03']) +# >>> parsed_series = to_datetime_year_week(series) +# >>> print(parsed_series.dt.week) +# 0 47 +# 1 12 +# 2 3 +# dtype: int64 +# """ +# return pd.to_datetime(series + "0", format="%Y/%U%w") +# +# +# def to_datetime_year_month_day(series: pd.Series) -> pd.Series: +# """Convert a series of the format YYYYMMDD (year, month, day) to datetime. +# +# Args: +# series: the Series to parse +# +# Returns: +# A datetime series +# +# Examples: +# >>> series = pd.Series(['20181201', '20181202', '20181203']) +# >>> parsed_series = to_datetime_year_week(series) +# >>> print(parsed_series.dt.day) +# 0 1 +# 1 2 +# 2 3 +# dtype: int64 +# """ +# return pd.to_datetime(series, format="%Y%m%d") +# +# +# def _to_datetime(func) -> InferenceRelation: +# return InferenceRelation( +# relationship=test_utils.coercion_test(func), +# transformer=func, +# related_type=String, +# ) +# +# +# def string_to_datetime_year_week() -> InferenceRelation: +# return _to_datetime(to_datetime_year_week) +# +# +# def string_to_datetime_year_month_day() -> InferenceRelation: +# return _to_datetime(to_datetime_year_month_day) diff --git a/build/lib/visions/contrib/relations/string_to_ordinal.py b/build/lib/visions/contrib/relations/string_to_ordinal.py new file mode 100644 index 000000000..2815cdd8e --- /dev/null +++ b/build/lib/visions/contrib/relations/string_to_ordinal.py @@ -0,0 +1,25 @@ +# import pandas as pd +# +# from visions.relations.relations import InferenceRelation +# from visions.relations.relations_utils import values_are_consecutive +# from visions.types.ordinal import to_ordinal +# +# +# def is_ordinal_str(s: pd.Series, state: dict) -> bool: +# if s.str.len().max() == 1: +# unique_values = list(s[s.notna()].str.lower().unique()) +# return "a" in unique_values and values_are_consecutive( +# list(map(ord, unique_values)) +# ) +# else: +# return False +# +# +# def string_to_ordinal() -> InferenceRelation: +# from visions.types import String +# +# return InferenceRelation( +# related_type=String, +# relationship=is_ordinal_str, +# transformer=to_ordinal, +# ) diff --git a/build/lib/visions/contrib/types/__init__.py b/build/lib/visions/contrib/types/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/build/lib/visions/contrib/typesets/__init__.py b/build/lib/visions/contrib/typesets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/build/lib/visions/declarative.py b/build/lib/visions/declarative.py new file mode 100644 index 000000000..6ccd11dca --- /dev/null +++ b/build/lib/visions/declarative.py @@ -0,0 +1,52 @@ +from typing import Any, Callable, List, Optional, Sequence, Type, TypeVar, Union + +from visions.relations import IdentityRelation, InferenceRelation +from visions.types.type import VisionsBaseType + +T = TypeVar("T") + + +def process_relation(items: Union[dict, Type[VisionsBaseType]]) -> IdentityRelation: + if isinstance(items, dict): + return IdentityRelation(**items) + elif issubclass(items, VisionsBaseType): + return IdentityRelation(related_type=items) + else: + raise TypeError("identity should be a list, a dict of params or related_type.") + + +def create_type( + name: str, + contains: Callable[[Any, dict], bool], + identity: Optional[ + Union[Type[VisionsBaseType], List[Union[dict, Type[VisionsBaseType]]], dict] + ] = None, + inference: Optional[Union[List[dict], dict]] = None, +): + def get_relations(): + if isinstance(identity, Sequence): + relations = [process_relation(item) for item in identity] + else: + relations = [] if identity is None else [process_relation(identity)] + + if inference is not None: + if isinstance(inference, dict): + relations += [InferenceRelation(**inference)] + elif isinstance(inference, list): + relations += [InferenceRelation(**params) for params in inference] + else: + raise TypeError("inference should be a list or a dict of params.") + + return relations + + def contains_op(series, state): + return contains(series, state) + + return type( + name, + (VisionsBaseType,), + { + "get_relations": staticmethod(get_relations), + "contains_op": staticmethod(contains_op), + }, + ) diff --git a/build/lib/visions/dtypes/__init__.py b/build/lib/visions/dtypes/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/build/lib/visions/dtypes/boolean.py b/build/lib/visions/dtypes/boolean.py new file mode 100644 index 000000000..508b20659 --- /dev/null +++ b/build/lib/visions/dtypes/boolean.py @@ -0,0 +1,776 @@ +import numbers +import warnings +from typing import Type + +import numpy as np +import pandas +from pandas._libs import lib +from pandas.compat import set_function_name +from pandas.core import nanops, ops +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, +) +from pandas.core.dtypes.dtypes import register_extension_dtype + +if tuple(map(int, pandas.__version__.split("."))) < (1, 3): + from pandas.core.dtypes.generic import ABCIndexClass + + dtg = ABCIndexClass +else: + from pandas.core.dtypes.generic import ABCIndex + + dtg = ABCIndex + +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import isna, notna +from pandas.core.tools.numeric import to_numeric +from pandas.util._decorators import cache_readonly + + +class _BoolDtype(ExtensionDtype): + """ + An ExtensionDtype to hold a single size & kind of integer dtype. + + These specific implementations are subclasses of the non-public + _BoolDtype. For example we have Int8Dtype to represent signed int 8s. + + The attributes name & type are set when these subclasses are created. + """ + + name = None # type: str + base = None + type = None # type: Type + na_value = None + + def __repr__(self): + """When the user calls `repr(series.dtype)`""" + return "BoolDtype()" + + @property + def _is_boolean(self) -> bool: + """Results in `pandas.api.types.is_boolean_dtype` recognizing this type.""" + return True + + @cache_readonly + def numpy_dtype(self): + """Return an instance of our numpy dtype""" + return np.dtype(self.type) + + @cache_readonly + def kind(self): + """When calling `series.dtype.kind`, returns 'b'.""" + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self): + """Return the number of bytes numpy requires to store the bool""" + return self.numpy_dtype.itemsize + + @classmethod + def construct_array_type(cls): + """Return the array type associated with this dtype + + Returns + ------- + type + """ + return BoolArray + + +def boolean_array(values, dtype=None, copy=False): + """ + Infer and return an integer array of the values. + + Parameters + ---------- + values : 1D list-like + dtype : dtype, optional + dtype to coerce + copy : boolean, default False + + Returns + ------- + BoolArray + + Raises + ------ + TypeError if incompatible types + """ + values, mask = coerce_to_array(values, dtype=dtype, copy=copy) + return BoolArray(values, mask) + + +def safe_cast(values, dtype, copy): + """ + Safely cast the values to the dtype if they + are equivalent, meaning floats must be equivalent to the + ints. + + """ + + try: + return values.astype(dtype, casting="safe", copy=copy) + except TypeError: + + casted = values.astype(dtype, copy=copy) + if (casted == values).all(): + return casted + + raise TypeError( + "cannot safely cast non-equivalent {} to {}".format( + values.dtype, np.dtype(dtype) + ) + ) + + +def coerce_to_array(values, dtype, mask=None, copy=False): + """ + Coerce the input values array to numpy arrays with a mask + + Parameters + ---------- + values : 1D list-like + dtype : integer dtype + mask : boolean 1D array, optional + copy : boolean, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + # if values is integer numpy array, preserve it's dtype + if dtype is None and hasattr(values, "dtype"): + if is_integer_dtype(values.dtype): + dtype = values.dtype + + if dtype is not None: + # if isinstance(dtype, str) and ( + # dtype.startswith("Int") or dtype.startswith("UInt") + # ): + # # Avoid DeprecationWarning from NumPy about np.dtype("Int64") + # # https://github.com/numpy/numpy/pull/7476 + # dtype = dtype.lower() + + if not issubclass(type(dtype), _BoolDtype): + try: + dtype = _dtypes[str(np.dtype(dtype))] + except KeyError: + raise ValueError(f"invalid dtype specified {dtype}") + + if isinstance(values, BoolArray): + values, mask = values._data, values._mask + if dtype is not None: + values = values.astype(dtype.numpy_dtype, copy=False) + + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + values = np.array(values, copy=copy) + if is_object_dtype(values): + inferred_type = lib.infer_dtype(values, skipna=True) + if inferred_type == "empty": + values = np.empty(len(values)) + values.fill(_BoolDtype.na_value) + elif inferred_type not in [ + "floating", + "integer", + "boolean", + "mixed-integer", + "mixed-integer-float", + ]: + raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") + + elif is_bool_dtype(values) and is_integer_dtype(dtype): + values = np.array(values, dtype=int, copy=copy) + + elif not ( + is_integer_dtype(values) or is_float_dtype(values) or is_bool_dtype(values) + ): + raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") + + if mask is None: + mask = isna(values) + else: + assert len(mask) == len(values) + + if not values.ndim == 1: + raise TypeError("values must be a 1D list-like") + if not mask.ndim == 1: + raise TypeError("mask must be a 1D list-like") + + # infer dtype if needed + if dtype is None: + dtype = np.dtype("bool") + else: + dtype = dtype.type + + # if we are float, let's make sure that we can + # safely cast + + # we copy as need to coerce here + if mask.any(): + values = values.copy() + values[mask] = 1 + values = safe_cast(values, dtype, copy=False) + else: + values = safe_cast(values, dtype, copy=False) + + return values, mask + + +class BoolArray(ExtensionArray, ExtensionOpsMixin): + """ + Array of integer (optional missing) values. + + .. versionadded:: 0.24.0 + + .. warning:: + + BoolArray is currently experimental, and its API or internal + implementation may change without warning. + + We represent an BoolArray with 2 numpy arrays: + + - data: contains a numpy integer array of the appropriate dtype + - mask: a boolean array holding a mask on the data, True is missing + + To construct an BoolArray from generic array-like input, use + :func:`pandas.array` with one of the integer dtypes (see examples). + + See :ref:`integer_na` for more. + + Parameters + ---------- + values : numpy.ndarray + A 1-d integer-dtype array. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values. + copy : bool, default False + Whether to copy the `values` and `mask`. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + BoolArray + + Examples + -------- + Create an BoolArray with :func:`pandas.array`. + + >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype()) + >>> int_array + + [1, NaN, 3] + Length: 3, dtype: Int32 + + String aliases for the dtypes are also available. They are capitalized. + + >>> pd.array([1, None, 3], dtype='Int32') + + [1, NaN, 3] + Length: 3, dtype: Int32 + + >>> pd.array([1, None, 3], dtype='UInt16') + + [1, NaN, 3] + Length: 3, dtype: UInt16 + """ + + @cache_readonly + def dtype(self): + return _dtypes[str(self._data.dtype)] + + def __init__(self, values, mask, copy=False): + if not ( + isinstance(values, np.ndarray) + and is_integer_dtype(values.dtype) + or is_bool_dtype(values.dtype) + ): + raise TypeError( + "values should be integer numpy array. Use " + "the 'integer_array' function instead" + ) + if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'integer_array' function instead" + ) + + if copy: + values = values.copy() + mask = mask.copy() + + self._data = values + self._mask = mask + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return boolean_array(scalars, dtype=dtype, copy=copy) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype, copy) + + @classmethod + def _from_factorized(cls, values, original): + return boolean_array(values, dtype=original.dtype) + + def _formatter(self, boxed=False): + def fmt(x): + if isna(x): + return "None" + return str(x) + + return fmt + + def __getitem__(self, item): + if is_integer(item): + if self._mask[item]: + return self.dtype.na_value + return self._data[item] + return type(self)(self._data[item], self._mask[item]) + + def _coerce_to_ndarray(self): + """ + coerce to an ndarary of object dtype + """ + + # TODO(jreback) make this better + data = self._data.astype(object) + data[self._mask] = self._na_value + return data + + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + + def __array__(self, dtype=None): + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + return self._coerce_to_ndarray() + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # For BoolArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == "reduce": + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (BoolArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, BoolArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + if is_integer_dtype(x.dtype): + m = mask.copy() + return BoolArray(x, m) + else: + x[mask] = _BoolDtype.na_value + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def __iter__(self): + for i in range(len(self)): + if self._mask[i]: + yield self.dtype.na_value + else: + yield self._data[i] + + def take(self, indexer, allow_fill=False, fill_value=None): + from pandas.api.extensions import take + + # we always fill with 1 internally + # to avoid upcasting + data_fill_value = 1 if isna(fill_value) else fill_value + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) + + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return type(self)(result, mask, copy=False) + + def copy(self): + data, mask = self._data, self._mask + data = data.copy() + mask = mask.copy() + return type(self)(data, mask, copy=False) + + def __setitem__(self, key, value): + _is_scalar = is_scalar(value) + if _is_scalar: + value = [value] + value, mask = coerce_to_array(value, dtype=self.dtype) + + if _is_scalar: + value = value[0] + mask = mask[0] + + self._data[key] = value + self._mask[key] = mask + + def __len__(self): + return len(self._data) + + @property + def nbytes(self): + return self._data.nbytes + self._mask.nbytes + + def isna(self): + return self._mask + + @property + def _na_value(self): + return _BoolDtype.na_value + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x._data for x in to_concat]) + mask = np.concatenate([x._mask for x in to_concat]) + return cls(data, mask) + + def astype(self, dtype, copy=True): + """ + Cast to a NumPy array or BoolArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray or BoolArray + NumPy ndarray or IntergerArray with 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an IntegerDtype, equivalent of same_kind + casting + """ + + # if we are astyping to an existing IntegerDtype we can fastpath + if isinstance(dtype, _BoolDtype): + result = self._data.astype(dtype.numpy_dtype, copy=False) + return type(self)(result, mask=self._mask, copy=False) + + # coerce + data = self._coerce_to_ndarray() + return astype_nansafe(data, dtype, copy=None) + + @property + def _ndarray_values(self) -> np.ndarray: + """Internal pandas method for lossy conversion to a NumPy ndarray. + + This method is not part of the pandas interface. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + """ + return self._data + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each category. + + Every category will have an entry, even those with a count of 0. + + Parameters + ---------- + dropna : boolean, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + + """ + + from pandas import Index, Series + + # compute counts on the data with no nans + data = self._data[~self._mask] + value_counts = Index(data).value_counts() + array = value_counts.values + + # TODO(extension) + # if we have allow Index to hold an ExtensionArray + # this is easier + index = value_counts.index.astype(object) + + # if we want nans, count the mask + if not dropna: + # TODO(extension) + # appending to an Index *always* infers + # w/o passing the dtype + array = np.append(array, [self._mask.sum()]) + index = Index( + np.concatenate([index.values, np.array([np.nan], dtype=object)]), + dtype=object, + ) + + return Series(array, index=index) + + def _values_for_argsort(self) -> np.ndarray: + """Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort + """ + data = self._data.copy() + data[self._mask] = data.min() - 1 + return data + + @classmethod + def _create_comparison_method(cls, op): + def cmp_method(self, other): + op_name = op.__name__ + mask = None + + if isinstance(other, (ABCSeries, dtg)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + if isinstance(other, BoolArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 0 and len(self) != len(other): + raise ValueError("Lengths must match to compare") + + other = lib.item_from_zerodim(other) + + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + result[mask] = op_name == "ne" + return result + + name = f"__{op.__name__}__" + return set_function_name(cmp_method, name, cls) + + def _reduce(self, name, skipna=True, **kwargs): + data = self._data + mask = self._mask + + # coerce to a nan-aware float if needed + if mask.any(): + data = self._data.astype("float64") + data[mask] = self._na_value + + op = getattr(nanops, "nan" + name) + result = op(data, axis=0, skipna=skipna, mask=mask) + + # if we have a boolean op, don't coerce + if name in ["any", "all"]: + pass + + # if we have a preservable numeric op, + # provide coercion back to an integer type if possible + elif name in ["sum", "min", "max", "prod"] and notna(result): + int_result = int(result) + if int_result == result: + result = int_result + + return result + + def _maybe_mask_result(self, result, mask, other, op_name): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + + # may need to fill infs + # and mask wraparound + if is_float_dtype(result): + mask |= (result == np.inf) | (result == -np.inf) + + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + if (is_float_dtype(other) or is_float(other)) or ( + op_name in ["rtruediv", "truediv"] + ): + result[mask] = np.nan + return result + + return type(self)(result, mask, copy=False) + + @classmethod + def _create_arithmetic_method(cls, op): + def integer_arithmetic_method(self, other): + + op_name = op.__name__ + mask = None + + if isinstance(other, (ABCSeries, dtg)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + if isinstance(other, BoolArray): + other, mask = other._data, other._mask + + elif getattr(other, "ndim", None) == 0: + other = other.item() + + elif is_list_like(other): + other = np.asarray(other) + if not other.ndim: + other = other.item() + elif other.ndim == 1: + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + else: + if not (is_float(other) or is_integer(other)): + raise TypeError("can only perform ops with numeric values") + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + # 1 ** np.nan is 1. So we have to unmask those. + if op_name == "pow": + mask = np.where(self == 1, False, mask) + + elif op_name == "rpow": + mask = np.where(other == 1, False, mask) + + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = f"__{op.__name__}__" + return set_function_name(integer_arithmetic_method, name, cls) + + +BoolArray._add_arithmetic_ops() +BoolArray._add_comparison_ops() + +_dtype_docstring = """ +An ExtensionDtype for {dtype} data. + +Attributes +---------- +None + +Methods +------- +None +""" + +# create the Dtype +BoolDtype = register_extension_dtype( + type( + "BoolDtype", + (_BoolDtype,), + { + "type": np.bool_, + "name": "Bool", + "__doc__": _dtype_docstring.format(dtype="bool"), + }, + ) +) + +_dtypes = {"bool": BoolDtype()} diff --git a/build/lib/visions/functional.py b/build/lib/visions/functional.py new file mode 100644 index 000000000..005d5e9de --- /dev/null +++ b/build/lib/visions/functional.py @@ -0,0 +1,140 @@ +from typing import Dict, List, Sequence, Tuple, Type, Union + +import pandas as pd + +from visions.types.type import VisionsBaseType +from visions.typesets.typeset import VisionsTypeset + +T = Type[VisionsBaseType] + + +def cast_to_detected(data: Sequence, typeset: VisionsTypeset) -> Sequence: + """Casts a DataFrame into a typeset by first performing column wise type detection against + a provided typeset + + Args: + data: the DataFrame to cast + typeset: the Typeset in which we cast + + Returns: + A tuple of the casted DataFrame and the types to which the columns were cast + """ + return typeset.cast_to_detected(data) + + +def cast_to_inferred(data: Sequence, typeset: VisionsTypeset) -> Sequence: + """Casts a DataFrame into a typeset by first performing column wise type inference against + a provided typeset + + Args: + data: the DataFrame to cast + typeset: the Typeset in which we cast + + Returns: + A tuple of the casted DataFrame and the types to which the columns were cast + """ + return typeset.cast_to_inferred(data) + + +def infer_type(data: Sequence, typeset: VisionsTypeset) -> Union[Dict[str, T], T]: + """Infer the current types of each column in the DataFrame given the typeset. + + Args: + data: the DataFrame to infer types on + typeset: the Typeset that provides the type context + + Returns: + A dictionary with a mapping from column name to type + """ + return typeset.infer_type(data) + + +def detect_type(data: Sequence, typeset: VisionsTypeset) -> Union[Dict[str, T], T]: + """Detect the type in the base graph + + Args: + data: the DataFrame to detect types on + typeset: the Typeset that provides the type context + + Returns: + A dictionary with a mapping from column name to type + """ + return typeset.detect_type(data) + + +def compare_detect_inference_frame( + data: Sequence, typeset: VisionsTypeset +) -> List[Tuple[str, T, T]]: + """Compare the types given by inference on the base graph and the relational graph + + Args: + data: the sequence to detect types on + typeset: the Typeset that provides the type context + + Examples: + >>> for column, type_before, type_after in compare_detect_inference_frame(data, typeset): + >>> print(f"{column} was {type_before} is {type_after}") + + See Also: + :doc:`type_inference_report_frame `: + Formatted report of the output of this function + """ + comparisons = [] + detected_types = detect_type(data, typeset) + inferred_types = infer_type(data, typeset) + + assert isinstance(detected_types, dict) and isinstance( + inferred_types, dict + ) # Placate the MyPy Gods + + for key in detected_types.keys() & inferred_types.keys(): + comparisons.append((key, detected_types[key], inferred_types[key])) + return comparisons + + +# TODO: make independent of pandas +def type_inference_report_frame(df: pd.DataFrame, typeset: VisionsTypeset) -> str: + """Return formatted report of the output of `compare_detect_inference_frame`. + + Args: + df: the DataFrame to detect types on + typeset: the Typeset that provides the type context + + Returns: + Text-based comparative type inference report + + Examples: + >>> import pandas as pd + >>> from visions.functional import type_inference_report_frame + >>> from visions.typesets import StandardSet + >>> + >>> typeset = StandardSet() + >>> df = pd.read_csv('dataset.csv') + >>> + >>> report = type_inference_report_frame(df, typeset) + >>> print(report) + """ + padding = 5 + max_column_length = max(len(column) for column in df.columns) + padding + max_type_length = 30 + + report = "" + change_count = 0 + for column, type_before, type_after in compare_detect_inference_frame(df, typeset): + changed = type_before != type_after + if changed: + fill = "!=" + change_count += 1 + else: + fill = "==" + report += ( + f"{column: <{max_column_length}} {type_before: <{max_type_length}} " + f"{fill} " + f"{type_after: <{max_type_length}} \n" + ) + report += ( + "In total {change_count} out of {type_count} types were changed.\n".format( + change_count=change_count, type_count=len(df.columns) + ) + ) + return report diff --git a/build/lib/visions/py.typed b/build/lib/visions/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/build/lib/visions/relations/__init__.py b/build/lib/visions/relations/__init__.py new file mode 100644 index 000000000..b48c2456d --- /dev/null +++ b/build/lib/visions/relations/__init__.py @@ -0,0 +1,13 @@ +"""This module contains (predefined) relations.""" + +from visions.relations.relations import ( + IdentityRelation, + InferenceRelation, + TypeRelation, +) + +__all__ = [ + "IdentityRelation", + "InferenceRelation", + "TypeRelation", +] diff --git a/build/lib/visions/relations/relations.py b/build/lib/visions/relations/relations.py new file mode 100644 index 000000000..e74d5fe40 --- /dev/null +++ b/build/lib/visions/relations/relations.py @@ -0,0 +1,88 @@ +from typing import Any, Callable, Optional, TypeVar + +import attr +from multimethod import multimethod + +T = TypeVar("T") + + +def func_repr(func: Callable) -> str: + return func.__name__ if hasattr(func, "__name__") else "lambda" + + +def identity_transform(series: Any, state: dict = dict()) -> Any: + return series + + +def default_relation(series: Any, state: dict = dict()) -> bool: + raise NotImplementedError + + +@attr.s(frozen=True) +class TypeRelation: + """Relationship encoder between implementations of :class:`visions.types.type.VisionsBaseType` + + Defines a one to one relationship between two :class:`visions.types.type.VisionsBaseType` implementations, + A and B, with respect to an underlying data series. In order to define a relationship we need + two methods: + + - **is_relationship**, determines whether a series of type B can be alternatively represented as type A. + - **transform**, provides a mechanism to convert the series from B -> A. + + For example, the series `pd.Series([1.0, 2.0, 3.0])` is encoded as a sequence of + floats but in reality they are all integers. + + Examples: + >>> from visions.types import Integer, Float + >>> x = pd.Series([1.0, 2.0, 3.0]) + >>> state = dict() + >>> relation = TypeRelation(Integer, Float) + >>> relation.is_relation(x, state) + True + + >>> relation.transform(x, state) + pd.Series([1, 2, 3]) + """ + + related_type = attr.ib() + inferential: bool = attr.ib() + transformer: Callable[[T, dict], T] = attr.ib( + converter=multimethod, repr=func_repr # type: ignore + ) + relationship: Callable[[Any, dict], bool] = attr.ib( + default=default_relation, converter=multimethod, repr=func_repr # type: ignore + ) + type = attr.ib(default=None) + + def is_relation(self, series: Any, state: Optional[dict] = None) -> bool: + if state is None: + state = {} + return self.relationship(series, state) + + def transform(self, series: T, state: Optional[dict] = None) -> T: + if state is None: + state = {} + return self.transformer(series, state) + + def __str__(self): + return f"{self.related_type}->{self.type}" + + +@attr.s(frozen=True) +class IdentityRelation(TypeRelation): + relationship: Callable[[T, dict], bool] = attr.ib(repr=func_repr, default=None) + transformer: Callable[[T, dict], T] = attr.ib( + default=identity_transform, repr=func_repr + ) + inferential: bool = attr.ib(default=False) + + +@attr.s(frozen=True) +class InferenceRelation(TypeRelation): + relationship: Callable[[T, dict], bool] = attr.ib( + repr=func_repr, default=default_relation + ) + transformer: Callable[[T, dict], T] = attr.ib( + repr=func_repr, default=identity_transform + ) + inferential: bool = attr.ib(default=True) diff --git a/build/lib/visions/test/__init__.py b/build/lib/visions/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/build/lib/visions/test/data/__init__.py b/build/lib/visions/test/data/__init__.py new file mode 100644 index 000000000..cc9602125 --- /dev/null +++ b/build/lib/visions/test/data/__init__.py @@ -0,0 +1 @@ +"""Small files used for test sequences""" diff --git a/build/lib/visions/test/data/file.html b/build/lib/visions/test/data/file.html new file mode 100644 index 000000000..e69de29bb diff --git a/build/lib/visions/test/data/img.jpeg b/build/lib/visions/test/data/img.jpeg new file mode 100644 index 000000000..f835344f1 Binary files /dev/null and b/build/lib/visions/test/data/img.jpeg differ diff --git a/build/lib/visions/test/data/img.jpg b/build/lib/visions/test/data/img.jpg new file mode 100644 index 000000000..f835344f1 Binary files /dev/null and b/build/lib/visions/test/data/img.jpg differ diff --git a/build/lib/visions/test/data/img.png b/build/lib/visions/test/data/img.png new file mode 100644 index 000000000..c5916f289 Binary files /dev/null and b/build/lib/visions/test/data/img.png differ diff --git a/build/lib/visions/test/series.py b/build/lib/visions/test/series.py new file mode 100644 index 000000000..fe1557775 --- /dev/null +++ b/build/lib/visions/test/series.py @@ -0,0 +1,21 @@ +from typing import Dict + +import pandas as pd + + +def get_series() -> Dict[str, pd.Series]: + from visions.backends.numpy.sequences import get_sequences as get_numpy_sequences + from visions.backends.pandas.sequences import get_sequences as get_pandas_sequences + from visions.backends.python.sequences import get_sequences as get_builtin_sequences + + sequences = get_builtin_sequences() + sequences.update(get_numpy_sequences()) + + test_series = {} + for name, sequence in sequences.items(): + test_series[name] = pd.Series(sequence) + + test_series.update(get_pandas_sequences()) + assert all(isinstance(v, pd.Series) for v in test_series.values()) + + return test_series diff --git a/build/lib/visions/test/series_geometry.py b/build/lib/visions/test/series_geometry.py new file mode 100644 index 000000000..6bb7c0418 --- /dev/null +++ b/build/lib/visions/test/series_geometry.py @@ -0,0 +1,29 @@ +from typing import Dict + +import pandas as pd + + +def get_geometry_series() -> Dict[str, pd.Series]: + from shapely import wkt + + series = { + "geometry_string_series": pd.Series( + ["POINT (-92 42)", "POINT (-92 42.1)", "POINT (-92 42.2)"], + ), + "geometry_series": pd.Series( + [ + wkt.loads("POINT (-92 42)"), + wkt.loads("POINT (-92 42.1)"), + wkt.loads("POINT (-92 42.2)"), + ], + ), + "geometry_series_missing": pd.Series( + [ + wkt.loads("POINT (-92 42)"), + wkt.loads("POINT (-92 42.1)"), + wkt.loads("POINT (-92 42.2)"), + None, + ], + ), + } + return series diff --git a/build/lib/visions/test/series_sparse.py b/build/lib/visions/test/series_sparse.py new file mode 100644 index 000000000..83ada9cc4 --- /dev/null +++ b/build/lib/visions/test/series_sparse.py @@ -0,0 +1,56 @@ +from typing import Dict + +import numpy as np +import pandas as pd + +from visions.backends.pandas.test_utils import pandas_version + +not_pandas_1_0_5 = not ( + (pandas_version[0] == 1) and (pandas_version[1] == 0) and (pandas_version[2] == 5) +) + + +def get_sparse_series() -> Dict[str, pd.Series]: + test_series = { + "int_sparse": pd.Series([-1, 0, 1, 2, 3], dtype=pd.SparseDtype(np.int32, 0)), + "float_sparse": pd.Series( + [np.nan, 0.2, 1, 2, 3], + dtype=pd.SparseDtype(np.float64, np.nan), + ), + "complex_sparse": pd.Series( + [np.nan, complex(0, 1), complex(1, -1), complex(2, 4), complex(3, -12)], + dtype=pd.SparseDtype(np.complex128, np.nan), + ), + "bool_sparse": pd.Series( + [True, False, False], + dtype=pd.SparseDtype(np.bool_, False), + ), + "str_obj_sparse": pd.Series( + pd.arrays.SparseArray([None, None, "gold", "black", "silver"]), + ), + # Pending https://github.com/pandas-dev/pandas/issues/35762 + # pd.Series([None, 0, 1, 2, 3, 4], name="datetime_sparse", dtype=pd.SparseDtype(np.datetime64)), + # Pandas dtypes + "pd_int64_sparse": pd.Series( + [0, 1, 2, 3, None], + dtype=pd.SparseDtype("int", np.nan), + ), + # Pending https://github.com/pandas-dev/pandas/issues/35793 + # pd.Series( + # ["a", "b", "c", None], + # name="pd_categorical_sparse", + # dtype=pd.SparseDtype(pd.CategoricalDtype(['a', 'b', 'c', 'd'])) + # ) + } + + if pandas_version[0] >= 1 and not_pandas_1_0_5: + test_series["pd_string_sparse"] = pd.Series( + ["Patty", "Valentine", "Upper", "", "", ""], + dtype=pd.SparseDtype(pd.StringDtype(), ""), + ) + test_series["pd_bool_sparse"] = pd.Series( + [True, False, False, None], + dtype=pd.SparseDtype("bool", pd.NA), + ) + + return test_series diff --git a/build/lib/visions/test/utils.py b/build/lib/visions/test/utils.py new file mode 100644 index 000000000..8d30a41a9 --- /dev/null +++ b/build/lib/visions/test/utils.py @@ -0,0 +1,246 @@ +from typing import Any, Dict, Iterable, Optional, Sequence, Set, Tuple, Type + +import networkx as nx +import pandas as pd +import pytest + +from visions import VisionsBaseType, VisionsTypeset + +T = Type[VisionsBaseType] + + +def is_iter(v: Any) -> bool: + return isinstance(v, Iterable) and not isinstance(v, (str, bytes)) + + +def sequences_equal(s1: Sequence, s2: Sequence) -> bool: + for v1, v2 in zip(s1, s2): + if is_iter(v1) and is_iter(v2): + if not sequences_equal(v1, v2): + return False + elif not (pd.isna(v1) and pd.isna(v2)) and not v1 == v2: + return False + + return True + + +def all_series_included( + series_list: Dict[str, Sequence], series_map: Dict[T, Set[str]] +): + """Check that all names are indeed used""" + used_names = {name for names in series_map.values() for name in names} + names = set(series_list.keys()) + + if not names == used_names: + unused = names - used_names + not_provided = used_names - names + # TODO: warning? + if len(unused) > 0: + raise ValueError(f"{len(unused)} series not included in tests {unused}") + if len(not_provided) > 0: + raise ValueError( + f"{len(not_provided)} series are included, not not provided {not_provided}" + ) + + +def get_contains_cases( + _test_suite: Dict[str, Sequence], + _series_map: Dict[T, Set[str]], + typeset: VisionsTypeset, +): + """Parametrize contains tests + + Args: + _test_suite: mapping from sequence identifiers to sequences + _series_map: mapping from type to a set of sequence identifiers + typeset: A VisionsTypeset + + Returns: + the args for the generated tests + """ + + # Include children's series in parent + reversed_topological_edge_sort = list( + reversed(list(nx.topological_sort(nx.line_graph(typeset.base_graph)))) + ) + for parent, child in reversed_topological_edge_sort: + _series_map[parent] |= _series_map[child] + + all_series_included(_test_suite, _series_map) + + argsvalues = [] + for name, item in _test_suite.items(): + for type, series_list in _series_map.items(): + args: Dict[str, Any] = {"id": f"{name} x {type}"} + + member = name in series_list + argsvalues.append(pytest.param(name, item, type, member, **args)) + + return { + "argnames": ["name", "series", "contains_type", "member"], + "argvalues": argsvalues, + } + + +def contains(name: str, series: Sequence, type: T, member: bool) -> Tuple[bool, str]: + return ( + member == (series in type), + f"{name} in {type}; expected {member}, got {series in type}", + ) + + +def get_inference_cases( + _test_suite: Dict[str, Sequence], + inferred_series_type_map: Dict[str, T], + typeset: VisionsTypeset, +) -> Dict[str, Any]: + argsvalues = [] + for name, series in _test_suite.items(): + if name not in inferred_series_type_map: + raise ValueError( + f"{name} has no defined inference type, please add one to the test case mapping" + ) + + expected_type = inferred_series_type_map[name] + for test_type in typeset.types: + expected = test_type == expected_type + args: Dict[str, Any] = {"id": f"{name} x {test_type} expected {expected}"} + difference = test_type != expected_type + argsvalues.append( + pytest.param(name, series, test_type, typeset, difference, **args) + ) + return { + "argnames": "name,series,inference_type,typeset,difference", + "argvalues": argsvalues, + } + + +def infers( + name: str, + series: Sequence, + expected_type: T, + typeset: VisionsTypeset, + difference: bool, +) -> Tuple[bool, str]: + from visions.typesets.typeset import get_type_from_path + + _, paths, _ = typeset.infer(series) + inferred_type = get_type_from_path(paths) + + # inferred_type = typeset.infer_type(series) + return ( + (inferred_type == expected_type) != difference, + f"inference of {name} expected {expected_type} to be {not difference} (typeset={typeset}). Path: {paths}", + ) + # return series in inferred_type, f"series should be member of inferred type" + + +def all_relations_tested(series_map, typeset): + # Convert data structure for mapping + series_map_lookup = {} + for map_to_type, map_from_type, items in series_map: + try: + series_map_lookup[map_to_type][map_from_type] = items + except KeyError: + series_map_lookup[map_to_type] = {map_from_type: items} + + missing_relations = set() + for node in typeset.types: + for relation in node.relations: + from_type, to_type = relation.related_type, relation.type + if relation.inferential and ( + to_type not in series_map_lookup + or from_type not in series_map_lookup[to_type] + or len(series_map_lookup[to_type][from_type]) == 0 + ): + missing_relations.add(str(relation)) + + if len(missing_relations) > 0: + raise ValueError( + f"Not all inferential relations are tested {missing_relations}" + ) + + +def get_convert_cases(_test_suite, _series_map, typeset): + all_relations_tested(_series_map, typeset) + + argsvalues = [] + for name, item in _test_suite.items(): + for source_type, relation_type, series_list in _series_map: + for namex in series_list: + if namex not in _test_suite.keys(): + raise ValueError( + f"{namex} specified in convert_map, but not in provided sequences" + ) + + if item in relation_type: + args: Dict[str, Any] = { + "id": f"{name}: {relation_type} -> {source_type}" + } + member = name in series_list + argsvalues.append( + pytest.param(name, source_type, relation_type, item, member, **args) + ) + + return dict( + argnames=["name", "source_type", "relation_type", "series", "member"], + argvalues=argsvalues, + ) + + +def convert( + name: str, source_type: T, relation_type: T, series: Sequence, member: bool +) -> Tuple[bool, str]: + relation = source_type.relations.get(relation_type, None) + is_relation = False if relation is None else relation.is_relation(series, {}) + + if not member: + return ( + (not is_relation), + f"{source_type}, {relation}, {member}, {name}, {series}", + ) + else: + # Note that the transformed series is not exactly the cast series + transformed_series = list(relation.transform(series, {})) + + return ( + is_relation, + f"Relationship {relation} for {series} tested false (but shouldn't have). " + f"Transform result would have been {transformed_series}", + ) + + +def get_cast_cases(_test_suite: Dict[str, Sequence], _results: Dict) -> Dict: + argsvalues = [] + for name, item in _test_suite.items(): + changed = name in _results + value = _results.get(name, "") + args: Dict[str, Any] = {"id": f"{name}: {changed}"} + argsvalues.append(pytest.param(name, item, value, **args)) + + return dict( + argnames=["name", "series", "expected"], + argvalues=argsvalues, + ) + + +def cast( + name: str, + series: Sequence, + typeset: VisionsTypeset, + expected: Optional[pd.Series] = None, +) -> Tuple[bool, str]: + result = typeset.cast_to_inferred(series) + # TODO: if error also print Path + if expected is None: + v = sequences_equal(result, series) + m = f"Series {name} cast expected {series} (no casting) got {result}" + + if v: + v = id(series) == id(result) + m = f"Series {name} memory addresses are not equal, while return value was" + else: + v = sequences_equal(result, expected) + m = f"Series {name} cast expected {expected} got {result}" + + return v, m diff --git a/build/lib/visions/types/__init__.py b/build/lib/visions/types/__init__.py new file mode 100644 index 000000000..8ab058fa4 --- /dev/null +++ b/build/lib/visions/types/__init__.py @@ -0,0 +1,53 @@ +from visions.types.boolean import Boolean +from visions.types.categorical import Categorical +from visions.types.complex import Complex +from visions.types.count import Count +from visions.types.date import Date +from visions.types.date_time import DateTime +from visions.types.email_address import EmailAddress +from visions.types.file import File +from visions.types.float import Float +from visions.types.generic import Generic +from visions.types.geometry import Geometry +from visions.types.image import Image +from visions.types.integer import Integer +from visions.types.ip_address import IPAddress +from visions.types.numeric import Numeric +from visions.types.object import Object +from visions.types.ordinal import Ordinal +from visions.types.path import Path +from visions.types.sparse import Sparse +from visions.types.string import String +from visions.types.time import Time +from visions.types.time_delta import TimeDelta +from visions.types.type import VisionsBaseType +from visions.types.url import URL +from visions.types.uuid import UUID + +__all__ = [ + "VisionsBaseType", + "Generic", + "String", + "Boolean", + "Categorical", + "Complex", + "Count", + "Date", + "DateTime", + "File", + "Float", + "Geometry", + "Image", + "Integer", + "IPAddress", + "Object", + "Ordinal", + "Path", + "TimeDelta", + "UUID", + "URL", + "Time", + "EmailAddress", + "Sparse", + "Numeric", +] diff --git a/build/lib/visions/types/boolean.py b/build/lib/visions/types/boolean.py new file mode 100644 index 000000000..fc5a2dbb2 --- /dev/null +++ b/build/lib/visions/types/boolean.py @@ -0,0 +1,38 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.generic import Generic +from visions.types.object import Object +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +class Boolean(VisionsBaseType): + """**Boolean** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import visions + >>> x = [True, False, False, True] + >>> x in visions.Boolean + True + + >>> x = [True, False, None] + >>> x in visions.Boolean + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Generic), + InferenceRelation(String), + InferenceRelation(Object), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/categorical.py b/build/lib/visions/types/categorical.py new file mode 100644 index 000000000..6826ef513 --- /dev/null +++ b/build/lib/visions/types/categorical.py @@ -0,0 +1,29 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.generic import Generic +from visions.types.type import VisionsBaseType + + +class Categorical(VisionsBaseType): + """**Categorical** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import pandas as pd + >>> import visions + >>> x = pd.Series([True, False, 1], dtype='category') + >>> x in visions.Categorical + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Generic)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/complex.py b/build/lib/visions/types/complex.py new file mode 100644 index 000000000..1da4f77c9 --- /dev/null +++ b/build/lib/visions/types/complex.py @@ -0,0 +1,31 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.generic import Generic +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +class Complex(VisionsBaseType): + """**Complex** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> x = [complex(0, 0), complex(1, 2), complex(3, -1)] + >>> x in visions.Complex + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Generic), + InferenceRelation(String), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/count.py b/build/lib/visions/types/count.py new file mode 100644 index 000000000..33a9a79fe --- /dev/null +++ b/build/lib/visions/types/count.py @@ -0,0 +1,27 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.integer import Integer +from visions.types.type import VisionsBaseType + + +class Count(VisionsBaseType): + """**Count** (positive integer) implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> x = [1, 4, 10, 20] + >>> x in visions.Count + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Integer)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/date.py b/build/lib/visions/types/date.py new file mode 100644 index 000000000..00922bb50 --- /dev/null +++ b/build/lib/visions/types/date.py @@ -0,0 +1,34 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.date_time import DateTime +from visions.types.object import Object +from visions.types.type import VisionsBaseType + + +class Date(VisionsBaseType): + """**Date** implementation of :class:`visions.types.type.VisionsBaseType`. + All values are should be datetime.date or missing + + Examples: + >>> import datetime + >>> import visions + >>> x = [datetime.date(2017, 3, 5), datetime.date(2019, 12, 4)] + >>> x in visions.Date + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Object), + InferenceRelation(DateTime), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/date_time.py b/build/lib/visions/types/date_time.py new file mode 100644 index 000000000..1698379a2 --- /dev/null +++ b/build/lib/visions/types/date_time.py @@ -0,0 +1,33 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.generic import Generic +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +class DateTime(VisionsBaseType): + """**Datetime** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import datetime + >>> import visions + >>> x = [datetime.datetime(2017, 3, 5), datetime.datetime(2019, 12, 4)] + >>> x in visions.DateTime + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Generic), + InferenceRelation(String), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/email_address.py b/build/lib/visions/types/email_address.py new file mode 100644 index 000000000..6720c7127 --- /dev/null +++ b/build/lib/visions/types/email_address.py @@ -0,0 +1,57 @@ +from typing import Any, Sequence + +import attr +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.object import Object +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +@attr.s(slots=True) +class FQDA: + local = attr.ib() + fqdn = attr.ib() + + @staticmethod + def from_str(s): + return _to_email(s) + + +def _to_email(s) -> FQDA: + if isinstance(s, FQDA): + return s + elif isinstance(s, str): + return FQDA(*s.split("@", maxsplit=1)) + else: + raise TypeError("Only strings supported") + + +class EmailAddress(VisionsBaseType): + """**EmailAddress** implementation of :class:`visions.types.type.VisionsBaseType`. + + Notes: + The email address should be a **fully qualified domain address** (FQDA) + FQDA = local part + @ + fully qualified domain name (FQDN) + This type + + Examples: + >>> import visions + >>> x = [FQDA('example','gmail.com'), FQDA.from_str('example@protonmail.com')] + >>> x in visions.EmailAddress + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Object), + InferenceRelation(String), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/file.py b/build/lib/visions/types/file.py new file mode 100644 index 000000000..f8f8388c4 --- /dev/null +++ b/build/lib/visions/types/file.py @@ -0,0 +1,29 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.path import Path +from visions.types.type import VisionsBaseType + + +class File(VisionsBaseType): + """**File** implementation of :class:`visions.types.type.VisionsBaseType`. + (i.e. existing path) + + Examples: + >>> import pathlib + >>> x = [pathlib.Path('/home/user/file.txt'), pathlib.Path('/home/user/test2.txt')] + >>> x in visions.File + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Path)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/float.py b/build/lib/visions/types/float.py new file mode 100644 index 000000000..b52d3537f --- /dev/null +++ b/build/lib/visions/types/float.py @@ -0,0 +1,34 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.complex import Complex +from visions.types.generic import Generic +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +class Float(VisionsBaseType): + """**Float** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import visions + >>> x = [1.0, 2.5, 5.0] + >>> x in visions.Float + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Generic), + InferenceRelation(String), + InferenceRelation(Complex), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/generic.py b/build/lib/visions/types/generic.py new file mode 100644 index 000000000..37a0b75c0 --- /dev/null +++ b/build/lib/visions/types/generic.py @@ -0,0 +1,27 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import TypeRelation +from visions.types.type import VisionsBaseType + + +class Generic(VisionsBaseType): + """**Generic** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import numpy as np + >>> import visions + >>> x = ['a', 1, np.nan] + >>> x in visions.Generic + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + return [] + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + return True diff --git a/build/lib/visions/types/geometry.py b/build/lib/visions/types/geometry.py new file mode 100644 index 000000000..45829df32 --- /dev/null +++ b/build/lib/visions/types/geometry.py @@ -0,0 +1,32 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.object import Object +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +class Geometry(VisionsBaseType): + """**Geometry** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> from shapely import wkt + >>> x = pd.Series([wkt.loads('POINT (-92 42)'), wkt.loads('POINT (-92 42.1)'), wkt.loads('POINT (-92 42.2)')] + >>> x in visions.geometry + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Object), + InferenceRelation(String), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/image.py b/build/lib/visions/types/image.py new file mode 100644 index 000000000..2b9e1d41c --- /dev/null +++ b/build/lib/visions/types/image.py @@ -0,0 +1,30 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.file import File +from visions.types.type import VisionsBaseType + + +class Image(VisionsBaseType): + """**Image** implementation of :class:`visions.types.type.VisionsBaseType`. + (i.e. series with all image files) + + Examples: + >>> from pathlib import Path + >>> import visions + >>> x = [Path('/home/user/file.png'), Path('/home/user/test2.jpg')] + >>> x in visions.Image + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(File)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/integer.py b/build/lib/visions/types/integer.py new file mode 100644 index 000000000..8e79ce44a --- /dev/null +++ b/build/lib/visions/types/integer.py @@ -0,0 +1,31 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.float import Float +from visions.types.generic import Generic +from visions.types.type import VisionsBaseType + + +class Integer(VisionsBaseType): + """**Integer** implementation of :class:`visions.types.type.VisionsBaseType`. + Examples: + >>> import pandas as pd + >>> x = [-1, 1, 2, 3] + >>> x in visions.Integer + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Generic), + InferenceRelation(Float), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/ip_address.py b/build/lib/visions/types/ip_address.py new file mode 100644 index 000000000..5fea143dc --- /dev/null +++ b/build/lib/visions/types/ip_address.py @@ -0,0 +1,33 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.object import Object +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +class IPAddress(VisionsBaseType): + """**IP Address** (v4 and v6) implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> from ipaddress import IPv4Address + >>> import visions + >>> x = [IPv4Address('127.0.0.1'), IPv4Address('128.0.1.2')] + >>> x in visions.IPAddress + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Object), + InferenceRelation(String), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/numeric.py b/build/lib/visions/types/numeric.py new file mode 100644 index 000000000..05e790a2b --- /dev/null +++ b/build/lib/visions/types/numeric.py @@ -0,0 +1,30 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.generic import Generic +from visions.types.type import VisionsBaseType + + +class Numeric(VisionsBaseType): + """**Numeric** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import visions + >>> from decimal import Decimal + >>> + >>> x = [Decimal(1), Decimal(2), Decimal(3)] + >>> x in visions.Numeric + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Generic)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/object.py b/build/lib/visions/types/object.py new file mode 100644 index 000000000..54b5417f5 --- /dev/null +++ b/build/lib/visions/types/object.py @@ -0,0 +1,27 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.generic import Generic +from visions.types.type import VisionsBaseType + + +class Object(VisionsBaseType): + """**Object** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> x = ['a', 1, np.nan] + >>> x in visions.Object + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Generic)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/ordinal.py b/build/lib/visions/types/ordinal.py new file mode 100644 index 000000000..2de6c6137 --- /dev/null +++ b/build/lib/visions/types/ordinal.py @@ -0,0 +1,29 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.categorical import Categorical +from visions.types.type import VisionsBaseType + + +class Ordinal(VisionsBaseType): + """**Ordinal** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import pandas as pd + >>> import visions + >>> x = pd.Series([1, 2, 3, 1, 1], dtype='category') + >>> x in visions.Ordinal + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Categorical)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/path.py b/build/lib/visions/types/path.py new file mode 100644 index 000000000..db1cdf610 --- /dev/null +++ b/build/lib/visions/types/path.py @@ -0,0 +1,33 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.object import Object +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +class Path(VisionsBaseType): + """**Path** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import pathlib + >>> import visions + >>> x = [pathlib.Path('/home/user/file.txt'), pathlib.Path('/home/user/test2.txt')] + >>> x in visions.Path + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Object), + InferenceRelation(String), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/sparse.py b/build/lib/visions/types/sparse.py new file mode 100644 index 000000000..23b8835f5 --- /dev/null +++ b/build/lib/visions/types/sparse.py @@ -0,0 +1,29 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.generic import Generic +from visions.types.type import VisionsBaseType + + +class Sparse(VisionsBaseType): + """**Sparse** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import pandas as pd + >>> import visions + >>> x = pd.Sparse(pd.Series([np.complex(0, 0), np.complex(1, 2), np.complex(3, -1)])) + >>> x in visions.Sparse + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Generic)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/string.py b/build/lib/visions/types/string.py new file mode 100644 index 000000000..6c802f851 --- /dev/null +++ b/build/lib/visions/types/string.py @@ -0,0 +1,27 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.object import Object +from visions.types.type import VisionsBaseType + + +class String(VisionsBaseType): + """**String** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> x = ['rubin', 'carter', 'champion'] + >>> x in visions.String + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Object)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/time.py b/build/lib/visions/types/time.py new file mode 100644 index 000000000..4d4f8161b --- /dev/null +++ b/build/lib/visions/types/time.py @@ -0,0 +1,29 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.object import Object +from visions.types.type import VisionsBaseType + + +class Time(VisionsBaseType): + """**Time** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> import datetime + >>> import visions + >>> x = [datetime.time(10, 8, 4), datetime.time(21, 17, 0)] + >>> x in visions.Time + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Object)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/time_delta.py b/build/lib/visions/types/time_delta.py new file mode 100644 index 000000000..ac99bede5 --- /dev/null +++ b/build/lib/visions/types/time_delta.py @@ -0,0 +1,28 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, TypeRelation +from visions.types.generic import Generic +from visions.types.type import VisionsBaseType + + +class TimeDelta(VisionsBaseType): + """**TimeDelta** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> from datetime import timedelta + >>> x = [timedelta(hours=1), timedelta(hours=3)] + >>> x in visions.Timedelta + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [IdentityRelation(Generic)] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/type.py b/build/lib/visions/types/type.py new file mode 100644 index 000000000..702dcf3b5 --- /dev/null +++ b/build/lib/visions/types/type.py @@ -0,0 +1,135 @@ +from abc import ABCMeta, abstractmethod +from typing import Any, Dict, Optional, Sequence, Type, Union, cast + +import attr +from multimethod import multimethod + +from visions.relations import TypeRelation + +_DEFAULT = object() + + +class RelationsIterManager: + """Class to enable to treat relations as dict""" + + def __init__(self, relations: Sequence[TypeRelation]): + self._keys: Dict["Type[VisionsBaseType]", int] = { + item.related_type: i for i, item in enumerate(relations) + } + self.values = tuple(relations) + + def __getitem__(self, index: Union["Type[VisionsBaseType]", int]) -> TypeRelation: + idx = index if isinstance(index, int) else self._keys[index] + return self.values[idx] + + def get( + self, index: Union["Type[VisionsBaseType]", int], default: Any = _DEFAULT + ) -> Union[TypeRelation, Any]: + try: + return self[index] + except (IndexError, KeyError) as err: + if default is _DEFAULT: + raise err + else: + return default + + def __iter__(self): + yield from self.values + + +class VisionsBaseTypeMeta(ABCMeta): + _relations: Optional[RelationsIterManager] = None + + def __contains__(cls, sequence: Sequence) -> bool: + return cls.contains_op(sequence, dict()) + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + raise NotImplementedError + + @staticmethod + def contains_op(item: Any, state: dict) -> bool: + raise NotImplementedError + + @property + def relations(cls) -> RelationsIterManager: + from visions.relations.relations import IdentityRelation + + if cls._relations is None: + cls._relations = RelationsIterManager( + [ + ( + attr.evolve( + r, + type=cls, + relationship=( + cls.contains_op + if r.relationship is None + else r.relationship + ), + ) + if isinstance(r, IdentityRelation) + else attr.evolve( + r, + type=cls, + relationship=( + multimethod(r.relationship) + if r.relationship is not None + else None + ), + transformer=multimethod(r.transformer), + ) + ) + for r in cls.get_relations() + ] + ) + return cls._relations + + def __add__(cls, other): + from visions.types import Generic + from visions.typesets import VisionsTypeset + + if not any(issubclass(x, Generic) for x in [cls, other]): + return VisionsTypeset({Generic, cls, other}) + return VisionsTypeset({cls, other}) + + def __str__(cls) -> str: + return str(cls.__name__) + + def __repr__(cls) -> str: + return str(cls) + + +class VisionsBaseType(metaclass=VisionsBaseTypeMeta): + """Abstract implementation of a vision type. + + Provides a common API for building custom visions data types. + """ + + def __init__(self): + pass + + @staticmethod + @abstractmethod + def get_relations() -> Sequence[TypeRelation]: + raise NotImplementedError + + @classmethod + def register_transformer( + cls, relation: "Type[VisionsBaseType]", dispatch_type: Any + ): + relation_transformer = cls.relations[relation].transformer + return cast(Any, relation_transformer).register(dispatch_type, dict) + + @classmethod + def register_relationship( + cls, relation: "Type[VisionsBaseType]", dispatch_type: Any + ): + relation_relationship = cls.relations[relation].relationship + return cast(Any, relation_relationship).register(dispatch_type, dict) + + @staticmethod + @multimethod + @abstractmethod + def contains_op(sequence: Any, state: Any) -> bool: + raise NotImplementedError diff --git a/build/lib/visions/types/url.py b/build/lib/visions/types/url.py new file mode 100644 index 000000000..086ff170f --- /dev/null +++ b/build/lib/visions/types/url.py @@ -0,0 +1,33 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.object import Object +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +class URL(VisionsBaseType): + """**Url** implementation of :class:`visions.types.type.VisionsBaseType`. + + Examples: + >>> from urllib.parse import urlparse + >>> urls = ['http://www.cwi.nl:80/%7Eguido/Python.html', 'https://github.com/pandas-profiling/pandas-profiling'] + >>> x = [urlparse(url) for url in urls] + >>> x in visions.URL + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Object), + InferenceRelation(String), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/types/uuid.py b/build/lib/visions/types/uuid.py new file mode 100644 index 000000000..22f949f47 --- /dev/null +++ b/build/lib/visions/types/uuid.py @@ -0,0 +1,41 @@ +from typing import Any, Sequence + +from multimethod import multimethod + +from visions.relations import IdentityRelation, InferenceRelation, TypeRelation +from visions.types.object import Object +from visions.types.string import String +from visions.types.type import VisionsBaseType + + +class UUID(VisionsBaseType): + """**UUID** implementation of :class:`visions.types.type.VisionsBaseType`. + + References: + UUID specification in RFC4122: + https://tools.ietf.org/html/rfc4122#section-3 + + Python standard library: + https://docs.python.org/3/library/uuid.html + + Examples: + >>> import uuid + >>> import visions + >>> uuids = ['0b8a22ca-80ad-4df5-85ac-fa49c44b7ede', 'aaa381d6-8442-4f63-88c8-7c900e9a23c6'] + >>> x = [uuid.UUID(uuid_str) for uuid_str in uuids] + >>> x in visions.UUID + True + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + relations = [ + IdentityRelation(Object), + InferenceRelation(String), + ] + return relations + + @staticmethod + @multimethod + def contains_op(item: Any, state: dict) -> bool: + pass diff --git a/build/lib/visions/typesets/__init__.py b/build/lib/visions/typesets/__init__.py new file mode 100644 index 000000000..803df1a50 --- /dev/null +++ b/build/lib/visions/typesets/__init__.py @@ -0,0 +1,6 @@ +from visions.typesets.complete_set import CompleteSet +from visions.typesets.geometry_set import GeometrySet +from visions.typesets.standard_set import StandardSet +from visions.typesets.typeset import VisionsTypeset + +__all__ = ["VisionsTypeset", "CompleteSet", "StandardSet", "GeometrySet"] diff --git a/build/lib/visions/typesets/complete_set.py b/build/lib/visions/typesets/complete_set.py new file mode 100644 index 000000000..74ba8c56a --- /dev/null +++ b/build/lib/visions/typesets/complete_set.py @@ -0,0 +1,93 @@ +from visions.types import ( + URL, + UUID, + Boolean, + Categorical, + Complex, + Count, + Date, + DateTime, + EmailAddress, + File, + Float, + Generic, + Geometry, + Image, + Integer, + IPAddress, + Object, + Ordinal, + Path, + String, + Time, + TimeDelta, +) +from visions.typesets.typeset import VisionsTypeset + + +class CompleteSet(VisionsTypeset): + """Complete visions typeset with all supported types + + Includes support for the following types: + + - Float + - Integer + - Boolean + - Object + - String + - Complex + - Categorical + - Ordinal + - Count + - DateTime + - Date + - Time + - TimeDelta + - Geometry + - Path + - File + - Image + - URL + - IPAddress + - EmailAddress + - UUID + + """ + + def __init__(self) -> None: + types = { + Generic, + Boolean, + Float, + Object, + Complex, + Categorical, + Ordinal, + DateTime, + TimeDelta, + Integer, + Count, + String, + Geometry, + URL, + Path, + Date, + Time, + File, + Image, + IPAddress, + EmailAddress, + UUID, + } + super().__init__(types) + + try: + import imagehash + import PIL + import shapely + except ImportError as e: + raise ImportError( + f"This typeset requires dependencies that are currently not installed ({e}). " + "You can follow the installation instructions to resolve this issue: " + "https://dylan-profiler.github.io/visions/visions/getting_started/installation.html" + ) diff --git a/build/lib/visions/typesets/geometry_set.py b/build/lib/visions/typesets/geometry_set.py new file mode 100644 index 000000000..079331528 --- /dev/null +++ b/build/lib/visions/typesets/geometry_set.py @@ -0,0 +1,58 @@ +from visions.types import ( + Boolean, + Categorical, + Complex, + DateTime, + Float, + Generic, + Geometry, + Integer, + Object, + String, + TimeDelta, +) +from visions.typesets.typeset import VisionsTypeset + + +class GeometrySet(VisionsTypeset): + """Standard visions typeset with shapely geometry support + + Includes support for the following types: + + - Float + - Integer + - Boolean + - Object + - String + - Complex + - Categorical + - DateTime + - TimeDelta + - Geometry + + """ + + def __init__(self) -> None: + types = { + Generic, + Boolean, + Float, + Object, + Complex, + Categorical, + DateTime, + TimeDelta, + Integer, + String, + Geometry, + } + super().__init__(types) + + try: + import shapely + except ImportError as e: + raise ImportError( + f"This typeset requires dependencies that are currently not installed ({e}). " + "You can follow the installation instructions to resolve this issue: " + "https://dylan-profiler.github.io/visions/visions/getting_started/installation.html" + ) diff --git a/build/lib/visions/typesets/standard_set.py b/build/lib/visions/typesets/standard_set.py new file mode 100644 index 000000000..8bea57e70 --- /dev/null +++ b/build/lib/visions/typesets/standard_set.py @@ -0,0 +1,46 @@ +from visions.types import ( + Boolean, + Categorical, + Complex, + DateTime, + Float, + Generic, + Integer, + Object, + String, + TimeDelta, +) +from visions.typesets.typeset import VisionsTypeset + + +class StandardSet(VisionsTypeset): + """The standard visions typesets + + Includes support for the following types: + + - Float + - Integer + - Boolean + - Object + - String + - Complex + - Categorical + - DateTime + - TimeDelta + + """ + + def __init__(self) -> None: + types = { + Generic, + Boolean, + Float, + Object, + Complex, + Categorical, + DateTime, + TimeDelta, + Integer, + String, + } + super().__init__(types) diff --git a/build/lib/visions/typesets/typeset.py b/build/lib/visions/typesets/typeset.py new file mode 100644 index 000000000..3f4289052 --- /dev/null +++ b/build/lib/visions/typesets/typeset.py @@ -0,0 +1,483 @@ +import warnings +from functools import singledispatch +from pathlib import Path +from typing import ( + Any, + Dict, + Iterable, + List, + Optional, + Sequence, + Set, + Tuple, + Type, + TypeVar, + Union, +) + +import networkx as nx +import pandas as pd + +from visions.types.generic import Generic +from visions.types.type import VisionsBaseType + +TypeOrTypeset = TypeVar("TypeOrTypeset", Type[VisionsBaseType], "VisionsTypeset") +pathTypes = TypeVar( + "pathTypes", Type[VisionsBaseType], Dict[str, Type[VisionsBaseType]] +) +pdT = TypeVar("pdT", pd.Series, pd.DataFrame) +T = Type[VisionsBaseType] + + +def build_graph(nodes: Set[Type[VisionsBaseType]]) -> Tuple[nx.DiGraph, nx.DiGraph]: + """Constructs a traversable relation graph between visions types + + Builds a type relation graph from a collection of :class:`visions.types.type.VisionsBaseType` where + each node corresponds to a type and each edge is a relation defined on the type. + + Args: + nodes: An Sequence of :class:`visions.types.type.VisionsBaseType` + + Returns: + A directed graph of type relations for the provided nodes. + """ + + style_map = {True: "dashed", False: "solid"} + relation_graph = nx.DiGraph() + relation_graph.add_nodes_from(nodes) + + noninferential_edges = [] + + for node in nodes: + for relation in node.relations: + if relation.related_type not in nodes: + warnings.warn( + f"Provided relations included mapping from {relation.related_type} to {relation.type} " + f"but {relation.related_type} was not included in the provided list of nodes" + ) + else: + relation_graph.add_edge( + relation.related_type, + relation.type, + relationship=relation, + style=style_map[relation.inferential], + ) + + if not relation.inferential: + noninferential_edges.append((relation.related_type, relation.type)) + + check_graph_constraints(relation_graph) + + base_graph = relation_graph.edge_subgraph(noninferential_edges) + return relation_graph, base_graph + + +def check_graph_constraints(relation_graph: nx.DiGraph) -> None: + """Validates a relation_graph is appropriately constructed + + Args: + relation_graph: A directed graph representing the set of relations between type nodes. + + """ + check_isolates(relation_graph) + check_cycles(relation_graph) + + +def check_isolates(graph: nx.DiGraph) -> None: + """Check for orphaned nodes. + + Args: + graph: the graph to check + + """ + nodes = set(graph.nodes) + root_node = next(nx.topological_sort(graph)) + + isolates = list(set(nx.isolates(graph)) - {root_node}) # root can be isolate + graph.remove_nodes_from(isolates) + orphaned_nodes = nodes - set(graph.nodes) + if orphaned_nodes: + message = f"{orphaned_nodes} were isolates in the type relation map and consequently orphaned. " + message += "Please add some mapping to the orphaned nodes." + warnings.warn(message) + + +def check_cycles(graph: nx.DiGraph) -> None: + """Check for cycles and warn if one is found + + Args: + graph: the graph to check + + """ + cycles = list(nx.simple_cycles(graph)) + if len(cycles) > 0: + warnings.warn(f"Cyclical relations between types {cycles} detected") + + +def traverse_graph_with_series( + base_type: T, + series: Sequence, + graph: nx.DiGraph, + path: List[T] = None, + state: Optional[dict] = None, +) -> Tuple[Sequence, List[T], dict]: + """Depth First Search traversal. There should be at most one successor that contains the series. + + Args: + base_type: Entry-point for graph to start traversal + series: the Series to check + graph: the Graph to traverse + path: the path so far + state: traversal state + + Returns: + The most uniquely specified node matching the series. + """ + if state is None: + state = dict() + + if path is None: + path = [] + + path.append(base_type) + + for vision_type in graph.successors(base_type): + relation = graph[base_type][vision_type]["relationship"] + + if relation.is_relation(series, state): + series = relation.transform(series, state) + return traverse_graph_with_series(vision_type, series, graph, path, state) + + return series, path, state + + +def traverse_graph_with_sampled_series( + base_type: T, + series: pd.Series, + graph: nx.DiGraph, + sample_size: int = 10, + state: dict = dict(), +) -> Tuple[Sequence, List[T], dict]: + """Depth First Search traversal with sampling. There should be at most one successor that contains the series. + + Args: + base_type: Entry-point for graph to start traversal + series: the Series to check + graph: the Graph to traverse + sample_size: number of items used in heuristic traversal + state: traversal state + + Returns: + The most uniquely specified node matching the series. + """ + + if (series.shape[0] < 1000) or (sample_size > series.shape[0]): + return traverse_graph_with_series(base_type, series, graph, state=state) + + series_sample = series.sample(sample_size) + _, path, _ = traverse_graph_with_series( + base_type, series_sample, graph, state=state + ) + if len(path) == 1: + return series, path, state + + # Cast the full series + from_type = path[0] + for i, to_type in enumerate(path[1:]): + relation = graph[from_type][to_type]["relationship"] + if not relation.is_relation(series, state): + break + series = relation.transform(series, state) + from_type = to_type + + return series, path[0 : (i + 2)], state + + +@singledispatch +def traverse_graph( + data: Sequence, root_node: T, graph: nx.DiGraph +) -> Tuple[Sequence, Union[List[T], Dict[str, List[T]]], Dict[str, dict]]: + return traverse_graph_with_series(root_node, data, graph) + + +@singledispatch +def get_type_from_path( + path_data: Union[Sequence[T], Dict[str, Sequence[T]]] +) -> Union[T, Dict[str, T]]: + raise TypeError(f"Can't get types from path object of type {type(path_data)}") + + +@get_type_from_path.register(list) +@get_type_from_path.register(tuple) +def _get_type_from_path_builtin(path_list: Sequence[T]) -> T: + return path_list[-1] + + +@get_type_from_path.register(dict) +def _get_type_from_path_dict(path_dict: Dict[str, Sequence[T]]) -> Dict[str, T]: + return {k: v[-1] for k, v in path_dict.items()} + + +class VisionsTypeset: + """ + A collection of :class:`visions.types.type.VisionsBaseType` with associated relationship map between them. + + Attributes: + types: The collection of Visions Types derived from :class:`visions.types.type.VisionsBaseType` + base_graph: The graph of relations composed exclusively of :class:`visions.relations.relations.IdentityRelation` + relation_graph: The full relation graph including both :class:`visions.relations.relations.IdentityRelation` + and :class:`visions.relations.relations.InferenceRelation` + """ + + def __init__(self, types: Set[Type[VisionsBaseType]]) -> None: + """ + Args: + types: a set of types + """ + self._root_node: Optional[T] = None + + if not isinstance(types, Iterable): + raise ValueError("types should be Sequence") + + self.relation_graph, self.base_graph = build_graph(set(types)) + + if not issubclass(self.root_node, Generic): + raise ValueError("`root_node` should be a subclass of Generic") + + self.types = set(self.relation_graph.nodes) + + @property + def root_node(self) -> T: + """Returns a cached copy of the relation_graphs root node + + Args: + + Returns: + A cached copy of the relation_graphs root node. + """ + if self._root_node is None: + self._root_node = next(nx.topological_sort(self.relation_graph)) + return self._root_node + + def detect(self, data: Any) -> Tuple[Sequence, Any, dict]: + """The results found after only considering IdentityRelations. + + Notes: + This is an advanced feature, consider using `detect_type` in case the type is what is needed. + + Args: + data: a DataFrame or Series to determine types over + + Returns: + A tuple of the coerced sequence, visited nodes and state + """ + return traverse_graph(data, self.root_node, self.base_graph) + + def detect_type(self, data: Sequence) -> Union[T, Dict[str, T]]: + """The inferred type found only considering IdentityRelations. + + Args: + data: a DataFrame or Series to determine types over + + Returns: + A dictionary of {name: type} pairs in the case of DataFrame input or a type + """ + _, paths, _ = self.detect(data) + return get_type_from_path(paths) + + def infer(self, data: Sequence) -> Tuple[Sequence, Any, dict]: + """The results found after considering all relations. + + Notes: + This is an advanced feature, consider using `infer_type` in case the type is what is needed. + + Args: + data: a DataFrame or Series to determine types over + + Returns: + A tuple of the coerced sequence, visited nodes and state + """ + return traverse_graph(data, self.root_node, self.relation_graph) + + def infer_type(self, data: Sequence) -> Union[T, Dict[str, T]]: + """The inferred type found using all type relations. + + Args: + data: a DataFrame or Series to determine types over + + Returns: + A dictionary of {name: type} pairs in the case of DataFrame input or a type + """ + _, paths, _ = self.infer(data) + return get_type_from_path(paths) + + def cast_to_detected(self, data: Sequence) -> Sequence: + """Transforms input data into a canonical representation using only IdentityRelations + + Args: + data: a DataFrame or Series to determine types over + + Returns: + new_data: The transformed DataFrame or Series. + """ + data, _, _ = self.detect(data) + return data + + def cast_to_inferred(self, data: Sequence) -> Sequence: + """Transforms input data and returns it's corresponding new type relation using all relations. + + Args: + data: a DataFrame or Series to determine types over + + Returns: + new_data: The transformed DataFrame or Series. + types: A dictionary of {name: type} pairs in the case of DataFrame input or a type. + """ + data, _, _ = self.infer(data) + return data + + def output_graph( + self, + file_name: Union[str, Path], + base_only: bool = False, + dpi: Optional[int] = None, + ) -> None: + """Write the type graph to a file. + + Args: + file_name: the file to save the output to + base_only: if True, plot the graph without relation mapping edges + dpi: set the dpi of the output image + """ + from visions.utils.graph import output_graph + + if base_only: + graph = self.base_graph.copy() + else: + graph = self.relation_graph.copy() + + graph.graph["node"] = {"shape": "box", "color": "red"} + if dpi is not None: + graph.graph["graph"] = {"dpi": dpi} + + output_graph(graph, file_name) + + def plot_graph( + self, + dpi: int = 800, + base_only: bool = False, + figsize: Optional[Tuple[int, int]] = None, + ): + """ + + Args: + dpi: dpi of the matplotlib figure. + figsize: figure size + base_only: Only display the typesets base_graph + Returns: + Displays the image + """ + import os + import tempfile + + from matplotlib import image as mpimg + from matplotlib import pyplot as plt + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file: + self.output_graph(temp_file.name, dpi=dpi, base_only=base_only) + img = mpimg.imread(temp_file.name) + plt.figure(dpi=dpi, figsize=figsize) + plt.axis("off") + plt.imshow(img) + os.unlink(temp_file.name) + + def _get_other_type(self, other: TypeOrTypeset) -> Set[T]: + """Converts input into a set of :class:`visions.types.type.VisionsBaseType` + + Args: + other: A :class:`visions.types.type.VisionsBaseType` or :class:`visions.typesets.typeset.VisionsTypeset` + + Raises: + NotImplementedError: + + Returns: + Set[Type[VisionsBaseType]]: + """ + if isinstance(other, VisionsTypeset): + other_types = set(other.types) + elif issubclass(other, VisionsBaseType): + other_types = {other} + else: + raise NotImplementedError( + f"Typeset operation not implemented for type {type(other)}" + ) + return other_types + + def replace(self, old: T, new: T) -> "VisionsTypeset": + """Create a new typeset having replace one type with another. + + Args: + old: Visions type to replace. + new: Replacement visions type. + + Returns + A VisionsTypeset + """ + types = self.types.copy() + types.add(new) + types.remove(old) + return VisionsTypeset(types) + + def __add__(self, other: TypeOrTypeset) -> "VisionsTypeset": + """Adds a type or typeset into the current typeset. + + Args: + other: Type or typeset to be added + + Returns + A VisionsTypeset + """ + other_types = self._get_other_type(other) + return VisionsTypeset(self.types | other_types) + + def __iadd__(self, other: TypeOrTypeset) -> "VisionsTypeset": + """Adds a type or typeset into the current typeset. + + Args: + other: Type or typeset to be added + + Returns + A VisionsTypeset + """ + return self.__add__(other) + + def __sub__(self, other: TypeOrTypeset) -> "VisionsTypeset": + """Subtracts a type or typeset from the current typeset. + + Args: + other: Type or typeset to be removed + + Returns + A VisionsTypeset + """ + other_types = self._get_other_type(other) + return VisionsTypeset(self.types - other_types) + + def __isub__(self, other: TypeOrTypeset) -> "VisionsTypeset": + """Subtracts a type or typeset from the current typeset. + + Args: + other: Type or typeset to be removed + + Returns + A VisionsTypeset + """ + return self.__sub__(other) + + def __repr__(self) -> str: + """Pretty representation of the typeset. + + Returns + A :class:`visions.typesets.typeset.VisionsTypeset` + """ + return self.__class__.__name__ diff --git a/build/lib/visions/utils/__init__.py b/build/lib/visions/utils/__init__.py new file mode 100644 index 000000000..ffdbaabd1 --- /dev/null +++ b/build/lib/visions/utils/__init__.py @@ -0,0 +1,11 @@ +""" Utilities suite for visions """ + +# from visions.utils.images import image_utils +from visions.utils.monkeypatches import imghdr_patch, pathlib_patch +from visions.utils.profiling import profile_type +from visions.utils.warning_handling import suppress_warnings + +__all__ = [ + "profile_type", + "suppress_warnings", +] diff --git a/build/lib/visions/utils/cache.py b/build/lib/visions/utils/cache.py new file mode 100644 index 000000000..77d491dc2 --- /dev/null +++ b/build/lib/visions/utils/cache.py @@ -0,0 +1,60 @@ +import functools +from collections import OrderedDict + +import pandas as pd + + +class LRUCacher: + def __init__(self, hash_func, max_length, value_func): + self.hash_func = hash_func + self.max_length = max_length + self.value_func = value_func + self.cache = OrderedDict() + + def __getitem__(self, key): + value = self.cache[key] + self.cache.move_to_end(key) + return value + + def __setitem__(self, key, value): + if key in self.cache: + self.cache.move_to_end(key) + self.cache[key] = value + if len(self.cache) > self.max_length: + oldest = next(iter(self.cache)) + del self.cache[oldest] + + def get_key(self, *args): + return self.hash_func(*args) + + def get(self, *args): + id_key = self.get_key(*args) + if id_key not in self.cache: + self[id_key] = self.value_func(*args) + return self[id_key] + + +def lru_cache(hash_func, max_length): + def func_inner(func): + cache = LRUCacher(hash_func, max_length, func) + + @functools.wraps(func) + def inner(*args): + return cache.get(*args) + + return inner + + return func_inner + + +def mutable_pseudo_hash(data, node, graph): + # return id((data, node, graph)) + try: + if isinstance(data, pd.DataFrame): + data_hash = hash(hash(tuple(data[col])) for col in data.columns) + else: + data_hash = hash(tuple(data.values)) + except (ValueError, TypeError, AttributeError): + return id((data, node, graph)) + + return hash((data_hash, node, graph)) diff --git a/build/lib/visions/utils/graph.py b/build/lib/visions/utils/graph.py new file mode 100644 index 000000000..b9e74f042 --- /dev/null +++ b/build/lib/visions/utils/graph.py @@ -0,0 +1,49 @@ +from pathlib import Path +from typing import Union + +import networkx as nx + + +def output_graph( + G: nx.DiGraph, file_name: Union[Path, str], sort: bool = True, file_format=None +) -> None: + """Output a graph to a file, either as image or as dot file. + + Args: + G: the DiGraph to write or plot + file_name: the file name to write to. + sort: create a copy of the graph with sorted keys + file_format: graphviz output format, if None, the file_name extension is used as format + https://graphviz.org/doc/info/output.html + + Returns: + Nothing + + Raises: + ValueError when the file_name does not end on .svg, .png or .dot + """ + + if sort: + # Create ordered graph for deterministic image outputs + G_sorted = nx.DiGraph() + G_sorted.graph["node"] = {"shape": "box", "color": "red"} + G_sorted.add_nodes_from(sorted(G.nodes, key=lambda x: str(x))) + + style = nx.get_edge_attributes(G, "style") + for edge in sorted(G.edges, key=lambda x: (str(x[0]), str(x[1]))): + G_sorted.add_edge(*edge, style=style.get(edge)) + G = G_sorted + + p = nx.drawing.nx_pydot.to_pydot(G) + if not isinstance(file_name, Path): + file_name = Path(file_name) + + if file_format is None: + file_format = file_name.suffix[1:].lower() + + try: + p.write(file_name, format=file_format) + except AssertionError: + raise ValueError( + "Could not write file. Please make sure that the format is accepted by pydot." + ) diff --git a/build/lib/visions/utils/images/__init__.py b/build/lib/visions/utils/images/__init__.py new file mode 100644 index 000000000..6770ee051 --- /dev/null +++ b/build/lib/visions/utils/images/__init__.py @@ -0,0 +1 @@ +from visions.utils.images import image_utils diff --git a/build/lib/visions/utils/images/image_utils.py b/build/lib/visions/utils/images/image_utils.py new file mode 100644 index 000000000..4cef526e7 --- /dev/null +++ b/build/lib/visions/utils/images/image_utils.py @@ -0,0 +1,114 @@ +import imghdr +from pathlib import Path +from typing import Optional, Tuple, Union + +import imagehash +from PIL import ExifTags, Image + +from visions.utils.monkeypatches.imghdr_patch import * + + +def open_image(path: Path) -> Optional[Image.Image]: + """ + + Args: + path: + + Returns: + + """ + try: + return Image.open(path) + except (OSError, AttributeError): + return None + + +def is_image_truncated(image: Image) -> bool: + """Returns True if the path refers to a truncated image + + Args: + image: + + Returns: + True if the image is truncated + """ + try: + image.load() + return False + except (OSError, AttributeError): + return True + + +def get_image_shape(image: Image) -> Optional[Tuple[int, int]]: + """ + + Args: + image: + + Returns: + + """ + try: + return image.size + except (OSError, AttributeError): + return None + + +def hash_image(image: Image) -> Optional[str]: + """ + + Args: + image: + + Returns: + + """ + try: + return str(imagehash.phash(image)) + except (OSError, AttributeError): + return None + + +def decode_byte_exif(exif_val: Union[str, bytes]) -> str: + """Decode byte encodings + + Args: + exif_val: + + Returns: + + """ + if isinstance(exif_val, str): + return exif_val + else: + return exif_val.decode() + + +def extract_exif(image: Image) -> dict: + """ + + Args: + image: + + Returns: + + """ + try: + exif_data = image._getexif() + if exif_data is not None: + exif = { + ExifTags.TAGS[k]: decode_byte_exif(v) + for k, v in exif_data.items() + if k in ExifTags.TAGS + } + else: + exif = {} + except (AttributeError, OSError): + # Not all file types (e.g. .gif) have exif information. + exif = {} + + return exif + + +def path_is_image(p: Path) -> bool: + return imghdr.what(p) is not None diff --git a/build/lib/visions/utils/monkeypatches/__init__.py b/build/lib/visions/utils/monkeypatches/__init__.py new file mode 100644 index 000000000..d65f05b9d --- /dev/null +++ b/build/lib/visions/utils/monkeypatches/__init__.py @@ -0,0 +1,6 @@ +from visions.utils.monkeypatches import imghdr_patch, pathlib_patch + +__all__ = [ + "imghdr_patch", + "pathlib_patch", +] diff --git a/build/lib/visions/utils/monkeypatches/imghdr_patch.py b/build/lib/visions/utils/monkeypatches/imghdr_patch.py new file mode 100644 index 000000000..f9e7098b6 --- /dev/null +++ b/build/lib/visions/utils/monkeypatches/imghdr_patch.py @@ -0,0 +1,31 @@ +# Monkeypatch bug in imagehdr +from imghdr import tests + + +def test_jpeg1(h, f): + """JPEG data in JFIF format""" + if b"JFIF" in h[:23]: + return "jpeg" + + +JPEG_MARK = ( + b"\xff\xd8\xff\xdb\x00C\x00\x08\x06\x06" + b"\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f" +) + + +def test_jpeg2(h, f): + """JPEG with small header""" + if len(h) >= 32 and 67 == h[5] and h[:32] == JPEG_MARK: + return "jpeg" + + +def test_jpeg3(h, f): + """JPEG data in JFIF or Exif format""" + if h[6:10] in (b"JFIF", b"Exif") or h[:2] == b"\xff\xd8": + return "jpeg" + + +tests.append(test_jpeg1) +tests.append(test_jpeg2) +tests.append(test_jpeg3) diff --git a/build/lib/visions/utils/monkeypatches/pathlib_patch.py b/build/lib/visions/utils/monkeypatches/pathlib_patch.py new file mode 100644 index 000000000..5170010f1 --- /dev/null +++ b/build/lib/visions/utils/monkeypatches/pathlib_patch.py @@ -0,0 +1,21 @@ +# type: ignore +from pathlib import Path + + +def _copy(self, target): + """Monkeypatch for pathlib + + Args: + self: + target: + + Returns: + + """ + import shutil + + assert self.is_file() + shutil.copy(str(self), str(target)) # str() only there for Python < (3, 6) + + +Path.copy = _copy diff --git a/build/lib/visions/utils/profiling.py b/build/lib/visions/utils/profiling.py new file mode 100644 index 000000000..0724c0fcb --- /dev/null +++ b/build/lib/visions/utils/profiling.py @@ -0,0 +1,89 @@ +import functools +import timeit + +import numpy as np +import pandas as pd + + +def big_o_tester(test_func): + import big_o + + @functools.wraps(test_func) + def inner(test_series): + try: + best, _ = big_o.big_o( + test_func, lambda n: test_series[0:n], max_n=test_series.shape[0] + ) + return best + except np.linalg.LinAlgError: + return np.nan + + return inner + + +def profile_type(dtype, profile_data, run_count=10, normed_length=100000): + profile_data = { + name: pd.Series(np.random.choice(data, normed_length)) + for name, data in profile_data.items() + if len(data) > 0 + } + big_O_test = big_o_tester(lambda x: x in dtype) + return [ + { + "type": dtype, + "series": name, + "run count": run_count, + "average run time": timeit.timeit(lambda: data in dtype, number=run_count) + / run_count, + "big O": big_O_test(data), + } + for name, data in profile_data.items() + ] + + +def profile_relation_is_relation( + relation, profile_data, run_count=10, normed_length=100000 +): + profile_data = { + name: pd.Series(np.random.choice(data, normed_length)) + for name, data in profile_data.items() + if len(data) > 0 + } + big_O_test = big_o_tester(relation.is_relation) + return [ + { + "relation": relation, + "series": name, + "run count": run_count, + "average run time": timeit.timeit( + lambda: relation.is_relation, number=run_count + ) + / run_count, + "big O": big_O_test(data), + } + for name, data in profile_data.items() + ] + + +def profile_relation_transform( + relation, profile_data, run_count=10, normed_length=100000 +): + profile_data = { + name: pd.Series(np.random.choice(data, normed_length)) + for name, data in profile_data.items() + if len(data) > 0 + } + big_O_test = big_o_tester(relation.transform) + return [ + { + "relation": relation, + "series": name, + "run count": run_count, + "average run time": timeit.timeit( + lambda: relation.transform, number=run_count + ) + / run_count, + "big O": big_O_test(data), + } + for name, data in profile_data.items() + ] diff --git a/build/lib/visions/utils/warning_handling.py b/build/lib/visions/utils/warning_handling.py new file mode 100644 index 000000000..8d667b5a3 --- /dev/null +++ b/build/lib/visions/utils/warning_handling.py @@ -0,0 +1,33 @@ +import functools +import os +import sys +import warnings +from typing import Callable, TypeVar + +T = TypeVar("T") + + +def suppress_warnings(func: Callable[..., T]) -> Callable[..., T]: + """Suppress warnings produces while executing the wrapped function.""" + + @functools.wraps(func) + def inner(*args, **kwargs) -> T: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return func(*args, **kwargs) + + return inner + + +def discard_stderr(func: Callable[..., T]) -> Callable[..., T]: + """Shapely logs failures at a silly severity, just trying to suppress it's output on failures. + Only known way to get rid of sys output when wkt.loads hits a bad value""" + + @functools.wraps(func) + def wrapper(*args, **kwargs) -> T: + sys.stderr = open(os.devnull, "w") + res = func(*args, **kwargs) + sys.stderr = sys.__stderr__ + return res + + return wrapper diff --git a/build/lib/visions/visualisation/__init__.py b/build/lib/visions/visualisation/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/build/lib/visions/visualisation/circular_packing.html b/build/lib/visions/visualisation/circular_packing.html new file mode 100644 index 000000000..0c0698d04 --- /dev/null +++ b/build/lib/visions/visualisation/circular_packing.html @@ -0,0 +1,154 @@ + + + + Typeset circular packing + + + + + + + + + \ No newline at end of file diff --git a/build/lib/visions/visualisation/plot_circular_packing.py b/build/lib/visions/visualisation/plot_circular_packing.py new file mode 100644 index 000000000..b97fc7862 --- /dev/null +++ b/build/lib/visions/visualisation/plot_circular_packing.py @@ -0,0 +1,69 @@ +import json +import re +from itertools import chain +from pathlib import Path + +import networkx as nx + +from visions.typesets import CompleteSet + + +def update(data): + data["name"] = data.pop("id") + if "children" not in data: + data["size"] = 1 + else: + data["children"] = [update(child) for child in data["children"]] + return data + + +def write_html(data, output_file): + jdata = json.dumps(data) + string = f"\n\troot = {jdata};\n\t" + + file_name = Path(__file__).parent / "circular_packing.html" + out_file = Path(output_file) + fc = file_name.read_text() + fc = re.sub( + r"// START-REPLACE(.*)// END-REPLACE", + rf"// START-REPLACE{string}// END-REPLACE", + fc, + flags=re.MULTILINE | re.DOTALL, + ) + out_file.write_text(fc) + + +def to_json_tree_sorted(G, root): + # json_graph.tree_data with sorting + def add_children(n, G): + nbrs = G[n] + if len(nbrs) == 0: + return [] + children_ = [] + for child in nbrs: + d = dict(chain(G.nodes[child].items(), [("id", child)])) + c = add_children(child, G) + if c: + d["children"] = c + children_.append(d) + + children_ = sorted(children_, key=lambda x: x["id"]) + return children_ + + data = dict(chain(G.nodes[root].items(), [("id", root)])) + data["children"] = add_children(root, G) + return data + + +def plot_graph_circular_packing(typeset, output_file) -> None: + graph = typeset.base_graph.copy() + nx.relabel_nodes(graph, {n: str(n) for n in graph.nodes}, copy=False) + + data = to_json_tree_sorted(graph, root=str(typeset.root_node)) + data = update(data) + write_html(data, output_file) + + +if __name__ == "__main__": + complete_set = CompleteSet() + plot_graph_circular_packing(complete_set, "circular_packing.html") diff --git a/build/lib/visions/visualisation/plot_typesets.py b/build/lib/visions/visualisation/plot_typesets.py new file mode 100644 index 000000000..ae886033a --- /dev/null +++ b/build/lib/visions/visualisation/plot_typesets.py @@ -0,0 +1,36 @@ +from pathlib import Path + +from visions.typesets import CompleteSet, GeometrySet, StandardSet + +# Windows Note +# Tip for Python3/64-bit compatible version of pygraphviz +# https://github.com/CristiFati/Prebuilt-Binaries/raw/master/Windows/PyGraphviz/pygraphviz-1.5-cp37-cp37m-win_amd64.whl + + +def generate_typeset_plots() -> None: + typesets_dir = Path("typesets/") + typesets_dir.mkdir(exist_ok=True) + + # Initialize typeset + for name, tsc in [ + ("typeset_complete", CompleteSet()), + ("typeset_geometry", GeometrySet()), + ("typeset_standard", StandardSet()), + ]: + # Write graph to dot + tsc.output_graph(typesets_dir / f"{name}.dot") + + # Plot the graph (svg) + tsc.output_graph(typesets_dir / f"{name}.svg") + tsc.output_graph(typesets_dir / f"{name}_base.svg", base_only=True) + + # Plot the graph (pdf) + tsc.output_graph(typesets_dir / f"{name}.pdf") + tsc.output_graph(typesets_dir / f"{name}_base.pdf", base_only=True) + + # Plot the graph (png) + tsc.output_graph(typesets_dir / f"{name}.png", dpi=150) + + +if __name__ == "__main__": + generate_typeset_plots() diff --git a/setup.py b/setup.py index 7d27dec34..3c5a43487 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ tests_require=test_requirements, python_requires=">=3.8", long_description=long_description, - long_description_content_type="text/x-rst", + long_description_content_type="text/markdown", zip_safe=False, classifiers=[ "Programming Language :: Python :: 3",