From 9e550d0ce49a04db687125253b9c880cf330d189 Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Mon, 6 Apr 2020 05:58:28 -0500 Subject: [PATCH] Integer as subset of Float Changes: - Integer should be a strict subset of Float - Fix pandas future warning --- src/visions/types/complex.py | 4 ++-- src/visions/types/float.py | 18 ++++++++++++++++-- src/visions/types/integer.py | 20 +++++++++++++------- tests/series.py | 15 +++++++++++---- tests/test_functional.py | 7 ++++--- tests/test_summarization.py | 16 ++++++++-------- tests/test_type_convert.py | 8 +++++--- 7 files changed, 59 insertions(+), 29 deletions(-) diff --git a/src/visions/types/complex.py b/src/visions/types/complex.py index 493df9d6..e44479eb 100644 --- a/src/visions/types/complex.py +++ b/src/visions/types/complex.py @@ -4,7 +4,7 @@ import pandas as pd import numpy as np -from visions.types.float import test_string_is_float +from visions.types.float import test_is_float from visions.relations import IdentityRelation, InferenceRelation, TypeRelation from visions.types import VisionsBaseType from visions.utils.coercion import test_utils @@ -12,7 +12,7 @@ def test_string_is_complex(series) -> bool: coerced_series = test_utils.option_coercion_evaluator(to_complex)(series) - return coerced_series is not None and not test_string_is_float(series) + return coerced_series is not None and not test_is_float(series) def to_complex(series: pd.Series) -> bool: diff --git a/src/visions/types/float.py b/src/visions/types/float.py index b8b1d222..899783a4 100644 --- a/src/visions/types/float.py +++ b/src/visions/types/float.py @@ -11,11 +11,25 @@ def test_string_is_float(series) -> bool: + coerced_series = test_utils.option_coercion_evaluator(string_to_float)(series) + return coerced_series is not None and coerced_series in Float + + +def string_to_float(series: pd.Series) -> pd.Series: + # Slightly faster to check for the character if it's not present than to + # attempt the replacement + # if any("," in x for x in series.dropna()): + # series = series.str.replace(",", "") + + return to_float(series) + + +def test_is_float(series: pd.Series) -> bool: coerced_series = test_utils.option_coercion_evaluator(to_float)(series) return coerced_series is not None and coerced_series in Float -def to_float(series: pd.Series) -> bool: +def to_float(series: pd.Series) -> pd.Series: return series.astype(float) @@ -25,7 +39,7 @@ def _get_relations(cls) -> Sequence[TypeRelation]: relations = [ IdentityRelation(cls, Generic), InferenceRelation( - cls, String, relationship=test_string_is_float, transformer=to_float + cls, String, relationship=test_string_is_float, transformer=string_to_float ), InferenceRelation( cls, diff --git a/src/visions/types/integer.py b/src/visions/types/integer.py index eb62678e..aaf0a94e 100644 --- a/src/visions/types/integer.py +++ b/src/visions/types/integer.py @@ -25,18 +25,24 @@ def check_equality(series): return check_equality(series.dropna() if series.hasnans else series) +def test_string_is_int(series) -> bool: + coerced_series = test_utils.option_coercion_evaluator(string_to_int)(series) + return coerced_series is not None and coerced_series in Integer + + +def string_to_int(series: pd.Series) -> pd.Series: + # if any("," in x for x in series.dropna()): + # series = series.str.replace(",", "") + + return to_int(series) + + def _get_relations(cls) -> List[TypeRelation]: - from visions.types import String, Generic, Float + from visions.types import Generic, Float relations = [ IdentityRelation(cls, Generic), InferenceRelation(cls, Float, relationship=float_is_int, transformer=to_int), - InferenceRelation( - cls, - String, - relationship=test_utils.coercion_test(to_int), - transformer=to_int, - ), ] return relations diff --git a/tests/series.py b/tests/series.py index 105a1b1b..c87ea7ac 100644 --- a/tests/series.py +++ b/tests/series.py @@ -91,6 +91,7 @@ def get_series(): name="string_np_unicode_series", ), pd.Series(["1.0", "2.0", np.nan], name="string_num_nan"), + pd.Series(["1,000.0", "2.1", np.nan], name="string_with_sep_num_nan"), pd.Series(["1.0", "2.0", "3.0"], name="string_num"), pd.Series(["1.0", "45.67", np.nan], name="string_flt_nan"), pd.Series(["1.0", "45.67", "3.5"], name="string_flt"), @@ -164,15 +165,19 @@ def get_series(): ), # Datetime Series pd.Series( - [pd.datetime(2017, 3, 5, 12, 2), pd.datetime(2019, 12, 4)], + [datetime.datetime(2017, 3, 5, 12, 2), datetime.datetime(2019, 12, 4)], name="timestamp_series", ), pd.Series( - [pd.datetime(2017, 3, 5), pd.datetime(2019, 12, 4, 3, 2, 0), pd.NaT], + [ + datetime.datetime(2017, 3, 5), + datetime.datetime(2019, 12, 4, 3, 2, 0), + pd.NaT, + ], name="timestamp_series_nat", ), pd.Series( - [pd.datetime(2017, 3, 5), pd.datetime(2019, 12, 4), pd.NaT], + [datetime.datetime(2017, 3, 5), datetime.datetime(2019, 12, 4), pd.NaT], name="date_series_nat", ), pd.Series( @@ -341,6 +346,7 @@ def get_contains_map(): TimeDelta: ["timedelta_series", "timedelta_series_nat"], String: [ "timestamp_string_series", + "string_with_sep_num_nan", "string_series", "geometry_string_series", "string_unicode_series", @@ -413,6 +419,7 @@ def infer_series_type_map(): "string_series": String, "categorical_string_series": Categorical, "timestamp_string_series": Date, + "string_with_sep_num_nan": String, # TODO: Introduce thousands separator "string_unicode_series": String, "string_np_unicode_series": String, "string_num_nan": Integer, @@ -476,7 +483,6 @@ def get_convert_map(): series_map = [ # Model type, Relation type (Integer, Float, ["int_nan_series", "float_series2"]), - (Integer, String, ["int_str_range"]), (Complex, String, ["str_complex"]), ( Float, @@ -489,6 +495,7 @@ def get_convert_map(): "textual_float", "textual_float_nan", "int_str_range", + # "string_with_sep_num_nan", ], ), (DateTime, String, ["timestamp_string_series", "string_date"]), diff --git a/tests/test_functional.py b/tests/test_functional.py index 91b07a54..3ca72cc0 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +import datetime from visions.functional import ( infer_frame_type, @@ -126,9 +127,9 @@ def test_type_detect_frame(): def test_type_detect_series(): datetime_series = pd.Series( [ - pd.datetime(2010, 1, 1), - pd.datetime(2010, 8, 2), - pd.datetime(2011, 2, 1), + datetime.datetime(2010, 1, 1), + datetime.datetime(2010, 8, 2), + datetime.datetime(2011, 2, 1), np.datetime64("NaT"), ] ) diff --git a/tests/test_summarization.py b/tests/test_summarization.py index db46feae..1524d68b 100644 --- a/tests/test_summarization.py +++ b/tests/test_summarization.py @@ -1,5 +1,5 @@ from urllib.parse import urlparse - +import datetime import pytest import pandas as pd import numpy as np @@ -124,16 +124,16 @@ def test_complex_missing_summary(summary, visions_type=Complex): def test_datetime_missing_summary(summary, visions_type=DateTime): test_series = pd.Series( [ - pd.datetime(2010, 1, 1), - pd.datetime(2010, 8, 2), - pd.datetime(2011, 2, 1), + datetime.datetime(2010, 1, 1), + datetime.datetime(2010, 8, 2), + datetime.datetime(2011, 2, 1), np.nan, ] ) correct_output = { "n_unique": 3, - "max": pd.datetime(2011, 2, 1), - "min": pd.datetime(2010, 1, 1), + "max": datetime.datetime(2011, 2, 1), + "min": datetime.datetime(2010, 1, 1), "n_records": 4, "na_count": 1, "range": test_series.max() - test_series.min(), @@ -143,10 +143,10 @@ def test_datetime_missing_summary(summary, visions_type=DateTime): def test_object_missing_summary(summary, visions_type=Object): - test_series = pd.Series([pd.datetime(2010, 1, 1), "test", 3, np.nan]) + test_series = pd.Series([datetime.datetime(2010, 1, 1), "test", 3, np.nan]) correct_output = { "n_unique": 3, - "frequencies": {"test": 1, 3: 1, pd.datetime(2010, 1, 1): 1}, + "frequencies": {"test": 1, 3: 1, datetime.datetime(2010, 1, 1): 1}, "n_records": 4, "na_count": 1, } diff --git a/tests/test_type_convert.py b/tests/test_type_convert.py index 8d23f6ea..ffea9d57 100644 --- a/tests/test_type_convert.py +++ b/tests/test_type_convert.py @@ -19,7 +19,7 @@ def all_relations_tested(series_map): missing_relations = set() for node in typeset.types: - for relation in node.get_relations(): + for relation in node.relations: from_type, to_type = relation.related_type, relation.type if relation.inferential and ( to_type not in series_map_lookup @@ -78,14 +78,16 @@ def pytest_generate_tests(metafunc): def test_relations(source_type, relation_type, series, member): relation_gen = ( - rel for rel in source_type.get_relations() if rel.related_type == relation_type + rel for rel in source_type.relations if rel.related_type == relation_type ) relation = next(relation_gen) is_relation = relation.is_relation(series) if not member: - assert not is_relation + assert ( + not is_relation + ), f"{source_type}, {relation}, {member}, {series.name}, {series[0]}" else: assert is_relation if relation.is_relation(series):