Skip to content

Commit

Permalink
Integer as subset of Float
Browse files Browse the repository at this point in the history
Changes: 

- Integer should be a strict subset of Float
- Fix pandas future warning
  • Loading branch information
ieaves authored Apr 6, 2020
1 parent 82519ef commit 9e550d0
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 29 deletions.
4 changes: 2 additions & 2 deletions src/visions/types/complex.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
import pandas as pd
import numpy as np

from visions.types.float import test_string_is_float
from visions.types.float import test_is_float
from visions.relations import IdentityRelation, InferenceRelation, TypeRelation
from visions.types import VisionsBaseType
from visions.utils.coercion import test_utils


def test_string_is_complex(series) -> bool:
coerced_series = test_utils.option_coercion_evaluator(to_complex)(series)
return coerced_series is not None and not test_string_is_float(series)
return coerced_series is not None and not test_is_float(series)


def to_complex(series: pd.Series) -> bool:
Expand Down
18 changes: 16 additions & 2 deletions src/visions/types/float.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,25 @@


def test_string_is_float(series) -> bool:
coerced_series = test_utils.option_coercion_evaluator(string_to_float)(series)
return coerced_series is not None and coerced_series in Float


def string_to_float(series: pd.Series) -> pd.Series:
# Slightly faster to check for the character if it's not present than to
# attempt the replacement
# if any("," in x for x in series.dropna()):
# series = series.str.replace(",", "")

return to_float(series)


def test_is_float(series: pd.Series) -> bool:
coerced_series = test_utils.option_coercion_evaluator(to_float)(series)
return coerced_series is not None and coerced_series in Float


def to_float(series: pd.Series) -> bool:
def to_float(series: pd.Series) -> pd.Series:
return series.astype(float)


Expand All @@ -25,7 +39,7 @@ def _get_relations(cls) -> Sequence[TypeRelation]:
relations = [
IdentityRelation(cls, Generic),
InferenceRelation(
cls, String, relationship=test_string_is_float, transformer=to_float
cls, String, relationship=test_string_is_float, transformer=string_to_float
),
InferenceRelation(
cls,
Expand Down
20 changes: 13 additions & 7 deletions src/visions/types/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,24 @@ def check_equality(series):
return check_equality(series.dropna() if series.hasnans else series)


def test_string_is_int(series) -> bool:
coerced_series = test_utils.option_coercion_evaluator(string_to_int)(series)
return coerced_series is not None and coerced_series in Integer


def string_to_int(series: pd.Series) -> pd.Series:
# if any("," in x for x in series.dropna()):
# series = series.str.replace(",", "")

return to_int(series)


def _get_relations(cls) -> List[TypeRelation]:
from visions.types import String, Generic, Float
from visions.types import Generic, Float

relations = [
IdentityRelation(cls, Generic),
InferenceRelation(cls, Float, relationship=float_is_int, transformer=to_int),
InferenceRelation(
cls,
String,
relationship=test_utils.coercion_test(to_int),
transformer=to_int,
),
]
return relations

Expand Down
15 changes: 11 additions & 4 deletions tests/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def get_series():
name="string_np_unicode_series",
),
pd.Series(["1.0", "2.0", np.nan], name="string_num_nan"),
pd.Series(["1,000.0", "2.1", np.nan], name="string_with_sep_num_nan"),
pd.Series(["1.0", "2.0", "3.0"], name="string_num"),
pd.Series(["1.0", "45.67", np.nan], name="string_flt_nan"),
pd.Series(["1.0", "45.67", "3.5"], name="string_flt"),
Expand Down Expand Up @@ -164,15 +165,19 @@ def get_series():
),
# Datetime Series
pd.Series(
[pd.datetime(2017, 3, 5, 12, 2), pd.datetime(2019, 12, 4)],
[datetime.datetime(2017, 3, 5, 12, 2), datetime.datetime(2019, 12, 4)],
name="timestamp_series",
),
pd.Series(
[pd.datetime(2017, 3, 5), pd.datetime(2019, 12, 4, 3, 2, 0), pd.NaT],
[
datetime.datetime(2017, 3, 5),
datetime.datetime(2019, 12, 4, 3, 2, 0),
pd.NaT,
],
name="timestamp_series_nat",
),
pd.Series(
[pd.datetime(2017, 3, 5), pd.datetime(2019, 12, 4), pd.NaT],
[datetime.datetime(2017, 3, 5), datetime.datetime(2019, 12, 4), pd.NaT],
name="date_series_nat",
),
pd.Series(
Expand Down Expand Up @@ -341,6 +346,7 @@ def get_contains_map():
TimeDelta: ["timedelta_series", "timedelta_series_nat"],
String: [
"timestamp_string_series",
"string_with_sep_num_nan",
"string_series",
"geometry_string_series",
"string_unicode_series",
Expand Down Expand Up @@ -413,6 +419,7 @@ def infer_series_type_map():
"string_series": String,
"categorical_string_series": Categorical,
"timestamp_string_series": Date,
"string_with_sep_num_nan": String, # TODO: Introduce thousands separator
"string_unicode_series": String,
"string_np_unicode_series": String,
"string_num_nan": Integer,
Expand Down Expand Up @@ -476,7 +483,6 @@ def get_convert_map():
series_map = [
# Model type, Relation type
(Integer, Float, ["int_nan_series", "float_series2"]),
(Integer, String, ["int_str_range"]),
(Complex, String, ["str_complex"]),
(
Float,
Expand All @@ -489,6 +495,7 @@ def get_convert_map():
"textual_float",
"textual_float_nan",
"int_str_range",
# "string_with_sep_num_nan",
],
),
(DateTime, String, ["timestamp_string_series", "string_date"]),
Expand Down
7 changes: 4 additions & 3 deletions tests/test_functional.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import numpy as np
import datetime

from visions.functional import (
infer_frame_type,
Expand Down Expand Up @@ -126,9 +127,9 @@ def test_type_detect_frame():
def test_type_detect_series():
datetime_series = pd.Series(
[
pd.datetime(2010, 1, 1),
pd.datetime(2010, 8, 2),
pd.datetime(2011, 2, 1),
datetime.datetime(2010, 1, 1),
datetime.datetime(2010, 8, 2),
datetime.datetime(2011, 2, 1),
np.datetime64("NaT"),
]
)
Expand Down
16 changes: 8 additions & 8 deletions tests/test_summarization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from urllib.parse import urlparse

import datetime
import pytest
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -124,16 +124,16 @@ def test_complex_missing_summary(summary, visions_type=Complex):
def test_datetime_missing_summary(summary, visions_type=DateTime):
test_series = pd.Series(
[
pd.datetime(2010, 1, 1),
pd.datetime(2010, 8, 2),
pd.datetime(2011, 2, 1),
datetime.datetime(2010, 1, 1),
datetime.datetime(2010, 8, 2),
datetime.datetime(2011, 2, 1),
np.nan,
]
)
correct_output = {
"n_unique": 3,
"max": pd.datetime(2011, 2, 1),
"min": pd.datetime(2010, 1, 1),
"max": datetime.datetime(2011, 2, 1),
"min": datetime.datetime(2010, 1, 1),
"n_records": 4,
"na_count": 1,
"range": test_series.max() - test_series.min(),
Expand All @@ -143,10 +143,10 @@ def test_datetime_missing_summary(summary, visions_type=DateTime):


def test_object_missing_summary(summary, visions_type=Object):
test_series = pd.Series([pd.datetime(2010, 1, 1), "test", 3, np.nan])
test_series = pd.Series([datetime.datetime(2010, 1, 1), "test", 3, np.nan])
correct_output = {
"n_unique": 3,
"frequencies": {"test": 1, 3: 1, pd.datetime(2010, 1, 1): 1},
"frequencies": {"test": 1, 3: 1, datetime.datetime(2010, 1, 1): 1},
"n_records": 4,
"na_count": 1,
}
Expand Down
8 changes: 5 additions & 3 deletions tests/test_type_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def all_relations_tested(series_map):

missing_relations = set()
for node in typeset.types:
for relation in node.get_relations():
for relation in node.relations:
from_type, to_type = relation.related_type, relation.type
if relation.inferential and (
to_type not in series_map_lookup
Expand Down Expand Up @@ -78,14 +78,16 @@ def pytest_generate_tests(metafunc):

def test_relations(source_type, relation_type, series, member):
relation_gen = (
rel for rel in source_type.get_relations() if rel.related_type == relation_type
rel for rel in source_type.relations if rel.related_type == relation_type
)
relation = next(relation_gen)

is_relation = relation.is_relation(series)

if not member:
assert not is_relation
assert (
not is_relation
), f"{source_type}, {relation}, {member}, {series.name}, {series[0]}"
else:
assert is_relation
if relation.is_relation(series):
Expand Down

0 comments on commit 9e550d0

Please sign in to comment.