Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: fix tests for mixed int string index #55458

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9b8c639
add to indices_dict entry mixed-int-string
natmokval Sep 26, 2023
3645428
fix tests in test_old_base.py, test_numpy_compat.py
natmokval Sep 30, 2023
6b0e2c2
add except TypeError to def nargsort, fix tests
natmokval Oct 9, 2023
786c250
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Oct 9, 2023
533b4a2
fix tests
natmokval Oct 9, 2023
4ad1bc4
correct the error msg, add tm.assert_mixed_int_string_entry, correct …
natmokval Oct 13, 2023
87160c0
resolve conflict
natmokval Oct 16, 2023
a27bc13
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Nov 27, 2023
2d67a27
fix tests
natmokval Nov 27, 2023
531beda
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Nov 27, 2023
f09b0ac
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Jan 1, 2024
adf4929
correct def nargsort, correect docs and rename assert_mixed_int_strin…
natmokval Jan 11, 2024
fac6291
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Jan 11, 2024
5eef4fc
add import _testing
natmokval Jan 11, 2024
8a487e3
fix pre-commit errors
natmokval Jan 11, 2024
5fb2f55
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Jan 31, 2024
3081ad5
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Feb 2, 2024
e7edbbe
fix pre-commit error
natmokval Feb 2, 2024
14d6b6f
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Mar 8, 2024
454f0b6
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Mar 15, 2024
82ad988
replace is_mixed_int_string_entry with request.node.callspec.id, corr…
natmokval Mar 16, 2024
72435c6
Merge branch 'main' into TST-mixed-int-string-Index
natmokval Mar 25, 2024
7ebd7ef
reolve conflicts
natmokval Jul 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
assert_is_sorted,
assert_is_valid_plot_return_object,
assert_metadata_equivalent,
assert_mixed_int_string_entry,
assert_numpy_array_equal,
assert_period_array_equal,
assert_series_equal,
Expand Down Expand Up @@ -987,6 +988,7 @@ def shares_memory(left, right) -> bool:
"assert_produces_warning",
"assert_series_equal",
"assert_sp_array_equal",
"assert_mixed_int_string_entry",
"assert_timedelta_array_equal",
"assert_cow_warning",
"at",
Expand Down
17 changes: 17 additions & 0 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -1382,3 +1382,20 @@ def assert_metadata_equivalent(
assert val is None
else:
assert val == getattr(right, attr, None)


def assert_mixed_int_string_entry(arr) -> bool:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe "is_mixed_..." instead of "assert_mixed_..."?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I replaced assert_mixed_int_string_entry with is_mixed_int_string_entry.

"""
Check that arr is mixed-int-string entry.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will be clearer if it refers directly to the relevant fixture.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this is an instance of a more general pain point in test writing:

def test_something(fixture):
    if the_fixture_value_is_this_particular_entry(fixture):
        ...

usually we either inspect the object like this assert_mixed_int_string_entry is doing or in some cases we pass request and do something hacky with request.fixturenames or something. I wonder if there is a recommended pattern for like pytest.get_active_fixture_id("index") that would return "mixed-int-string" here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I coud't find a recommended pattern that would return "mixed-int-string" in this case.
Instead I passed request and used request.node.callspec.id to return "mixed-int-string".

Now I inspect the object by using request.node.callspec.id instead of is_mixed_int_string_entry. The function is_mixed_int_string_entry is removed.
Sorry it took so long. Could you please take a look at this PR?

"""
# we might have an entry [0, "a", 1, "b", 2, "c"] with duplicates
# or with None or without the first element
if len(arr) < 3:
return False
else:
if isinstance(arr[0], int):
return isinstance(arr[1], str)
elif isinstance(arr[0], str):
return isinstance(arr[1], int)
else:
return isinstance(arr[2], int) and isinstance(arr[3], str)
1 change: 1 addition & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,7 @@ def _create_mi_with_dt64tz_level():
"empty": Index([]),
"tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])),
"mi-with-dt64tz-level": _create_mi_with_dt64tz_level(),
"mixed-int-string": Index([0, "a", 1, "b", 2, "c"]),
"multi": _create_multiindex(),
"repeats": Index([0, 0, 1, 1, 2, 2]),
"nullable_int": Index(np.arange(100), dtype="Int64"),
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,13 @@ def nargsort(
if not ascending:
non_nans = non_nans[::-1]
non_nan_idx = non_nan_idx[::-1]
indexer = non_nan_idx[non_nans.argsort(kind=kind)]
# GH#54072
# argsort does not support mixed int/string Index
try:
indexer = non_nan_idx[non_nans.argsort(kind=kind)]
except TypeError as err:
msg = "'<' not supported between instances of 'int' and 'str'"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there cases other than this particular test case that might hit this?

raise TypeError(msg) from err
if not ascending:
indexer = indexer[::-1]
# Finally, place the NaNs at the end or the beginning according to
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/base/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,15 @@ def test_searchsorted(request, index_or_series_obj):
mark = pytest.mark.xfail(reason="complex objects are not comparable")
request.applymarker(mark)

# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(obj.values):
with pytest.raises(
TypeError, match="'>' not supported between instances of 'str' and 'int'"
):
max_obj = max(obj, default=0)
index = np.searchsorted(obj, max_obj)
return

max_obj = max(obj, default=0)
index = np.searchsorted(obj, max_obj)
assert 0 <= index <= len(obj)
Expand Down
21 changes: 15 additions & 6 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,18 @@ def test_value_counts(index_or_series_obj):
# i.e IntegerDtype
expected = expected.astype("Int64")

# TODO(GH#32514): Order of entries with the same count is inconsistent
# on CI (gh-32449)
if obj.duplicated().any():
result = result.sort_index()
expected = expected.sort_index()
tm.assert_series_equal(result, expected)
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(obj.values):
msg = "'<' not supported between instances of 'int' and 'str'"
with pytest.raises(TypeError, match=msg):
result.sort_index()
else:
# TODO(GH#32514): Order of entries with the same count is inco8nsistent
# on CI (gh-32449)
if obj.duplicated().any():
result = result.sort_index()
expected = expected.sort_index()
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("null_obj", [np.nan, None])
Expand All @@ -67,6 +73,9 @@ def test_value_counts_null(null_obj, index_or_series_obj):
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(orig, MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(orig.values):
pytest.skip("'<' not supported between instances of 'str' and 'int'")

values = obj._values
values[0:2] = null_obj
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,11 @@ def test_union_duplicates(index, request):
values = index.unique().values.tolist()
mi1 = MultiIndex.from_arrays([values, [1] * len(values)])
mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)])
result = mi2.union(mi1)
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(index.values):
pytest.skip("'<' not supported between instances of 'str' and 'int'")
else:
result = mi2.union(mi1)
expected = mi2.sort_values()
tm.assert_index_equal(result, expected)

Expand Down
36 changes: 26 additions & 10 deletions pandas/tests/indexes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,8 +448,15 @@ def test_hasnans_isnans(self, index_flat):
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
@pytest.mark.parametrize("na_position", [None, "middle"])
def test_sort_values_invalid_na_position(index_with_missing, na_position):
with pytest.raises(ValueError, match=f"invalid na_position: {na_position}"):
index_with_missing.sort_values(na_position=na_position)
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(index_with_missing.values):
with pytest.raises(
TypeError, match="'<' not supported between instances of 'int' and 'str'"
):
index_with_missing.sort_values(na_position=na_position)
else:
with pytest.raises(ValueError, match=f"invalid na_position: {na_position}"):
index_with_missing.sort_values(na_position=na_position)


@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
Expand All @@ -467,17 +474,26 @@ def test_sort_values_with_missing(index_with_missing, na_position, request):

missing_count = np.sum(index_with_missing.isna())
not_na_vals = index_with_missing[index_with_missing.notna()].values
sorted_values = np.sort(not_na_vals)
if na_position == "first":
sorted_values = np.concatenate([[None] * missing_count, sorted_values])
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(index_with_missing.values):
with pytest.raises(
TypeError, match="'<' not supported between instances of 'int' and 'str'"
):
np.sort(not_na_vals)
else:
sorted_values = np.concatenate([sorted_values, [None] * missing_count])
sorted_values = np.sort(not_na_vals)
if na_position == "first":
sorted_values = np.concatenate([[None] * missing_count, sorted_values])
else:
sorted_values = np.concatenate([sorted_values, [None] * missing_count])

# Explicitly pass dtype needed for Index backed by EA e.g. IntegerArray
expected = type(index_with_missing)(sorted_values, dtype=index_with_missing.dtype)
# Explicitly pass dtype needed for Index backed by EA e.g. IntegerArray
expected = type(index_with_missing)(
sorted_values, dtype=index_with_missing.dtype
)

result = index_with_missing.sort_values(na_position=na_position)
tm.assert_index_equal(result, expected)
result = index_with_missing.sort_values(na_position=na_position)
tm.assert_index_equal(result, expected)


def test_ndarray_compat_properties(index):
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/indexes/test_numpy_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ def test_numpy_ufuncs_reductions(index, func, request):
with pytest.raises(TypeError, match="is not ordered for"):
func.reduce(index)
return
# This check is written for the mixed-int-string entry
elif tm.assert_mixed_int_string_entry(index.values):
with pytest.raises(
TypeError, match=".* not supported between instances of 'int' and 'str'"
):
func.reduce(index)
return
else:
result = func.reduce(index)

Expand Down
34 changes: 25 additions & 9 deletions pandas/tests/indexes/test_old_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,18 +333,34 @@ def test_argsort(self, index):
if isinstance(index, CategoricalIndex):
pytest.skip(f"{type(self).__name__} separately tested")

result = index.argsort()
expected = np.array(index).argsort()
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(index.values):
with pytest.raises(
TypeError,
match="'<' not supported between instances of 'str' and 'int'",
):
index.argsort()
else:
result = index.argsort()
expected = np.array(index).argsort()
tm.assert_numpy_array_equal(result, expected, check_dtype=False)

def test_numpy_argsort(self, index):
result = np.argsort(index)
expected = index.argsort()
tm.assert_numpy_array_equal(result, expected)
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(index.values):
with pytest.raises(
TypeError,
match="'<' not supported between instances of 'str' and 'int'",
):
np.argsort(index)
else:
result = np.argsort(index)
expected = index.argsort()
tm.assert_numpy_array_equal(result, expected)

result = np.argsort(index, kind="mergesort")
expected = index.argsort(kind="mergesort")
tm.assert_numpy_array_equal(result, expected)
result = np.argsort(index, kind="mergesort")
expected = index.argsort(kind="mergesort")
tm.assert_numpy_array_equal(result, expected)

# these are the only two types that perform
# pandas compatibility input validation - the
Expand Down
93 changes: 70 additions & 23 deletions pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,16 @@ def any_dtype_for_small_pos_integer_indexes(request):
def test_union_same_types(index):
# Union with a non-unique, non-monotonic index raises error
# Only needed for bool index factory
idx1 = index.sort_values()
idx2 = index.sort_values()
assert idx1.union(idx2).dtype == idx1.dtype
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(index.values):
with pytest.raises(
TypeError, match="'<' not supported between instances of 'int' and 'str'"
):
index.sort_values()
else:
idx1 = index.sort_values()
idx2 = index.sort_values()
assert idx1.union(idx2).dtype == idx1.dtype


def test_union_different_types(index_flat, index_flat2, request):
Expand Down Expand Up @@ -123,19 +130,28 @@ def test_union_different_types(index_flat, index_flat2, request):

# Union with a non-unique, non-monotonic index raises error
# This applies to the boolean index
idx1 = idx1.sort_values()
idx2 = idx2.sort_values()

with tm.assert_produces_warning(warn, match=msg):
res1 = idx1.union(idx2)
res2 = idx2.union(idx1)

if any_uint64 and (idx1_signed or idx2_signed):
assert res1.dtype == np.dtype("O")
assert res2.dtype == np.dtype("O")
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(
idx1.values
) or tm.assert_mixed_int_string_entry(idx2.values):
with pytest.raises(
TypeError, match="'<' not supported between instances of 'int' and 'str'"
):
idx1.sort_values()
idx2.sort_values()
else:
assert res1.dtype == common_dtype
assert res2.dtype == common_dtype
idx1 = idx1.sort_values()
idx2 = idx2.sort_values()
with tm.assert_produces_warning(warn, match=msg):
res1 = idx1.union(idx2)
res2 = idx2.union(idx1)

if any_uint64 and (idx1_signed or idx2_signed):
assert res1.dtype == np.dtype("O")
assert res2.dtype == np.dtype("O")
else:
assert res1.dtype == common_dtype
assert res2.dtype == common_dtype


@pytest.mark.parametrize(
Expand Down Expand Up @@ -254,7 +270,15 @@ def test_union_base(self, index):
everything = index

union = first.union(second)
tm.assert_index_equal(union.sort_values(), everything.sort_values())
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(index.values):
with pytest.raises(
TypeError,
match="'<' not supported between instances of 'int' and 'str'",
):
tm.assert_index_equal(union.sort_values(), everything.sort_values())
else:
tm.assert_index_equal(union.sort_values(), everything.sort_values())

if isinstance(index.dtype, DatetimeTZDtype):
# The second.values below will drop tz, so the rest of this test
Expand Down Expand Up @@ -319,7 +343,15 @@ def test_symmetric_difference(self, index):
second = index[:-1]
answer = index[[0, -1]]
result = first.symmetric_difference(second)
tm.assert_index_equal(result.sort_values(), answer.sort_values())
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(index.values):
with pytest.raises(
TypeError,
match="'<' not supported between instances of 'int' and 'str'",
):
tm.assert_index_equal(result.sort_values(), answer.sort_values())
else:
tm.assert_index_equal(result.sort_values(), answer.sort_values())

# GH#10149
cases = [second.to_numpy(), second.to_series(), second.to_list()]
Expand Down Expand Up @@ -398,9 +430,17 @@ def test_union_unequal(self, index_flat, fname, sname, expected_name):
# test copy.union(subset) - need sort for unicode and string
first = index.copy().set_names(fname)
second = index[1:].set_names(sname)
union = first.union(second).sort_values()
expected = index.set_names(expected_name).sort_values()
tm.assert_index_equal(union, expected)
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(second.values):
with pytest.raises(
TypeError,
match="'<' not supported between instances of 'int' and 'str'",
):
first.union(second).sort_values()
else:
union = first.union(second).sort_values()
expected = index.set_names(expected_name).sort_values()
tm.assert_index_equal(union, expected)

@pytest.mark.parametrize(
"fname, sname, expected_name",
Expand Down Expand Up @@ -467,9 +507,16 @@ def test_intersect_unequal(self, index_flat, fname, sname, expected_name):
# test copy.intersection(subset) - need sort for unicode and string
first = index.copy().set_names(fname)
second = index[1:].set_names(sname)
intersect = first.intersection(second).sort_values()
expected = index[1:].set_names(expected_name).sort_values()
tm.assert_index_equal(intersect, expected)
if tm.assert_mixed_int_string_entry(index.values):
with pytest.raises(
TypeError,
match="'<' not supported between instances of 'int' and 'str'",
):
first.intersection(second).sort_values()
else:
intersect = first.intersection(second).sort_values()
expected = index[1:].set_names(expected_name).sort_values()
tm.assert_index_equal(intersect, expected)

@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_intersection_name_retention_with_nameless(self, index):
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ def test_factorize(self, index_or_series_obj, sort):
expected_uniques = expected_uniques.astype(object)

if sort:
# This check is written for the mixed-int-string entry
if tm.assert_mixed_int_string_entry(expected_uniques.values):
pytest.skip("'<' not supported between instances of 'str' and 'int'")
expected_uniques = expected_uniques.sort_values()

# construct an integer ndarray so that
Expand Down
Loading