From 16a292c939497a1eb3e6a19109250fcc681964a6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 30 Nov 2022 18:27:45 +0000 Subject: [PATCH] Only exclude join keys that are indices from key columns Previously, if any of the join keys were indices, we assumed that they all were, and provided an empty set of key columns with matching names in the left and right dataframe. This does the wrong thing for mixed join keys (on a combination of index and normal columns), producing more output columns than is correct. To avoid this, only skip matching key names if they name indices. Closes #11550. --- python/cudf/cudf/core/join/join.py | 6 ++++-- python/cudf/cudf/tests/test_joining.py | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 0e5ac8dc02d..0c889fcbc81 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -147,12 +147,14 @@ def __init__( self._key_columns_with_same_name = ( set(_coerce_to_tuple(on)) if on - else set() - if (self._using_left_index or self._using_right_index) else { lkey.name for lkey, rkey in zip(self._left_keys, self._right_keys) if lkey.name == rkey.name + and not ( + isinstance(lkey, _IndexIndexer) + or isinstance(rkey, _IndexIndexer) + ) } ) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 5066c5eef38..c787204735f 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1,5 +1,7 @@ # Copyright (c) 2018-2022, NVIDIA CORPORATION. +from itertools import combinations, product, repeat + import numpy as np import pandas as pd import pytest @@ -2106,6 +2108,28 @@ def test_string_join_values_nulls(): assert_join_results_equal(expect, got, how="left") +@pytest.mark.parametrize( + "left_on,right_on", + [ + *product(["a", "b", "c"], ["a", "b"]), + *zip(combinations(["a", "b", "c"], 2), repeat(["a", "b"])), + ], +) +def test_merge_mixed_index_columns(left_on, right_on): + left = pd.DataFrame({"a": [1, 2, 1, 2], "b": [2, 3, 3, 4]}).set_index("a") + right = pd.DataFrame({"a": [1, 2, 1, 3], "b": [2, 30, 3, 4]}).set_index( + "a" + ) + + left["c"] = 10 + + expect = left.merge(right, left_on=left_on, right_on=right_on, how="outer") + cleft = cudf.from_pandas(left) + cright = cudf.from_pandas(right) + got = cleft.merge(cright, left_on=left_on, right_on=right_on, how="outer") + assert_join_results_equal(expect, got, how="outer") + + def test_merge_multiindex_columns(): lhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) lhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])