Skip to content
/ cudf Public
forked from rapidsai/cudf

Commit

Permalink
Only exclude join keys that are indices from key columns
Browse files Browse the repository at this point in the history
Previously, if any of the join keys were indices, we assumed that they
all were, and provided an empty set of key columns with matching names
in the left and right dataframe. This does the wrong thing for mixed
join keys (on a combination of index and normal columns), producing
more output columns than is correct. To avoid this, only skip matching
key names if they name indices.

Closes rapidsai#11550.
  • Loading branch information
wence- committed Nov 30, 2022
1 parent d970c5a commit 16a292c
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/join/join.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,14 @@ def __init__(
self._key_columns_with_same_name = (
set(_coerce_to_tuple(on))
if on
else set()
if (self._using_left_index or self._using_right_index)
else {
lkey.name
for lkey, rkey in zip(self._left_keys, self._right_keys)
if lkey.name == rkey.name
and not (
isinstance(lkey, _IndexIndexer)
or isinstance(rkey, _IndexIndexer)
)
}
)

Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/tests/test_joining.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) 2018-2022, NVIDIA CORPORATION.

from itertools import combinations, product, repeat

import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -2106,6 +2108,28 @@ def test_string_join_values_nulls():
assert_join_results_equal(expect, got, how="left")


@pytest.mark.parametrize(
"left_on,right_on",
[
*product(["a", "b", "c"], ["a", "b"]),
*zip(combinations(["a", "b", "c"], 2), repeat(["a", "b"])),
],
)
def test_merge_mixed_index_columns(left_on, right_on):
left = pd.DataFrame({"a": [1, 2, 1, 2], "b": [2, 3, 3, 4]}).set_index("a")
right = pd.DataFrame({"a": [1, 2, 1, 3], "b": [2, 30, 3, 4]}).set_index(
"a"
)

left["c"] = 10

expect = left.merge(right, left_on=left_on, right_on=right_on, how="outer")
cleft = cudf.from_pandas(left)
cright = cudf.from_pandas(right)
got = cleft.merge(cright, left_on=left_on, right_on=right_on, how="outer")
assert_join_results_equal(expect, got, how="outer")


def test_merge_multiindex_columns():
lhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
lhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
Expand Down

0 comments on commit 16a292c

Please sign in to comment.