Skip to content

Commit

Permalink
fix: Fix null index join with 'on' arg (#1153)
Browse files Browse the repository at this point in the history
  • Loading branch information
TrevorBergeron authored Nov 21, 2024
1 parent 0fae2e0 commit 9015c33
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 8 deletions.
20 changes: 12 additions & 8 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2696,6 +2696,7 @@ def join(
self, other: DataFrame, *, on: Optional[str] = None, how: str = "left"
) -> DataFrame:
left, right = self, other

if not left.columns.intersection(right.columns).empty:
raise NotImplementedError(
f"Deduping column names is not implemented. {constants.FEEDBACK_LINK}"
Expand All @@ -2721,11 +2722,13 @@ def join(
)
# Switch left index with on column
left_columns = left.columns
left_idx_original_names = left.index.names
left_idx_original_names = left.index.names if left._has_index else ()
left_idx_names_in_cols = [
f"bigframes_left_idx_name_{i}" for i in range(len(left.index.names))
f"bigframes_left_idx_name_{i}"
for i in range(len(left_idx_original_names))
]
left.index.names = left_idx_names_in_cols
if left._has_index:
left.index.names = left_idx_names_in_cols
left = left.reset_index(drop=False)
left = left.set_index(on)

Expand All @@ -2736,11 +2739,12 @@ def join(
combined_df = combined_df.set_index(left_idx_names_in_cols)

# To be consistent with Pandas
combined_df.index.names = (
left_idx_original_names
if how in ("inner", "left")
else ([None] * len(combined_df.index.names))
)
if combined_df._has_index:
combined_df.index.names = (
left_idx_original_names
if how in ("inner", "left")
else ([None] * len(combined_df.index.names))
)

# Reorder columns
combined_df = combined_df[list(left_columns) + list(right.columns)]
Expand Down
38 changes: 38 additions & 0 deletions tests/system/small/test_null_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,25 @@ def test_null_index_series_self_join(
)


def test_null_index_series_self_join_on(
scalars_df_null_index, scalars_pandas_df_default_index
):
# caller doesn't need index, but do need index on arg to join with 'on'
bf_result = scalars_df_null_index[["int64_col", "string_col"]].join(
scalars_df_null_index[["int64_too", "bool_col"]].set_index("int64_too"),
on="int64_col",
)
pd_result = scalars_pandas_df_default_index[["int64_col", "string_col"]].join(
scalars_pandas_df_default_index[["int64_too", "bool_col"]].set_index(
"int64_too"
),
on="int64_col",
)
pd.testing.assert_frame_equal(
bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False
)


def test_null_index_series_self_aligns(
scalars_df_null_index, scalars_pandas_df_default_index
):
Expand Down Expand Up @@ -322,6 +341,25 @@ def test_null_index_df_concat(scalars_df_null_index, scalars_pandas_df_default_i
)


def test_null_index_map_dict_input(
scalars_df_null_index, scalars_pandas_df_default_index
):

local_map = dict()
# construct a local map, incomplete to cover <NA> behavior
for s in scalars_pandas_df_default_index.string_col[:-3]:
if isinstance(s, str):
local_map[s] = ord(s[0])

pd_result = scalars_pandas_df_default_index.string_col.map(local_map)
pd_result = pd_result.astype("Int64") # pandas type differences
bf_result = scalars_df_null_index.string_col.map(local_map)

pd.testing.assert_series_equal(
bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False
)


def test_null_index_align_error(scalars_df_null_index):
with pytest.raises(bigframes.exceptions.NullIndexError):
_ = (
Expand Down

0 comments on commit 9015c33

Please sign in to comment.