From 9015c33e73675ebb2299487dce3295732ea0527e Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 21 Nov 2024 14:02:40 -0800 Subject: [PATCH] fix: Fix null index join with 'on' arg (#1153) --- bigframes/dataframe.py | 20 ++++++++------ tests/system/small/test_null_index.py | 38 +++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0b639a5649..185a756fed 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2696,6 +2696,7 @@ def join( self, other: DataFrame, *, on: Optional[str] = None, how: str = "left" ) -> DataFrame: left, right = self, other + if not left.columns.intersection(right.columns).empty: raise NotImplementedError( f"Deduping column names is not implemented. {constants.FEEDBACK_LINK}" @@ -2721,11 +2722,13 @@ def join( ) # Switch left index with on column left_columns = left.columns - left_idx_original_names = left.index.names + left_idx_original_names = left.index.names if left._has_index else () left_idx_names_in_cols = [ - f"bigframes_left_idx_name_{i}" for i in range(len(left.index.names)) + f"bigframes_left_idx_name_{i}" + for i in range(len(left_idx_original_names)) ] - left.index.names = left_idx_names_in_cols + if left._has_index: + left.index.names = left_idx_names_in_cols left = left.reset_index(drop=False) left = left.set_index(on) @@ -2736,11 +2739,12 @@ def join( combined_df = combined_df.set_index(left_idx_names_in_cols) # To be consistent with Pandas - combined_df.index.names = ( - left_idx_original_names - if how in ("inner", "left") - else ([None] * len(combined_df.index.names)) - ) + if combined_df._has_index: + combined_df.index.names = ( + left_idx_original_names + if how in ("inner", "left") + else ([None] * len(combined_df.index.names)) + ) # Reorder columns combined_df = combined_df[list(left_columns) + list(right.columns)] diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index c5be49a56b..da9baa0069 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -268,6 +268,25 @@ def test_null_index_series_self_join( ) +def test_null_index_series_self_join_on( + scalars_df_null_index, scalars_pandas_df_default_index +): + # caller doesn't need index, but do need index on arg to join with 'on' + bf_result = scalars_df_null_index[["int64_col", "string_col"]].join( + scalars_df_null_index[["int64_too", "bool_col"]].set_index("int64_too"), + on="int64_col", + ) + pd_result = scalars_pandas_df_default_index[["int64_col", "string_col"]].join( + scalars_pandas_df_default_index[["int64_too", "bool_col"]].set_index( + "int64_too" + ), + on="int64_col", + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + def test_null_index_series_self_aligns( scalars_df_null_index, scalars_pandas_df_default_index ): @@ -322,6 +341,25 @@ def test_null_index_df_concat(scalars_df_null_index, scalars_pandas_df_default_i ) +def test_null_index_map_dict_input( + scalars_df_null_index, scalars_pandas_df_default_index +): + + local_map = dict() + # construct a local map, incomplete to cover behavior + for s in scalars_pandas_df_default_index.string_col[:-3]: + if isinstance(s, str): + local_map[s] = ord(s[0]) + + pd_result = scalars_pandas_df_default_index.string_col.map(local_map) + pd_result = pd_result.astype("Int64") # pandas type differences + bf_result = scalars_df_null_index.string_col.map(local_map) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + def test_null_index_align_error(scalars_df_null_index): with pytest.raises(bigframes.exceptions.NullIndexError): _ = (