fix: Fix null index join with 'on' arg (#1153)

googleapis · Nov 21, 2024 · 9015c33 · 9015c33
1 parent 0fae2e0
commit 9015c33
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 8 deletions.
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2696,6 +2696,7 @@ def join(
         self, other: DataFrame, *, on: Optional[str] = None, how: str = "left"
     ) -> DataFrame:
         left, right = self, other
+
         if not left.columns.intersection(right.columns).empty:
             raise NotImplementedError(
                 f"Deduping column names is not implemented. {constants.FEEDBACK_LINK}"
@@ -2721,11 +2722,13 @@ def join(
                 )
             # Switch left index with on column
             left_columns = left.columns
-            left_idx_original_names = left.index.names
+            left_idx_original_names = left.index.names if left._has_index else ()
             left_idx_names_in_cols = [
-                f"bigframes_left_idx_name_{i}" for i in range(len(left.index.names))
+                f"bigframes_left_idx_name_{i}"
+                for i in range(len(left_idx_original_names))
             ]
-            left.index.names = left_idx_names_in_cols
+            if left._has_index:
+                left.index.names = left_idx_names_in_cols
             left = left.reset_index(drop=False)
             left = left.set_index(on)
 
@@ -2736,11 +2739,12 @@ def join(
             combined_df = combined_df.set_index(left_idx_names_in_cols)
 
             # To be consistent with Pandas
-            combined_df.index.names = (
-                left_idx_original_names
-                if how in ("inner", "left")
-                else ([None] * len(combined_df.index.names))
-            )
+            if combined_df._has_index:
+                combined_df.index.names = (
+                    left_idx_original_names
+                    if how in ("inner", "left")
+                    else ([None] * len(combined_df.index.names))
+                )
 
             # Reorder columns
             combined_df = combined_df[list(left_columns) + list(right.columns)]

diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py
@@ -268,6 +268,25 @@ def test_null_index_series_self_join(
     )
 
 
+def test_null_index_series_self_join_on(
+    scalars_df_null_index, scalars_pandas_df_default_index
+):
+    # caller doesn't need index, but do need index on arg to join with 'on'
+    bf_result = scalars_df_null_index[["int64_col", "string_col"]].join(
+        scalars_df_null_index[["int64_too", "bool_col"]].set_index("int64_too"),
+        on="int64_col",
+    )
+    pd_result = scalars_pandas_df_default_index[["int64_col", "string_col"]].join(
+        scalars_pandas_df_default_index[["int64_too", "bool_col"]].set_index(
+            "int64_too"
+        ),
+        on="int64_col",
+    )
+    pd.testing.assert_frame_equal(
+        bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False
+    )
+
+
 def test_null_index_series_self_aligns(
     scalars_df_null_index, scalars_pandas_df_default_index
 ):
@@ -322,6 +341,25 @@ def test_null_index_df_concat(scalars_df_null_index, scalars_pandas_df_default_i
     )
 
 
+def test_null_index_map_dict_input(
+    scalars_df_null_index, scalars_pandas_df_default_index
+):
+
+    local_map = dict()
+    # construct a local map, incomplete to cover <NA> behavior
+    for s in scalars_pandas_df_default_index.string_col[:-3]:
+        if isinstance(s, str):
+            local_map[s] = ord(s[0])
+
+    pd_result = scalars_pandas_df_default_index.string_col.map(local_map)
+    pd_result = pd_result.astype("Int64")  # pandas type differences
+    bf_result = scalars_df_null_index.string_col.map(local_map)
+
+    pd.testing.assert_series_equal(
+        bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False
+    )
+
+
 def test_null_index_align_error(scalars_df_null_index):
     with pytest.raises(bigframes.exceptions.NullIndexError):
         _ = (