From 0b344023fe71384e49af9893e363a086f19e5258 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Thu, 7 Mar 2024 08:20:17 +0000
Subject: [PATCH 1/3] chore: materialize result of `remote_function` early
 (#408)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes internal issue 327662690 🦕
---
 bigframes/series.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/bigframes/series.py b/bigframes/series.py
index 4aef959a76..dfa6fa4b0d 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -1253,11 +1253,17 @@ def apply(
                     ex.message += f"\n{_remote_function_recommendation_message}"
                 raise
 
+        # We are working with remote function at this point
         reprojected_series = Series(self._block._force_reproject())
-        return reprojected_series._apply_unary_op(
+        result_series = reprojected_series._apply_unary_op(
             ops.RemoteFunctionOp(func=func, apply_on_null=True)
         )
 
+        # return Series with materialized result so that any error in the remote
+        # function is caught early
+        materialized_series = result_series._cached()
+        return materialized_series
+
     def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series:
         return Series(self._get_block().add_prefix(prefix))
 

From 76b252f907055d72556e3e95f6cb5ee41de5b1c2 Mon Sep 17 00:00:00 2001
From: Duc Le Tu <tuduc93@gmail.com>
Date: Fri, 8 Mar 2024 00:36:16 +0700
Subject: [PATCH 2/3] fix: only do row identity based joins when joining by
 index (#356)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [x] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [x] Ensure the tests and linter pass
- [x] Code coverage does not decrease (if any source code was changed)
- [x] Appropriate docs were updated (if necessary)

Fixes #355 🦕
---
 bigframes/core/__init__.py                    |  2 +-
 bigframes/core/compile/single_column.py       |  4 +-
 bigframes/core/nodes.py                       |  2 +-
 tests/system/conftest.py                      |  7 ++
 .../test_issue355_merge_after_filter.py       | 70 +++++++++++++++++++
 5 files changed, 81 insertions(+), 4 deletions(-)
 create mode 100644 tests/system/small/regression/test_issue355_merge_after_filter.py

diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
index 4dc2e4d7af..9032993452 100644
--- a/bigframes/core/__init__.py
+++ b/bigframes/core/__init__.py
@@ -349,7 +349,7 @@ def join(
         self,
         other: ArrayValue,
         join_def: join_def.JoinDefinition,
-        allow_row_identity_join: bool = True,
+        allow_row_identity_join: bool = False,
     ):
         return ArrayValue(
             nodes.JoinNode(
diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py
index d26e71d1b4..7beebfcb66 100644
--- a/bigframes/core/compile/single_column.py
+++ b/bigframes/core/compile/single_column.py
@@ -33,7 +33,7 @@ def join_by_column_ordered(
     left: compiled.OrderedIR,
     right: compiled.OrderedIR,
     join: join_defs.JoinDefinition,
-    allow_row_identity_join: bool = True,
+    allow_row_identity_join: bool = False,
 ) -> compiled.OrderedIR:
     """Join two expressions by column equality.
 
@@ -134,7 +134,7 @@ def join_by_column_unordered(
     left: compiled.UnorderedIR,
     right: compiled.UnorderedIR,
     join: join_defs.JoinDefinition,
-    allow_row_identity_join: bool = True,
+    allow_row_identity_join: bool = False,
 ) -> compiled.UnorderedIR:
     """Join two expressions by column equality.
 
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
index f637177a94..1cd3277cbc 100644
--- a/bigframes/core/nodes.py
+++ b/bigframes/core/nodes.py
@@ -115,7 +115,7 @@ class JoinNode(BigFrameNode):
     left_child: BigFrameNode
     right_child: BigFrameNode
     join: JoinDefinition
-    allow_row_identity_join: bool = True
+    allow_row_identity_join: bool = False
 
     @property
     def row_preserving(self) -> bool:
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index 7ca1882fe0..4b5ebc9d43 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -285,6 +285,13 @@ def scalars_table_id(test_data_tables) -> str:
     return test_data_tables["scalars"]
 
 
+@pytest.fixture(scope="session")
+def baseball_schedules_df(session: bigframes.Session) -> bigframes.dataframe.DataFrame:
+    """Public BQ table"""
+    df = session.read_gbq("bigquery-public-data.baseball.schedules")
+    return df
+
+
 @pytest.fixture(scope="session")
 def hockey_table_id(test_data_tables) -> str:
     return test_data_tables["hockey_players"]
diff --git a/tests/system/small/regression/test_issue355_merge_after_filter.py b/tests/system/small/regression/test_issue355_merge_after_filter.py
new file mode 100644
index 0000000000..24ee01cb7f
--- /dev/null
+++ b/tests/system/small/regression/test_issue355_merge_after_filter.py
@@ -0,0 +1,70 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pytest
+
+from tests.system.utils import assert_pandas_df_equal
+
+
+@pytest.mark.parametrize(
+    ("merge_how",),
+    [
+        ("inner",),
+        ("outer",),
+        ("left",),
+        ("right",),
+    ],
+)
+def test_merge_after_filter(baseball_schedules_df, merge_how):
+    on = ["awayTeamName"]
+    left_columns = [
+        "gameId",
+        "year",
+        "homeTeamName",
+        "awayTeamName",
+        "duration_minutes",
+    ]
+    right_columns = [
+        "gameId",
+        "year",
+        "homeTeamName",
+        "awayTeamName",
+        "duration_minutes",
+    ]
+
+    left = baseball_schedules_df[left_columns]
+    left = left[left["homeTeamName"] == "Rays"]
+    # Offset the rows somewhat so that outer join can have an effect.
+    right = baseball_schedules_df[right_columns]
+    right = right[right["homeTeamName"] == "White Sox"]
+
+    df = left.merge(right, on=on, how=merge_how)
+    bf_result = df.to_pandas()
+
+    left_pandas = baseball_schedules_df.to_pandas()[left_columns]
+    left_pandas = left_pandas[left_pandas["homeTeamName"] == "Rays"]
+
+    right_pandas = baseball_schedules_df.to_pandas()[right_columns]
+    right_pandas = right_pandas[right_pandas["homeTeamName"] == "White Sox"]
+
+    pd_result = pd.merge(
+        left_pandas,
+        right_pandas,
+        merge_how,
+        on,
+        sort=True,
+    )
+
+    assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)

From 38bd2ba21bc1a3222635de22eecd97930bf5b1de Mon Sep 17 00:00:00 2001
From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com>
Date: Thu, 7 Mar 2024 10:38:40 -0800
Subject: [PATCH 3/3] docs: fix the note rendering for DataFrames methods:
 nlargest, nsmallest (#417)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes internal issue #328445384 🦕
---
 .../bigframes_vendored/pandas/core/frame.py   | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 84d2aa7fcb..f88649ca13 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -3935,6 +3935,11 @@ def nlargest(self, n: int, columns, keep: str = "first"):
         ``df.sort_values(columns, ascending=False).head(n)``, but more
         performant.
 
+        .. note::
+            This function cannot be used with all column types. For example, when
+            specifying columns with `object` or `category` dtypes, ``TypeError`` is
+            raised.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd
@@ -4002,11 +4007,6 @@ def nlargest(self, n: int, columns, keep: str = "first"):
 
         Returns:
             DataFrame: The first `n` rows ordered by the given columns in descending order.
-
-        .. note::
-            This function cannot be used with all column types. For example, when
-            specifying columns with `object` or `category` dtypes, ``TypeError`` is
-            raised.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
@@ -4022,6 +4022,12 @@ def nsmallest(self, n: int, columns, keep: str = "first"):
         ``df.sort_values(columns, ascending=True).head(n)``, but more
         performant.
 
+        .. note::
+
+            This function cannot be used with all column types. For example, when
+            specifying columns with `object` or `category` dtypes, ``TypeError`` is
+            raised.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd
@@ -4090,11 +4096,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"):
 
         Returns:
             DataFrame: The first `n` rows ordered by the given columns in ascending order.
-
-        .. note::
-            This function cannot be used with all column types. For example, when
-            specifying columns with `object` or `category` dtypes, ``TypeError`` is
-            raised.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)