From 0b344023fe71384e49af9893e363a086f19e5258 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 7 Mar 2024 08:20:17 +0000 Subject: [PATCH 1/3] chore: materialize result of `remote_function` early (#408) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 327662690 🦕 --- bigframes/series.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index 4aef959a76..dfa6fa4b0d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1253,11 +1253,17 @@ def apply( ex.message += f"\n{_remote_function_recommendation_message}" raise + # We are working with remote function at this point reprojected_series = Series(self._block._force_reproject()) - return reprojected_series._apply_unary_op( + result_series = reprojected_series._apply_unary_op( ops.RemoteFunctionOp(func=func, apply_on_null=True) ) + # return Series with materialized result so that any error in the remote + # function is caught early + materialized_series = result_series._cached() + return materialized_series + def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_prefix(prefix)) From 76b252f907055d72556e3e95f6cb5ee41de5b1c2 Mon Sep 17 00:00:00 2001 From: Duc Le Tu Date: Fri, 8 Mar 2024 00:36:16 +0700 Subject: [PATCH 2/3] fix: only do row identity based joins when joining by index (#356) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [x] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [x] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) Fixes #355 🦕 --- bigframes/core/__init__.py | 2 +- bigframes/core/compile/single_column.py | 4 +- bigframes/core/nodes.py | 2 +- tests/system/conftest.py | 7 ++ .../test_issue355_merge_after_filter.py | 70 +++++++++++++++++++ 5 files changed, 81 insertions(+), 4 deletions(-) create mode 100644 tests/system/small/regression/test_issue355_merge_after_filter.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 4dc2e4d7af..9032993452 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -349,7 +349,7 @@ def join( self, other: ArrayValue, join_def: join_def.JoinDefinition, - allow_row_identity_join: bool = True, + allow_row_identity_join: bool = False, ): return ArrayValue( nodes.JoinNode( diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index d26e71d1b4..7beebfcb66 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -33,7 +33,7 @@ def join_by_column_ordered( left: compiled.OrderedIR, right: compiled.OrderedIR, join: join_defs.JoinDefinition, - allow_row_identity_join: bool = True, + allow_row_identity_join: bool = False, ) -> compiled.OrderedIR: """Join two expressions by column equality. @@ -134,7 +134,7 @@ def join_by_column_unordered( left: compiled.UnorderedIR, right: compiled.UnorderedIR, join: join_defs.JoinDefinition, - allow_row_identity_join: bool = True, + allow_row_identity_join: bool = False, ) -> compiled.UnorderedIR: """Join two expressions by column equality. diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index f637177a94..1cd3277cbc 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -115,7 +115,7 @@ class JoinNode(BigFrameNode): left_child: BigFrameNode right_child: BigFrameNode join: JoinDefinition - allow_row_identity_join: bool = True + allow_row_identity_join: bool = False @property def row_preserving(self) -> bool: diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 7ca1882fe0..4b5ebc9d43 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -285,6 +285,13 @@ def scalars_table_id(test_data_tables) -> str: return test_data_tables["scalars"] +@pytest.fixture(scope="session") +def baseball_schedules_df(session: bigframes.Session) -> bigframes.dataframe.DataFrame: + """Public BQ table""" + df = session.read_gbq("bigquery-public-data.baseball.schedules") + return df + + @pytest.fixture(scope="session") def hockey_table_id(test_data_tables) -> str: return test_data_tables["hockey_players"] diff --git a/tests/system/small/regression/test_issue355_merge_after_filter.py b/tests/system/small/regression/test_issue355_merge_after_filter.py new file mode 100644 index 0000000000..24ee01cb7f --- /dev/null +++ b/tests/system/small/regression/test_issue355_merge_after_filter.py @@ -0,0 +1,70 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +from tests.system.utils import assert_pandas_df_equal + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_after_filter(baseball_schedules_df, merge_how): + on = ["awayTeamName"] + left_columns = [ + "gameId", + "year", + "homeTeamName", + "awayTeamName", + "duration_minutes", + ] + right_columns = [ + "gameId", + "year", + "homeTeamName", + "awayTeamName", + "duration_minutes", + ] + + left = baseball_schedules_df[left_columns] + left = left[left["homeTeamName"] == "Rays"] + # Offset the rows somewhat so that outer join can have an effect. + right = baseball_schedules_df[right_columns] + right = right[right["homeTeamName"] == "White Sox"] + + df = left.merge(right, on=on, how=merge_how) + bf_result = df.to_pandas() + + left_pandas = baseball_schedules_df.to_pandas()[left_columns] + left_pandas = left_pandas[left_pandas["homeTeamName"] == "Rays"] + + right_pandas = baseball_schedules_df.to_pandas()[right_columns] + right_pandas = right_pandas[right_pandas["homeTeamName"] == "White Sox"] + + pd_result = pd.merge( + left_pandas, + right_pandas, + merge_how, + on, + sort=True, + ) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) From 38bd2ba21bc1a3222635de22eecd97930bf5b1de Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 7 Mar 2024 10:38:40 -0800 Subject: [PATCH 3/3] docs: fix the note rendering for DataFrames methods: nlargest, nsmallest (#417) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue #328445384 🦕 --- .../bigframes_vendored/pandas/core/frame.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 84d2aa7fcb..f88649ca13 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3935,6 +3935,11 @@ def nlargest(self, n: int, columns, keep: str = "first"): ``df.sort_values(columns, ascending=False).head(n)``, but more performant. + .. note:: + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + **Examples:** >>> import bigframes.pandas as bpd @@ -4002,11 +4007,6 @@ def nlargest(self, n: int, columns, keep: str = "first"): Returns: DataFrame: The first `n` rows ordered by the given columns in descending order. - - .. note:: - This function cannot be used with all column types. For example, when - specifying columns with `object` or `category` dtypes, ``TypeError`` is - raised. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4022,6 +4022,12 @@ def nsmallest(self, n: int, columns, keep: str = "first"): ``df.sort_values(columns, ascending=True).head(n)``, but more performant. + .. note:: + + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + **Examples:** >>> import bigframes.pandas as bpd @@ -4090,11 +4096,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): Returns: DataFrame: The first `n` rows ordered by the given columns in ascending order. - - .. note:: - This function cannot be used with all column types. For example, when - specifying columns with `object` or `category` dtypes, ``TypeError`` is - raised. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)