From 659a161a53e93f66334cd04d1c3dc1f1f47ecc16 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 4 Apr 2024 09:36:41 -0700 Subject: [PATCH] fix: Use bytes limit on frame inlining rather than element count (#576) --- bigframes/session/__init__.py | 8 ++++---- tests/system/small/test_dataframe.py | 7 +++++++ tests/system/small/test_progress_bar.py | 4 ++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c7605e89d7..671a3d65e7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -116,9 +116,9 @@ "UTF-32LE", } -# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. -# TODO(tbergeron): Convert to bytes-based limit -MAX_INLINE_DF_SIZE = 5000 +# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table. +# Also must assume that text encoding as literals is much less efficient than in-memory representation. +MAX_INLINE_DF_BYTES = 5000 logger = logging.getLogger(__name__) @@ -1051,7 +1051,7 @@ def _read_pandas_inline( ) -> Optional[dataframe.DataFrame]: import bigframes.dataframe as dataframe - if pandas_dataframe.size > MAX_INLINE_DF_SIZE: + if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES: return None try: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ae80a088b5..f28de37d68 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -66,6 +66,13 @@ def test_df_construct_pandas_default(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_construct_large_strings(): + data = [["hello", "w" + "o" * 50000 + "rld"]] + bf_result = dataframe.DataFrame(data).to_pandas() + pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow")) + pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + def test_df_construct_pandas_load_job(scalars_dfs): # This should trigger the inlined codepath columns = [ diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 1c04b580fc..ea139b9802 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -20,7 +20,7 @@ import bigframes as bf import bigframes.formatting_helpers as formatting_helpers -from bigframes.session import MAX_INLINE_DF_SIZE +from bigframes.session import MAX_INLINE_DF_BYTES job_load_message_regex = r"\w+ job [\w-]+ is \w+\." @@ -70,7 +70,7 @@ def test_progress_bar_load_jobs( ): # repeat the DF to be big enough to trigger the load job. df = penguins_pandas_df_default_index - while len(df) < MAX_INLINE_DF_SIZE: + while len(df) < MAX_INLINE_DF_BYTES: df = pd.DataFrame(np.repeat(df.values, 2, axis=0)) bf.options.display.progress_bar = "terminal"