Skip to content

Commit

Permalink
fix: Use bytes limit on frame inlining rather than element count (#576)
Browse files Browse the repository at this point in the history
  • Loading branch information
TrevorBergeron authored Apr 4, 2024
1 parent 6d8f3af commit 659a161
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
8 changes: 4 additions & 4 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,9 @@
"UTF-32LE",
}

# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type.
# TODO(tbergeron): Convert to bytes-based limit
MAX_INLINE_DF_SIZE = 5000
# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table.
# Also must assume that text encoding as literals is much less efficient than in-memory representation.
MAX_INLINE_DF_BYTES = 5000

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1051,7 +1051,7 @@ def _read_pandas_inline(
) -> Optional[dataframe.DataFrame]:
import bigframes.dataframe as dataframe

if pandas_dataframe.size > MAX_INLINE_DF_SIZE:
if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES:
return None

try:
Expand Down
7 changes: 7 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ def test_df_construct_pandas_default(scalars_dfs):
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_df_construct_large_strings():
data = [["hello", "w" + "o" * 50000 + "rld"]]
bf_result = dataframe.DataFrame(data).to_pandas()
pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow"))
pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)


def test_df_construct_pandas_load_job(scalars_dfs):
# This should trigger the inlined codepath
columns = [
Expand Down
4 changes: 2 additions & 2 deletions tests/system/small/test_progress_bar.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import bigframes as bf
import bigframes.formatting_helpers as formatting_helpers
from bigframes.session import MAX_INLINE_DF_SIZE
from bigframes.session import MAX_INLINE_DF_BYTES

job_load_message_regex = r"\w+ job [\w-]+ is \w+\."

Expand Down Expand Up @@ -70,7 +70,7 @@ def test_progress_bar_load_jobs(
):
# repeat the DF to be big enough to trigger the load job.
df = penguins_pandas_df_default_index
while len(df) < MAX_INLINE_DF_SIZE:
while len(df) < MAX_INLINE_DF_BYTES:
df = pd.DataFrame(np.repeat(df.values, 2, axis=0))

bf.options.display.progress_bar = "terminal"
Expand Down

0 comments on commit 659a161

Please sign in to comment.