From 0912b88dbb80fcfea96c190bd124e285b1edcbac Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 11 Nov 2020 12:06:47 -0600 Subject: [PATCH] perf: avoid extra API calls from `to_dataframe` if all rows are cached Follow-up to previous PR, which cached the first page of `getQueryResults`. If the first page is the only page (no `pageToken`), then it is unnecessary to make extra API calls from `to_dataframe` or `to_arrow` to the BigQuery Storage API. --- google/cloud/bigquery/table.py | 56 ++++++++++++++++++++--------- tests/unit/job/test_query_pandas.py | 28 ++++++++++++--- 2 files changed, 64 insertions(+), 20 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index c14a8adc4..1ee36c7ea 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1351,6 +1351,41 @@ def __init__( self._total_rows = total_rows self._first_page_response = first_page_response + def _is_completely_cached(self): + """Check if all results are completely cached. + + This is useful to know, because we can avoid alternative download + mechanisms. + """ + if self._first_page_response is None or self.next_page_token: + return False + + return self._first_page_response.get(self._next_token) is None + + def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client): + """Returns if the BigQuery Storage API can be used. + + Returns: + bool + True if the BigQuery Storage client can be used or created. + """ + using_bqstorage_api = bqstorage_client or create_bqstorage_client + if not using_bqstorage_api: + return False + + if self._is_completely_cached(): + return False + + if self.max_results is not None: + warnings.warn( + "Cannot use bqstorage_client if max_results is set, " + "reverting to fetching data with the REST endpoint.", + stacklevel=2, + ) + return False + + return True + def _get_next_page_response(self): """Requests the next page from the path provided. @@ -1412,6 +1447,9 @@ def _get_progress_bar(self, progress_bar_type): def _to_page_iterable( self, bqstorage_download, tabledata_list_download, bqstorage_client=None ): + if not self._validate_bqstorage(bqstorage_client, False): + bqstorage_client = None + if bqstorage_client is not None: for item in bqstorage_download(): yield item @@ -1503,14 +1541,7 @@ def to_arrow( if pyarrow is None: raise ValueError(_NO_PYARROW_ERROR) - if ( - bqstorage_client or create_bqstorage_client - ) and self.max_results is not None: - warnings.warn( - "Cannot use bqstorage_client if max_results is set, " - "reverting to fetching data with the REST endpoint.", - stacklevel=2, - ) + if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client): create_bqstorage_client = False bqstorage_client = None @@ -1687,14 +1718,7 @@ def to_dataframe( if dtypes is None: dtypes = {} - if ( - bqstorage_client or create_bqstorage_client - ) and self.max_results is not None: - warnings.warn( - "Cannot use bqstorage_client if max_results is set, " - "reverting to fetching data with the REST endpoint.", - stacklevel=2, - ) + if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client): create_bqstorage_client = False bqstorage_client = None diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index b0a652b78..a481bff69 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -99,6 +99,7 @@ def test_to_dataframe_bqstorage_preserve_order(query): ] }, "totalRows": "4", + "pageToken": "next-page", } connection = _make_connection(get_query_results_resource, job_resource) client = _make_client(connection=connection) @@ -133,7 +134,16 @@ def test_to_dataframe_bqstorage_preserve_order(query): @pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") -def test_to_arrow(): +@pytest.mark.parametrize( + "method_kwargs", + [ + {"create_bqstorage_client": False}, + # Since all rows are contained in the first page of results, the BigQuery + # Storage API won't actually be used. + {"create_bqstorage_client": True}, + ], +) +def test_to_arrow(method_kwargs): from google.cloud.bigquery.job import QueryJob as target_class begun_resource = _make_job_resource(job_type="query") @@ -182,7 +192,7 @@ def test_to_arrow(): client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) - tbl = job.to_arrow(create_bqstorage_client=False) + tbl = job.to_arrow(**method_kwargs) assert isinstance(tbl, pyarrow.Table) assert tbl.num_rows == 2 @@ -216,7 +226,16 @@ def test_to_arrow(): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -def test_to_dataframe(): +@pytest.mark.parametrize( + "method_kwargs", + [ + {"create_bqstorage_client": False}, + # Since all rows are contained in the first page of results, the BigQuery + # Storage API won't actually be used. + {"create_bqstorage_client": True}, + ], +) +def test_to_dataframe(method_kwargs): from google.cloud.bigquery.job import QueryJob as target_class begun_resource = _make_job_resource(job_type="query") @@ -243,7 +262,7 @@ def test_to_dataframe(): client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) - df = job.to_dataframe(create_bqstorage_client=False) + df = job.to_dataframe(**method_kwargs) assert isinstance(df, pandas.DataFrame) assert len(df) == 4 # verify the number of rows @@ -288,6 +307,7 @@ def test_to_dataframe_bqstorage(): {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, ] }, + "pageToken": "next-page", } connection = _make_connection(query_resource) client = _make_client(connection=connection)