Merge remote-tracking branch 'github/main' into garrettwu-transformer2

googleapis · Apr 1, 2024 · 6ad77d5 · 6ad77d5
2 parents 0f77788 + 9ac4ed8
commit 6ad77d5
Show file tree

Hide file tree

Showing 15 changed files with 913 additions and 619 deletions.
diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh
@@ -106,6 +106,7 @@ for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \
     # write access to
     COVERAGE_TABLE=bigframes-metrics.coverage_report.bigframes_coverage_nightly
     python3.10 scripts/publish_api_coverage.py \
+      bigquery \
       --bigframes_version=$BIGFRAMES_VERSION \
       --release_version=$RELEASE_VERSION \
       --bigquery_table=$COVERAGE_TABLE

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -467,23 +467,6 @@ def to_pandas_batches(self):
             self._copy_index_to_pandas(df)
             yield df
 
-    def download_pandas_preview(
-        self, max_rows: int
-    ) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
-        """Download one page of results and return the query job."""
-        dtypes = dict(zip(self.index_columns, self.index.dtypes))
-        dtypes.update(zip(self.value_columns, self.dtypes))
-        results_iterator, query_job = self.session._execute(
-            self.expr, sorted=True, max_results=max_rows
-        )
-        arrow_results_iterator = results_iterator.to_arrow_iterable()
-        arrow_table = next(arrow_results_iterator)
-        downloaded_df = bigframes.session._io.pandas.arrow_to_pandas(
-            arrow_table, dtypes
-        )
-        self._copy_index_to_pandas(downloaded_df)
-        return downloaded_df, query_job
-
     def _copy_index_to_pandas(self, df: pd.DataFrame):
         """Set the index on pandas DataFrame to match this block.
 
@@ -1314,25 +1297,26 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1):
     # queries.
     @functools.cache
     def retrieve_repr_request_results(
-        self, max_results: int, max_columns: int
-    ) -> Tuple[pd.DataFrame, Tuple[int, int], bigquery.QueryJob]:
+        self, max_results: int
+    ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
         """
         Retrieves a pandas dataframe containing only max_results many rows for use
         with printing methods.
 
-        Returns a tuple of the dataframe preview for printing and the overall number
-        of rows and columns of the table, as well as the query job used.
+        Returns a tuple of the dataframe and the overall number of rows of the query.
         """
-        pandas_df, query_job = self.download_pandas_preview(max_results)
-        row_count = self.session._get_table_row_count(query_job.destination)
-        column_count = len(self.value_columns)
-
-        formatted_df = pandas_df.set_axis(self.column_labels, axis=1)
+        # TODO(swast): Select a subset of columns if max_columns is less than the
+        # number of columns in the schema.
+        count = self.shape[0]
+        if count > max_results:
+            head_block = self.slice(0, max_results)
+        else:
+            head_block = self
+        computed_df, query_job = head_block.to_pandas()
+        formatted_df = computed_df.set_axis(self.column_labels, axis=1)
         # we reset the axis and substitute the bf index name for the default
         formatted_df.index.name = self.index.name
-        # limit column count
-        formatted_df = formatted_df.iloc[:, 0:max_columns]
-        return formatted_df, (row_count, column_count), query_job
+        return formatted_df, count, query_job
 
     def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]:
         result_id = guid.generate_guid()

diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py
@@ -205,17 +205,17 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
         return self._query_job
 
     def __repr__(self) -> str:
+        # TODO(swast): Add a timeout here? If the query is taking a long time,
+        # maybe we just print the job metadata that we have so far?
+        # TODO(swast): Avoid downloading the whole series by using job
+        # metadata, like we do with DataFrame.
         opts = bigframes.options.display
         max_results = opts.max_rows
-        max_columns = opts.max_columns
         if opts.repr_mode == "deferred":
             return formatter.repr_query_job(self.query_job)
 
-        pandas_df, _, query_job = self._block.retrieve_repr_request_results(
-            max_results, max_columns
-        )
+        pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results)
         self._query_job = query_job
-
         return repr(pandas_df.index)
 
     def copy(self, name: Optional[Hashable] = None):

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -579,57 +579,71 @@ def __setattr__(self, key: str, value):
             object.__setattr__(self, key, value)
 
     def __repr__(self) -> str:
-        """Converts a DataFrame to a string using pandas dataframe __repr__.
+        """Converts a DataFrame to a string. Calls to_pandas.
 
-        Only represents the first `bigframes.options.display.max_rows`
-        and `bigframes.options.display.max_columns`.
+        Only represents the first `bigframes.options.display.max_rows`.
         """
-        if bigframes.options.display.repr_mode == "deferred":
+        opts = bigframes.options.display
+        max_results = opts.max_rows
+        if opts.repr_mode == "deferred":
             return formatter.repr_query_job(self.query_job)
 
-        pandas_df, shape = self._perform_repr_request()
-        with display_options.pandas_repr(bigframes.options.display):
+        self._cached()
+        # TODO(swast): pass max_columns and get the true column count back. Maybe
+        # get 1 more column than we have requested so that pandas can add the
+        # ... for us?
+        pandas_df, row_count, query_job = self._block.retrieve_repr_request_results(
+            max_results
+        )
+
+        self._set_internal_query_job(query_job)
+
+        column_count = len(pandas_df.columns)
+
+        with display_options.pandas_repr(opts):
             repr_string = repr(pandas_df)
 
         # Modify the end of the string to reflect count.
         lines = repr_string.split("\n")
         pattern = re.compile("\\[[0-9]+ rows x [0-9]+ columns\\]")
         if pattern.match(lines[-1]):
             lines = lines[:-2]
-        if shape[0] > len(lines) - 1:
+
+        if row_count > len(lines) - 1:
             lines.append("...")
+
         lines.append("")
-        lines.append(f"[{shape[0]} rows x {shape[1]} columns]")
+        lines.append(f"[{row_count} rows x {column_count} columns]")
         return "\n".join(lines)
 
-    def _perform_repr_request(self) -> Tuple[pandas.DataFrame, Tuple[int, int]]:
-        max_results = bigframes.options.display.max_rows
-        max_columns = bigframes.options.display.max_columns
-        self._cached()
-        pandas_df, shape, query_job = self._block.retrieve_repr_request_results(
-            max_results, max_columns
-        )
-        self._set_internal_query_job(query_job)
-        return pandas_df, shape
-
     def _repr_html_(self) -> str:
         """
         Returns an html string primarily for use by notebooks for displaying
-        a representation of the DataFrame. Displays at most the number of rows
-        and columns given by `bigframes.options.display.max_rows` and
-        `bigframes.options.display.max_columns`.
+        a representation of the DataFrame. Displays 20 rows by default since
+        many notebooks are not configured for large tables.
         """
-
-        if bigframes.options.display.repr_mode == "deferred":
+        opts = bigframes.options.display
+        max_results = bigframes.options.display.max_rows
+        if opts.repr_mode == "deferred":
             return formatter.repr_query_job_html(self.query_job)
 
-        pandas_df, shape = self._perform_repr_request()
+        self._cached()
+        # TODO(swast): pass max_columns and get the true column count back. Maybe
+        # get 1 more column than we have requested so that pandas can add the
+        # ... for us?
+        pandas_df, row_count, query_job = self._block.retrieve_repr_request_results(
+            max_results
+        )
+
+        self._set_internal_query_job(query_job)
+
+        column_count = len(pandas_df.columns)
 
-        with display_options.pandas_repr(bigframes.options.display):
+        with display_options.pandas_repr(opts):
             # _repr_html_ stub is missing so mypy thinks it's a Series. Ignore mypy.
             html_string = pandas_df._repr_html_()  # type:ignore
 
-        html_string += f"[{shape[0]} rows x {shape[1]} columns in total]"
+        html_string += f"[{row_count} rows x {column_count} columns in total]"
         return html_string
 
     def __setitem__(self, key: str, value: SingleItemValue):

diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py
@@ -19,8 +19,8 @@
 import typing
 from typing import Tuple, Union
 
-import bigframes_vendored.sklearn.metrics._classification as vendored_mertics_classification
-import bigframes_vendored.sklearn.metrics._ranking as vendored_mertics_ranking
+import bigframes_vendored.sklearn.metrics._classification as vendored_metrics_classification
+import bigframes_vendored.sklearn.metrics._ranking as vendored_metrics_ranking
 import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression
 import numpy as np
 import pandas as pd
@@ -79,7 +79,7 @@ def accuracy_score(
         return score.sum()
 
 
-accuracy_score.__doc__ = inspect.getdoc(vendored_mertics_classification.accuracy_score)
+accuracy_score.__doc__ = inspect.getdoc(vendored_metrics_classification.accuracy_score)
 
 
 def roc_curve(
@@ -149,7 +149,7 @@ def roc_curve(
     )
 
 
-roc_curve.__doc__ = inspect.getdoc(vendored_mertics_ranking.roc_curve)
+roc_curve.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_curve)
 
 
 def roc_auc_score(
@@ -171,7 +171,7 @@ def roc_auc_score(
     return (width_diff * height_avg).sum()
 
 
-roc_auc_score.__doc__ = inspect.getdoc(vendored_mertics_ranking.roc_auc_score)
+roc_auc_score.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_auc_score)
 
 
 def auc(
@@ -185,7 +185,7 @@ def auc(
     return auc
 
 
-auc.__doc__ = inspect.getdoc(vendored_mertics_ranking.auc)
+auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc)
 
 
 def confusion_matrix(
@@ -223,7 +223,7 @@ def confusion_matrix(
 
 
 confusion_matrix.__doc__ = inspect.getdoc(
-    vendored_mertics_classification.confusion_matrix
+    vendored_metrics_classification.confusion_matrix
 )
 
 
@@ -261,7 +261,7 @@ def recall_score(
     return recall_score
 
 
-recall_score.__doc__ = inspect.getdoc(vendored_mertics_classification.recall_score)
+recall_score.__doc__ = inspect.getdoc(vendored_metrics_classification.recall_score)
 
 
 def precision_score(
@@ -299,7 +299,7 @@ def precision_score(
 
 
 precision_score.__doc__ = inspect.getdoc(
-    vendored_mertics_classification.precision_score
+    vendored_metrics_classification.precision_score
 )
 
 
@@ -334,4 +334,4 @@ def f1_score(
     return f1_score
 
 
-f1_score.__doc__ = inspect.getdoc(vendored_mertics_classification.f1_score)
+f1_score.__doc__ = inspect.getdoc(vendored_metrics_classification.f1_score)
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -272,16 +272,17 @@ def reset_index(
             return bigframes.dataframe.DataFrame(block)
 
     def __repr__(self) -> str:
+        # TODO(swast): Add a timeout here? If the query is taking a long time,
+        # maybe we just print the job metadata that we have so far?
+        # TODO(swast): Avoid downloading the whole series by using job
+        # metadata, like we do with DataFrame.
         opts = bigframes.options.display
         max_results = opts.max_rows
-        max_columns = opts.max_columns
         if opts.repr_mode == "deferred":
             return formatter.repr_query_job(self.query_job)
 
         self._cached()
-        pandas_df, _, query_job = self._block.retrieve_repr_request_results(
-            max_results, max_columns
-        )
+        pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results)
         self._set_internal_query_job(query_job)
 
         return repr(pandas_df.iloc[:, 0])

diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -1832,7 +1832,6 @@ def _execute(
         sorted: bool = True,
         dry_run=False,
         col_id_overrides: Mapping[str, str] = {},
-        max_results: Optional[int] = None,
     ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
         sql = self._to_sql(
             array_value, sorted=sorted, col_id_overrides=col_id_overrides
@@ -1842,7 +1841,8 @@ def _execute(
         else:
             job_config.dry_run = dry_run
         return self._start_query(
-            sql=sql, job_config=job_config, max_results=max_results
+            sql=sql,
+            job_config=job_config,
         )
 
     def _peek(
@@ -1887,10 +1887,6 @@ def _get_table_size(self, destination_table):
         table = self.bqclient.get_table(destination_table)
         return table.num_bytes
 
-    def _get_table_row_count(self, destination_table) -> int:
-        table = self.bqclient.get_table(destination_table)
-        return table.num_rows
-
     def _rows_to_dataframe(
         self, row_iterator: bigquery.table.RowIterator, dtypes: Dict
     ) -> pandas.DataFrame:

diff --git a/docs/index.rst b/docs/index.rst
@@ -7,6 +7,7 @@ API reference
     :maxdepth: 3
 
     reference/index
+    supported_pandas_apis
 
 Changelog
 ---------

diff --git a/docs/supported_pandas_apis.rst b/docs/supported_pandas_apis.rst
@@ -0,0 +1,62 @@
+Supported pandas APIs
+=====================
+
+The following tables show the pandas APIs that have been implemented (or not)
+in BigQuery DataFrames.
+
+* 'Y' means it implements all parameters.
+* 'P' means it implements only some parameters.
+
+DataFrame
+---------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_dataframe.html
+
+DataFrameGroupBy
+----------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_dataframegroupby.html
+
+Index
+-----
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_index.html
+
+pandas module
+-------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_pandas.html
+
+Series
+------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_series.html
+
+Series.dt methods
+-----------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_datetimemethods.html
+
+Series.str methods
+------------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_stringmethods.html
+
+SeriesGroupBy
+-------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_seriesgroupby.html
+
+Window
+------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_window.html
diff --git a/docs/supported_pandas_apis/.gitignore b/docs/supported_pandas_apis/.gitignore
@@ -0,0 +1 @@
+*.html