Skip to content

Commit

Permalink
Merge remote-tracking branch 'github/main' into garrettwu-transformer2
Browse files Browse the repository at this point in the history
  • Loading branch information
GarrettWu committed Apr 1, 2024
2 parents 0f77788 + 9ac4ed8 commit 6ad77d5
Show file tree
Hide file tree
Showing 15 changed files with 913 additions and 619 deletions.
1 change: 1 addition & 0 deletions .kokoro/release-nightly.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \
# write access to
COVERAGE_TABLE=bigframes-metrics.coverage_report.bigframes_coverage_nightly
python3.10 scripts/publish_api_coverage.py \
bigquery \
--bigframes_version=$BIGFRAMES_VERSION \
--release_version=$RELEASE_VERSION \
--bigquery_table=$COVERAGE_TABLE
Expand Down
42 changes: 13 additions & 29 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,23 +467,6 @@ def to_pandas_batches(self):
self._copy_index_to_pandas(df)
yield df

def download_pandas_preview(
self, max_rows: int
) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
"""Download one page of results and return the query job."""
dtypes = dict(zip(self.index_columns, self.index.dtypes))
dtypes.update(zip(self.value_columns, self.dtypes))
results_iterator, query_job = self.session._execute(
self.expr, sorted=True, max_results=max_rows
)
arrow_results_iterator = results_iterator.to_arrow_iterable()
arrow_table = next(arrow_results_iterator)
downloaded_df = bigframes.session._io.pandas.arrow_to_pandas(
arrow_table, dtypes
)
self._copy_index_to_pandas(downloaded_df)
return downloaded_df, query_job

def _copy_index_to_pandas(self, df: pd.DataFrame):
"""Set the index on pandas DataFrame to match this block.
Expand Down Expand Up @@ -1314,25 +1297,26 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1):
# queries.
@functools.cache
def retrieve_repr_request_results(
self, max_results: int, max_columns: int
) -> Tuple[pd.DataFrame, Tuple[int, int], bigquery.QueryJob]:
self, max_results: int
) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
"""
Retrieves a pandas dataframe containing only max_results many rows for use
with printing methods.
Returns a tuple of the dataframe preview for printing and the overall number
of rows and columns of the table, as well as the query job used.
Returns a tuple of the dataframe and the overall number of rows of the query.
"""
pandas_df, query_job = self.download_pandas_preview(max_results)
row_count = self.session._get_table_row_count(query_job.destination)
column_count = len(self.value_columns)

formatted_df = pandas_df.set_axis(self.column_labels, axis=1)
# TODO(swast): Select a subset of columns if max_columns is less than the
# number of columns in the schema.
count = self.shape[0]
if count > max_results:
head_block = self.slice(0, max_results)
else:
head_block = self
computed_df, query_job = head_block.to_pandas()
formatted_df = computed_df.set_axis(self.column_labels, axis=1)
# we reset the axis and substitute the bf index name for the default
formatted_df.index.name = self.index.name
# limit column count
formatted_df = formatted_df.iloc[:, 0:max_columns]
return formatted_df, (row_count, column_count), query_job
return formatted_df, count, query_job

def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]:
result_id = guid.generate_guid()
Expand Down
10 changes: 5 additions & 5 deletions bigframes/core/indexes/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,17 +205,17 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
return self._query_job

def __repr__(self) -> str:
# TODO(swast): Add a timeout here? If the query is taking a long time,
# maybe we just print the job metadata that we have so far?
# TODO(swast): Avoid downloading the whole series by using job
# metadata, like we do with DataFrame.
opts = bigframes.options.display
max_results = opts.max_rows
max_columns = opts.max_columns
if opts.repr_mode == "deferred":
return formatter.repr_query_job(self.query_job)

pandas_df, _, query_job = self._block.retrieve_repr_request_results(
max_results, max_columns
)
pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results)
self._query_job = query_job

return repr(pandas_df.index)

def copy(self, name: Optional[Hashable] = None):
Expand Down
66 changes: 40 additions & 26 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,57 +579,71 @@ def __setattr__(self, key: str, value):
object.__setattr__(self, key, value)

def __repr__(self) -> str:
"""Converts a DataFrame to a string using pandas dataframe __repr__.
"""Converts a DataFrame to a string. Calls to_pandas.
Only represents the first `bigframes.options.display.max_rows`
and `bigframes.options.display.max_columns`.
Only represents the first `bigframes.options.display.max_rows`.
"""
if bigframes.options.display.repr_mode == "deferred":
opts = bigframes.options.display
max_results = opts.max_rows
if opts.repr_mode == "deferred":
return formatter.repr_query_job(self.query_job)

pandas_df, shape = self._perform_repr_request()
with display_options.pandas_repr(bigframes.options.display):
self._cached()
# TODO(swast): pass max_columns and get the true column count back. Maybe
# get 1 more column than we have requested so that pandas can add the
# ... for us?
pandas_df, row_count, query_job = self._block.retrieve_repr_request_results(
max_results
)

self._set_internal_query_job(query_job)

column_count = len(pandas_df.columns)

with display_options.pandas_repr(opts):
repr_string = repr(pandas_df)

# Modify the end of the string to reflect count.
lines = repr_string.split("\n")
pattern = re.compile("\\[[0-9]+ rows x [0-9]+ columns\\]")
if pattern.match(lines[-1]):
lines = lines[:-2]
if shape[0] > len(lines) - 1:

if row_count > len(lines) - 1:
lines.append("...")

lines.append("")
lines.append(f"[{shape[0]} rows x {shape[1]} columns]")
lines.append(f"[{row_count} rows x {column_count} columns]")
return "\n".join(lines)

def _perform_repr_request(self) -> Tuple[pandas.DataFrame, Tuple[int, int]]:
max_results = bigframes.options.display.max_rows
max_columns = bigframes.options.display.max_columns
self._cached()
pandas_df, shape, query_job = self._block.retrieve_repr_request_results(
max_results, max_columns
)
self._set_internal_query_job(query_job)
return pandas_df, shape

def _repr_html_(self) -> str:
"""
Returns an html string primarily for use by notebooks for displaying
a representation of the DataFrame. Displays at most the number of rows
and columns given by `bigframes.options.display.max_rows` and
`bigframes.options.display.max_columns`.
a representation of the DataFrame. Displays 20 rows by default since
many notebooks are not configured for large tables.
"""

if bigframes.options.display.repr_mode == "deferred":
opts = bigframes.options.display
max_results = bigframes.options.display.max_rows
if opts.repr_mode == "deferred":
return formatter.repr_query_job_html(self.query_job)

pandas_df, shape = self._perform_repr_request()
self._cached()
# TODO(swast): pass max_columns and get the true column count back. Maybe
# get 1 more column than we have requested so that pandas can add the
# ... for us?
pandas_df, row_count, query_job = self._block.retrieve_repr_request_results(
max_results
)

self._set_internal_query_job(query_job)

column_count = len(pandas_df.columns)

with display_options.pandas_repr(bigframes.options.display):
with display_options.pandas_repr(opts):
# _repr_html_ stub is missing so mypy thinks it's a Series. Ignore mypy.
html_string = pandas_df._repr_html_() # type:ignore

html_string += f"[{shape[0]} rows x {shape[1]} columns in total]"
html_string += f"[{row_count} rows x {column_count} columns in total]"
return html_string

def __setitem__(self, key: str, value: SingleItemValue):
Expand Down
20 changes: 10 additions & 10 deletions bigframes/ml/metrics/_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import typing
from typing import Tuple, Union

import bigframes_vendored.sklearn.metrics._classification as vendored_mertics_classification
import bigframes_vendored.sklearn.metrics._ranking as vendored_mertics_ranking
import bigframes_vendored.sklearn.metrics._classification as vendored_metrics_classification
import bigframes_vendored.sklearn.metrics._ranking as vendored_metrics_ranking
import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -79,7 +79,7 @@ def accuracy_score(
return score.sum()


accuracy_score.__doc__ = inspect.getdoc(vendored_mertics_classification.accuracy_score)
accuracy_score.__doc__ = inspect.getdoc(vendored_metrics_classification.accuracy_score)


def roc_curve(
Expand Down Expand Up @@ -149,7 +149,7 @@ def roc_curve(
)


roc_curve.__doc__ = inspect.getdoc(vendored_mertics_ranking.roc_curve)
roc_curve.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_curve)


def roc_auc_score(
Expand All @@ -171,7 +171,7 @@ def roc_auc_score(
return (width_diff * height_avg).sum()


roc_auc_score.__doc__ = inspect.getdoc(vendored_mertics_ranking.roc_auc_score)
roc_auc_score.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_auc_score)


def auc(
Expand All @@ -185,7 +185,7 @@ def auc(
return auc


auc.__doc__ = inspect.getdoc(vendored_mertics_ranking.auc)
auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc)


def confusion_matrix(
Expand Down Expand Up @@ -223,7 +223,7 @@ def confusion_matrix(


confusion_matrix.__doc__ = inspect.getdoc(
vendored_mertics_classification.confusion_matrix
vendored_metrics_classification.confusion_matrix
)


Expand Down Expand Up @@ -261,7 +261,7 @@ def recall_score(
return recall_score


recall_score.__doc__ = inspect.getdoc(vendored_mertics_classification.recall_score)
recall_score.__doc__ = inspect.getdoc(vendored_metrics_classification.recall_score)


def precision_score(
Expand Down Expand Up @@ -299,7 +299,7 @@ def precision_score(


precision_score.__doc__ = inspect.getdoc(
vendored_mertics_classification.precision_score
vendored_metrics_classification.precision_score
)


Expand Down Expand Up @@ -334,4 +334,4 @@ def f1_score(
return f1_score


f1_score.__doc__ = inspect.getdoc(vendored_mertics_classification.f1_score)
f1_score.__doc__ = inspect.getdoc(vendored_metrics_classification.f1_score)
9 changes: 5 additions & 4 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,16 +272,17 @@ def reset_index(
return bigframes.dataframe.DataFrame(block)

def __repr__(self) -> str:
# TODO(swast): Add a timeout here? If the query is taking a long time,
# maybe we just print the job metadata that we have so far?
# TODO(swast): Avoid downloading the whole series by using job
# metadata, like we do with DataFrame.
opts = bigframes.options.display
max_results = opts.max_rows
max_columns = opts.max_columns
if opts.repr_mode == "deferred":
return formatter.repr_query_job(self.query_job)

self._cached()
pandas_df, _, query_job = self._block.retrieve_repr_request_results(
max_results, max_columns
)
pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results)
self._set_internal_query_job(query_job)

return repr(pandas_df.iloc[:, 0])
Expand Down
8 changes: 2 additions & 6 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1832,7 +1832,6 @@ def _execute(
sorted: bool = True,
dry_run=False,
col_id_overrides: Mapping[str, str] = {},
max_results: Optional[int] = None,
) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
sql = self._to_sql(
array_value, sorted=sorted, col_id_overrides=col_id_overrides
Expand All @@ -1842,7 +1841,8 @@ def _execute(
else:
job_config.dry_run = dry_run
return self._start_query(
sql=sql, job_config=job_config, max_results=max_results
sql=sql,
job_config=job_config,
)

def _peek(
Expand Down Expand Up @@ -1887,10 +1887,6 @@ def _get_table_size(self, destination_table):
table = self.bqclient.get_table(destination_table)
return table.num_bytes

def _get_table_row_count(self, destination_table) -> int:
table = self.bqclient.get_table(destination_table)
return table.num_rows

def _rows_to_dataframe(
self, row_iterator: bigquery.table.RowIterator, dtypes: Dict
) -> pandas.DataFrame:
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ API reference
:maxdepth: 3

reference/index
supported_pandas_apis

Changelog
---------
Expand Down
62 changes: 62 additions & 0 deletions docs/supported_pandas_apis.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
Supported pandas APIs
=====================

The following tables show the pandas APIs that have been implemented (or not)
in BigQuery DataFrames.

* 'Y' means it implements all parameters.
* 'P' means it implements only some parameters.

DataFrame
---------

.. raw:: html
:file: supported_pandas_apis/bf_dataframe.html

DataFrameGroupBy
----------------

.. raw:: html
:file: supported_pandas_apis/bf_dataframegroupby.html

Index
-----

.. raw:: html
:file: supported_pandas_apis/bf_index.html

pandas module
-------------

.. raw:: html
:file: supported_pandas_apis/bf_pandas.html

Series
------

.. raw:: html
:file: supported_pandas_apis/bf_series.html

Series.dt methods
-----------------

.. raw:: html
:file: supported_pandas_apis/bf_datetimemethods.html

Series.str methods
------------------

.. raw:: html
:file: supported_pandas_apis/bf_stringmethods.html

SeriesGroupBy
-------------

.. raw:: html
:file: supported_pandas_apis/bf_seriesgroupby.html

Window
------

.. raw:: html
:file: supported_pandas_apis/bf_window.html
1 change: 1 addition & 0 deletions docs/supported_pandas_apis/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.html
Loading

0 comments on commit 6ad77d5

Please sign in to comment.