From 264142e54498d50ec7427c8b3e45265f19fcd874 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 11 Aug 2023 14:50:04 -0500 Subject: [PATCH] chore: release 0.1.0 (#4) Syncs to internal commit 792794841b8ae0ca502723a2abead4f5fd5d11a0 Change-Id: Ib799a9c5e5a18d9b471756410aa5c87cb4932fe8 --- .kokoro/continuous/e2e.cfg | 2 +- .kokoro/continuous/nightly.cfg | 2 +- .kokoro/presubmit/e2e.cfg | 2 +- .kokoro/release-nightly.sh | 5 + .repo-metadata.json | 16 + CHANGELOG.md | 4 +- README.rst | 232 +- bigframes/_config/__init__.py | 11 + bigframes/_config/bigquery_options.py | 21 +- bigframes/_config/display_options.py | 3 +- bigframes/_config/sampling_options.py | 30 + bigframes/constants.py | 23 + bigframes/core/__init__.py | 372 ++- bigframes/core/blocks.py | 428 ++- bigframes/core/groupby/__init__.py | 169 +- bigframes/core/indexers.py | 62 +- bigframes/core/indexes/index.py | 75 +- bigframes/core/io.py | 90 + bigframes/core/joins/row_identity.py | 79 +- bigframes/core/joins/single_column.py | 235 +- bigframes/core/ordering.py | 201 +- bigframes/core/reshape/__init__.py | 105 +- bigframes/core/scalar.py | 29 +- bigframes/core/utils.py | 22 + bigframes/dataframe.py | 649 ++-- bigframes/dtypes.py | 69 +- bigframes/formatting_helpers.py | 207 +- bigframes/ml/base.py | 30 +- bigframes/ml/cluster.py | 69 +- bigframes/ml/compose.py | 72 +- bigframes/ml/core.py | 171 +- bigframes/ml/decomposition.py | 70 +- bigframes/ml/ensemble.py | 249 +- bigframes/ml/forecasting.py | 88 +- bigframes/ml/imported.py | 144 +- bigframes/ml/linear_model.py | 132 +- bigframes/ml/llm.py | 90 +- bigframes/ml/loader.py | 100 +- bigframes/ml/metrics.py | 220 +- bigframes/ml/model_selection.py | 46 +- bigframes/ml/pipeline.py | 144 +- bigframes/ml/preprocessing.py | 143 +- bigframes/ml/sql.py | 36 +- bigframes/ml/utils.py | 58 + bigframes/operations/__init__.py | 145 +- bigframes/operations/aggregations.py | 117 +- bigframes/operations/base.py | 15 +- bigframes/operations/strings.py | 109 +- bigframes/pandas/__init__.py | 99 +- bigframes/remote_function.py | 455 ++- bigframes/series.py | 354 ++- bigframes/session.py | 451 ++- docs/getting_started/index.rst | 27 - docs/index.rst | 4 +- docs/reference/bigframes.ml/cluster.rst | 2 + docs/reference/bigframes.ml/compose.rst | 2 + docs/reference/bigframes.ml/decomposition.rst | 2 + docs/reference/bigframes.ml/ensemble.rst | 7 + docs/reference/bigframes.ml/forecasting.rst | 2 + docs/reference/bigframes.ml/imported.rst | 2 + docs/reference/bigframes.ml/index.rst | 2 + docs/reference/bigframes.ml/linear_model.rst | 2 + docs/reference/bigframes.ml/llm.rst | 2 + docs/reference/bigframes.ml/metrics.rst | 2 + .../bigframes.ml/model_selection.rst | 2 + docs/reference/bigframes.ml/pipeline.rst | 2 + docs/reference/bigframes.ml/preprocessing.rst | 2 + docs/reference/bigframes/options.rst | 2 + .../bigframes.pandas/remote_functions.rst | 105 +- notebooks/01 - Getting Started.ipynb | 231 +- notebooks/02 - DataFrame.ipynb | 2 +- ...Using ML - SKLearn linear regression.ipynb | 1166 ++++--- ... - Using ML - Easy linear regression.ipynb | 1136 ++++--- ...6 - Using ML - Large Language Models.ipynb | 2 +- notebooks/10 - Regionalized.ipynb | 2800 +++++++++++++++++ noxfile.py | 54 +- samples/snippets/quickstart.py | 62 +- scripts/upload_to_google_drive.py | 71 + setup.py | 7 +- testing/constraints-3.9.txt | 4 +- tests/system/conftest.py | 24 + tests/system/large/ml/test_cluster.py | 35 +- tests/system/large/ml/test_compose.py | 8 +- tests/system/large/ml/test_core.py | 16 +- tests/system/large/ml/test_decomposition.py | 34 +- tests/system/large/ml/test_ensemble.py | 25 +- tests/system/large/ml/test_forecasting.py | 2 +- tests/system/large/ml/test_linear_model.py | 8 +- tests/system/large/ml/test_pipeline.py | 301 +- tests/system/large/test_remote_function.py | 20 +- tests/system/small/ml/conftest.py | 70 +- tests/system/small/ml/test_cluster.py | 117 +- tests/system/small/ml/test_core.py | 12 +- tests/system/small/ml/test_decomposition.py | 71 +- tests/system/small/ml/test_ensemble.py | 154 +- tests/system/small/ml/test_forecasting.py | 46 +- tests/system/small/ml/test_imported.py | 22 +- tests/system/small/ml/test_linear_model.py | 90 +- tests/system/small/ml/test_llm.py | 34 +- tests/system/small/ml/test_metrics.py | 278 +- tests/system/small/ml/test_model_selection.py | 43 +- tests/system/small/ml/test_preprocessing.py | 136 +- tests/system/small/ml/test_register.py | 8 +- .../system/small/operations/test_datetimes.py | 20 +- tests/system/small/operations/test_strings.py | 155 +- tests/system/small/test_dataframe.py | 418 ++- tests/system/small/test_dataframe_io.py | 100 +- tests/system/small/test_groupby.py | 54 +- tests/system/small/test_ibis.py | 39 + tests/system/small/test_index.py | 12 +- tests/system/small/test_multiindex.py | 224 +- tests/system/small/test_pandas.py | 57 +- tests/system/small/test_pandas_options.py | 194 ++ tests/system/small/test_progress_bar.py | 97 +- tests/system/small/test_remote_function.py | 313 +- tests/system/small/test_series.py | 622 +++- tests/system/small/test_session.py | 97 +- tests/system/small/test_window.py | 43 +- tests/unit/ml/test_sql.py | 28 +- tests/unit/test_core.py | 78 +- tests/unit/test_dtypes.py | 9 +- tests/unit/test_formatting_helper.py | 17 + tests/unit/test_formatting_helpers.py | 46 + tests/unit/test_pandas.py | 32 +- .../bigframes_vendored/ibis/LICENSE.txt | 202 ++ third_party/bigframes_vendored/ibis/README.md | 196 ++ .../bigframes_vendored/ibis/__init__.py | 0 .../ibis/backends/__init__.py | 0 .../ibis/backends/bigquery/__init__.py | 0 .../ibis/backends/bigquery/registry.py | 19 + .../bigframes_vendored/ibis/expr/__init__.py | 0 .../ibis/expr/operations/__init__.py | 4 + .../ibis/expr/operations/reductions.py | 23 + .../pandas/core/config_init.py | 46 +- .../bigframes_vendored/pandas/core/frame.py | 629 ++-- .../bigframes_vendored/pandas/core/generic.py | 146 +- .../pandas/core/groupby/__init__.py | 235 +- .../pandas/core/indexes/accessor.py | 13 +- .../pandas/core/indexes/base.py | 18 +- .../pandas/core/indexing.py | 30 +- .../pandas/core/reshape/__init__.py | 0 .../pandas/core/reshape/concat.py | 3 + .../pandas/core/reshape/tile.py | 65 + .../bigframes_vendored/pandas/core/series.py | 1185 +++---- .../pandas/core/strings/accessor.py | 183 +- .../bigframes_vendored/pandas/io/common.py | 5 +- .../bigframes_vendored/pandas/io/gbq.py | 12 +- .../bigframes_vendored/pandas/io/parquet.py | 15 +- .../pandas/io/parsers/readers.py | 53 +- .../bigframes_vendored/sklearn/base.py | 22 +- .../sklearn/cluster/_kmeans.py | 60 +- .../sklearn/compose/_column_transformer.py | 12 +- .../sklearn/decomposition/_pca.py | 48 +- .../sklearn/ensemble/_forest.py | 13 +- .../sklearn/linear_model/_base.py | 35 +- .../sklearn/linear_model/_logistic.py | 43 +- .../sklearn/metrics/_classification.py | 45 +- .../sklearn/metrics/_ranking.py | 18 +- .../sklearn/metrics/_regression.py | 7 +- .../bigframes_vendored/sklearn/pipeline.py | 37 +- .../sklearn/preprocessing/_data.py | 33 +- .../sklearn/preprocessing/_encoder.py | 49 +- .../bigframes_vendored/xgboost/sklearn.py | 99 +- 163 files changed, 15353 insertions(+), 4913 deletions(-) create mode 100644 .repo-metadata.json create mode 100644 bigframes/_config/sampling_options.py create mode 100644 bigframes/constants.py create mode 100644 bigframes/core/io.py create mode 100644 bigframes/core/utils.py create mode 100644 bigframes/ml/utils.py delete mode 100644 docs/getting_started/index.rst create mode 100644 docs/reference/bigframes.ml/ensemble.rst create mode 100644 notebooks/10 - Regionalized.ipynb create mode 100644 scripts/upload_to_google_drive.py create mode 100644 tests/system/small/test_ibis.py create mode 100644 tests/unit/test_formatting_helper.py create mode 100644 tests/unit/test_formatting_helpers.py create mode 100644 third_party/bigframes_vendored/ibis/LICENSE.txt create mode 100644 third_party/bigframes_vendored/ibis/README.md create mode 100644 third_party/bigframes_vendored/ibis/__init__.py create mode 100644 third_party/bigframes_vendored/ibis/backends/__init__.py create mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py create mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/registry.py create mode 100644 third_party/bigframes_vendored/ibis/expr/__init__.py create mode 100644 third_party/bigframes_vendored/ibis/expr/operations/__init__.py create mode 100644 third_party/bigframes_vendored/ibis/expr/operations/reductions.py create mode 100644 third_party/bigframes_vendored/pandas/core/reshape/__init__.py create mode 100644 third_party/bigframes_vendored/pandas/core/reshape/tile.py diff --git a/.kokoro/continuous/e2e.cfg b/.kokoro/continuous/e2e.cfg index e2ca8bc78d..d875f36060 100644 --- a/.kokoro/continuous/e2e.cfg +++ b/.kokoro/continuous/e2e.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system_prerelease system_noextras e2e notebook samples" + value: "system_noextras e2e notebook samples" } diff --git a/.kokoro/continuous/nightly.cfg b/.kokoro/continuous/nightly.cfg index 5d1ec3a8f7..ac34c4b0c6 100644 --- a/.kokoro/continuous/nightly.cfg +++ b/.kokoro/continuous/nightly.cfg @@ -2,7 +2,7 @@ env_vars: { key: "NOX_SESSION" - value: "unit unit_prerelease system system_prerelease cover lint lint_setup_py mypy format docs e2e notebook" + value: "unit system cover lint lint_setup_py mypy format docs e2e notebook" } build_file: "bigframes/.kokoro/release-nightly.sh" diff --git a/.kokoro/presubmit/e2e.cfg b/.kokoro/presubmit/e2e.cfg index e2ca8bc78d..d875f36060 100644 --- a/.kokoro/presubmit/e2e.cfg +++ b/.kokoro/presubmit/e2e.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system_prerelease system_noextras e2e notebook samples" + value: "system_noextras e2e notebook samples" } diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 8742a0fc79..e3b6b4d449 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -211,3 +211,8 @@ gcs_docs () { } gcs_docs + +if ! [ ${DRY_RUN} ]; then + # Copy docs and wheels to Google Drive + python3.10 scripts/upload_to_google_drive.py +fi diff --git a/.repo-metadata.json b/.repo-metadata.json new file mode 100644 index 0000000000..0efaa967d2 --- /dev/null +++ b/.repo-metadata.json @@ -0,0 +1,16 @@ +{ + "name": "bigframes", + "name_pretty": "A unified Python API in BigQuery", + "product_documentation": "https://cloud.google.com/bigquery", + "client_documentation": "https://cloud.google.com/python/docs/reference/bigframes/latest", + "issue_tracker": "https://github.com/googleapis/python-bigquery-dataframes/issues", + "release_level": "preview", + "language": "python", + "library_type": "INTEGRATION", + "repo": "googleapis/python-bigquery-dataframes", + "distribution_name": "bigframes", + "api_id": "bigquery.googleapis.com", + "default_version": "", + "codeowner_team": "@googleapis/api-bigquery-dataframe", + "api_shortname": "bigquery" +} diff --git a/CHANGELOG.md b/CHANGELOG.md index 39353a3f1b..9d966220bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,14 +4,14 @@ [1]: https://pypi.org/project/bigframes/#history -## 0.1.0 (TBD) +## 0.1.0 (2023-08-11) ### Features * Add `bigframes.pandas` package with an API compatible with [pandas](https://pandas.pydata.org/). Supported data sources include: BigQuery SQL queries, BigQuery tables, CSV (local and GCS), Parquet (local - and GCS), and more. + and Cloud Storage), and more. * Add `bigframes.ml` package with an API inspired by [scikit-learn](https://scikit-learn.org/stable/). Train machine learning models and run batch predicition, powered by [BigQuery diff --git a/README.rst b/README.rst index 5f473e1189..6f51dfde23 100644 --- a/README.rst +++ b/README.rst @@ -4,5 +4,233 @@ BigQuery DataFrames BigQuery DataFrames provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine. -* ``bigframes.pandas`` provides a pandas-like API for analytics. -* ``bigframes.ml`` provides a Scikit-Learn-like API for ML. +* ``bigframes.pandas`` provides a pandas-compatible API for analytics. +* ``bigframes.ml`` provides a scikit-learn-like API for ML. + +Documentation +------------- + +* `BigQuery DataFrames sample notebooks `_ +* `BigQuery DataFrames API reference `_ +* `BigQuery documentation `_ + + +Quickstart +---------- + +Prerequisites +^^^^^^^^^^^^^ + +* Install the ``bigframes`` package. +* Create a Google Cloud project and billing account. +* When running locally, authenticate with application default credentials. See + the `gcloud auth application-default login + `_ + reference. + +Code sample +^^^^^^^^^^^ + +Import ``bigframes.pandas`` for a pandas-like interface. The ``read_gbq`` +method accepts either a fully-qualified table ID or a SQL query. + +.. code-block:: python + + import bigframes.pandas as bpd + + df1 = bpd.read_gbq("project.dataset.table") + df2 = bpd.read_gbq("SELECT a, b, c, FROM `project.dataset.table`") + +* `More code samples `_ + + +Locations +--------- +BigQuery DataFrames uses a +`BigQuery session `_ +internally to manage metadata on the service side. This session is tied to a +`location `_ . +BigQuery DataFrames uses the US multi-region as the default location, but you +can use ``session_options.location`` to set a different location. Every query +in a session is executed in the location where the session was created. + +If you want to reset the location of the created DataFrame or Series objects, +can reset the session by executing ``bigframes.pandas.reset_session()``. +After that, you can reuse ``bigframes.pandas.options.bigquery.location`` to +specify another location. + + +``read_gbq()`` requires you to specify a location if the dataset you are +querying is not in the US multi-region. If you try to read a table from another +location, you get a NotFound exception. + + +ML locations +------------ + +``bigframes.ml`` supports the same locations as BigQuery ML. BigQuery ML model +prediction and other ML functions are supported in all BigQuery regions. Support +for model training varies by region. For more information, see +`BigQuery ML locations `_. + + +Data types +---------- + +BigQuery DataFrames supports the following numpy and pandas dtypes: + +* ``numpy.dtype("O")`` +* ``pandas.BooleanDtype()`` +* ``pandas.Float64Dtype()`` +* ``pandas.Int64Dtype()`` +* ``pandas.StringDtype(storage="pyarrow")`` +* ``pandas.ArrowDtype(pa.date32())`` +* ``pandas.ArrowDtype(pa.time64("us"))`` +* ``pandas.ArrowDtype(pa.timestamp("us"))`` +* ``pandas.ArrowDtype(pa.timestamp("us", tz="UTC"))`` + +BigQuery DataFrames doesn’t support the following BigQuery data types: + +* ``ARRAY`` +* ``NUMERIC`` +* ``BIGNUMERIC`` +* ``INTERVAL`` +* ``STRUCT`` +* ``JSON`` + +All other BigQuery data types display as the object type. + + +Remote functions +---------------- + +BigQuery DataFrames gives you the ability to turn your custom scalar functions +into `BigQuery remote functions +`_ . Creating a remote +function in BigQuery DataFrames creates a BigQuery remote function, a `BigQuery +connection +`_ , +and a `Cloud Functions (2nd gen) function +`_ . + +BigQuery connections are created in the same location as the BigQuery +DataFrames session, using the name you provide in the custom function +definition. To view and manage connections, do the following: + +1. Go to `BigQuery Studio `__. +2. Select the project in which you created the remote function. +3. In the Explorer pane, expand that project and then expand External connections. + +BigQuery remote functions are created in the dataset you specify, or +in a dataset with the name ``bigframes_temp_location``, where location is +the location used by the BigQuery DataFrames session. For example, +``bigframes_temp_us_central1``. To view and manage remote functions, do +the following: + +1. Go to `BigQuery Studio `__. +2. Select the project in which you created the remote function. +3. In the Explorer pane, expand that project, expand the dataset in which you + created the remote function, and then expand Routines. + +To view and manage Cloud Functions functions, use the +`Functions `_ +page and use the project picker to select the project in which you +created the function. For easy identification, the names of the functions +created by BigQuery DataFrames are prefixed by ``bigframes-``. + +**Requirements** + +BigQuery DataFrames uses the ``gcloud`` command-line interface internally, +so you must run ``gcloud auth login`` before using remote functions. + +To use BigQuery DataFrames remote functions, you must enable the following APIs: + +* The BigQuery API (bigquery.googleapis.com) +* The BigQuery Connection API (bigqueryconnection.googleapis.com) +* The Cloud Functions API (cloudfunctions.googleapis.com) +* The Cloud Run API (run.googleapis.com) +* The Artifact Registry API (artifactregistry.googleapis.com) +* The Cloud Build API (cloudbuild.googleapis.com ) +* The Cloud Resource Manager API (cloudresourcemanager.googleapis.com) + +To use BigQuery DataFrames remote functions, you must be granted the +following IAM roles: + +* BigQuery Data Editor (roles/bigquery.dataEditor) +* BigQuery Connection Admin (roles/bigquery.connectionAdmin) +* Cloud Functions Developer (roles/cloudfunctions.developer) +* Service Account User (roles/iam.serviceAccountUser) +* Storage Object Viewer (roles/storage.objectViewer) +* Project IAM Admin (roles/resourcemanager.projectIamAdmin) + +**Limitations** + +* Remote functions take about 90 seconds to become available when you first create them. +* Trivial changes in the notebook, such as inserting a new cell or renaming a variable, + might cause the remote function to be re-created, even if these changes are unrelated + to the remote function code. +* BigQuery DataFrames does not differentiate any personal data you include in the remote + function code. The remote function code is serialized as an opaque box to deploy it as a + Cloud Functions function. +* The Cloud Functions (2nd gen) functions, BigQuery connections, and BigQuery remote + functions created by BigQuery DataFrames persist in Google Cloud. If you don’t want to + keep these resources, you must delete them separately using an appropriate Cloud Functions + or BigQuery interface. +* A project can have up to 1000 Cloud Functions (2nd gen) functions at a time. See Cloud + Functions quotas for all the limits. + + +Quotas and limits +----------------- + +`BigQuery quotas `_ +including hardware, software, and network components. + + +Session termination +------------------- + +Each BigQuery DataFrames DataFrame or Series object is tied to a BigQuery +DataFrames session, which is in turn based on a BigQuery session. BigQuery +sessions +`auto-terminate `_ +; when this happens, you can’t use previously +created DataFrame or Series objects and must re-create them using a new +BigQuery DataFrames session. You can do this by running +``bigframes.pandas.reset_session()`` and then re-running the BigQuery +DataFrames expressions. + + +Data processing location +------------------------ + +BigQuery DataFrames is designed for scale, which it achieves by keeping data +and processing on the BigQuery service. However, you can bring data into the +memory of your client machine by calling ``.execute()`` on a DataFrame or Series +object. If you choose to do this, the memory limitation of your client machine +applies. + + +License +------- + +BigQuery DataFrames is distributed with the `Apache-2.0 license +`_. + +It also contains code derived from the following third-party packages: + +* `Ibis `_ +* `pandas `_ +* `Python `_ +* `scikit-learn `_ +* `XGBoost `_ + +For details, see the `third_party +`_ +directory. + + +Contact Us +---------- + +For further help and provide feedback, you can email us at `bigframes-feedback@google.com `_. diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index 54c529fb0c..e26eaf8800 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -19,6 +19,7 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes._config.display_options as display_options +import bigframes._config.sampling_options as sampling_options class Options: @@ -27,6 +28,7 @@ class Options: def __init__(self): self._bigquery_options = bigquery_options.BigQueryOptions() self._display_options = display_options.DisplayOptions() + self._sampling_options = sampling_options.SamplingOptions() @property def bigquery(self) -> bigquery_options.BigQueryOptions: @@ -38,6 +40,15 @@ def display(self) -> display_options.DisplayOptions: """Options controlling object representation.""" return self._display_options + @property + def sampling(self) -> sampling_options.SamplingOptions: + """Options controlling downsampling when downloading data + to memory. The data will be downloaded into memory explicitly + (e.g., to_pandas, to_numpy, values) or implicitly (e.g., + matplotlib plotting). This option can be overriden by + parameters in specific functions.""" + return self._sampling_options + options = Options() """Global options for default session.""" diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 5eb5ba7bc8..a103abe190 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -21,11 +21,14 @@ import google.api_core.exceptions import google.auth.credentials -SESSION_STARTED_MESSAGE = "Cannot change '{attribute}' once a session has started." +SESSION_STARTED_MESSAGE = ( + "Cannot change '{attribute}' once a session has started. " + "Call bigframes.pandas.reset_session() first, if you are using the bigframes.pandas API." +) class BigQueryOptions: - """Encapsulates configuration for working with an Session.""" + """Encapsulates configuration for working with a session.""" def __init__( self, @@ -55,7 +58,7 @@ def credentials(self, value: Optional[google.auth.credentials.Credentials]): @property def location(self) -> Optional[str]: - """Default location for jobs / datasets / tables. + """Default location for job, datasets, and tables. See: https://cloud.google.com/bigquery/docs/locations """ @@ -69,7 +72,7 @@ def location(self, value: Optional[str]): @property def project(self) -> Optional[str]: - """Google Cloud project ID to use for billing and default data project.""" + """Google Cloud project ID to use for billing and as the default project.""" return self._project @project.setter @@ -80,10 +83,12 @@ def project(self, value: Optional[str]): @property def remote_udf_connection(self) -> Optional[str]: - """Name of the BigQuery connection for the purpose of remote UDFs. + """Name of the BigQuery connection to use for remote functions. - It should be either pre created in `location`, or the user should have - privilege to create one. + You should either have the connection already created in the + location you have chosen, or you should have the Project IAM + Admin role to enable the service to create the connection for you if you + need it. """ return self._remote_udf_connection @@ -97,7 +102,7 @@ def remote_udf_connection(self, value: Optional[str]): @property def use_regional_endpoints(self) -> bool: - """In preview. Flag to connect to regional API endpoints. + """Flag to connect to regional API endpoints. Requires ``location`` to also be set. For example, set ``location='asia-northeast1'`` and ``use_regional_endpoints=True`` to diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index 43faad0e78..8bd2743f17 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -16,7 +16,7 @@ import contextlib import dataclasses -from typing import Optional +from typing import Literal, Optional import pandas as pd @@ -30,6 +30,7 @@ class DisplayOptions: max_columns: int = 20 max_rows: int = 25 progress_bar: Optional[str] = "auto" + repr_mode: Literal["head", "deferred"] = "head" @contextlib.contextmanager diff --git a/bigframes/_config/sampling_options.py b/bigframes/_config/sampling_options.py new file mode 100644 index 0000000000..1742dabe17 --- /dev/null +++ b/bigframes/_config/sampling_options.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Options for downsampling.""" + +import dataclasses +from typing import Literal, Optional + +import third_party.bigframes_vendored.pandas.core.config_init as vendored_pandas_config + + +@dataclasses.dataclass +class SamplingOptions: + __doc__ = vendored_pandas_config.sampling_options_doc + + max_download_size: Optional[int] = 500 + enable_downsampling: bool = False + sampling_method: Literal["head", "uniform"] = "uniform" + random_state: Optional[int] = None diff --git a/bigframes/constants.py b/bigframes/constants.py new file mode 100644 index 0000000000..3f3f155733 --- /dev/null +++ b/bigframes/constants.py @@ -0,0 +1,23 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Constants used across BigQuery DataFrames. + +This module should not depend on any others in the package. +""" + +FEEDBACK_LINK = ( + "Share your usecase with the BigQuery DataFrames team at the " + "https://bit.ly/bigframes-feedback survey." +) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 70565fddf5..075e27b0c2 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -25,11 +25,15 @@ import ibis.expr.types as ibis_types import pandas +import bigframes.constants as constants import bigframes.core.guid from bigframes.core.ordering import ( + encode_order_string, ExpressionOrdering, + IntegerEncoding, OrderingColumnReference, - stringify_order_id, + reencode_order_string, + StringEncoding, ) import bigframes.dtypes import bigframes.operations as ops @@ -85,27 +89,22 @@ def __init__( self, session: Session, table: ibis_types.Table, - columns: Optional[Sequence[ibis_types.Value]] = None, + columns: Sequence[ibis_types.Value], hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, - ordering: Optional[ExpressionOrdering] = None, + ordering: ExpressionOrdering = ExpressionOrdering(), predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): self._session = session self._table = table self._predicates = tuple(predicates) if predicates is not None else () # TODO: Validate ordering - self._ordering = ordering or ExpressionOrdering() + if not ordering.total_ordering_columns: + raise ValueError("Must have total ordering defined by one or more columns") + self._ordering = ordering # Allow creating a DataFrame directly from an Ibis table expression. - if columns is None: - self._columns = tuple( - table[key] - for key in table.columns - if ordering is None or key != ordering.ordering_id - ) - else: - # TODO(swast): Validate that each column references the same table (or - # no table for literal values). - self._columns = tuple(columns) + # TODO(swast): Validate that each column references the same table (or + # no table for literal values). + self._columns = tuple(columns) # Meta columns store ordering, or other data that doesn't correspond to dataframe columns self._hidden_ordering_columns = ( @@ -120,6 +119,20 @@ def __init__( self._hidden_ordering_column_names = { column.get_name(): column for column in self._hidden_ordering_columns } + ### Validation + value_col_ids = self._column_names.keys() + hidden_col_ids = self._hidden_ordering_column_names.keys() + + all_columns = value_col_ids | hidden_col_ids + ordering_valid = all( + col.column_id in all_columns for col in ordering.all_ordering_columns + ) + if value_col_ids & hidden_col_ids: + raise ValueError( + f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" + ) + if not ordering_valid: + raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") @classmethod def mem_expr_from_pandas( @@ -146,7 +159,8 @@ def mem_expr_from_pandas( session, # type: ignore # Session cannot normally be none, see "caution" above keys_memtable, ordering=ExpressionOrdering( - ordering_id_column=OrderingColumnReference(ORDER_ID_COLUMN) + ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), ), hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), ) @@ -155,10 +169,6 @@ def mem_expr_from_pandas( def table(self) -> ibis_types.Table: return self._table - @property - def predicates(self) -> typing.Tuple[ibis_types.BooleanValue, ...]: - return self._predicates - @property def reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" @@ -181,18 +191,12 @@ def hidden_ordering_columns(self) -> typing.Tuple[ibis_types.Value, ...]: return self._hidden_ordering_columns @property - def ordering(self) -> Sequence[ibis_types.Value]: + def _ibis_order(self) -> Sequence[ibis_types.Value]: """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" - if not self._ordering: - return [] - else: - # TODO(swast): When we assign literals / scalars, we might not - # have a true Column. Do we need to check this before trying to - # sort by such a column? - return _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, - self._ordering.all_ordering_columns, - ) + return _convert_ordering_to_table_values( + {**self._column_names, **self._hidden_ordering_column_names}, + self._ordering.all_ordering_columns, + ) def builder(self) -> ArrayValueBuilder: """Creates a mutable builder for expressions.""" @@ -202,17 +206,12 @@ def builder(self) -> ArrayValueBuilder: return ArrayValueBuilder( self._session, self._table, - self._columns, - self._hidden_ordering_columns, + columns=self._columns, + hidden_ordering_columns=self._hidden_ordering_columns, ordering=self._ordering, predicates=self._predicates, ) - def insert_column(self, index: int, column: ibis_types.Value) -> ArrayValue: - expr = self.builder() - expr.columns.insert(index, column) - return expr.build() - def drop_columns(self, columns: Iterable[str]) -> ArrayValue: # Must generate offsets if we are dropping a column that ordering depends on expr = self @@ -229,7 +228,9 @@ def drop_columns(self, columns: Iterable[str]) -> ArrayValue: return expr_builder.build() def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - ibis_type = typing.cast(bigframes.dtypes.IbisDtype, self.get_column(key).type()) + ibis_type = typing.cast( + bigframes.dtypes.IbisDtype, self.get_any_column(key).type() + ) return typing.cast( bigframes.dtypes.Dtype, bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), @@ -286,8 +287,7 @@ def apply_limit(self, max_results: int) -> ArrayValue: def filter(self, predicate: ibis_types.BooleanValue) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" expr = self.builder() - if expr.ordering: - expr.ordering = expr.ordering.with_is_sequential(False) + expr.ordering = expr.ordering.with_non_sequential() expr.predicates = [*self._predicates, predicate] return expr.build() @@ -303,13 +303,33 @@ def reversed(self) -> ArrayValue: expr_builder.ordering = self._ordering.with_reverse() return expr_builder.build() + def _uniform_sampling(self, fraction: float) -> ArrayValue: + table = self.to_ibis_expr( + ordering_mode="order_by", expose_hidden_cols=True, fraction=fraction + ) + columns = [table[column_name] for column_name in self._column_names] + hidden_ordering_columns = [ + table[column_name] for column_name in self._hidden_ordering_column_names + ] + return ArrayValue( + self._session, + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + @property def offsets(self): if not self._ordering.is_sequential: raise ValueError( "Expression does not have offsets. Generate them first using project_offsets." ) - return self._get_hidden_ordering_column(self._ordering.ordering_id) + if not self._ordering.total_order_col: + raise ValueError( + "Ordering is invalid. Marked as sequential but no total order columns." + ) + return self.get_any_column(self._ordering.total_order_col.column_id) def project_offsets(self) -> ArrayValue: """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" @@ -321,8 +341,9 @@ def project_offsets(self) -> ArrayValue: ) columns = [table[column_name] for column_name in self._column_names] ordering = ExpressionOrdering( - ordering_id_column=OrderingColumnReference(ORDER_ID_COLUMN), - is_sequential=True, + ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(True, is_sequential=True), ) return ArrayValue( self._session, @@ -342,13 +363,7 @@ def _hide_column(self, column_id) -> ArrayValue: *self._hidden_ordering_columns, self.get_column(column_id).name(new_name), ] - - ordering_columns = [ - col if col.column_id != column_id else col.with_name(new_name) - for col in self._ordering.ordering_value_columns - ] - - expr_builder.ordering = self._ordering.with_ordering_columns(ordering_columns) + expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) return expr_builder.build() def promote_offsets(self) -> typing.Tuple[ArrayValue, str]: @@ -358,12 +373,12 @@ def promote_offsets(self) -> typing.Tuple[ArrayValue, str]: # Special case: offsets already exist ordering = self._ordering - if (not ordering.is_sequential) or (not ordering.ordering_id): + if (not ordering.is_sequential) or (not ordering.total_order_col): return self.project_offsets().promote_offsets() col_id = bigframes.core.guid.generate_guid() expr_builder = self.builder() expr_builder.columns = [ - self._get_hidden_ordering_column(ordering.ordering_id).name(col_id), + self.get_any_column(ordering.total_order_col.column_id).name(col_id), *self.columns, ] return expr_builder.build(), col_id @@ -409,13 +424,13 @@ def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) # Must normalize all ids to the same encoding size max_encoding_size = max( - self._ordering.ordering_encoding_size, - *[expression._ordering.ordering_encoding_size for expression in other], + self._ordering.string_encoding.length, + *[expression._ordering.string_encoding.length for expression in other], ) for i, expr in enumerate([self, *other]): ordering_prefix = str(i).zfill(prefix_size) table = expr.to_ibis_expr( - ordering_mode="ordered_col", order_col_name=ORDER_ID_COLUMN + ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN ) # Rename the value columns based on horizontal offset before applying union. table = table.select( @@ -424,7 +439,9 @@ def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: if col != ORDER_ID_COLUMN else ( ordering_prefix - + stringify_order_id(table[ORDER_ID_COLUMN], max_encoding_size) + + reencode_order_string( + table[ORDER_ID_COLUMN], max_encoding_size + ) ).name(ORDER_ID_COLUMN) for i, col in enumerate(table.columns) ] @@ -432,8 +449,9 @@ def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: tables.append(table) combined_table = ibis.union(*tables) ordering = ExpressionOrdering( - ordering_id_column=OrderingColumnReference(ORDER_ID_COLUMN), - ordering_encoding_size=prefix_size + max_encoding_size, + ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + string_encoding=StringEncoding(True, prefix_size + max_encoding_size), ) return ArrayValue( self._session, @@ -498,7 +516,7 @@ def aggregate( aggregations: input_column_id, operation, output_column_id tuples dropna: whether null keys should be dropped """ - table = self.to_ibis_expr() + table = self.to_ibis_expr(ordering_mode="unordered") stats = { col_out: agg_op._as_ibis(table[col_in]) for col_in, agg_op, col_out in aggregations @@ -510,9 +528,11 @@ def aggregate( [ OrderingColumnReference(column_id=column_id) for column_id in by_column_ids - ] + ], + total_ordering_columns=frozenset(by_column_ids), ) - expr = ArrayValue(self._session, result, ordering=ordering) + columns = tuple(result[key] for key in result.columns) + expr = ArrayValue(self._session, result, columns=columns, ordering=ordering) if dropna: for column_id in by_column_ids: expr = expr.filter( @@ -525,8 +545,9 @@ def aggregate( result = table.aggregate(**aggregates) # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. ordering = ExpressionOrdering( - ordering_id_column=OrderingColumnReference(column_id=ORDER_ID_COLUMN), - is_sequential=True, + ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), ) return ArrayValue( self._session, @@ -589,10 +610,12 @@ def project_window_op( def to_ibis_expr( self, ordering_mode: Literal[ - "order_by", "ordered_col", "offset_col", "unordered" + "order_by", "string_encoded", "offset_col", "unordered" ] = "order_by", order_col_name: Optional[str] = ORDER_ID_COLUMN, expose_hidden_cols: bool = False, + fraction: Optional[float] = None, + col_id_overrides: typing.Mapping[str, str] = {}, ): """ Creates an Ibis table expression representing the DataFrame. @@ -604,8 +627,7 @@ def to_ibis_expr( column, however there will be an order_by clause applied to the ouput. * "offset_col": Zero-based offsets are generated as a column, this will not sort the rows however. - * "ordered_col": An ordered column is provided in output table, without - guarantee that the values are sequential + * "string_encoded": An ordered string column is provided in output table. * "unordered": No ordering information will be provided in output. Only value columns are projected. @@ -624,12 +646,14 @@ def to_ibis_expr( If True, include the hidden ordering columns in the results. Only compatible with `order_by` and `unordered` ``ordering_mode``. + col_id_overrides: + overrides the column ids for the result Returns: An ibis expression representing the data help by the ArrayValue object. """ assert ordering_mode in ( "order_by", - "ordered_col", + "string_encoded", "offset_col", "unordered", ) @@ -638,87 +662,107 @@ def to_ibis_expr( f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" ) - table = self._table columns = list(self._columns) - hidden_ordering_columns = [ - col.column_id - for col in self._ordering.all_ordering_columns - if col.column_id not in self._column_names.keys() - ] + columns_to_drop: list[ + str + ] = [] # Ordering/Filtering columns that will be dropped at end if self.reduced_predicate is not None: columns.append(self.reduced_predicate) - if ordering_mode in ("offset_col", "ordered_col"): - # Generate offsets if current ordering id semantics are not sufficiently strict - if (ordering_mode == "offset_col" and not self._ordering.is_sequential) or ( - ordering_mode == "ordered_col" and not self._ordering.order_id_defined - ): - window = ibis.window(order_by=self.ordering) - if self._predicates: - window = window.group_by(self.reduced_predicate) - columns.append(ibis.row_number().name(order_col_name).over(window)) - elif self._ordering.ordering_id: - columns.append( - self._get_hidden_ordering_column(self._ordering.ordering_id).name( - order_col_name - ) - ) - else: - # Should not be possible. - raise ValueError( - "Expression does not have ordering id and none was generated." - ) - elif ordering_mode == "order_by": - columns.extend( - [ - self._get_hidden_ordering_column(name) - for name in hidden_ordering_columns - ] - ) + # Usually drop predicate as it is will be all TRUE after filtering + if not expose_hidden_cols: + columns_to_drop.append(self.reduced_predicate.get_name()) - # We already need to add the hidden ordering columns for "order_by" so - # we can order by them. - if expose_hidden_cols and ordering_mode != "order_by": - columns.extend( - [ - self._get_hidden_ordering_column(name) - for name in hidden_ordering_columns - ] - ) + order_columns = self._create_order_columns( + ordering_mode, order_col_name, expose_hidden_cols + ) + columns.extend(order_columns) + if (ordering_mode == "order_by") and not expose_hidden_cols: + columns_to_drop.extend(col.get_name() for col in order_columns) # Special case for empty tables, since we can't create an empty # projection. if not columns: return ibis.memtable([]) - table = table.select(columns) + # Make sure all dtypes are the "canonical" ones for BigFrames. This is # important for operations like UNION where the schema must match. - table = bigframes.dtypes.ibis_table_to_canonical_types(table) - + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table if self.reduced_predicate is not None: - table = table.filter(table[PREDICATE_COLUMN]) - # Drop predicate as it is will be all TRUE after filtering - table = table.drop(PREDICATE_COLUMN) + table = table.filter(base_table[PREDICATE_COLUMN]) if ordering_mode == "order_by": - # Some ordering columns are value columns, while other are used purely for ordering. - # We drop the non-value columns after the ordering table = table.order_by( _convert_ordering_to_table_values( - {col: table[col] for col in table.columns}, + {col: base_table[col] for col in table.columns}, self._ordering.all_ordering_columns, ) # type: ignore ) - # TODO(swast): We should be able to avoid this subquery by ordering - # by columns that don't have to be in the SELECT clause. - if not expose_hidden_cols: - table = table.drop(*hidden_ordering_columns) - + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) return table + def _create_order_columns( + self, + ordering_mode: str, + order_col_name: Optional[str], + expose_hidden_cols: bool, + ) -> typing.Sequence[ibis_types.Value]: + # Generate offsets if current ordering id semantics are not sufficiently strict + if ordering_mode == "offset_col": + return (self._create_offset_column().name(order_col_name),) + elif ordering_mode == "string_encoded": + return (self._create_string_ordering_column().name(order_col_name),) + elif ordering_mode == "order_by" or expose_hidden_cols: + return self.hidden_ordering_columns + return () + + def _create_offset_column(self) -> ibis_types.IntegerColumn: + if self._ordering.total_order_col and self._ordering.is_sequential: + offsets = self.get_any_column(self._ordering.total_order_col.column_id) + return typing.cast(ibis_types.IntegerColumn, offsets) + else: + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self.reduced_predicate) + offsets = ibis.row_number().over(window) + return typing.cast(ibis_types.IntegerColumn, offsets) + + def _create_string_ordering_column(self) -> ibis_types.StringColumn: + if self._ordering.total_order_col and self._ordering.is_string_encoded: + string_order_ids = self.get_any_column( + self._ordering.total_order_col.column_id + ) + return typing.cast(ibis_types.StringColumn, string_order_ids) + if ( + self._ordering.total_order_col + and self._ordering.integer_encoding.is_encoded + ): + # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers + int_values = self.get_any_column(self._ordering.total_order_col.column_id) + return encode_order_string( + typing.cast(ibis_types.IntegerColumn, int_values), + ) + else: + # Have to build string from scratch + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self.reduced_predicate) + row_nums = typing.cast( + ibis_types.IntegerColumn, ibis.row_number().over(window) + ) + return encode_order_string(row_nums) + def start_query( self, job_config: Optional[bigquery.job.QueryJobConfig] = None, max_results: Optional[int] = None, + expose_extra_columns: bool = False, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """Execute a query and return metadata about the results.""" # TODO(swast): Cache the job ID so we can look it up again if they ask @@ -731,7 +775,7 @@ def start_query( # a LocalSession for unit testing. # TODO(swast): Add a timeout here? If the query is taking a long time, # maybe we just print the job metadata that we have so far? - table = self.to_ibis_expr() + table = self.to_ibis_expr(expose_hidden_cols=expose_extra_columns) sql = self._session.ibis_client.compile(table) # type:ignore return self._session._start_query( sql=sql, @@ -739,6 +783,9 @@ def start_query( max_results=max_results, ) + def _get_table_size(self, destination_table): + return self._session._get_table_size(destination_table) + def _reproject_to_table(self) -> ArrayValue: """ Internal operators that projects the internal representation into a @@ -749,12 +796,16 @@ def _reproject_to_table(self) -> ArrayValue: """ table = self.to_ibis_expr( ordering_mode="unordered", - order_col_name=self._ordering.ordering_id, expose_hidden_cols=True, ) columns = [table[column_name] for column_name in self._column_names] + ordering_col_ids = [ + ref.column_id for ref in self._ordering.all_ordering_columns + ] hidden_ordering_columns = [ - table[column_name] for column_name in self._hidden_ordering_column_names + table[column_name] + for column_name in self._hidden_ordering_column_names + if column_name in ordering_col_ids ] return ArrayValue( self._session, @@ -782,10 +833,10 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal ) if not allow_ties: # Most operator need an unambiguous ordering, so the table's total ordering is appended - order_by = tuple([*order_by, *self.ordering]) + order_by = tuple([*order_by, *self._ibis_order]) elif (window_spec.following is not None) or (window_spec.preceding is not None): # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. - order_by = tuple(self.ordering) + order_by = tuple(self._ibis_order) else: # Unbound grouping window. Suitable for aggregations but not for analytic function application. order_by = None @@ -796,47 +847,56 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal group_by=group_by, ) - def transpose_single_row( + def unpivot_single_row( self, - labels, + row_labels: typing.Sequence[typing.Optional[str]], + unpivot_columns: typing.Sequence[typing.Tuple[str, typing.Sequence[str]]], *, index_col_id: str = "index", - value_col_id: str = "values", dtype=pandas.Float64Dtype(), ) -> ArrayValue: - """Pivot a single row into a 3 column expression with index, values and offsets. Only works if all values can be cast to a common type.""" + """Unpivot a single row.""" + # TODO: Generalize to multiple row input table = self.to_ibis_expr(ordering_mode="unordered") sub_expressions = [] - for i, col_id in enumerate(self._column_names.keys()): + + # TODO: validate all columns are equal length, as well as row labels + row_n = len(row_labels) + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + # Select each column + for i in range(row_n): + values = [] + for result_col, source_cols in unpivot_columns: + values.append( + ops.AsTypeOp(dtype)._as_ibis(table[source_cols[i]]).name(result_col) + ) + sub_expr = table.select( - ibis_types.literal(labels[i]).name(index_col_id), - ops.AsTypeOp(dtype)._as_ibis(table[col_id]).name(value_col_id), + ibis_types.literal(row_labels[i]).name(index_col_id), + *values, ibis_types.literal(i).name(ORDER_ID_COLUMN), ) sub_expressions.append(sub_expr) rotated_table = ibis.union(*sub_expressions) + + value_columns = [ + rotated_table[value_col_id] for value_col_id, _ in unpivot_columns + ] return ArrayValue( session=self._session, table=rotated_table, - columns=[rotated_table[index_col_id], rotated_table[value_col_id]], + columns=[rotated_table[index_col_id], *value_columns], hidden_ordering_columns=[rotated_table[ORDER_ID_COLUMN]], ordering=ExpressionOrdering( - ordering_id_column=OrderingColumnReference(column_id=ORDER_ID_COLUMN), + ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), ), ) - # TODO(b/282041134) Remove deprecate_rename_column once label/id separation in dataframe - def deprecated_rename_column(self, old_id, new_id) -> ArrayValue: - """ - Don't use this, temporary measure until dataframe supports sqlid!=dataframe col id. - In future, caller shouldn't need to control internal column id strings. - """ - if new_id == old_id: - return self - return self._set_or_replace_by_id(new_id, self.get_column(old_id)).drop_columns( - [old_id] - ) - def assign(self, source_id: str, destination_id: str) -> ArrayValue: return self._set_or_replace_by_id(destination_id, self.get_column(source_id)) @@ -850,12 +910,20 @@ def assign_constant( ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) if ibis_value is None: raise NotImplementedError( - f"Type not supported as scalar value {type(value)}" + f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" ) expr = self._set_or_replace_by_id(destination_id, ibis_value) return expr._reproject_to_table() - def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value): + def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> ArrayValue: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + if id in ordering_col_ids: + return self._hide_column(id)._set_or_replace_by_id(id, new_value) + builder = self.builder() if id in self.column_names: builder.columns = [ @@ -937,9 +1005,9 @@ def __init__( self, session: Session, table: ibis_types.Table, + ordering: ExpressionOrdering, columns: Collection[ibis_types.Value] = (), hidden_ordering_columns: Collection[ibis_types.Value] = (), - ordering: Optional[ExpressionOrdering] = None, predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): self.session = session diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index fe7261522d..5ef9263072 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -23,8 +23,10 @@ import functools import itertools +import random import typing from typing import Iterable, List, Optional, Sequence, Tuple +import warnings import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery @@ -34,10 +36,12 @@ import pandas as pd import pyarrow as pa # type: ignore +import bigframes.constants as constants import bigframes.core as core import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.ordering as ordering +import bigframes.core.utils import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -45,6 +49,15 @@ # Type constraint for wherever column labels are used Label = typing.Optional[str] +# Bytes to Megabyte Conversion +_BYTES_TO_KILOBYTES = 1024 +_BYTES_TO_MEGABYTES = _BYTES_TO_KILOBYTES * 1024 + +# All sampling method +_HEAD = "head" +_UNIFORM = "uniform" +_SAMPLING_METHODS = (_HEAD, _UNIFORM) + class BlockHolder(typing.Protocol): """Interface for mutable objects with state represented by a block value object.""" @@ -89,6 +102,11 @@ def __init__( raise ValueError( f"'value_columns' (size {len(self.value_columns)}) and 'column_labels' (size {len(self._column_labels)}) must have equal length" ) + # col_id -> [stat_name -> scalar] + # TODO: Preserve cache under safe transforms (eg. drop column, reorder) + self._stats_cache: dict[str, dict[str, typing.Any]] = { + col_id: {} for col_id in self.value_columns + } @property def index(self) -> indexes.IndexValue: @@ -241,7 +259,11 @@ def reset_index(self, drop: bool = True) -> Block: return block def set_index( - self, col_ids: typing.Sequence[str], drop: bool = True, append: bool = False + self, + col_ids: typing.Sequence[str], + drop: bool = True, + append: bool = False, + index_labels: typing.Sequence[Label] = (), ) -> Block: """Set the index of the block to @@ -249,6 +271,7 @@ def set_index( ids: columns to be converted to index columns drop: whether to drop the new index columns as value columns append: whether to discard the existing index or add on to it + index_labels: new index labels Returns: Block with new index @@ -269,6 +292,9 @@ def set_index( else: expr = expr.drop_columns(self.index_columns) + if index_labels: + new_index_labels = list(index_labels) + block = Block( expr, index_columns=new_index_columns, @@ -325,42 +351,222 @@ def _to_dataframe(self, result, schema: ibis_schema.Schema) -> pd.DataFrame: ) return df - def compute( - self, value_keys: Optional[Iterable[str]] = None, max_results=None + def to_pandas( + self, + value_keys: Optional[Iterable[str]] = None, + max_results: Optional[int] = None, + max_download_size: Optional[int] = None, + sampling_method: Optional[str] = None, + random_state: Optional[int] = None, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" + if max_download_size is None: + max_download_size = bigframes.options.sampling.max_download_size + if sampling_method is None: + sampling_method = ( + bigframes.options.sampling.sampling_method + if bigframes.options.sampling.sampling_method is not None + else _UNIFORM + ) + if random_state is None: + random_state = bigframes.options.sampling.random_state + + sampling_method = sampling_method.lower() + if sampling_method not in _SAMPLING_METHODS: + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) + df, _, query_job = self._compute_and_count( - value_keys=value_keys, max_results=max_results + value_keys=value_keys, + max_results=max_results, + max_download_size=max_download_size, + sampling_method=sampling_method, + random_state=random_state, ) return df, query_job def _compute_and_count( - self, value_keys: Optional[Iterable[str]] = None, max_results=None + self, + value_keys: Optional[Iterable[str]] = None, + max_results: Optional[int] = None, + max_download_size: Optional[int] = None, + sampling_method: Optional[str] = None, + random_state: Optional[int] = None, ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. - expr = self._expr + expr = self._apply_value_keys_to_expr(value_keys=value_keys) - value_column_names = value_keys or self.value_columns - if value_keys is not None: - index_columns = ( - expr.get_column(column_name) for column_name in self._index_columns + results_iterator, query_job = expr.start_query( + max_results=max_results, expose_extra_columns=True + ) + + table_size = expr._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES + fraction = ( + max_download_size / table_size + if (max_download_size is not None) and (table_size != 0) + else 2 + ) + + if fraction < 1: + if not bigframes.options.sampling.enable_downsampling: + raise RuntimeError( + f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of " + f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n" + "\t\t`bigframes.options.sampling.enable_downsampling = True`\n" + "\t* Update the global `max_download_size` option. Please make sure " + "there is enough memory available:\n" + "\t\t`bigframes.options.sampling.max_download_size = desired_size`" + " # Setting it to None will download all the data\n" + f"{constants.FEEDBACK_LINK}" + ) + + warnings.warn( + f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of" + f"({max_download_size} MB). It will be downsampled to {max_download_size} MB for download." + "\nPlease refer to the documentation for configuring the downloading limit.", + UserWarning, ) - value_columns = (expr.get_column(column_name) for column_name in value_keys) - expr = expr.projection(itertools.chain(index_columns, value_columns)) + if sampling_method == _HEAD: + total_rows = int(results_iterator.total_rows * fraction) + results_iterator.max_results = total_rows + df = self._to_dataframe(results_iterator, expr.to_ibis_expr().schema()) + + if self.index_columns: + df.set_index(list(self.index_columns), inplace=True) + df.index.names = self.index.names # type: ignore + + df.drop( + [col for col in df.columns if col not in self.value_columns], + axis=1, + inplace=True, + ) + elif (sampling_method == _UNIFORM) and (random_state is None): + filtered_expr = self.expr._uniform_sampling(fraction) + block = Block( + filtered_expr, + index_columns=self.index_columns, + column_labels=self.column_labels, + index_labels=self.index.names, + ) + df, total_rows, _ = block._compute_and_count(max_download_size=None) + elif sampling_method == _UNIFORM: + block = self._split( + fracs=(max_download_size / table_size,), + random_state=random_state, + preserve_order=True, + )[0] + df, total_rows, _ = block._compute_and_count(max_download_size=None) + else: + # This part should never be called, just in case. + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) + else: + total_rows = results_iterator.total_rows + df = self._to_dataframe(results_iterator, expr.to_ibis_expr().schema()) + + if self.index_columns: + df.set_index(list(self.index_columns), inplace=True) + df.index.names = self.index.names # type: ignore + + df.drop( + [col for col in df.columns if col not in self.value_columns], + axis=1, + inplace=True, + ) + + return df, total_rows, query_job - results_iterator, query_job = expr.start_query(max_results=max_results) - df = self._to_dataframe( - results_iterator, - expr.to_ibis_expr().schema(), + def _split( + self, + ns: Iterable[int] = (), + fracs: Iterable[float] = (), + *, + random_state: Optional[int] = None, + preserve_order: Optional[bool] = False, + ) -> List[Block]: + """Internal function to support splitting Block to multiple parts along index axis. + + At most one of ns and fracs can be passed in. If neither, default to ns = (1,). + Return a list of sampled Blocks. + """ + block = self + if ns and fracs: + raise ValueError("Only one of 'ns' or 'fracs' parameter must be specified.") + + if not ns and not fracs: + ns = (1,) + + if ns: + sample_sizes = ns + else: + total_rows = block.shape[0] + # Round to nearest integer. "round half to even" rule applies. + # At least to be 1. + sample_sizes = [round(frac * total_rows) or 1 for frac in fracs] + + if random_state is None: + random_state = random.randint(-(2**63), 2**63 - 1) + + # Create a new column with random_state value. + block, random_state_col = block.create_constant(str(random_state)) + + # Create an ordering col and convert to string + block, ordering_col = block.promote_offsets() + block, string_ordering_col = block.apply_unary_op( + ordering_col, ops.AsTypeOp("string[pyarrow]") + ) + + # Apply hash method to sum col and order by it. + block, string_sum_col = block.apply_binary_op( + string_ordering_col, random_state_col, ops.concat_op ) + block, hash_string_sum_col = block.apply_unary_op(string_sum_col, ops.hash_op) + block = block.order_by([ordering.OrderingColumnReference(hash_string_sum_col)]) - df = df.loc[:, [*self.index_columns, *value_column_names]] - if self.index_columns: - df = df.set_index(list(self.index_columns)) - df.index.names = self.index.names # type: ignore + intervals = [] + cur = 0 - return df, results_iterator.total_rows, query_job + for sample_size in sample_sizes: + intervals.append((cur, cur + sample_size)) + cur += sample_size + + sliced_blocks = [ + typing.cast(Block, block.slice(start=lower, stop=upper)) + for lower, upper in intervals + ] + if preserve_order: + sliced_blocks = [ + sliced_block.order_by([ordering.OrderingColumnReference(ordering_col)]) + for sliced_block in sliced_blocks + ] + + drop_cols = [ + random_state_col, + ordering_col, + string_ordering_col, + string_sum_col, + hash_string_sum_col, + ] + return [sliced_block.drop_columns(drop_cols) for sliced_block in sliced_blocks] + + def _compute_dry_run( + self, value_keys: Optional[Iterable[str]] = None + ) -> bigquery.QueryJob: + expr = self._apply_value_keys_to_expr(value_keys=value_keys) + job_config = bigquery.QueryJobConfig(dry_run=True) + _, query_job = expr.start_query(job_config=job_config) + return query_job + + def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None): + expr = self._expr + if value_keys is not None: + expr = expr.select_columns(itertools.chain(self._index_columns, value_keys)) + return expr def with_column_labels(self, value: typing.Iterable[Label]) -> Block: label_list = tuple(value) @@ -553,10 +759,17 @@ def assign_label(self, column_id: str, new_label: Label) -> Block: new_labels[col_index] = new_label return self.with_column_labels(new_labels) - def filter(self, column_name: str): + def filter(self, column_name: str, keep_null: bool = False): condition = typing.cast( ibis_types.BooleanValue, self._expr.get_column(column_name) ) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) filtered_expr = self.expr.filter(condition) return Block( filtered_expr, @@ -576,10 +789,10 @@ def aggregate_all_and_pivot( aggregations = [(col_id, operation, col_id) for col_id in self.value_columns] result_expr = self.expr.aggregate( aggregations, dropna=dropna - ).transpose_single_row( - labels=self.column_labels, + ).unpivot_single_row( + row_labels=self.column_labels, index_col_id="index", - value_col_id=value_col_id, + unpivot_columns=[(value_col_id, self.value_columns)], dtype=dtype, ) return Block(result_expr, index_columns=["index"], column_labels=[None]) @@ -614,8 +827,8 @@ def rename(self, *, columns: typing.Mapping[Label, Label]): def aggregate( self, - by_column_ids: typing.Sequence[str], - aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp]], + by_column_ids: typing.Sequence[str] = (), + aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp]] = (), *, as_index: bool = True, dropna: bool = True, @@ -623,7 +836,7 @@ def aggregate( """ Apply aggregations to the block. Callers responsible for setting index column(s) after. Arguments: - by_column_id: column id of the aggregation key, this is preserved through the transform and used as index + by_column_id: column id of the aggregation key, this is preserved through the transform and used as index. aggregations: input_column_id, operation tuples as_index: if True, grouping keys will be index columns in result, otherwise they will be non-index columns. dropna: whether null keys should be dropped @@ -639,15 +852,12 @@ def aggregate( [agg[0] for agg in aggregations] ) if as_index: - # TODO: Generalize to multi-index names: typing.List[Label] = [] for by_col_id in by_column_ids: - if by_col_id in self.index_columns: - # Groupby level 0 case, keep index name - index_name = self.col_id_to_index_name[by_col_id] + if by_col_id in self.value_columns: + names.append(self.col_id_to_label[by_col_id]) else: - index_name = self.col_id_to_label[by_col_id] - names.append(index_name) + names.append(self.col_id_to_index_name[by_col_id]) return ( Block( result_expr, @@ -657,10 +867,89 @@ def aggregate( ), output_col_ids, ) - else: - by_column_labels = self._get_labels_for_columns(by_column_ids) + else: # as_index = False + # If as_index=False, drop grouping levels, but keep grouping value columns + by_value_columns = [ + col for col in by_column_ids if col in self.value_columns + ] + by_column_labels = self._get_labels_for_columns(by_value_columns) labels = (*by_column_labels, *aggregate_labels) - return Block(result_expr, column_labels=labels), output_col_ids + result_expr_pruned = result_expr.select_columns( + [*by_value_columns, *output_col_ids] + ) + return Block(result_expr_pruned, column_labels=labels), output_col_ids + + def get_stat(self, column_id: str, stat: agg_ops.AggregateOp): + """Gets aggregates immediately, and caches it""" + if stat.name in self._stats_cache[column_id]: + return self._stats_cache[column_id][stat.name] + + # TODO: Convert nonstandard stats into standard stats where possible (popvar, etc.) + # if getting a standard stat, just go get the rest of them + standard_stats = self._standard_stats(column_id) + stats_to_fetch = standard_stats if stat in standard_stats else [stat] + + aggregations = [(column_id, stat, stat.name) for stat in stats_to_fetch] + expr = self.expr.aggregate(aggregations) + block = Block(expr, column_labels=[s.name for s in stats_to_fetch]) + df, _ = block.to_pandas() + + # Carefully extract stats such that they aren't coerced to a common type + stats_map = {stat_name: df.loc[0, stat_name] for stat_name in df.columns} + self._stats_cache[column_id].update(stats_map) + return stats_map[stat.name] + + def summarize( + self, + column_ids: typing.Sequence[str], + stats: typing.Sequence[agg_ops.AggregateOp], + ): + """Get a list of stats as a deferred block object.""" + label_col_id = guid.generate_guid() + labels = [stat.name for stat in stats] + aggregations = [ + (col_id, stat, f"{col_id}-{stat.name}") + for stat in stats + for col_id in column_ids + ] + columns = [ + (col_id, [f"{col_id}-{stat.name}" for stat in stats]) + for col_id in column_ids + ] + expr = self.expr.aggregate(aggregations).unpivot_single_row( + labels, + unpivot_columns=columns, + index_col_id=label_col_id, + ) + labels = self._get_labels_for_columns(column_ids) + return Block(expr, column_labels=labels, index_columns=[label_col_id]) + + def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.AggregateOp]: + """ + Gets a standard set of stats to preemptively fetch for a column if + any other stat is fetched. + Helps prevent repeat scanning of the same column to fetch statistics. + Standard stats should be: + - commonly used + - efficiently computable. + """ + # TODO: annotate aggregations themself with this information + dtype = self.expr.get_column_type(column_id) + stats: list[agg_ops.AggregateOp] = [agg_ops.count_op] + if dtype not in bigframes.dtypes.UNORDERED_DTYPES: + stats += [agg_ops.min_op, agg_ops.max_op] + if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES: + # Notable exclusions: + # prod op tends to cause overflows + # Also, var_op is redundant as can be derived from std + stats += [ + agg_ops.std_op, + agg_ops.mean_op, + agg_ops.var_op, + agg_ops.sum_op, + ] + + return stats def _get_labels_for_columns(self, column_ids: typing.Sequence[str]): """Get column label for value columns, or index name for index columns""" @@ -699,6 +988,29 @@ def slice( ) return block + def retrieve_repr_request_results( + self, max_results: int + ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: + """ + Retrieves a pandas dataframe containing only max_results many rows for use + with printing methods. + + Returns a tuple of the dataframe and the overall number of rows of the query. + """ + # TODO(swast): Select a subset of columns if max_columns is less than the + # number of columns in the schema. + count = self.shape[0] + if count > max_results: + head_block = self.slice(0, max_results) + computed_df, query_job = head_block.to_pandas(max_results=max_results) + else: + head_block = self + computed_df, query_job = head_block.to_pandas() + formatted_df = computed_df.set_axis(self.column_labels, axis=1) + # we reset the axis and substitute the bf index name for the default + formatted_df.index.name = self.index.name + return formatted_df, count, query_job + def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: expr, result_id = self._expr.promote_offsets() return ( @@ -712,7 +1024,7 @@ def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: ) def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: - axis_number = _get_axis_number(axis) + axis_number = bigframes.core.utils.get_axis_number(axis) if axis_number == 0: expr = self._expr for index_col in self._index_columns: @@ -735,7 +1047,7 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: ) def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: - axis_number = _get_axis_number(axis) + axis_number = bigframes.core.utils.get_axis_number(axis) if axis_number == 0: expr = self._expr for index_col in self._index_columns: @@ -788,6 +1100,15 @@ def concat( result_block = result_block.reset_index() return result_block + def _force_reproject(self) -> Block: + """Forces a reprojection of the underlying tables expression. Used to force predicate/order application before subsequent operations.""" + return Block( + self._expr._reproject_to_table(), + index_columns=self.index_columns, + column_labels=self.column_labels, + index_labels=self.index.names, + ) + def block_from_local(data, session=None, use_index=True) -> Block: # TODO(tbergeron): Handle duplicate column labels @@ -795,14 +1116,20 @@ def block_from_local(data, session=None, use_index=True) -> Block: column_labels = list(pd_data.columns) if not all((label is None) or isinstance(label, str) for label in column_labels): - raise NotImplementedError("Only string column labels supported") + raise NotImplementedError( + f"Only string column labels supported. {constants.FEEDBACK_LINK}" + ) if use_index: if pd_data.index.nlevels > 1: - raise NotImplementedError("multi-indices not supported.") + raise NotImplementedError( + f"multi-indices not supported. {constants.FEEDBACK_LINK}" + ) index_label = pd_data.index.name if (index_label is not None) and (not isinstance(index_label, str)): - raise NotImplementedError("Only string index names supported") + raise NotImplementedError( + f"Only string index names supported. {constants.FEEDBACK_LINK}" + ) index_id = guid.generate_guid() pd_data = pd_data.reset_index(names=index_id) @@ -853,11 +1180,11 @@ def _align_indices(blocks: typing.Sequence[Block]) -> typing.Sequence[Label]: for block in blocks[1:]: if len(names) != block.index.nlevels: raise NotImplementedError( - "Cannot combine indices with different number of levels. Use 'ignore_index'=True." + f"Cannot combine indices with different number of levels. Use 'ignore_index'=True. {constants.FEEDBACK_LINK}" ) if block.index.dtypes != types: raise NotImplementedError( - "Cannot combine different index dtypes. Use 'ignore_index'=True." + f"Cannot combine different index dtypes. Use 'ignore_index'=True. {constants.FEEDBACK_LINK}" ) names = [ lname if lname == rname else None @@ -875,7 +1202,7 @@ def _combine_schema_inner( if label in right: if type != right[label]: raise ValueError( - f"Cannot concat rows with label {label} due to mismatched types" + f"Cannot concat rows with label {label} due to mismatched types. {constants.FEEDBACK_LINK}" ) result[label] = type return result @@ -889,7 +1216,7 @@ def _combine_schema_outer( for label, type in left.items(): if (label in right) and (type != right[label]): raise ValueError( - f"Cannot concat rows with label {label} due to mismatched types" + f"Cannot concat rows with label {label} due to mismatched types. {constants.FEEDBACK_LINK}" ) result[label] = type for label, type in right.items(): @@ -906,12 +1233,3 @@ def _get_block_schema( for label, dtype in zip(block.column_labels, block.dtypes): result[label] = typing.cast(bigframes.dtypes.Dtype, dtype) return result - - -def _get_axis_number(axis: str | int | None) -> typing.Literal[0, 1]: - if axis in {0, "index", "rows", None}: - return 0 - elif axis in {1, "columns"}: - return 1 - else: - raise ValueError(f"Not a valid axis: {axis}") diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index a8655116e9..288dcdd5b0 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -16,12 +16,16 @@ import typing +import pandas as pd +import typing_extensions + +import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.ordering as order import bigframes.core.window as windows import bigframes.dataframe as df -import bigframes.dtypes +import bigframes.dtypes as dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series as series @@ -36,6 +40,7 @@ def __init__( block: blocks.Block, by_col_ids: typing.Sequence[str], *, + selected_cols: typing.Optional[typing.Sequence[str]] = None, dropna: bool = True, as_index: bool = True, ): @@ -48,8 +53,52 @@ def __init__( ) } self._by_col_ids = by_col_ids - self._dropna = dropna # Applies to aggregations but not windowing + + self._dropna = dropna self._as_index = as_index + if selected_cols: + for col in selected_cols: + if col not in self._block.value_columns: + raise ValueError(f"Invalid column selection: {col}") + self._selected_cols = selected_cols + else: + self._selected_cols = [ + col_id + for col_id in self._block.value_columns + if col_id not in self._by_col_ids + ] + + def __getitem__( + self, + key: typing.Union[ + blocks.Label, + typing.Sequence[blocks.Label], + ], + ): + if _is_list_like(key): + keys = list(key) + else: + keys = [key] + columns = [ + col_id for col_id, label in self._col_id_labels.items() if label in keys + ] + + if len(columns) > 1 or (not self._as_index): + return DataFrameGroupBy( + self._block, + self._by_col_ids, + selected_cols=columns, + dropna=self._dropna, + as_index=self._as_index, + ) + else: + return SeriesGroupBy( + self._block, + columns[0], + self._by_col_ids, + value_name=self._col_id_labels[columns[0]], + dropna=self._dropna, + ) def sum(self, numeric_only: bool = False, *args) -> df.DataFrame: if not numeric_only: @@ -61,15 +110,22 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame: self._raise_on_non_numeric("mean") return self._aggregate(agg_ops.mean_op, numeric_only=True) - def min(self, numeric_only: bool = False, *args) -> df.DataFrame: + def median( + self, numeric_only: bool = False, *, exact: bool = False + ) -> df.DataFrame: + if exact: + raise NotImplementedError( + f"Only approximate median is supported. {constants.FEEDBACK_LINK}" + ) if not numeric_only: - self._raise_on_non_numeric("min") - return self._aggregate(agg_ops.min_op, numeric_only=True) + self._raise_on_non_numeric("median") + return self._aggregate(agg_ops.median_op, numeric_only=True) + + def min(self, numeric_only: bool = False, *args) -> df.DataFrame: + return self._aggregate(agg_ops.min_op, numeric_only=numeric_only) def max(self, numeric_only: bool = False, *args) -> df.DataFrame: - if not numeric_only: - self._raise_on_non_numeric("max") - return self._aggregate(agg_ops.max_op, numeric_only=True) + return self._aggregate(agg_ops.max_op, numeric_only=numeric_only) def std( self, @@ -101,45 +157,40 @@ def count(self) -> df.DataFrame: def cumsum(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("cumsum") - window = bigframes.core.WindowSpec(grouping_keys=self._by_col_ids, following=0) - return self._apply_window_op(agg_ops.sum_op, window, numeric_only=True) + return self._apply_window_op(agg_ops.sum_op, numeric_only=True) def cummin(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame: - if not numeric_only: - self._raise_on_non_numeric("cummin") - window = bigframes.core.WindowSpec(grouping_keys=self._by_col_ids, following=0) - return self._apply_window_op(agg_ops.min_op, window, numeric_only=True) + return self._apply_window_op(agg_ops.min_op, numeric_only=numeric_only) def cummax(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame: - if not numeric_only: - self._raise_on_non_numeric("cummax") - window = bigframes.core.WindowSpec(grouping_keys=self._by_col_ids, following=0) - return self._apply_window_op(agg_ops.max_op, window, numeric_only=True) + return self._apply_window_op(agg_ops.max_op, numeric_only=numeric_only) def cumprod(self, *args, **kwargs) -> df.DataFrame: - window = bigframes.core.WindowSpec(grouping_keys=self._by_col_ids, following=0) - return self._apply_window_op(agg_ops.product_op, window, numeric_only=True) + return self._apply_window_op(agg_ops.product_op, numeric_only=True) def _raise_on_non_numeric(self, op: str): if not all( - dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES - for dtype in self._block.dtypes + dtype in dtypes.NUMERIC_BIGFRAMES_TYPES for dtype in self._block.dtypes ): raise NotImplementedError( - f"'{op}' does not support non-numeric columns. Set 'numeric_only'=True to ignore non-numeric columns" + f"'{op}' does not support non-numeric columns. " + "Set 'numeric_only'=True to ignore non-numeric columns. " + f"{constants.FEEDBACK_LINK}" ) return self - def _aggregated_columns(self, numeric_only: bool = False): - return [ - col_id - for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) - if col_id not in self._by_col_ids - and ( - (not numeric_only) - or (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) - ) - ] + def _aggregated_columns(self, numeric_only: bool = False) -> typing.Sequence[str]: + valid_agg_cols: list[str] = [] + for col_id in self._selected_cols: + is_numeric = self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES + if is_numeric or not numeric_only: + valid_agg_cols.append(col_id) + return valid_agg_cols + + def _column_type(self, col_id: str) -> dtypes.Dtype: + col_offset = self._block.value_columns.index(col_id) + dtype = self._block.dtypes[col_offset] + return dtype def _aggregate( self, aggregate_op: agg_ops.AggregateOp, numeric_only: bool = False @@ -147,8 +198,8 @@ def _aggregate( aggregated_col_ids = self._aggregated_columns(numeric_only=numeric_only) aggregations = [(col_id, aggregate_op) for col_id in aggregated_col_ids] result_block, _ = self._block.aggregate( - self._by_col_ids, - aggregations, + by_column_ids=self._by_col_ids, + aggregations=aggregations, as_index=self._as_index, dropna=self._dropna, ) @@ -157,15 +208,16 @@ def _aggregate( def _apply_window_op( self, op: agg_ops.WindowOp, - window_spec: bigframes.core.WindowSpec, + window: typing.Optional[core.WindowSpec] = None, numeric_only: bool = False, ): + """Apply window op to groupby. Defaults to grouped cumulative window.""" + window_spec = window or core.WindowSpec( + grouping_keys=self._by_col_ids, following=0 + ) columns = self._aggregated_columns(numeric_only=numeric_only) - block = self._block.select_columns([*columns, *window_spec.grouping_keys]) block = self._block.multi_apply_window_op( - columns, - op, - window_spec=window_spec, + columns, op, window_spec=window_spec, skip_null_groups=self._dropna ) block = block.select_columns(columns) return df.DataFrame(block) @@ -190,7 +242,7 @@ def __init__( self._dropna = dropna # Applies to aggregations but not windowing @property - def value(self): + def _value(self): return self._block.expr.get_column(self._value_column) def all(self) -> series.Series: @@ -199,16 +251,24 @@ def all(self) -> series.Series: def any(self) -> series.Series: return self._aggregate(agg_ops.any_op) + def min(self, *args) -> series.Series: + return self._aggregate(agg_ops.min_op) + + def max(self, *args) -> series.Series: + return self._aggregate(agg_ops.max_op) + def count(self) -> series.Series: return self._aggregate(agg_ops.count_op) def sum(self, *args) -> series.Series: - """Sums the numeric values for each group in the series. Ignores null/nan.""" return self._aggregate(agg_ops.sum_op) def mean(self, *args) -> series.Series: return self._aggregate(agg_ops.mean_op) + def median(self, *args, **kwargs) -> series.Series: + return self._aggregate(agg_ops.mean_op) + def std(self, *args, **kwargs) -> series.Series: return self._aggregate(agg_ops.std_op) @@ -221,45 +281,39 @@ def prod(self, *args) -> series.Series: def cumsum(self, *args, **kwargs) -> series.Series: return self._apply_window_op( agg_ops.sum_op, - bigframes.core.WindowSpec(grouping_keys=self._by_col_ids, following=0), ) def cumprod(self, *args, **kwargs) -> series.Series: return self._apply_window_op( agg_ops.product_op, - bigframes.core.WindowSpec(grouping_keys=self._by_col_ids, following=0), ) def cummax(self, *args, **kwargs) -> series.Series: return self._apply_window_op( agg_ops.max_op, - bigframes.core.WindowSpec(grouping_keys=self._by_col_ids, following=0), ) def cummin(self, *args, **kwargs) -> series.Series: return self._apply_window_op( agg_ops.min_op, - bigframes.core.WindowSpec(grouping_keys=self._by_col_ids, following=0), ) def cumcount(self, *args, **kwargs) -> series.Series: return self._apply_window_op( agg_ops.rank_op, - bigframes.core.WindowSpec(grouping_keys=self._by_col_ids, following=0), discard_name=True, )._apply_unary_op(ops.partial_right(ops.sub_op, 1)) def shift(self, periods=1) -> series.Series: """Shift index by desired number of periods.""" - window = bigframes.core.WindowSpec( + window = core.WindowSpec( grouping_keys=self._by_col_ids, preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) - return self._apply_window_op(agg_ops.ShiftOp(periods), window) + return self._apply_window_op(agg_ops.ShiftOp(periods), window=window) def diff(self) -> series.Series: - """Difference between each element and previous element.""" return self._ungroup() - self.shift(1) def rolling(self, window: int, min_periods=None) -> windows.Window: @@ -278,7 +332,9 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: def expanding(self, min_periods: int = 1) -> windows.Window: window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, following=0, min_periods=min_periods + grouping_keys=self._by_col_ids, + following=0, + min_periods=min_periods, ) block = self._block.order_by( [order.OrderingColumnReference(col) for col in self._by_col_ids], @@ -301,9 +357,14 @@ def _aggregate(self, aggregate_op: agg_ops.AggregateOp) -> series.Series: def _apply_window_op( self, op: agg_ops.WindowOp, - window_spec: bigframes.core.WindowSpec, discard_name=False, + window: typing.Optional[core.WindowSpec] = None, ): + """Apply window op to groupby. Defaults to grouped cumulative window.""" + window_spec = window or core.WindowSpec( + grouping_keys=self._by_col_ids, following=0 + ) + label = self._value_name if not discard_name else None block, result_id = self._block.apply_window_op( self._value_column, @@ -313,3 +374,7 @@ def _apply_window_op( skip_null_groups=self._dropna, ) return series.Series(block.select_column(result_id)) + + +def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: + return pd.api.types.is_list_like(obj) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index f65cb02941..0aaf169bea 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -20,6 +20,7 @@ import ibis import pandas as pd +import bigframes.constants as constants import bigframes.core as core import bigframes.core.guid as guid import bigframes.core.indexes as indexes @@ -47,10 +48,14 @@ def __setitem__(self, key, value) -> None: # TODO(swast): support MultiIndex if isinstance(key, slice): # TODO(swast): Implement loc with slices. - raise NotImplementedError("loc does not yet support slices") + raise NotImplementedError( + f"loc does not yet support slices. {constants.FEEDBACK_LINK}" + ) elif isinstance(key, list): # TODO(tbergeron): Implement loc for index label list. - raise NotImplementedError("loc does not yet support index label lists") + raise NotImplementedError( + f"loc does not yet support index label lists. {constants.FEEDBACK_LINK}" + ) # Assume the key is for the index label. block = self._series._block @@ -149,6 +154,7 @@ def __setitem__( ): raise NotImplementedError( "Only setting a column by DataFrame.loc[:, 'column'] is supported." + f"{constants.FEEDBACK_LINK}" ) # TODO(swast): Support setting multiple columns with key[1] as a list @@ -213,19 +219,45 @@ def _loc_getitem_series_or_dataframe( typing.Union[bigframes.dataframe.DataFrame, bigframes.series.Series], series_or_dataframe.iloc[0:0], ) + + # We can't upload a DataFrame with None as the column name, so set it + # an arbitrary string. index_name = series_or_dataframe.index.name + index_name_is_none = index_name is None + if index_name_is_none: + index_name = "unnamed_col" + keys_df = bigframes.dataframe.DataFrame( {index_name: key}, session=series_or_dataframe._get_block().expr._session ) keys_df = keys_df.set_index(index_name, drop=True) + + if index_name_is_none: + keys_df.index.name = None return _perform_loc_list_join(series_or_dataframe, keys_df) elif isinstance(key, slice): - return series_or_dataframe._slice(key.start, key.stop, key.step) + if (key.start is None) and (key.stop is None) and (key.step is None): + return series_or_dataframe.copy() + raise NotImplementedError( + f"loc does not yet support indexing with a slice. {constants.FEEDBACK_LINK}" + ) elif callable(key): - raise NotImplementedError("loc does not yet support indexing with a callable") + raise NotImplementedError( + f"loc does not yet support indexing with a callable. {constants.FEEDBACK_LINK}" + ) + elif pd.api.types.is_scalar(key): + index_name = "unnamed_col" + keys_df = bigframes.dataframe.DataFrame( + {index_name: [key]}, session=series_or_dataframe._get_block().expr._session + ) + keys_df = keys_df.set_index(index_name, drop=True) + keys_df.index.name = None + return _perform_loc_list_join(series_or_dataframe, keys_df) else: raise TypeError( - "Invalid argument type. loc currently only supports indexing with a boolean bigframes Series or a list of index entries." + "Invalid argument type. loc currently only supports indexing with a " + "boolean bigframes Series, a list of index entries or a single index entry. " + f"{constants.FEEDBACK_LINK}" ) @@ -257,11 +289,13 @@ def _perform_loc_list_join( name = series_or_dataframe.name if series_or_dataframe.name is not None else "0" result = typing.cast( bigframes.series.Series, - series_or_dataframe.to_frame().join(keys_df, how="right")[name], + series_or_dataframe.to_frame()._perform_join_by_index(keys_df, how="right")[ + name + ], ) result = result.rename(original_name) else: - result = series_or_dataframe.join(keys_df, how="right") # type: ignore + result = series_or_dataframe._perform_join_by_index(keys_df, how="right") # type: ignore result = result.rename_axis(original_index_names) return result @@ -284,12 +318,8 @@ def _iloc_getitem_series_or_dataframe( series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series, key ) -> bigframes.dataframe.DataFrame | bigframes.series.Series | bigframes.core.scalar.Scalar | pd.Series: if isinstance(key, int): - if key < 0: - raise NotImplementedError( - "iloc does not yet support negative single positional index" - ) internal_slice_result = series_or_dataframe._slice(key, key + 1, 1) - result_pd_df = internal_slice_result.compute() + result_pd_df = internal_slice_result.to_pandas() if result_pd_df.empty: raise IndexError("single positional indexer is out-of-bounds") return result_pd_df.iloc[0] @@ -329,9 +359,11 @@ def _iloc_getitem_series_or_dataframe( elif isinstance(key, tuple): raise NotImplementedError( - "iloc does not yet support indexing with a (row, column) tuple" + f"iloc does not yet support indexing with a (row, column) tuple. {constants.FEEDBACK_LINK}" ) elif callable(key): - raise NotImplementedError("iloc does not yet support indexing with a callable") + raise NotImplementedError( + f"iloc does not yet support indexing with a callable. {constants.FEEDBACK_LINK}" + ) else: - raise TypeError("Invalid argument type.") + raise TypeError(f"Invalid argument type. {constants.FEEDBACK_LINK}") diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index f7fa7ac119..bb1fcc4b17 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -20,8 +20,9 @@ from typing import Callable, Tuple import numpy as np -import pandas as pd +import pandas +import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.joins as joins @@ -45,6 +46,7 @@ def name(self, value: blocks.Label): @property def names(self) -> typing.Sequence[blocks.Label]: + """Returns the names of the Index.""" return self._data._get_block()._index_labels @names.setter @@ -57,21 +59,40 @@ def shape(self) -> typing.Tuple[int]: @property def size(self) -> int: + """Returns the size of the Index.""" return self.shape[0] @property def empty(self) -> bool: + """Returns True if the Index is empty, otherwise returns False.""" return self.shape[0] == 0 - def to_pandas(self) -> pd.Index: - """Get the Index as a pandas Index.""" - return IndexValue(self._data._get_block()).compute() + def __getitem__(self, key: int) -> typing.Any: + if isinstance(key, int): + result_pd_df, _ = self._data._get_block().slice(key, key + 1, 1).to_pandas() + if result_pd_df.empty: + raise IndexError("single positional indexer is out-of-bounds") + return result_pd_df.index[0] + else: + raise NotImplementedError(f"Index key not supported {key}") + + def to_pandas(self) -> pandas.Index: + """Gets the Index as a pandas Index. + + Returns: + pandas.Index: + A pandas Index with all of the labels from this Index. + """ + return IndexValue(self._data._get_block()).to_pandas() + + def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: + return self.to_pandas().to_numpy(dtype, **kwargs) + + __array__ = to_numpy def __len__(self): return self.shape[0] - compute = to_pandas - class IndexValue: """An immutable index.""" @@ -107,10 +128,10 @@ def __repr__(self) -> str: # maybe we just print the job metadata that we have so far? # TODO(swast): Avoid downloading the whole index by using job # metadata, like we do with DataFrame. - preview = self.compute() + preview = self.to_pandas() return repr(preview) - def compute(self) -> pd.Index: + def to_pandas(self) -> pandas.Index: """Executes deferred operations and downloads the results.""" # Project down to only the index column. So the query can be cached to visualize other data. index_column = self._block.index_columns[0] @@ -123,26 +144,35 @@ def compute(self) -> pd.Index: return index def join( - self, other: IndexValue, *, how="left", sort=False + self, + other: IndexValue, + *, + how="left", + sort=False, + block_identity_join: bool = False, ) -> Tuple[IndexValue, Tuple[Callable[[str], str], Callable[[str], str]],]: if not isinstance(other, IndexValue): # TODO(swast): We need to improve this error message to be more # actionable for the user. For example, it's possible they # could call set_index and try again to resolve this error. raise ValueError( - "Can't mixed objects with explicit Index and ImpliedJoiner" + f"Tried to join with an unexpected type: {type(other)}. {constants.FEEDBACK_LINK}" ) # TODO(swast): Support cross-joins (requires reindexing). if how not in {"outer", "left", "right", "inner"}: raise NotImplementedError( - "Only how='outer','left','right','inner' currently supported" + f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}" ) if self.nlevels == other.nlevels == 1: - return join_mono_indexed(self, other, how=how, sort=sort) + return join_mono_indexed( + self, other, how=how, sort=sort, block_identity_join=block_identity_join + ) else: # Always sort mult-index join - return join_multi_indexed(self, other, how=how, sort=sort) + return join_multi_indexed( + self, other, how=how, sort=sort, block_identity_join=block_identity_join + ) def resolve_level_name(self: IndexValue, label: blocks.Label) -> str: matches = self._block.index_name_to_col_id.get(label, []) @@ -157,7 +187,12 @@ def is_uniquely_named(self: IndexValue): def join_mono_indexed( - left: IndexValue, right: IndexValue, *, how="left", sort=False + left: IndexValue, + right: IndexValue, + *, + how="left", + sort=False, + block_identity_join: bool = False, ) -> Tuple[IndexValue, Tuple[Callable[[str], str], Callable[[str], str]],]: ( combined_expr, @@ -170,6 +205,7 @@ def join_mono_indexed( right._block.index_columns, how=how, sort=sort, + allow_row_identity_join=(not block_identity_join), ) # Drop original indices from each side. and used the coalesced combination generated by the join. left_indices = [get_column_left(col_id) for col_id in left._block.index_columns] @@ -188,7 +224,12 @@ def join_mono_indexed( def join_multi_indexed( - left: IndexValue, right: IndexValue, *, how="left", sort=False + left: IndexValue, + right: IndexValue, + *, + how="left", + sort=False, + block_identity_join: bool = False, ) -> Tuple[IndexValue, Tuple[Callable[[str], str], Callable[[str], str]],]: if not (left.is_uniquely_named() and right.is_uniquely_named()): raise ValueError("Joins not supported on indices with non-unique level names") @@ -203,6 +244,7 @@ def join_multi_indexed( left_join_ids = [left.resolve_level_name(name) for name in common_names] right_join_ids = [right.resolve_level_name(name) for name in common_names] + names_fully_match = len(left_only_names) == 0 and len(right_only_names) == 0 ( combined_expr, joined_index_col_names, @@ -214,6 +256,9 @@ def join_multi_indexed( right_join_ids, how=how, sort=sort, + # If we're only joining on a subset of the index columns, we need to + # perform a true join. + allow_row_identity_join=names_fully_match and not block_identity_join, ) # Drop original indices from each side. and used the coalesced combination generated by the join. combined_expr = combined_expr.drop_columns( diff --git a/bigframes/core/io.py b/bigframes/core/io.py new file mode 100644 index 0000000000..6b7ed52488 --- /dev/null +++ b/bigframes/core/io.py @@ -0,0 +1,90 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Private module: Helpers for I/O operations.""" + +import datetime +import textwrap +from typing import Dict, Union + +import google.cloud.bigquery as bigquery + +IO_ORDERING_ID = "bqdf_row_nums" + + +def create_export_csv_statement( + table_id: str, uri: str, field_delimiter: str, header: bool +) -> str: + return create_export_data_statement( + table_id, + uri, + "CSV", + { + "field_delimiter": field_delimiter, + "header": header, + }, + ) + + +def create_export_data_statement( + table_id: str, uri: str, format: str, export_options: Dict[str, Union[bool, str]] +) -> str: + all_options: Dict[str, Union[bool, str]] = { + "uri": uri, + "format": format, + # TODO(swast): Does pandas have an option not to overwrite files? + "overwrite": True, + } + all_options.update(export_options) + export_options_str = ", ".join( + format_option(key, value) for key, value in all_options.items() + ) + # Manually generate ORDER BY statement since ibis will not always generate + # it in the top level statement. This causes BigQuery to then run + # non-distributed sort and run out of memory. + return textwrap.dedent( + f""" + EXPORT DATA + OPTIONS ( + {export_options_str} + ) AS + SELECT * EXCEPT ({IO_ORDERING_ID}) + FROM `{table_id}` + ORDER BY {IO_ORDERING_ID} + """ + ) + + +def create_snapshot_sql( + table_ref: bigquery.TableReference, current_timestamp: datetime.datetime +) -> str: + """Query a table via 'time travel' for consistent reads.""" + + # If we have a _SESSION table, assume that it's already a copy. Nothing to do here. + if table_ref.dataset_id.upper() == "_SESSION": + return f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`" + + return textwrap.dedent( + f""" + SELECT * + FROM `{table_ref.project}`.`{table_ref.dataset_id}`.`{table_ref.table_id}` + FOR SYSTEM_TIME AS OF TIMESTAMP({repr(current_timestamp.isoformat())}) + """ + ) + + +def format_option(key: str, value: Union[bool, str]) -> str: + if isinstance(value, bool): + return f"{key}=true" if value else f"{key}=false" + return f"{key}={repr(value)}" diff --git a/bigframes/core/joins/row_identity.py b/bigframes/core/joins/row_identity.py index b77b5d80ad..66eb223990 100644 --- a/bigframes/core/joins/row_identity.py +++ b/bigframes/core/joins/row_identity.py @@ -23,6 +23,7 @@ import ibis import ibis.expr.types as ibis_types +import bigframes.constants as constants import bigframes.core as core SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} @@ -33,7 +34,9 @@ def join_by_row_identity( ) -> Tuple[core.ArrayValue, Tuple[Callable[[str], str], Callable[[str], str]],]: """Compute join when we are joining by row identity not a specific column.""" if how not in SUPPORTED_ROW_IDENTITY_HOW: - raise NotImplementedError("Only how='outer','left','inner' currently supported") + raise NotImplementedError( + f"Only how='outer','left','inner' currently supported. {constants.FEEDBACK_LINK}" + ) if not left.table.equals(right.table): raise ValueError( @@ -67,47 +70,41 @@ def join_by_row_identity( for key in right.column_names.keys() ] - hidden_ordering_columns = [] - new_ordering = core.ExpressionOrdering() - if left._ordering and right._ordering: - # These ordering columns will be present in the ArrayValue, as we - # haven't hidden any value / index column(s). Code that is aware of - # which columns are index columns / value columns columns will need to - # add the previous columns to hidden columns. - new_ordering = left._ordering.with_ordering_columns( - [ - col_ref.with_name(map_left_id(col_ref.column_id)) - for col_ref in left._ordering.ordering_value_columns - ] - + [ - col_ref.with_name(map_right_id(col_ref.column_id)) - for col_ref in right._ordering.ordering_value_columns - ] - ) - - hidden_ordering_columns = [ - left._get_hidden_ordering_column(key.column_id).name( - map_left_id(key.column_id) - ) - for key in left._ordering.ordering_value_columns - if key.column_id in left._hidden_ordering_column_names.keys() + # If left isn't being masked, can just use left ordering + if not left_mask: + col_mapping = { + order_ref.column_id: map_left_id(order_ref.column_id) + for order_ref in left._ordering.ordering_value_columns + } + new_ordering = left._ordering.with_column_remap(col_mapping) + else: + ordering_columns = [ + col_ref.with_name(map_left_id(col_ref.column_id)) + for col_ref in left._ordering.ordering_value_columns ] + [ - right._get_hidden_ordering_column(key.column_id).name( - map_right_id(key.column_id) - ) - for key in right._ordering.ordering_value_columns - if key.column_id in right._hidden_ordering_column_names.keys() + col_ref.with_name(map_right_id(col_ref.column_id)) + for col_ref in right._ordering.ordering_value_columns ] + left_total_order_cols = frozenset( + map_left_id(col) for col in left._ordering.total_ordering_columns + ) + # Assume that left ordering is sufficient since 1:1 join over same base table + join_total_order_cols = left_total_order_cols + new_ordering = core.ExpressionOrdering( + ordering_columns, total_ordering_columns=join_total_order_cols + ) - left_ordering_id = left._ordering.ordering_id - if left_ordering_id: - new_ordering = new_ordering.with_ordering_id(map_left_id(left_ordering_id)) - if left_ordering_id in left._hidden_ordering_column_names.keys(): - hidden_ordering_columns.append( - left._get_hidden_ordering_column(left_ordering_id).name( - map_left_id(left_ordering_id) - ) - ) + hidden_ordering_columns = [ + left._get_hidden_ordering_column(key.column_id).name(map_left_id(key.column_id)) + for key in left._ordering.ordering_value_columns + if key.column_id in left._hidden_ordering_column_names.keys() + ] + [ + right._get_hidden_ordering_column(key.column_id).name( + map_right_id(key.column_id) + ) + for key in right._ordering.ordering_value_columns + if key.column_id in right._hidden_ordering_column_names.keys() + ] joined_expr = core.ArrayValue( left._session, @@ -169,7 +166,9 @@ def _join_predicates( ) return (*left_predicates, *right_relative_predicates) else: - raise ValueError("Unsupported join_type: " + join_type) + raise ValueError( + f"Unsupported join_type: {join_type}. {constants.FEEDBACK_LINK}" + ) def _get_relative_predicates( diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/joins/single_column.py index 34873e6f7e..4c865fffdf 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/joins/single_column.py @@ -23,6 +23,7 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types +import bigframes.constants as constants import bigframes.core as core import bigframes.core.guid import bigframes.core.joins.row_identity @@ -42,7 +43,8 @@ def join_by_column( "right", ], sort: bool = False, - get_both_join_key_cols: bool = False, + coalesce_join_keys: bool = True, + allow_row_identity_join: bool = True, ) -> Tuple[ core.ArrayValue, typing.Sequence[str], @@ -56,8 +58,11 @@ def join_by_column( right: Expression for right table to join. right_column_ids: Column IDs (not label) to join by. how: The type of join to perform. - get_both_join_key_cols: if set to True, returned column ids will contain + coalesce_join_keys: if set to False, returned column ids will contain both left and right join key columns. + allow_row_identity_join (bool): + If True, allow matching by row identity. Set to False to always + perform a true JOIN in generated SQL. Returns: The joined expression and the objects needed to interpret it. @@ -66,19 +71,22 @@ def join_by_column( * Sequence[str]: Column IDs of the coalesced join columns. Sometimes either the left/right table will have missing rows. This column pulls the non-NULL value from either left/right. - If get_both_join_key_cols is True, will return uncombined left and + If coalesce_join_keys is False, will return uncombined left and right key columns. * Tuple[Callable, Callable]: For a given column ID from left or right, respectively, return the new column id from the combined expression. """ - if ( - how in bigframes.core.joins.row_identity.SUPPORTED_ROW_IDENTITY_HOW + allow_row_identity_join + and how in bigframes.core.joins.row_identity.SUPPORTED_ROW_IDENTITY_HOW and left.table.equals(right.table) - # Compare ibis expressions for left/right columns because its possible that - # they both have the same names but were modified in different ways. + # Make sure we're joining on exactly the same column(s), at least with + # regards to value its possible that they both have the same names but + # were modified in different ways. Ignore differences in the names. and all( - left.get_any_column(lcol).equals(right.get_any_column(rcol)) + left.get_any_column(lcol) + .name("index") + .equals(right.get_any_column(rcol).name("index")) for lcol, rcol in zip(left_column_ids, right_column_ids) ) ): @@ -86,15 +94,42 @@ def join_by_column( get_column_left, get_column_right, ) = bigframes.core.joins.row_identity.join_by_row_identity(left, right, how=how) - original_ordering = combined_expr._ordering + left_join_keys = [ + combined_expr.get_column(get_column_left(col)) for col in left_column_ids + ] + right_join_keys = [ + combined_expr.get_column(get_column_right(col)) for col in right_column_ids + ] + join_key_cols = get_join_cols( + left_join_keys, right_join_keys, how, coalesce_join_keys + ) + join_key_ids = [col.get_name() for col in join_key_cols] + combined_expr = combined_expr.projection( + [*join_key_cols, *combined_expr.columns] + ) + if sort: + combined_expr = combined_expr.order_by( + [ + core.OrderingColumnReference(join_col_id) + for join_col_id in join_key_ids + ] + ) + return ( + combined_expr, + join_key_ids, + ( + get_column_left, + get_column_right, + ), + ) else: # Generate offsets if non-default ordering is applied # Assumption, both sides are totally ordered, otherwise offsets will be nondeterministic left_table = left.to_ibis_expr( - ordering_mode="ordered_col", order_col_name=core.ORDER_ID_COLUMN + ordering_mode="string_encoded", order_col_name=core.ORDER_ID_COLUMN ) right_table = right.to_ibis_expr( - ordering_mode="ordered_col", order_col_name=core.ORDER_ID_COLUMN + ordering_mode="string_encoded", order_col_name=core.ORDER_ID_COLUMN ) join_conditions = [ value_to_join_key(left_table[left_index]) @@ -144,66 +179,105 @@ def get_column_right(key: str) -> str: return key left_ordering_encoding_size = ( - left._ordering.ordering_encoding_size - or bigframes.core.ordering.DEFAULT_ORDERING_ID_LENGTH + left._ordering.string_encoding.length + if left._ordering.is_string_encoded + else bigframes.core.ordering.DEFAULT_ORDERING_ID_LENGTH ) right_ordering_encoding_size = ( - right._ordering.ordering_encoding_size - or bigframes.core.ordering.DEFAULT_ORDERING_ID_LENGTH + right._ordering.string_encoding.length + if right._ordering.is_string_encoded + else bigframes.core.ordering.DEFAULT_ORDERING_ID_LENGTH ) # Preserve original ordering accross joins. left_order_id = get_column_left(core.ORDER_ID_COLUMN) right_order_id = get_column_right(core.ORDER_ID_COLUMN) new_order_id_col = _merge_order_ids( - combined_table[left_order_id], + typing.cast(ibis_types.StringColumn, combined_table[left_order_id]), left_ordering_encoding_size, - combined_table[right_order_id], + typing.cast(ibis_types.StringColumn, combined_table[right_order_id]), right_ordering_encoding_size, how, ) new_order_id = new_order_id_col.get_name() if new_order_id is None: raise ValueError("new_order_id unexpectedly has no name") + hidden_columns = (new_order_id_col,) - original_ordering = core.ExpressionOrdering( - ordering_id_column=core.OrderingColumnReference(new_order_id) - if (new_order_id_col is not None) - else None, - ordering_encoding_size=left_ordering_encoding_size - + right_ordering_encoding_size, + ordering = core.ExpressionOrdering( + # Order id is non-nullable but na_last=False generates simpler sql with current impl + ordering_value_columns=[ + core.OrderingColumnReference(new_order_id, na_last=False) + ], + total_ordering_columns=frozenset([new_order_id]), + string_encoding=core.StringEncoding( + True, left_ordering_encoding_size + right_ordering_encoding_size + ), + ) + + left_join_keys = [ + combined_table[get_column_left(col)] for col in left_column_ids + ] + right_join_keys = [ + combined_table[get_column_right(col)] for col in right_column_ids + ] + join_key_cols = get_join_cols( + left_join_keys, right_join_keys, how, coalesce_join_keys + ) + # We could filter out the original join columns, but predicates/ordering + # might still reference them in implicit joins. + columns = ( + join_key_cols + + [combined_table[get_column_left(col.get_name())] for col in left.columns] + + [ + combined_table[get_column_right(col.get_name())] + for col in right.columns + ] ) combined_expr = core.ArrayValue( left._session, combined_table, + columns=columns, hidden_ordering_columns=hidden_columns, + ordering=ordering, + ) + if sort: + combined_expr = combined_expr.order_by( + [ + core.OrderingColumnReference(join_key_col.get_name()) + for join_key_col in join_key_cols + ] + ) + return ( + combined_expr, + [key.get_name() for key in join_key_cols], + (get_column_left, get_column_right), ) + +def get_join_cols( + left_join_cols: typing.Iterable[ibis_types.Value], + right_join_cols: typing.Iterable[ibis_types.Value], + how: str, + coalesce_join_keys: bool = True, +) -> typing.List[ibis_types.Value]: join_key_cols: list[ibis_types.Value] = [] - for lcol, rcol in zip(left_column_ids, right_column_ids): - if get_both_join_key_cols: + for left_col, right_col in zip(left_join_cols, right_join_cols): + if not coalesce_join_keys: join_key_cols.append( - combined_expr.get_column(get_column_left(lcol)).name( - bigframes.core.guid.generate_guid(prefix="index_") - ) + left_col.name(bigframes.core.guid.generate_guid(prefix="index_")) ) join_key_cols.append( - combined_expr.get_column(get_column_right(rcol)).name( - bigframes.core.guid.generate_guid(prefix="index_") - ) + right_col.name(bigframes.core.guid.generate_guid(prefix="index_")) ) else: if how == "left" or how == "inner": join_key_cols.append( - combined_expr.get_column(get_column_left(lcol)).name( - bigframes.core.guid.generate_guid(prefix="index_") - ) + left_col.name(bigframes.core.guid.generate_guid(prefix="index_")) ) elif how == "right": join_key_cols.append( - combined_expr.get_column(get_column_right(rcol)).name( - bigframes.core.guid.generate_guid(prefix="index_") - ) + right_col.name(bigframes.core.guid.generate_guid(prefix="index_")) ) elif how == "outer": # The left index and the right index might contain null values, for @@ -211,48 +285,25 @@ def get_column_right(key: str) -> str: # these to take the index value from either column. # Use a random name in case the left index and the right index have the # same name. In such a case, _x and _y suffixes will already be used. - join_key_cols.append( - ibis.coalesce( - combined_expr.get_column(get_column_left(lcol)), - combined_expr.get_column(get_column_right(rcol)), - ).name(bigframes.core.guid.generate_guid(prefix="index_")) - ) + # Don't need to coalesce if they are exactly the same column. + if left_col.name("index").equals(right_col.name("index")): + join_key_cols.append( + left_col.name( + bigframes.core.guid.generate_guid(prefix="index_") + ) + ) + else: + join_key_cols.append( + ibis.coalesce( + left_col, + right_col, + ).name(bigframes.core.guid.generate_guid(prefix="index_")) + ) else: - raise ValueError(f"Unexpected join type: {how}") - - # We could filter out the original join columns, but predicates/ordering - # might still reference them in implicit joins. - columns = ( - join_key_cols - + [ - combined_expr.get_column(get_column_left(key)) - for key in left.column_names.keys() - ] - + [ - combined_expr.get_column(get_column_right(key)) - for key in right.column_names.keys() - ] - ) - - if sort: - ordering = original_ordering.with_ordering_columns( - [ - core.OrderingColumnReference(join_key_col.get_name()) - for join_key_col in join_key_cols - ] - ) - else: - ordering = original_ordering - - combined_expr_builder = combined_expr.builder() - combined_expr_builder.columns = columns - combined_expr_builder.ordering = ordering - combined_expr = combined_expr_builder.build() - return ( - combined_expr, - [key.get_name() for key in join_key_cols], - (get_column_left, get_column_right), - ) + raise ValueError( + f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}" + ) + return join_key_cols def value_to_join_key(value: ibis_types.Value): @@ -263,19 +314,31 @@ def value_to_join_key(value: ibis_types.Value): def _merge_order_ids( - left_id: ibis_types.Value, + left_id: ibis_types.StringColumn, left_encoding_size: int, - right_id: ibis_types.Value, + right_id: ibis_types.StringColumn, right_encoding_size: int, how: str, -) -> ibis_types.StringValue: +) -> ibis_types.StringColumn: if how == "right": return _merge_order_ids( right_id, right_encoding_size, left_id, left_encoding_size, "left" ) - return ( - ( - bigframes.core.ordering.stringify_order_id(left_id, left_encoding_size) - + bigframes.core.ordering.stringify_order_id(right_id, right_encoding_size) + + if how == "left": + right_id = typing.cast( + ibis_types.StringColumn, + right_id.fillna(ibis_types.literal(":" * right_encoding_size)), + ) + elif how != "inner": # outer join + left_id = typing.cast( + ibis_types.StringColumn, + left_id.fillna(ibis_types.literal(":" * left_encoding_size)), ) - ).name(bigframes.core.guid.generate_guid(prefix="bigframes_ordering_id_")) + right_id = typing.cast( + ibis_types.StringColumn, + right_id.fillna(ibis_types.literal(":" * right_encoding_size)), + ) + return (left_id + right_id).name( + bigframes.core.guid.generate_guid(prefix="bigframes_ordering_id_") + ) diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index 6f8d35e52a..d5f07ecf91 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -14,7 +14,7 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum import math import typing @@ -28,6 +28,8 @@ # Sufficient to store any value up to 2^63 DEFAULT_ORDERING_ID_LENGTH: int = math.ceil(63 * math.log(2, ORDERING_ID_STRING_BASE)) +STABLE_SORTS = ["mergesort", "stable"] + class OrderingDirection(Enum): ASC = 1 @@ -61,121 +63,162 @@ def with_reverse(self): ) +# Encoding classes specify additional properties for some ordering representations @dataclass(frozen=True) -class ExpressionOrdering: - """Immutable object that holds information about the ordering of rows in a ArrayValue object.""" +class StringEncoding: + """String encoded order ids are fixed length and can be concat together in joins.""" - ordering_value_columns: Sequence[OrderingColumnReference] = () - ordering_id_column: Optional[OrderingColumnReference] = None - is_sequential: bool = False + is_encoded: bool = False # Encoding size must be tracked in order to know what how to combine ordering ids across tables (eg how much to pad when combining different length). # Also will be needed to determine when length is too large and need to compact ordering id with a ROW_NUMBER operation. - ordering_encoding_size: int = DEFAULT_ORDERING_ID_LENGTH + length: int = DEFAULT_ORDERING_ID_LENGTH + + +@dataclass(frozen=True) +class IntegerEncoding: + """Integer encoded order ids are guaranteed non-negative.""" + + is_encoded: bool = False + is_sequential: bool = False + - def with_is_sequential(self, is_sequential: bool): +@dataclass(frozen=True) +class ExpressionOrdering: + """Immutable object that holds information about the ordering of rows in a ArrayValue object.""" + + ordering_value_columns: Sequence[OrderingColumnReference] = () + integer_encoding: IntegerEncoding = IntegerEncoding(False) + string_encoding: StringEncoding = StringEncoding(False) + # A table has a total ordering defined by the identities of a set of 1 or more columns. + # These columns must always be part of the ordering, in order to guarantee that the ordering is total. + # Therefore, any modifications(or drops) done to these columns must result in hidden copies being made. + total_ordering_columns: frozenset[str] = field(default_factory=frozenset) + + def with_non_sequential(self): """Create a copy that is marked as non-sequential. This is useful when filtering, but not sorting, an expression. """ - return ExpressionOrdering( - self.ordering_value_columns, - self.ordering_id_column, - is_sequential, - ordering_encoding_size=self.ordering_encoding_size, - ) + if self.integer_encoding.is_sequential: + return ExpressionOrdering( + self.ordering_value_columns, + integer_encoding=IntegerEncoding( + self.integer_encoding.is_encoded, is_sequential=False + ), + total_ordering_columns=self.total_ordering_columns, + ) + + return self def with_ordering_columns( self, ordering_value_columns: Sequence[OrderingColumnReference] = (), stable: bool = False, - ): - """Creates a new ordering that preserves ordering id, but replaces ordering value column list.""" + ) -> ExpressionOrdering: + """Creates a new ordering that reorders by the given columns. + + Args: + ordering_value_columns: + In decreasing precedence order, the values used to sort the ordering + stable: + If True, will use apply a stable sorting, using the old ordering where + the new ordering produces ties. Otherwise, ties will be resolved in + a performance maximizing way, + + Returns: + Modified ExpressionOrdering + """ + col_ids_new = [ + ordering_ref.column_id for ordering_ref in ordering_value_columns + ] if stable: - col_ids_new = [ - ordering_ref.column_id for ordering_ref in ordering_value_columns - ] # Only reference each column once, so discard old referenc if there is a new reference old_ordering_keep = [ ordering_ref for ordering_ref in self.ordering_value_columns if ordering_ref.column_id not in col_ids_new ] - new_ordering = (*ordering_value_columns, *old_ordering_keep) - else: # Not stable, so discard old ordering completely - new_ordering = tuple(ordering_value_columns) + else: + # New ordering needs to keep all total ordering columns no matter what. + # All other old ordering references can be discarded as does not need + # to be a stable sort. + old_ordering_keep = [ + ordering_ref + for ordering_ref in self.ordering_value_columns + if (ordering_ref.column_id not in col_ids_new) + and (ordering_ref.column_id in self.total_ordering_columns) + ] + new_ordering = (*ordering_value_columns, *old_ordering_keep) return ExpressionOrdering( new_ordering, - self.ordering_id_column, - is_sequential=False, - ordering_encoding_size=self.ordering_encoding_size, - ) - - def with_ordering_id(self, ordering_id: str): - """Creates a new ordering that preserves other properties, but with a different ordering id. - - Useful when reprojecting ordering for implicit joins. - """ - return ExpressionOrdering( - self.ordering_value_columns, - OrderingColumnReference(ordering_id), - is_sequential=self.is_sequential, - ordering_encoding_size=self.ordering_encoding_size, + total_ordering_columns=self.total_ordering_columns, ) def with_reverse(self): """Reverses the ordering.""" return ExpressionOrdering( tuple([col.with_reverse() for col in self.ordering_value_columns]), - self.ordering_id_column.with_reverse() - if self.ordering_id_column is not None - else None, - is_sequential=False, - ordering_encoding_size=self.ordering_encoding_size, + total_ordering_columns=self.total_ordering_columns, + ) + + def with_column_remap(self, mapping: typing.Mapping[str, str]): + new_value_columns = [ + col.with_name(mapping.get(col.column_id, col.column_id)) + for col in self.ordering_value_columns + ] + new_total_order = frozenset( + mapping.get(col_id, col_id) for col_id in self.total_ordering_columns + ) + return ExpressionOrdering( + new_value_columns, + integer_encoding=self.integer_encoding, + string_encoding=self.string_encoding, + total_ordering_columns=new_total_order, ) @property - def ordering_id(self) -> Optional[str]: - return self.ordering_id_column.column_id if self.ordering_id_column else None + def total_order_col(self) -> Optional[OrderingColumnReference]: + """Returns column id of columns that defines total ordering, if such as column exists""" + if len(self.ordering_value_columns) != 1: + return None + order_ref = self.ordering_value_columns[0] + if order_ref.direction != OrderingDirection.ASC: + return None + return order_ref @property - def order_id_defined(self) -> bool: - """True if ordering is fully defined in ascending order by its ordering id.""" - return bool( - self.ordering_id_column - and (not self.ordering_value_columns) - and self.ordering_id_column.direction == OrderingDirection.ASC - ) + def is_string_encoded(self) -> bool: + """True if ordering is fully defined by a fixed length string column.""" + return self.string_encoding.is_encoded + + @property + def is_sequential(self) -> bool: + return self.integer_encoding.is_encoded and self.integer_encoding.is_sequential @property def all_ordering_columns(self) -> Sequence[OrderingColumnReference]: - return ( - list(self.ordering_value_columns) - if self.ordering_id_column is None - else [*self.ordering_value_columns, self.ordering_id_column] - ) + return list(self.ordering_value_columns) -def stringify_order_id( - order_id: ibis_types.Value, length: int = DEFAULT_ORDERING_ID_LENGTH -) -> ibis_types.StringValue: +def encode_order_string( + order_id: ibis_types.IntegerColumn, length: int = DEFAULT_ORDERING_ID_LENGTH +) -> ibis_types.StringColumn: """Converts an order id value to string if it is not already a string. MUST produced fixed-length strings.""" - if order_id.type().is_int64(): - # This is very inefficient encoding base-10 string uses only 10 characters per byte(out of 256 bit combinations) - # Furthermore, if know tighter bounds on order id are known, can produce smaller strings. - # 19 characters chosen as it can represent any positive Int64 in base-10 - # For missing values, ":" * 19 is used as it is larger than any other value this function produces, so null values will be last. - string_order_id = ( - typing.cast( - ibis_types.StringValue, - typing.cast(ibis_types.IntegerValue, order_id).cast(ibis_dtypes.string), - ) - .lpad(length, "0") - .fillna(ibis_types.literal(":" * length)) - ) - else: - string_order_id = ( - typing.cast(ibis_types.StringValue, order_id) - .lpad(length, "0") - .fillna(ibis_types.literal(":" * length)) - ) - return typing.cast(ibis_types.StringValue, string_order_id) + # This is very inefficient encoding base-10 string uses only 10 characters per byte(out of 256 bit combinations) + # Furthermore, if know tighter bounds on order id are known, can produce smaller strings. + # 19 characters chosen as it can represent any positive Int64 in base-10 + # For missing values, ":" * 19 is used as it is larger than any other value this function produces, so null values will be last. + string_order_id = typing.cast( + ibis_types.StringValue, + order_id.cast(ibis_dtypes.string), + ).lpad(length, "0") + return typing.cast(ibis_types.StringColumn, string_order_id) + + +def reencode_order_string( + order_id: ibis_types.StringColumn, length: int +) -> ibis_types.StringColumn: + return typing.cast( + ibis_types.StringColumn, + (typing.cast(ibis_types.StringValue, order_id).lpad(length, "0")), + ) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 1443a8e08f..339ce7466a 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -14,48 +14,107 @@ from __future__ import annotations import typing -from typing import Iterable, Literal, Union +from typing import Iterable, Literal, Optional, Union +import bigframes.constants as constants +import bigframes.core as core +import bigframes.core.utils as utils import bigframes.dataframe +import bigframes.operations.aggregations as agg_ops import bigframes.series @typing.overload def concat( - objs: Iterable[bigframes.dataframe.DataFrame], *, join, ignore_index + objs: Iterable[bigframes.series.Series], + *, + axis: typing.Literal["index", 0] = ..., + join=..., + ignore_index=..., +) -> bigframes.series.Series: + ... + + +@typing.overload +def concat( + objs: Iterable[bigframes.dataframe.DataFrame], + *, + axis: typing.Literal["index", 0] = ..., + join=..., + ignore_index=..., ) -> bigframes.dataframe.DataFrame: ... @typing.overload def concat( - objs: Iterable[bigframes.series.Series], *, join, ignore_index -) -> bigframes.series.Series: + objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]], + *, + axis: typing.Literal["columns", 1], + join=..., + ignore_index=..., +) -> bigframes.dataframe.DataFrame: ... +@typing.overload def concat( - objs: Union[ - Iterable[bigframes.dataframe.DataFrame], Iterable[bigframes.series.Series] - ], + objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]], *, + axis=..., + join=..., + ignore_index=..., +) -> Union[bigframes.dataframe.DataFrame, bigframes.series.Series]: + ... + + +def concat( + objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]], + *, + axis: typing.Union[str, int] = 0, join: Literal["inner", "outer"] = "outer", ignore_index: bool = False, ) -> Union[bigframes.dataframe.DataFrame, bigframes.series.Series]: - contains_dataframes = any( - isinstance(x, bigframes.dataframe.DataFrame) for x in objs - ) - if not contains_dataframes: - # Special case, all series, so align everything into single column even if labels don't match - series = typing.cast(typing.Iterable[bigframes.series.Series], objs) - names = {s.name for s in series} - # For series case, labels are stripped if they don't all match - if len(names) > 1: - blocks = [s._block.with_column_labels([None]) for s in series] - else: - blocks = [s._block for s in series] + axis_n = utils.get_axis_number(axis) + if axis_n == 0: + contains_dataframes = any( + isinstance(x, bigframes.dataframe.DataFrame) for x in objs + ) + if not contains_dataframes: + # Special case, all series, so align everything into single column even if labels don't match + series = typing.cast(typing.Iterable[bigframes.series.Series], objs) + names = {s.name for s in series} + # For series case, labels are stripped if they don't all match + if len(names) > 1: + blocks = [s._block.with_column_labels([None]) for s in series] + else: + blocks = [s._block for s in series] + block = blocks[0].concat(blocks[1:], how=join, ignore_index=ignore_index) + return bigframes.series.Series(block) + blocks = [obj._block for obj in objs] block = blocks[0].concat(blocks[1:], how=join, ignore_index=ignore_index) - return bigframes.series.Series(block) - blocks = [obj._block for obj in objs] - block = blocks[0].concat(blocks[1:], how=join, ignore_index=ignore_index) - return bigframes.dataframe.DataFrame(block) + return bigframes.dataframe.DataFrame(block) + else: + # Note: does not validate inputs + block_list = [obj._block for obj in objs] + block = block_list[0] + for rblock in block_list[1:]: + combined_index, _ = block.index.join(rblock.index, how=join) + block = combined_index._block + return bigframes.dataframe.DataFrame(block) + + +def cut( + x: bigframes.series.Series, + bins: int, + *, + labels: Optional[bool] = None, +) -> bigframes.series.Series: + if bins <= 0: + raise ValueError("`bins` should be a positive integer.") + + if labels is not False: + raise NotImplementedError( + f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + ) + return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec()) diff --git a/bigframes/core/scalar.py b/bigframes/core/scalar.py index 67ced525b9..6dfbd31b77 100644 --- a/bigframes/core/scalar.py +++ b/bigframes/core/scalar.py @@ -15,10 +15,14 @@ from __future__ import annotations import typing -from typing import Any +from typing import Any, Optional +import google.cloud.bigquery as bigquery import ibis.expr.types as ibis_types +import bigframes +import bigframes.formatting_helpers as formatter + if typing.TYPE_CHECKING: import bigframes.session @@ -29,19 +33,36 @@ class DeferredScalar: def __init__(self, value: ibis_types.Scalar, session: bigframes.session.Session): self._value = value self._session = session + self._query_job: Optional[bigquery.QueryJob] = None + + @property + def query_job(self) -> Optional[bigquery.QueryJob]: + """BigQuery job metadata for the most recent query.""" + if self._query_job is None: + self._query_job = self._compute_dry_run() + return self._query_job def __repr__(self) -> str: """Converts a Series to a string.""" # TODO(swast): Add a timeout here? If the query is taking a long time, # maybe we just print the job metadata that we have so far? - return repr(self.compute()) + opts = bigframes.options.display + if opts.repr_mode == "deferred": + return formatter.repr_query_job(self.query_job) + else: + return repr(self.to_pandas()) - def compute(self) -> Any: + def to_pandas(self) -> Any: """Executes deferred operations and downloads the resulting scalar.""" - result, _ = self._session._start_query(self._value.compile()) + result, query_job = self._session._start_query(self._value.compile()) + self._query_job = query_job df = self._session._rows_to_dataframe(result) return df.iloc[0, 0] + def _compute_dry_run(self): + job_config = bigquery.QueryJobConfig(dry_run=True) + return self._session._start_query(self._value.compile(), job_config=job_config) + # All public APIs return Any at present # Later implementation may sometimes return a lazy scalar diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py new file mode 100644 index 0000000000..8ad58fb166 --- /dev/null +++ b/bigframes/core/utils.py @@ -0,0 +1,22 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import typing + + +def get_axis_number(axis: typing.Union[str, int, None]) -> typing.Literal[0, 1]: + if axis in {0, "index", "rows", None}: + return 0 + elif axis in {1, "columns"}: + return 1 + raise ValueError(f"Not a valid axis: {axis}") diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0155aafa0b..cb062f0924 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -16,7 +16,6 @@ from __future__ import annotations -import random import re import textwrap import typing @@ -33,13 +32,13 @@ ) import google.cloud.bigquery as bigquery -import ibis.expr.datatypes as ibis_dtypes import numpy -import pandas as pd +import pandas import typing_extensions import bigframes import bigframes._config.display_options as display_options +import bigframes.constants as constants import bigframes.core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks @@ -47,9 +46,12 @@ import bigframes.core.guid import bigframes.core.indexers as indexers import bigframes.core.indexes as indexes +import bigframes.core.io import bigframes.core.joins as joins import bigframes.core.ordering as order +import bigframes.core.utils as utils import bigframes.dtypes +import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series @@ -64,11 +66,21 @@ # BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. # TODO(tbergeron): Convert to bytes-based limit -MAX_INLINE_DF_SIZE = 5000 +# TODO(swast): Address issues with string escaping and empty tables before +# re-enabling inline data (ibis.memtable) feature. +MAX_INLINE_DF_SIZE = -1 -LevelsType = typing.Union[str, int, typing.Sequence[typing.Union[str, int]]] +LevelType = typing.Union[str, int] +LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] SingleItemValue = Union[bigframes.series.Series, int, float, Callable] +ERROR_IO_ONLY_GS_PATHS = f"Only Google Cloud Storage (gs://...) paths are supported. {constants.FEEDBACK_LINK}" +ERROR_IO_REQUIRES_WILDCARD = ( + "Google Cloud Storage path must contain a wildcard '*' character. See: " + "https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#export_data_statement" + f"{constants.FEEDBACK_LINK}" +) + # Inherits from pandas DataFrame so that we can use the same docstrings. class DataFrame(vendored_pandas_frame.DataFrame): @@ -87,7 +99,9 @@ def __init__( session: typing.Optional[bigframes.session.Session] = None, ): if copy is not None and not copy: - raise ValueError("DataFrame constructor only supports copy=True") + raise ValueError( + f"DataFrame constructor only supports copy=True. {constants.FEEDBACK_LINK}" + ) # Check to see if constructing from BigQuery-backed objects before # falling back to pandas constructor @@ -106,7 +120,9 @@ def __init__( ): if not all(isinstance(data[key], bf_series.Series) for key in data.keys()): # TODO(tbergeron): Support local list/series data by converting to memtable. - raise NotImplementedError("Cannot mix Series with other types.") + raise NotImplementedError( + f"Cannot mix Series with other types. {constants.FEEDBACK_LINK}" + ) keys = list(data.keys()) first_label, first_series = keys[0], data[keys[0]] block = ( @@ -128,7 +144,8 @@ def __init__( if block: if index: raise NotImplementedError( - "DataFrame 'index' constructor parameter not supported when passing BigQuery-backed objects" + "DataFrame 'index' constructor parameter not supported " + f"when passing BigQuery-backed objects. {constants.FEEDBACK_LINK}" ) if columns: block = block.select_columns(list(columns)) # type:ignore @@ -141,7 +158,7 @@ def __init__( else: import bigframes.pandas - pd_dataframe = pd.DataFrame( + pd_dataframe = pandas.DataFrame( data=data, index=index, # type:ignore columns=columns, # type:ignore @@ -182,12 +199,14 @@ def _find_indices( def _resolve_label_exact(self, label) -> str: matches = self._block.label_to_col_id.get(label, []) if len(matches) != 1: - raise ValueError("Index data must be 1-dimensional") + raise ValueError( + f"Index data must be 1-dimensional. {constants.FEEDBACK_LINK}" + ) return matches[0] def _sql_names( self, - columns: Union[blocks.Label, Sequence[blocks.Label], pd.Index], + columns: Union[blocks.Label, Sequence[blocks.Label], pandas.Index], tolerance: bool = False, ) -> Sequence[str]: """Retrieve sql name (column name in BQ schema) of column(s).""" @@ -215,11 +234,11 @@ def iloc(self) -> indexers.ILocDataFrameIndexer: return indexers.ILocDataFrameIndexer(self) @property - def dtypes(self) -> pd.Series: - return pd.Series(data=self._block.dtypes, index=self._block.column_labels) + def dtypes(self) -> pandas.Series: + return pandas.Series(data=self._block.dtypes, index=self._block.column_labels) @property - def columns(self) -> pd.Index: + def columns(self) -> pandas.Index: return self.dtypes.index @property @@ -253,19 +272,21 @@ def astype( ) -> DataFrame: return self._apply_to_rows(ops.AsTypeOp(dtype)) - def to_sql_query( + def _to_sql_query( self, always_include_index: bool ) -> Tuple[str, List[Tuple[str, bool]]]: - """Compiles this dataframe's expression tree to SQL, optionally - including unnamed index columns + """Compiles this DataFrame's expression tree to SQL, optionally + including unnamed index columns. Args: - always_include_index: whether to include unnamed index columns. - If False, only named indexes are included. + always_include_index (bool): + whether to include unnamed index columns. If False, only named + indexes are included. - Returns: a tuple of (sql_string, index_column_list). Each entry in the - index column list is a tuple of (column_name, named). If named is - is false, then the column name exists only in SQL""" + Returns: a tuple of (sql_string, index_column_list) + Each entry in the index column list is a tuple of (column_name, named). + If named is false, then the column name exists only in SQL + """ # Has to be unordered as it is impossible to order the sql without # including metadata columns in selection with ibis. ibis_expr = self._block.expr.to_ibis_expr(ordering_mode="unordered") @@ -325,15 +346,25 @@ def to_sql_query( @property def sql(self) -> str: - """Compiles this dataframe's expression tree to SQL""" - sql, _ = self.to_sql_query(always_include_index=False) + """Compiles this DataFrame's expression tree to SQL.""" + sql, _ = self._to_sql_query(always_include_index=False) return sql @property def query_job(self) -> Optional[bigquery.QueryJob]: - """BigQuery job metadata for the most recent query.""" + """BigQuery job metadata for the most recent query. + + Returns: + The most recent `QueryJob + `_. + """ + if self._query_job is None: + self._set_internal_query_job(self._compute_dry_run()) return self._query_job + def _set_internal_query_job(self, query_job: bigquery.QueryJob): + self._query_job = query_job + @typing.overload def __getitem__(self, key: bigframes.series.Series) -> DataFrame: ... @@ -343,7 +374,7 @@ def __getitem__(self, key: Sequence[blocks.Label]) -> DataFrame: # type:ignore ... @typing.overload - def __getitem__(self, key: pd.Index) -> DataFrame: # type:ignore + def __getitem__(self, key: pandas.Index) -> DataFrame: # type:ignore ... @typing.overload @@ -356,7 +387,7 @@ def __getitem__( blocks.Label, Sequence[blocks.Label], # Index of column labels can be treated the same as a sequence of column labels. - pd.Index, + pandas.Index, bigframes.series.Series, ], ) -> Union[bigframes.series.Series, "DataFrame"]: @@ -397,8 +428,10 @@ def __getitem__( # Bool Series selects rows def _getitem_bool_series(self, key: bigframes.series.Series) -> DataFrame: - if not key._to_ibis_expr().type() == ibis_dtypes.bool: - raise ValueError("Only boolean series currently supported for indexing.") + if not key.dtype == pandas.BooleanDtype(): + raise NotImplementedError( + f"Only boolean series currently supported for indexing. {constants.FEEDBACK_LINK}" + ) # TODO: enforce stricter alignment combined_index, ( get_column_left, @@ -413,16 +446,12 @@ def _getitem_bool_series(self, key: bigframes.series.Series) -> DataFrame: def __getattr__(self, key: str): if key in self._block.column_labels: return self.__getitem__(key) - elif hasattr(pd.DataFrame, key): + elif hasattr(pandas.DataFrame, key): raise NotImplementedError( textwrap.dedent( f""" BigQuery DataFrames has not yet implemented an equivalent to - 'pandas.DataFrame.{key}'. Please check - https://github.com/googleapis/python-bigquery-dataframes/issues for - existing feature requests, or file your own. - Please include information about your use case, as well as - relevant code snippets. + 'pandas.DataFrame.{key}'. {constants.FEEDBACK_LINK} """ ) ) @@ -432,14 +461,21 @@ def __getattr__(self, key: str): def __repr__(self) -> str: """Converts a DataFrame to a string. Calls compute. - Only represents the first ``bigframes.options.display.max_rows``. + Only represents the first `bigframes.options.display.max_rows`. """ opts = bigframes.options.display max_results = opts.max_rows + if opts.repr_mode == "deferred": + return formatter.repr_query_job(self.query_job) # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? - pandas_df, row_count = self._retrieve_repr_request_results(max_results) + pandas_df, row_count, query_job = self._block.retrieve_repr_request_results( + max_results + ) + + self._set_internal_query_job(query_job) + column_count = len(pandas_df.columns) with display_options.pandas_repr(opts): @@ -466,10 +502,17 @@ def _repr_html_(self) -> str: """ opts = bigframes.options.display max_results = bigframes.options.display.max_rows + if opts.repr_mode == "deferred": + return formatter.repr_query_job_html(self.query_job) # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? - pandas_df, row_count = self._retrieve_repr_request_results(max_results) + pandas_df, row_count, query_job = self._block.retrieve_repr_request_results( + max_results + ) + + self._set_internal_query_job(query_job) + column_count = len(pandas_df.columns) with display_options.pandas_repr(opts): @@ -479,33 +522,6 @@ def _repr_html_(self) -> str: html_string += f"[{row_count} rows x {column_count} columns in total]" return html_string - def _retrieve_repr_request_results( - self, max_results: int - ) -> Tuple[pd.DataFrame, int]: - """ - Retrieves a pandas dataframe containing only max_results many rows for use - with printing methods. - - Returns a tuple of the dataframe and the overall number of rows of the query. - """ - # TODO(swast): Select a subset of columns if max_columns is less than the - # number of columns in the schema. - count = self.shape[0] - if count > max_results: - head_df = self.head(n=max_results) - computed_df, query_job = head_df._block.compute(max_results=max_results) - else: - head_df = self - computed_df, query_job = head_df._block.compute() - - formatted_df = computed_df.set_axis(self._block.column_labels, axis=1) - # don't update details when the cache is hit - if self.query_job is None or not query_job.cache_hit: - self._query_job = query_job - # we reset the axis and substitute the bf index name for the default - formatted_df.index.name = self.index.name - return formatted_df, count - def __setitem__(self, key: str, value: SingleItemValue): """Modify or insert a column into the DataFrame. @@ -527,6 +543,7 @@ def _apply_binop( return self._apply_series_binop(other, op, axis=axis) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." + f"{constants.FEEDBACK_LINK}" ) def _apply_scalar_binop(self, other: float | int, op: ops.BinaryOp) -> DataFrame: @@ -549,7 +566,9 @@ def _apply_series_binop( raise ValueError(f"Invalid input: axis {axis}.") if axis in ("columns", 1): - raise NotImplementedError("Row Series operations haven't been supported.") + raise NotImplementedError( + f"Row Series operations haven't been supported. {constants.FEEDBACK_LINK}" + ) joined_index, (get_column_left, get_column_right) = self._block.index.join( other._block.index, how="outer" @@ -573,6 +592,16 @@ def _apply_series_binop( block = block.with_index_labels(self.index.names) return DataFrame(block) + def eq(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: + return self._apply_binop(other, ops.eq_op, axis=axis) + + def ne(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: + return self._apply_binop(other, ops.ne_op, axis=axis) + + __eq__ = eq # type: ignore + + __ne__ = ne # type: ignore + def le(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.le_op, axis=axis) @@ -661,12 +690,47 @@ def rmod(self, other: int | bigframes.series.Series, axis: str | int = "columns" __rmod__ = rmod - def compute(self) -> pd.DataFrame: - """Executes deferred operations and downloads the results.""" + def to_pandas( + self, + max_download_size: Optional[int] = None, + sampling_method: Optional[str] = None, + random_state: Optional[int] = None, + ) -> pandas.DataFrame: + """Write DataFrame to pandas DataFrame. + + Args: + max_download_size (int, default None): + Download size threshold in MB. If max_download_size is exceeded when downloading data + (e.g., to_pandas()), the data will be downsampled if + bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be + raised. If set to a value other than None, this will supersede the global config. + sampling_method (str, default None): + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm + returns a portion of the data from the beginning. It is fast and requires minimal + computations to perform the downsampling; "uniform": This algorithm returns uniform + random samples of the data. If set to a value other than None, this will supersede + the global config. + random_state (int, default None): + The seed for the uniform downsampling algorithm. If provided, the uniform method may + take longer to execute and require more computation. If set to a value other than + None, this will supersede the global config. + + Returns: + pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the + data_sampling_threshold_mb is not exceeded; otherwise, a pandas DataFrame with + downsampled rows and all columns of this DataFrame. + """ # TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job - df, query_job = self._block.compute() - self._query_job = query_job - return df.set_axis(self._block.column_labels, axis=1) + df, query_job = self._block.to_pandas( + max_download_size=max_download_size, + sampling_method=sampling_method, + random_state=random_state, + ) + self._set_internal_query_job(query_job) + return df.set_axis(self._block.column_labels, axis=1, copy=False) + + def _compute_dry_run(self) -> bigquery.QueryJob: + return self._block._compute_dry_run() def copy(self) -> DataFrame: return DataFrame(self._block) @@ -677,12 +741,50 @@ def head(self, n: int = 5) -> DataFrame: def tail(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[-n:]) - def drop(self, *, columns: Union[str, Iterable[str]]) -> DataFrame: - if not _is_list_like(columns): - columns = [columns] # type:ignore - columns = list(columns) + def drop( + self, + labels: typing.Any = None, + *, + axis: typing.Union[int, str] = 0, + index: typing.Any = None, + columns: Union[blocks.Label, Iterable[blocks.Label]] = None, + level: typing.Optional[LevelType] = None, + ) -> DataFrame: + if labels: + if index or columns: + raise ValueError("Cannot specify both 'labels' and 'index'/'columns") + axis_n = utils.get_axis_number(axis) + if axis_n == 0: + index = labels + else: + columns = labels + + block = self._block + if index: + level_id = self._resolve_levels(level or 0)[0] - block = self._block.drop_columns(self._sql_names(columns)) + if _is_list_like(index): + block, inverse_condition_id = block.apply_unary_op( + level_id, ops.IsInOp(index, match_nulls=True) + ) + block, condition_id = block.apply_unary_op( + inverse_condition_id, ops.invert_op + ) + else: + block, condition_id = block.apply_unary_op( + level_id, ops.partial_right(ops.ne_op, index) + ) + block = block.filter(condition_id, keep_null=True).select_columns( + self._block.value_columns + ) + if columns: + if not _is_list_like(columns): + columns = [columns] # type:ignore + columns = list(columns) + + block = block.drop_columns(self._sql_names(columns)) + if not index and not columns: + raise ValueError("Must specify 'labels' or 'index'/'columns") return DataFrame(block) def droplevel(self, level: LevelsType): @@ -722,7 +824,7 @@ def rename_axis( ) -> DataFrame: if len(kwargs) != 0: raise NotImplementedError( - "rename_axis does not currently support any keyword arguments." + f"rename_axis does not currently support any keyword arguments. {constants.FEEDBACK_LINK}" ) # limited implementation: the new index name is simply the 'mapper' parameter if _is_list_like(mapper): @@ -795,7 +897,7 @@ def _assign_series_join_on_index( # Update case, remove after copying into columns block = block.drop_columns([source_column]) - return DataFrame(block.with_index_labels([self.index.name])) + return DataFrame(block.with_index_labels(self.index.names)) def reset_index(self, *, drop: bool = False) -> DataFrame: block = self._block.reset_index(drop) @@ -814,9 +916,20 @@ def set_index( col_ids = [self._resolve_label_exact(key) for key in keys] return DataFrame(self._block.set_index(col_ids, append=append, drop=drop)) - def sort_index(self) -> DataFrame: + def sort_index( + self, ascending: bool = True, na_position: Literal["first", "last"] = "last" + ) -> DataFrame: + if na_position not in ["first", "last"]: + raise ValueError("Param na_position must be one of 'first' or 'last'") + direction = ( + order.OrderingDirection.ASC if ascending else order.OrderingDirection.DESC + ) + na_last = na_position == "last" index_columns = self._block.index_columns - ordering = [order.OrderingColumnReference(column) for column in index_columns] + ordering = [ + order.OrderingColumnReference(column, direction=direction, na_last=na_last) + for column in index_columns + ] return DataFrame(self._block.order_by(ordering)) def sort_values( @@ -824,6 +937,7 @@ def sort_values( by: str | typing.Sequence[str], *, ascending: bool | typing.Sequence[bool] = True, + kind: str = "quicksort", na_position: typing.Literal["first", "last"] = "last", ) -> DataFrame: if na_position not in {"first", "last"}: @@ -854,8 +968,9 @@ def sort_values( column_id, direction=direction, na_last=na_last ) ) - - return DataFrame(self._block.order_by(ordering)) + return DataFrame( + self._block.order_by(ordering, stable=kind in order.STABLE_SORTS) + ) def value_counts( self, @@ -902,7 +1017,7 @@ def any( else: frame = self._drop_non_bool() block = frame._block.aggregate_all_and_pivot( - agg_ops.any_op, dtype=pd.BooleanDtype() + agg_ops.any_op, dtype=pandas.BooleanDtype() ) return bigframes.series.Series(block.select_column("values")) @@ -912,7 +1027,7 @@ def all(self, *, bool_only: bool = False) -> bigframes.series.Series: else: frame = self._drop_non_bool() block = frame._block.aggregate_all_and_pivot( - agg_ops.all_op, dtype=pd.BooleanDtype() + agg_ops.all_op, dtype=pandas.BooleanDtype() ) return bigframes.series.Series(block.select_column("values")) @@ -932,6 +1047,20 @@ def mean(self, *, numeric_only: bool = False) -> bigframes.series.Series: block = frame._block.aggregate_all_and_pivot(agg_ops.mean_op) return bigframes.series.Series(block.select_column("values")) + def median( + self, *, numeric_only: bool = False, exact: bool = False + ) -> bigframes.series.Series: + if exact: + raise NotImplementedError( + f"Only approximate median is supported. {constants.FEEDBACK_LINK}" + ) + if not numeric_only: + frame = self._raise_on_non_numeric("median") + else: + frame = self._drop_non_numeric() + block = frame._block.aggregate_all_and_pivot(agg_ops.median_op) + return bigframes.series.Series(block.select_column("values")) + def std(self, *, numeric_only: bool = False) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("std") @@ -986,11 +1115,52 @@ def nunique(self) -> bigframes.series.Series: block = self._block.aggregate_all_and_pivot(agg_ops.nunique_op) return bigframes.series.Series(block.select_column("values")) - def _drop_non_numeric(self) -> DataFrame: + def agg( + self, func: str | typing.Sequence[str] + ) -> DataFrame | bigframes.series.Series: + if _is_list_like(func): + if any( + dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES + for dtype in self.dtypes + ): + raise NotImplementedError( + f"Multiple aggregations only supported on numeric columns. {constants.FEEDBACK_LINK}" + ) + aggregations = [agg_ops.AGGREGATIONS_LOOKUP[f] for f in func] + return DataFrame( + self._block.summarize( + self._block.value_columns, + aggregations, + ) + ) + else: + return bigframes.series.Series( + self._block.aggregate_all_and_pivot( + agg_ops.AGGREGATIONS_LOOKUP[typing.cast(str, func)] + ) + ) + + aggregate = agg + + def describe(self) -> DataFrame: + df_numeric = self._drop_non_numeric(keep_bool=False) + if len(df_numeric.columns) == 0: + raise NotImplementedError( + f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}" + ) + result = df_numeric.agg( + ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] + ) + return typing.cast(DataFrame, result) + + def _drop_non_numeric(self, keep_bool=True) -> DataFrame: + types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) + if not keep_bool: + types_to_keep -= set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES) non_numeric_cols = [ col_id for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) - if dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES + if dtype not in types_to_keep ] return DataFrame(self._block.drop_columns(non_numeric_cols)) @@ -1008,7 +1178,8 @@ def _raise_on_non_numeric(self, op: str): for dtype in self._block.dtypes ): raise NotImplementedError( - f"'{op}' does not support non-numeric columns. Set 'numeric_only'=True to ignore non-numeric columns" + f"'{op}' does not support non-numeric columns. " + f"Set 'numeric_only'=True to ignore non-numeric columns. {constants.FEEDBACK_LINK}" ) return self @@ -1018,7 +1189,8 @@ def _raise_on_non_boolean(self, op: str): for dtype in self._block.dtypes ): raise NotImplementedError( - f"'{op}' does not support non-bool columns. Set 'bool_only'=True to ignore non-bool columns" + f"'{op}' does not support non-bool columns. " + f"Set 'bool_only'=True to ignore non-bool columns. {constants.FEEDBACK_LINK}" ) return self @@ -1044,10 +1216,12 @@ def merge( ) -> DataFrame: if on is None: if left_on is None or right_on is None: - raise ValueError("Must specify either on or left_on + right_on.") + raise ValueError("Must specify `on` or `left_on` + `right_on`.") else: if left_on is not None or right_on is not None: - raise ValueError("Can not pass both on and left_on + right_on params.") + raise ValueError( + "Can not pass both `on` and `left_on` + `right_on` params." + ) left_on, right_on = on, on left = self @@ -1075,7 +1249,7 @@ def merge( sort=sort, # In merging on the same column, it only returns 1 key column from coalesced both. # While if 2 different columns, both will be presented in the result. - get_both_join_key_cols=(left_on != right_on), + coalesce_join_keys=(left_on == right_on), ) # TODO(swast): Add suffixes to the column labels instead of reusing the # column IDs as the new labels. @@ -1138,14 +1312,57 @@ def _get_merged_col_labels( return left_col_labels + right_col_labels - def join(self, other: DataFrame, *, how: str = "left") -> DataFrame: - if not self.columns.intersection(other.columns).empty: - raise NotImplementedError("Deduping column names is not implemented") + def join( + self, other: DataFrame, *, on: Optional[str] = None, how: str = "left" + ) -> DataFrame: + left, right = self, other + if not left.columns.intersection(right.columns).empty: + raise NotImplementedError( + f"Deduping column names is not implemented. {constants.FEEDBACK_LINK}" + ) - left = self - right = other - combined_index, (get_column_left, get_column_right) = left._block.index.join( - right._block.index, how=how + # Join left columns with right index + if on is not None: + if other._block.index.nlevels != 1: + raise ValueError( + "Join on columns must match the index level of the other DataFrame. Join on column with multi-index haven't been supported." + ) + # Switch left index with on column + left_columns = left.columns + left_idx_original_names = left.index.names + left_idx_names_in_cols = [ + f"bigframes_left_idx_name_{i}" for i in range(len(left.index.names)) + ] + left.index.names = left_idx_names_in_cols + left = left.reset_index(drop=False) + left = left.set_index(on) + + # Join on index and switch back + combined_df = left._perform_join_by_index(right, how=how) + combined_df.index.name = on + combined_df = combined_df.reset_index(drop=False) + combined_df = combined_df.set_index(left_idx_names_in_cols) + + # To be consistent with Pandas + combined_df.index.names = ( + left_idx_original_names + if how in ("inner", "left") + else ([None] * len(combined_df.index.names)) + ) + + # Reorder columns + combined_df = combined_df[list(left_columns) + list(right.columns)] + return combined_df + + # Join left index with right index + if left._block.index.nlevels != right._block.index.nlevels: + raise ValueError("Index to join on must have the same number of levels.") + + return left._perform_join_by_index(right, how=how) + + def _perform_join_by_index(self, other: DataFrame, *, how: str = "left"): + combined_index, _ = self._block.index.join( + other._block.index, how=how, block_identity_join=True ) return DataFrame(combined_index._block) @@ -1178,7 +1395,7 @@ def _groupby_level( ): return groupby.DataFrameGroupBy( self._block, - self._resolve_levels(level), + by_col_ids=self._resolve_levels(level), as_index=as_index, dropna=dropna, ) @@ -1220,13 +1437,13 @@ def _groupby_series( matches = [*col_matches, *level_matches] if len(matches) != 1: raise ValueError( - f"GroupBy key {key} does not map to unambiguous column or index level" + f"GroupBy key {key} does not match a unique column or index level. BigQuery DataFrames only interprets lists of strings as column or index names, not directly as per-row group assignments." ) col_ids = [*col_ids, matches[0]] return groupby.DataFrameGroupBy( block, - col_ids, + by_col_ids=col_ids, as_index=as_index, dropna=dropna, ) @@ -1311,8 +1528,9 @@ def sample( ns = (n,) if n is not None else () fracs = (frac,) if frac is not None else () - - return self._split(ns=ns, fracs=fracs, random_state=random_state)[0] + return DataFrame( + self._block._split(ns=ns, fracs=fracs, random_state=random_state)[0] + ) def _split( self, @@ -1326,88 +1544,33 @@ def _split( At most one of ns and fracs can be passed in. If neither, default to ns = (1,). Return a list of sampled DataFrames. """ - if ns and fracs: - raise ValueError("Only one of 'ns' or 'fracs' parameter must be specified.") - - block = self._block - if not ns and not fracs: - ns = (1,) - - if ns: - sample_sizes = ns - else: - total_rows = block.shape[0] - # Round to nearest integer. "round half to even" rule applies. - # At least to be 1. - sample_sizes = [round(frac * total_rows) or 1 for frac in fracs] - - # Set random_state if it is not provided - if random_state is None: - random_state = random.randint(-(2**30), 2**30) - - # Create a new column with random_state value. - block, random_state_col = block.create_constant(random_state) - - # Create an ordering col and a new sum col which is ordering+random_state. - block, ordering_col = block.promote_offsets() - block, sum_col = block.apply_binary_op( - ordering_col, random_state_col, ops.add_op - ) - - # Apply hash method to sum col and order by it. - block, string_sum_col = block.apply_unary_op( - sum_col, ops.AsTypeOp("string[pyarrow]") - ) - block, hash_string_sum_col = block.apply_unary_op(string_sum_col, ops.hash_op) - block = block.order_by([order.OrderingColumnReference(hash_string_sum_col)]) - - drop_cols = [ - random_state_col, - ordering_col, - sum_col, - string_sum_col, - hash_string_sum_col, - ] - block = block.drop_columns(drop_cols) - df = DataFrame(block) - - intervals = [] - cur = 0 - for sample_size in sample_sizes: - intervals.append((cur, cur + sample_size)) - cur += sample_size - - # DF.iloc[slice] always returns DF. - return [ - typing.cast(DataFrame, df.iloc[lower:upper]) for lower, upper in intervals - ] + blocks = self._block._split(ns=ns, fracs=fracs, random_state=random_state) + return [DataFrame(block) for block in blocks] - def to_pandas(self) -> pd.DataFrame: - """Writes DataFrame to Pandas DataFrame.""" - # TODO(chelsealin): Support block parameters. - # TODO(chelsealin): Add to_pandas_batches() API. - return self.compute() - - def to_csv(self, path_or_buf: str, *, index: bool = True) -> None: + def to_csv( + self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True + ) -> None: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. # TODO(swast): Some warning that wildcard is recommended for large # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size if not path_or_buf.startswith("gs://"): - raise NotImplementedError( - "Only Google Cloud Storage (gs://...) paths are supported." - ) + raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS) + if "*" not in path_or_buf: + raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) - source_table = self._execute_query(index=index) - job_config = bigquery.ExtractJobConfig( - destination_format=bigquery.DestinationFormat.CSV + result_table = self._run_io_query( + index=index, ordering_id=bigframes.core.io.IO_ORDERING_ID ) - self._block.expr._session._extract_table( - source_table, - destination_uris=[path_or_buf], - job_config=job_config, + export_data_statement = bigframes.core.io.create_export_csv_statement( + f"{result_table.project}.{result_table.dataset_id}.{result_table.table_id}", + uri=path_or_buf, + field_delimiter=sep, + header=header, ) + _, query_job = self._block.expr._session._start_query(export_data_statement) + self._set_internal_query_job(query_job) def to_json( self, @@ -1421,13 +1584,11 @@ def to_json( ) -> None: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. - # TODO(swast): Some warning that wildcard is recommended for large - # query results? See: - # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size if not path_or_buf.startswith("gs://"): - raise NotImplementedError( - "Only Google Cloud Storage (gs://...) paths are supported." - ) + raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS) + + if "*" not in path_or_buf: + raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) if lines is True and orient != "records": raise ValueError( @@ -1438,18 +1599,20 @@ def to_json( # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#to_json_string if lines is False: raise NotImplementedError( - "Only newline delimited JSON format is supported." + f"Only newline delimited JSON format is supported. {constants.FEEDBACK_LINK}" ) - source_table = self._execute_query(index=index) - job_config = bigquery.ExtractJobConfig( - destination_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON + result_table = self._run_io_query( + index=index, ordering_id=bigframes.core.io.IO_ORDERING_ID ) - self._block.expr._session._extract_table( - source_table, - destination_uris=[path_or_buf], - job_config=job_config, + export_data_statement = bigframes.core.io.create_export_data_statement( + f"{result_table.project}.{result_table.dataset_id}.{result_table.table_id}", + uri=path_or_buf, + format="JSON", + export_options={}, ) + _, query_job = self._block.expr._session._start_query(export_data_statement) + self._set_internal_query_job(query_job) def to_gbq( self, @@ -1457,6 +1620,7 @@ def to_gbq( *, if_exists: Optional[Literal["fail", "replace", "append"]] = "fail", index: bool = True, + ordering_id: Optional[str] = None, ) -> None: if "." not in destination_table: raise ValueError( @@ -1480,12 +1644,12 @@ def to_gbq( ), ) - self._execute_query(index=index, job_config=job_config) + self._run_io_query(index=index, ordering_id=ordering_id, job_config=job_config) def to_numpy( self, dtype=None, copy=False, na_value=None, **kwargs ) -> numpy.ndarray: - return self.compute().to_numpy(dtype, copy, na_value, **kwargs) + return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) __array__ = to_numpy @@ -1496,30 +1660,29 @@ def to_parquet(self, path: str, *, index: bool = True) -> None: # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size if not path.startswith("gs://"): - raise NotImplementedError( - "Only Google Cloud Storage (gs://...) paths are supported." - ) + raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS) - source_table = self._execute_query(index=index) - job_config = bigquery.ExtractJobConfig( - destination_format=bigquery.DestinationFormat.PARQUET - ) + if "*" not in path: + raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) - self._block.expr._session._extract_table( - source_table, - destination_uris=[path], - job_config=job_config, + result_table = self._run_io_query( + index=index, ordering_id=bigframes.core.io.IO_ORDERING_ID ) + export_data_statement = bigframes.core.io.create_export_data_statement( + f"{result_table.project}.{result_table.dataset_id}.{result_table.table_id}", + uri=path, + format="PARQUET", + export_options={}, + ) + _, query_job = self._block.expr._session._start_query(export_data_statement) + self._set_internal_query_job(query_job) def _apply_to_rows(self, operation: ops.UnaryOp): block = self._block.multi_apply_unary_op(self._block.value_columns, operation) return DataFrame(block) - def _execute_query( - self, index: bool, job_config: Optional[bigquery.job.QueryJobConfig] = None - ): - """Executes a query job presenting this dataframe and returns the destination - table.""" + def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: + """Create query text representing this dataframe for I/O.""" expr = self._block.expr session = expr._session columns = list(self._block.value_columns) @@ -1531,26 +1694,53 @@ def _execute_query( if index and self.index.name is not None: columns.extend(self._block.index_columns) column_labels.extend(self.index.names) - # TODO(chelsealin): normalize the file formats if we needs, such as arbitrary - # unicode for column labels. - value_columns = (expr.get_column(column_name) for column_name in columns) - expr = expr.projection(value_columns) + else: + expr = expr.drop_columns(self._block.index_columns) # Make columns in SQL reflect _labels_ not _ids_. Note: This may use # the arbitrary unicode column labels feature in BigQuery, which is # currently (June 2023) in preview. # TODO(swast): Handle duplicate and NULL labels. - ibis_expr = expr.to_ibis_expr() - renamed_columns = [ - ibis_expr[col_id].name(col_label) + id_overrides = { + col_id: col_label for col_id, col_label in zip(columns, column_labels) - ] - ibis_expr = ibis_expr.select(*renamed_columns) - sql = session.ibis_client.compile(ibis_expr) # type: ignore + if col_label + } + + if ordering_id is not None: + ibis_expr = expr.to_ibis_expr( + ordering_mode="offset_col", + col_id_overrides=id_overrides, + order_col_name=ordering_id, + ) + else: + ibis_expr = expr.to_ibis_expr( + ordering_mode="unordered", + col_id_overrides=id_overrides, + ) + + return session.ibis_client.compile(ibis_expr) # type: ignore + + def _run_io_query( + self, + index: bool, + ordering_id: Optional[str] = None, + job_config: Optional[bigquery.job.QueryJobConfig] = None, + ) -> bigquery.TableReference: + """Executes a query job presenting this dataframe and returns the destination + table.""" + expr = self._block.expr + session = expr._session + sql = self._create_io_query(index=index, ordering_id=ordering_id) _, query_job = session._start_query( sql=sql, job_config=job_config # type: ignore ) - return query_job.destination + self._set_internal_query_job(query_job) + + # The query job should have finished, so there should be always be a result table. + result_table = query_job.destination + assert result_table is not None + return result_table def map(self, func, na_action: Optional[str] = None) -> DataFrame: if not callable(func): @@ -1560,18 +1750,29 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: raise ValueError(f"na_action={na_action} not supported") # TODO(shobs): Support **kwargs - - return self._apply_to_rows( + # Reproject as workaround to applying filter too late. This forces the filter + # to be applied before passing data to remote function, protecting from bad + # inputs causing errors. + reprojected_df = DataFrame(self._block._force_reproject()) + return reprojected_df._apply_to_rows( ops.RemoteFunctionOp(func, apply_on_null=(na_action is None)) ) - def drop_duplicates(self, subset=None, *, keep: str = "first") -> DataFrame: + def drop_duplicates( + self, + subset: typing.Union[blocks.Label, typing.Sequence[blocks.Label]] = None, + *, + keep: str = "first", + ) -> DataFrame: if subset is None: column_ids = self._block.value_columns - else: + elif _is_list_like(subset): column_ids = [ id for label in subset for id in self._block.label_to_col_id[label] ] + else: + # interpret as single label + column_ids = self._block.label_to_col_id[typing.cast(blocks.Label, subset)] block = block_ops.drop_duplicates(self._block, column_ids, keep) return DataFrame(block) @@ -1619,8 +1820,8 @@ def _get_block(self) -> blocks.Block: def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: - return pd.api.types.is_list_like(obj) + return pandas.api.types.is_list_like(obj) def _is_dict_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Mapping]: - return pd.api.types.is_dict_like(obj) + return pandas.api.types.is_dict_like(obj) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index c257d6943e..95cf737b2e 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -14,6 +14,7 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" +import textwrap import typing from typing import Any, Dict, Iterable, Literal, Tuple, Union @@ -25,6 +26,8 @@ import pandas as pd import pyarrow as pa +import bigframes.constants as constants + # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ pd.BooleanDtype, @@ -37,6 +40,9 @@ # Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation) NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()] +# On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable +UNORDERED_DTYPES = [gpd.array.GeometryDtype()] + # Type hints for dtype strings supported by BigQuery DataFrame DtypeString = Literal[ "boolean", @@ -150,7 +156,9 @@ def ibis_dtype_to_bigframes_dtype( if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] else: - raise ValueError(f"Unexpected Ibis data type {type(ibis_dtype)}") + raise ValueError( + f"Unexpected Ibis data type {type(ibis_dtype)}. {constants.FEEDBACK_LINK}" + ) def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: @@ -182,14 +190,14 @@ def bigframes_dtype_to_ibis_dtype( """Converts a BigQuery DataFrames supported dtype to an Ibis dtype. Args: - bigframes_dtype: A dtype supported by BigQuery DataFrame + bigframes_dtype: + A dtype supported by BigQuery DataFrame Returns: - The corresponding Ibis type + IbisDtype: The corresponding Ibis type Raises: - ValueError: - If passed a dtype not supported by BigQuery DataFrames. + ValueError: If passed a dtype not supported by BigQuery DataFrames. """ type_string = str(bigframes_dtype) if type_string in BIGFRAMES_STRING_TO_BIGFRAMES: @@ -197,7 +205,23 @@ def bigframes_dtype_to_ibis_dtype( typing.cast(DtypeString, type_string) ] else: - raise ValueError(f"Unexpected data type {bigframes_dtype}") + raise ValueError( + textwrap.dedent( + f""" + Unexpected data type {bigframes_dtype}. The following + str dtypes are supppted: 'boolean','Float64','Int64', 'string', + 'tring[pyarrow]','timestamp[us, tz=UTC][pyarrow]', + 'timestamp[us][pyarrow]','date32[day][pyarrow]', + 'time64[us][pyarrow]'. The following pandas.ExtensionDtype are + supported: pandas.BooleanDtype(), pandas.Float64Dtype(), + pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), + pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), + pd.ArrowDtype(pa.timestamp("us")), + pd.ArrowDtype(pa.timestamp("us", tz="UTC")). + {constants.FEEDBACK_LINK} + """ + ) + ) return BIGFRAMES_TO_IBIS[bigframes_dtype] @@ -209,8 +233,10 @@ def literal_to_ibis_scalar( expression with a BigQuery DataFrames compatible data type Args: - literal: any value accepted by Ibis - force_dtype: force the value to a specific dtype + literal: + any value accepted by Ibis + force_dtype: + force the value to a specific dtype validate: If true, will raise ValueError if type cannot be stored in a BigQuery DataFrames object. If used as a subexpression, this should @@ -227,7 +253,9 @@ def literal_to_ibis_scalar( if pd.api.types.is_list_like(literal): if validate: - raise ValueError("List types can't be stored in BigQuery DataFrames") + raise ValueError( + f"List types can't be stored in BigQuery DataFrames. {constants.FEEDBACK_LINK}" + ) # "correct" way would be to use ibis.array, but this produces invalid BQ SQL syntax return tuple(literal) if not pd.api.types.is_list_like(literal) and pd.isna(literal): @@ -246,7 +274,9 @@ def literal_to_ibis_scalar( # TODO(bmil): support other literals that can be coerced to compatible types if validate and (scalar_expr.type() not in BIGFRAMES_TO_IBIS.values()): - raise ValueError(f"Literal did not coerce to a supported data type: {literal}") + raise ValueError( + f"Literal did not coerce to a supported data type: {literal}. {constants.FEEDBACK_LINK}" + ) return scalar_expr @@ -255,9 +285,11 @@ def cast_ibis_value(value: ibis_types.Value, to_type: IbisDtype) -> ibis_types.V """Perform compatible type casts of ibis values Args: - value: Ibis value, which could be a literal, scalar, or column + value: + Ibis value, which could be a literal, scalar, or column - to_type: The Ibis type to cast to + to_type: + The Ibis type to cast to Returns: A new Ibis value of type to_type @@ -275,7 +307,7 @@ def cast_ibis_value(value: ibis_types.Value, to_type: IbisDtype) -> ibis_types.V ibis_dtypes.float64, ibis_dtypes.string, ), - ibis_dtypes.float64: (ibis_dtypes.string,), + ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), ibis_dtypes.string: (), ibis_dtypes.date: (), ibis_dtypes.time: (), @@ -289,7 +321,9 @@ def cast_ibis_value(value: ibis_types.Value, to_type: IbisDtype) -> ibis_types.V return value.cast(to_type) else: # this should never happen - raise TypeError(f"Unexpected value type {value.type()}") + raise TypeError( + f"Unexpected value type {value.type()}. {constants.FEEDBACK_LINK}" + ) # casts that need some encouragement @@ -301,4 +335,9 @@ def cast_ibis_value(value: ibis_types.Value, to_type: IbisDtype) -> ibis_types.V if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.float64: return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64) - raise TypeError(f"Unsupported cast {value.type()} to {to_type}") + if value.type() == ibis_dtypes.float64 and to_type == ibis_dtypes.bool: + return value != ibis_types.literal(0) + + raise TypeError( + f"Unsupported cast {value.type()} to {to_type}. {constants.FEEDBACK_LINK}" + ) diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index fa0ed0c64d..2c3c50ba75 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -13,28 +13,53 @@ # limitations under the License. """Shared helper functions for formatting jobs related info.""" +# TODO(orrbradford): cleanup up typings and documenttion in this file import datetime -from typing import Optional +from typing import Any, Optional, Union +import google.api_core.exceptions as api_core_exceptions import google.cloud.bigquery as bigquery import humanize import IPython import IPython.display as display import ipywidgets as widgets +import bigframes.constants as constants + +GenericJob = Union[bigquery.LoadJob, bigquery.ExtractJob, bigquery.QueryJob] + query_job_prop_pairs = { "Job Id": "job_id", "Destination Table": "destination", "Slot Time": "slot_millis", - "Bytes Processed": "estimated_bytes_processed", + "Bytes Processed": "total_bytes_processed", "Cache hit": "cache_hit", } -def repr_query_job(query_job: Optional[bigquery.QueryJob]): +def add_feedback_link( + exception: Union[ + api_core_exceptions.RetryError, api_core_exceptions.GoogleAPICallError + ] +): + exception.message = exception.message + f" {constants.FEEDBACK_LINK}" + + +def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): + """Return query job in html format. + Args: + query_job: + The job representing the execution of the query on the server. + Returns: + Pywidget html table. + """ if query_job is None: return widgets.HTML("No job information available") + if query_job.dry_run: + return widgets.HTML( + f"Computation deferred. Computation will process {get_formatted_bytes(query_job.total_bytes_processed)}" + ) table_html = "" table_html += "" for key, value in query_job_prop_pairs.items(): @@ -54,7 +79,35 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]): return widgets.HTML(table_html) -def wait_for_job( +def repr_query_job(query_job: Optional[bigquery.QueryJob]): + """Return query job as a formatted string. + Args: + query_job: + The job representing the execution of the query on the server. + Returns: + Pywidget html table. + """ + if query_job is None: + return "No job information available" + if query_job.dry_run: + return f"Computation deferred. Computation will process {get_formatted_bytes(query_job.total_bytes_processed)}" + res = "Query Job Info" + for key, value in query_job_prop_pairs.items(): + job_val = getattr(query_job, value) + if job_val is not None: + res += "\n" + if key == "Job Id": # add link to job + res += f"""Job url: {get_job_url(query_job)}""" + elif key == "Slot Time": + res += f"""{key}: {get_formatted_time(job_val)}""" + elif key == "Bytes Processed": + res += f"""{key}: {get_formatted_bytes(job_val)}""" + else: + res += f"""{key}: {job_val}""" + return res + + +def wait_for_query_job( query_job: bigquery.QueryJob, max_results: Optional[int] = None, progress_bar: Optional[str] = None, @@ -72,19 +125,75 @@ def wait_for_job( if progress_bar == "auto": progress_bar = "notebook" if in_ipython() else "terminal" - if progress_bar == "notebook": - display.display(loading_bar) - query_result = query_job.result(max_results=max_results) - query_job.reload() - loading_bar.close() - elif progress_bar == "terminal": - print(get_query_job_loading_string(query_job)) - query_result = query_job.result(max_results=max_results) - query_job.reload() - return query_result + try: + if progress_bar == "notebook": + display.display(loading_bar) + query_result = query_job.result(max_results=max_results) + query_job.reload() + loading_bar.value = get_query_job_loading_html(query_job) + elif progress_bar == "terminal": + initial_loading_bar = get_query_job_loading_string(query_job) + print(initial_loading_bar) + query_result = query_job.result(max_results=max_results) + query_job.reload() + if initial_loading_bar != get_query_job_loading_string(query_job): + print(get_query_job_loading_string(query_job)) + else: + # No progress bar. + query_result = query_job.result(max_results=max_results) + query_job.reload() + return query_result + except api_core_exceptions.RetryError as exc: + add_feedback_link(exc) + raise + except api_core_exceptions.GoogleAPICallError as exc: + add_feedback_link(exc) + raise + + +def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): + """Waits for job results. Displays a progress bar while the job is running + Args: + job: + The bigquery job to be executed + """ + loading_bar = widgets.HTML(get_base_job_loading_html(job)) + if progress_bar == "auto": + progress_bar = "notebook" if in_ipython() else "terminal" + + try: + if progress_bar == "notebook": + display.display(loading_bar) + job.result() + job.reload() + loading_bar.value = get_base_job_loading_html(job) + elif progress_bar == "terminal": + inital_loading_bar = get_base_job_loading_string(job) + print(inital_loading_bar) + job.result() + job.reload() + if get_base_job_loading_string != inital_loading_bar: + print(get_base_job_loading_string(job)) + else: + # No progress bar. + job.result() + job.reload() + except api_core_exceptions.RetryError as exc: + add_feedback_link(exc) + raise + except api_core_exceptions.GoogleAPICallError as exc: + add_feedback_link(exc) + raise -def get_job_url(query_job: bigquery.QueryJob): +def get_job_url(query_job: GenericJob): + """Return url to the query job in cloud console. + Args: + query_job: + The job representing the execution of the query on the server. + Returns: + String url. + """ if ( query_job.project is None or query_job.location is None @@ -95,16 +204,57 @@ def get_job_url(query_job: bigquery.QueryJob): def get_query_job_loading_html(query_job: bigquery.QueryJob): - return f"""Job {query_job.job_id} is {query_job.state}. Open Job""" + """Return progress bar html string + Args: + query_job: + The job representing the execution of the query on the server. + Returns: + Html string. + """ + return f"""Query job {query_job.job_id} is {query_job.state}. {get_bytes_processed_string(query_job.total_bytes_processed)}Open Job""" def get_query_job_loading_string(query_job: bigquery.QueryJob): - return ( - f"""Job {query_job.job_id} is {query_job.state}. \n{get_job_url(query_job)}""" - ) + """Return progress bar string + Args: + query_job: + The job representing the execution of the query on the server. + Returns: + String + """ + return f"""Query job {query_job.job_id} is {query_job.state}.{get_bytes_processed_string(query_job.total_bytes_processed)} \n{get_job_url(query_job)}""" + + +def get_base_job_loading_html(job: GenericJob): + """Return progress bar html string + Args: + job: + The job representing the execution of the query on the server. + Returns: + Html string. + """ + return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. Open Job""" + + +def get_base_job_loading_string(job: GenericJob): + """Return progress bar string + Args: + job: + The job representing the execution of the query on the server. + Returns: + String + """ + return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. \n{get_job_url(job)}""" def get_formatted_time(val): + """Try to format time + Args: + val: + Time in ms + Returns: + Duration string + """ try: return humanize.naturaldelta(datetime.timedelta(milliseconds=float(val))) except Exception: @@ -112,7 +262,24 @@ def get_formatted_time(val): def get_formatted_bytes(val): - return humanize.naturalsize(val) + """Try to format bytes + Args: + val (Any): + Bytes to format + Returns: + Duration string + """ + if isinstance(val, int): + return humanize.naturalsize(val) + return "N/A" + + +def get_bytes_processed_string(val: Any): + """Try to get bytes processed string. Return empty if passed non int value""" + bytes_processed_string = "" + if isinstance(val, int): + bytes_processed_string = f"""{get_formatted_bytes(val)} processed. """ + return bytes_processed_string def in_ipython(): diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index e0a1027dfe..e4c68eb17c 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -22,9 +22,9 @@ """ import abc -from typing import Optional, TypeVar +from typing import cast, Optional, TypeVar -from bigframes.ml.core import BqmlModel +from bigframes.ml import core import third_party.bigframes_vendored.sklearn.base @@ -92,7 +92,7 @@ class Predictor(BaseEstimator): """A BigQuery DataFrames ML Model base class that can be used to predict outputs.""" def __init__(self): - self._bqml_model: Optional[BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None @abc.abstractmethod def predict(self, X): @@ -102,21 +102,32 @@ def predict(self, X): def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T: """Register the model to Vertex AI. + + After register, go to https://pantheon.corp.google.com/vertex-ai/models to manage the model registries. + Refer to https://cloud.google.com/vertex-ai/docs/model-registry/introduction for more options. + Args: - vertex_ai_model_id: optional string id as model id in Vertex. If not set, will by default to 'bigframes_{bq_model_id}'. + vertex_ai_model_id (Optional[str], default None): + optional string id as model id in Vertex. If not set, will by default to 'bigframes_{bq_model_id}'. + Vertex Ai model id will be truncated to 63 characters due to its limitation. Returns: BigQuery DataFrames Model after register. """ if not self._bqml_model: - raise RuntimeError("A model must be trained before register.") + # TODO(garrettwu): find a more elegant way to do this. + try: + self._bqml_model = self._create_bqml_model() # type: ignore + except AttributeError: + raise RuntimeError("A model must be trained before register.") + self._bqml_model = cast(core.BqmlModel, self._bqml_model) self._bqml_model.register(vertex_ai_model_id) return self class TrainablePredictor(Predictor): - """A BigQuery DataFrame ML Model base class that can be used to fit and predict outputs. + """A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs. Also the predictor can be attached to a pipeline with transformers.""" @@ -124,10 +135,9 @@ class TrainablePredictor(Predictor): def fit(self, X, y, transforms): pass - # TODO(b/289280565): enable signatures after updating KMeans and PCA - # @abc.abstractmethod - # def score(self, X, y): - # pass + @abc.abstractmethod + def score(self, X, y): + pass # TODO(b/291812029): move to Predictor after implement in LLM and imported models @abc.abstractmethod diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 2dd95bfc1c..6f6efb9f37 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -13,35 +13,33 @@ # limitations under the License. """Clustering models. This module is styled after Scikit-Learn's cluster module: -https://scikit-learn.org/stable/modules/clustering.html""" +https://scikit-learn.org/stable/modules/clustering.html.""" from __future__ import annotations -from typing import cast, Dict, List, Optional, TYPE_CHECKING +from typing import cast, Dict, List, Optional, Union from google.cloud import bigquery -if TYPE_CHECKING: - import bigframes - -import bigframes.ml.base -import bigframes.ml.core +import bigframes +from bigframes.ml import base, core, utils +import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.cluster._kmeans class KMeans( third_party.bigframes_vendored.sklearn.cluster._kmeans.KMeans, - bigframes.ml.base.TrainablePredictor, + base.TrainablePredictor, ): __doc__ = third_party.bigframes_vendored.sklearn.cluster._kmeans.KMeans.__doc__ def __init__(self, n_clusters=8): self.n_clusters = n_clusters - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None - @staticmethod - def _from_bq(session: bigframes.Session, model: bigquery.Model) -> KMeans: + @classmethod + def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> KMeans: assert model.model_type == "KMEANS" kwargs = {} @@ -51,8 +49,8 @@ def _from_bq(session: bigframes.Session, model: bigquery.Model) -> KMeans: if "numClusters" in last_fitting: kwargs["n_clusters"] = int(last_fitting["numClusters"]) - new_kmeans = KMeans(**kwargs) - new_kmeans._bqml_model = bigframes.ml.core.BqmlModel(session, model) + new_kmeans = cls(**kwargs) + new_kmeans._bqml_model = core.BqmlModel(session, model) return new_kmeans @property @@ -62,36 +60,55 @@ def _bqml_options(self) -> Dict[str, str | int | float | List[str]]: def fit( self, - X: bigframes.dataframe.DataFrame, - y=None, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored transforms: Optional[List[str]] = None, - ): - self._bqml_model = bigframes.ml.core.create_bqml_model( + ) -> KMeans: + (X,) = utils.convert_to_dataframe(X) + + self._bqml_model = core.create_bqml_model( train_X=X, transforms=transforms, options=self._bqml_options, ) + return self def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + self, + X: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - return cast( - bigframes.dataframe.DataFrame, self._bqml_model.predict(X)[["CENTROID_ID"]] - ) + (X,) = utils.convert_to_dataframe(X) + + return cast(bpd.DataFrame, self._bqml_model.predict(X)[["CENTROID_ID"]]) def to_gbq(self, model_name: str, replace: bool = False) -> KMeans: - """Save the model to Google Cloud BigQuey. + """Save the model to BigQuery. Args: - model_name: the name of the model. - replace: whether to replace if the model already exists. Default to False. + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: + KMeans: saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) + + def score( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("A model must be fitted before score") + + (X,) = utils.convert_to_dataframe(X) + + return self._bqml_model.evaluate(X) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 09645d4cf8..49b4899beb 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -12,33 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Build composite transformers on heterogenous data. This module is styled +"""Build composite transformers on heterogeneous data. This module is styled after Scikit-Learn's compose module: -https://scikit-learn.org/stable/modules/classes.html#module-sklearn.compose""" +https://scikit-learn.org/stable/modules/classes.html#module-sklearn.compose.""" from __future__ import annotations import typing -from typing import List, Optional, Tuple, TYPE_CHECKING, Union +from typing import List, Optional, Tuple, Union -if TYPE_CHECKING: - import bigframes - -import bigframes.ml.base -import bigframes.ml.compose -import bigframes.ml.core -import bigframes.ml.preprocessing +from bigframes import constants +from bigframes.ml import base, core, preprocessing, utils +import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.compose._column_transformer CompilablePreprocessorType = Union[ - bigframes.ml.preprocessing.OneHotEncoder, - bigframes.ml.preprocessing.StandardScaler, + preprocessing.OneHotEncoder, + preprocessing.StandardScaler, ] class ColumnTransformer( third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer, - bigframes.ml.base.BaseEstimator, + base.BaseEstimator, ): __doc__ = ( third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer.__doc__ @@ -54,14 +50,17 @@ def __init__( ] ], ): + # TODO: if any(transformers) has fitted raise warning self.transformers = transformers - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None + # call self.transformers_ to check chained transformers + self.transformers_ @property def transformers_( self, ) -> List[Tuple[str, CompilablePreprocessorType, str,]]: - """The collection of transformers as tuples of (name, transformer, column)""" + """The collection of transformers as tuples of (name, transformer, column).""" result: List[ Tuple[ str, @@ -69,13 +68,22 @@ def transformers_( str, ] ] = [] + + column_set: set[str] = set() for entry in self.transformers: name, transformer, column_or_columns = entry - if isinstance(column_or_columns, str): - result.append((name, transformer, column_or_columns)) - else: - for column in column_or_columns: - result.append((name, transformer, column)) + columns = ( + column_or_columns + if isinstance(column_or_columns, List) + else [column_or_columns] + ) + + for column in columns: + if column in column_set: + raise NotImplementedError( + f"Chained transformers on the same column isn't supported. {constants.FEEDBACK_LINK}" + ) + result.append((name, transformer, column)) return result @@ -84,9 +92,11 @@ def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: a BQML TRANSFORM clause Args: - columns: a list of column names to transform + columns (List[str]): + a list of column names to transform - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: + a list of tuples of (sql_expression, output_name)""" return [ transformer._compile_to_sql([column])[0] for column in columns @@ -96,12 +106,15 @@ def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: def fit( self, - X: bigframes.dataframe.DataFrame, - ): + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> ColumnTransformer: + (X,) = utils.convert_to_dataframe(X) + compiled_transforms = self._compile_to_sql(X.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - self._bqml_model = bigframes.ml.core.create_bqml_model( + self._bqml_model = core.create_bqml_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, @@ -109,15 +122,16 @@ def fit( # The schema of TRANSFORM output is not available in the model API, so save it during fitting self._output_names = [name for _, name in compiled_transforms] + return self - def transform( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") + (X,) = utils.convert_to_dataframe(X) + df = self._bqml_model.transform(X) return typing.cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[self._output_names], ) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 2663a8e17e..9629ca0f4d 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -21,9 +21,10 @@ from google.cloud import bigquery -import bigframes.dataframe -import bigframes.ml.sql -import bigframes.session +import bigframes +import bigframes.constants as constants +from bigframes.ml import sql as ml_sql +import bigframes.pandas as bpd class BqmlModel: @@ -33,7 +34,7 @@ class BqmlModel: BigQuery DataFrames ML. """ - def __init__(self, session: bigframes.session.Session, model: bigquery.Model): + def __init__(self, session: bigframes.Session, model: bigquery.Model): self._session = session self._model = model @@ -52,29 +53,35 @@ def model(self) -> bigquery.Model: """Get the BQML model associated with this wrapper""" return self._model - @staticmethod + @classmethod def _apply_sql( + cls, session: bigframes.Session, - input_data: bigframes.dataframe.DataFrame, + input_data: bpd.DataFrame, func: Callable[[str], str], - ) -> bigframes.dataframe.DataFrame: + ) -> bpd.DataFrame: """Helper to wrap a dataframe in a SQL query, keeping the index intact. Args: - session: the active bigframes.Session + session (bigframes.Session): + the active bigframes.Session - input_data: the dataframe to be wrapped + input_data (bigframes.dataframe.DataFrame): + the dataframe to be wrapped - func: a function that will accept a SQL string and produce a new SQL + func (function): + a function that will accept a SQL string and produce a new SQL string from which to construct the output dataframe. It must include the index columns of the input SQL. """ - source_sql, tagged_index_cols = input_data.to_sql_query( + source_sql, tagged_index_cols = input_data._to_sql_query( always_include_index=True ) if len(tagged_index_cols) != 1: - raise NotImplementedError("Only exactly one index column is supported") + raise NotImplementedError( + f"Only exactly one index column is supported. {constants.FEEDBACK_LINK}" + ) index_col_name, is_named_index = tagged_index_cols[0] sql = func(source_sql) @@ -84,79 +91,75 @@ def _apply_sql( return df - def predict( - self, input_data: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( self._session, input_data, - lambda source_sql: bigframes.ml.sql.ml_predict( + lambda source_sql: ml_sql.ml_predict( model_name=self.model_name, source_sql=source_sql ), ) - def transform( - self, input_data: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( self._session, input_data, - lambda source_sql: bigframes.ml.sql.ml_transform( + lambda source_sql: ml_sql.ml_transform( model_name=self.model_name, source_sql=source_sql ), ) def generate_text( self, - input_data: bigframes.dataframe.DataFrame, + input_data: bpd.DataFrame, options: Mapping[str, int | float], - ) -> bigframes.dataframe.DataFrame: + ) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( self._session, input_data, - lambda source_sql: bigframes.ml.sql.ml_generate_text( + lambda source_sql: ml_sql.ml_generate_text( model_name=self.model_name, source_sql=source_sql, - struct_options=bigframes.ml.sql.struct_options(**options), + struct_options=ml_sql.struct_options(**options), ), ) - def embed_text( + def generate_text_embedding( self, - input_data: bigframes.dataframe.DataFrame, + input_data: bpd.DataFrame, options: Mapping[str, int | float], - ) -> bigframes.dataframe.DataFrame: + ) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( self._session, input_data, - lambda source_sql: bigframes.ml.sql.ml_embed_text( + lambda source_sql: ml_sql.ml_generate_text_embedding( model_name=self.model_name, source_sql=source_sql, - struct_options=bigframes.ml.sql.struct_options(**options), + struct_options=ml_sql.struct_options(**options), ), ) - def forecast(self) -> bigframes.dataframe.DataFrame: - sql = bigframes.ml.sql.ml_forecast(self.model_name) + def forecast(self) -> bpd.DataFrame: + sql = ml_sql.ml_forecast(self.model_name) return self._session.read_gbq(sql) - def evaluate(self, input_data: Union[bigframes.dataframe.DataFrame, None] = None): + def evaluate(self, input_data: Optional[bpd.DataFrame] = None): # TODO: validate input data schema # Note: don't need index as evaluate returns a new table source_sql, _ = ( - input_data.to_sql_query(always_include_index=False) + input_data._to_sql_query(always_include_index=False) if (input_data is not None) else (None, None) ) - sql = bigframes.ml.sql.ml_evaluate(self.model_name, source_sql) + sql = ml_sql.ml_evaluate(self.model_name, source_sql) return self._session.read_gbq(sql) - def copy(self, new_model_name, replace=False) -> BqmlModel: + def copy(self, new_model_name: str, replace: bool = False) -> BqmlModel: job_config = bigquery.job.CopyJobConfig() if replace: job_config.write_disposition = "WRITE_TRUNCATE" @@ -173,10 +176,11 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: # vertex id needs to start with letters. https://cloud.google.com/vertex-ai/docs/general/resource-naming vertex_ai_model_id = "bigframes_" + cast(str, self._model.model_id) - options_sql = bigframes.ml.sql.options( - **{"vertex_ai_model_id": vertex_ai_model_id} - ) - sql = bigframes.ml.sql.alter_model(self.model_name, options_sql=options_sql) + # truncate as Vertex ID only accepts 63 characters, easily exceeding the limit for temp models. + # The possibility of conflicts should be low. + vertex_ai_model_id = vertex_ai_model_id[:63] + options_sql = ml_sql.options(**{"vertex_ai_model_id": vertex_ai_model_id}) + sql = ml_sql.alter_model(self.model_name, options_sql=options_sql) # Register the model and wait it to finish self._session.bqclient.query(sql).result() @@ -185,8 +189,8 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: def create_bqml_model( - train_X: bigframes.dataframe.DataFrame, - train_y: Optional[bigframes.dataframe.DataFrame] = None, + train_X: bpd.DataFrame, + train_y: Optional[bpd.DataFrame] = None, transforms: Optional[Iterable[str]] = None, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: @@ -217,29 +221,22 @@ def create_bqml_model( # for now, drop index to avoid including the index in feature columns input_data = input_data.reset_index(drop=True) - model_name = f"{session._session_dataset_id}.{uuid.uuid4().hex}" source_sql = input_data.sql - options_sql = bigframes.ml.sql.options(**options) - transform_sql = ( - bigframes.ml.sql.transform(*transforms) if transforms is not None else None - ) - sql = bigframes.ml.sql.create_model( - model_name=model_name, + options_sql = ml_sql.options(**options) + transform_sql = ml_sql.transform(*transforms) if transforms is not None else None + sql = ml_sql.create_model( + model_name=_create_temp_model_name(), source_sql=source_sql, transform_sql=transform_sql, options_sql=options_sql, ) - # fit the model, synchronously - session.bqclient.query(sql).result() - - model = session.bqclient.get_model(model_name) - return BqmlModel(session, model) + return _create_bqml_model_with_sql(session=session, sql=sql) def create_bqml_time_series_model( - train_X: bigframes.dataframe.DataFrame, - train_y: bigframes.dataframe.DataFrame, + train_X: bpd.DataFrame, + train_y: bpd.DataFrame, transforms: Optional[Iterable[str]] = None, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: @@ -258,25 +255,18 @@ def create_bqml_time_series_model( # pickpocket session object from the dataframe session = train_X._get_block().expr._session - model_name = f"{session._session_dataset_id}.{uuid.uuid4().hex}" source_sql = input_data.sql - options_sql = bigframes.ml.sql.options(**options) + options_sql = ml_sql.options(**options) - transform_sql = ( - bigframes.ml.sql.transform(*transforms) if transforms is not None else None - ) - sql = bigframes.ml.sql.create_model( - model_name=model_name, + transform_sql = ml_sql.transform(*transforms) if transforms is not None else None + sql = ml_sql.create_model( + model_name=_create_temp_model_name(), source_sql=source_sql, transform_sql=transform_sql, options_sql=options_sql, ) - # fit the model, synchronously - session.bqclient.query(sql).result() - - model = session.bqclient.get_model(model_name) - return BqmlModel(session, model) + return _create_bqml_model_with_sql(session=session, sql=sql) def create_bqml_remote_model( @@ -287,25 +277,22 @@ def create_bqml_remote_model( """Create a session-temporary BQML remote model with the CREATE MODEL statement Args: - connection_name: a BQ connection to talk with Vertex AI, of the format ... https://cloud.google.com/bigquery/docs/create-cloud-resource-connection - options: a dict of options to configure the model. Generates a BQML OPTIONS - clause + connection_name: + a BQ connection to talk with Vertex AI, of the format ... https://cloud.google.com/bigquery/docs/create-cloud-resource-connection + options: + a dict of options to configure the model. Generates a BQML OPTIONS clause - Returns: a BqmlModel, wrapping a trained model in BigQuery + Returns: + BqmlModel: a BqmlModel wrapping a trained model in BigQuery """ - model_name = f"{session._session_dataset_id}.{uuid.uuid4().hex}" - options_sql = bigframes.ml.sql.options(**options) - sql = bigframes.ml.sql.create_remote_model( - model_name=model_name, + options_sql = ml_sql.options(**options) + sql = ml_sql.create_remote_model( + model_name=_create_temp_model_name(), connection_name=connection_name, options_sql=options_sql, ) - # create the model, synchronously - session.bqclient.query(sql).result() - - model = session.bqclient.get_model(model_name) - return BqmlModel(session, model) + return _create_bqml_model_with_sql(session=session, sql=sql) def create_bqml_imported_model( @@ -320,15 +307,25 @@ def create_bqml_imported_model( Returns: a BqmlModel, wrapping a trained model in BigQuery """ - model_name = f"{session._session_dataset_id}.{uuid.uuid4().hex}" - options_sql = bigframes.ml.sql.options(**options) - sql = bigframes.ml.sql.create_imported_model( - model_name=model_name, + options_sql = ml_sql.options(**options) + sql = ml_sql.create_imported_model( + model_name=_create_temp_model_name(), options_sql=options_sql, ) - # create the model, synchronously - session.bqclient.query(sql).result() + return _create_bqml_model_with_sql(session=session, sql=sql) + + +def _create_temp_model_name() -> str: + return uuid.uuid4().hex + + +def _create_bqml_model_with_sql(session: bigframes.Session, sql: str) -> BqmlModel: + # fit the model, synchronously + job = session.bqclient.query(sql) + job.result() - model = session.bqclient.get_model(model_name) + # real model path in the session specific hidden dataset and table prefix + model_name_full = f"{job.destination.dataset_id}.{job.destination.table_id}" + model = session.bqclient.get_model(model_name_full) return BqmlModel(session, model) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 08551d1cb8..76b4f9ced6 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -13,34 +13,32 @@ # limitations under the License. """Matrix Decomposition models. This module is styled after Scikit-Learn's decomposition module: -https://scikit-learn.org/stable/modules/decomposition.html""" +https://scikit-learn.org/stable/modules/decomposition.html.""" from __future__ import annotations -from typing import cast, List, Optional, TYPE_CHECKING +from typing import cast, List, Optional, Union from google.cloud import bigquery -if TYPE_CHECKING: - import bigframes - -import bigframes.ml.base -import bigframes.ml.core +import bigframes +from bigframes.ml import base, core, utils +import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.decomposition._pca class PCA( third_party.bigframes_vendored.sklearn.decomposition._pca.PCA, - bigframes.ml.base.TrainablePredictor, + base.TrainablePredictor, ): __doc__ = third_party.bigframes_vendored.sklearn.decomposition._pca.PCA.__doc__ def __init__(self, n_components=3): self.n_components = n_components - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None - @staticmethod - def _from_bq(session: bigframes.Session, model: bigquery.Model) -> PCA: + @classmethod + def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> PCA: assert model.model_type == "PCA" kwargs = {} @@ -50,17 +48,19 @@ def _from_bq(session: bigframes.Session, model: bigquery.Model) -> PCA: if "numPrincipalComponents" in last_fitting: kwargs["n_components"] = int(last_fitting["numPrincipalComponents"]) - new_pca = PCA(**kwargs) - new_pca._bqml_model = bigframes.ml.core.BqmlModel(session, model) + new_pca = cls(**kwargs) + new_pca._bqml_model = core.BqmlModel(session, model) return new_pca def fit( self, - X: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], y=None, transforms: Optional[List[str]] = None, - ): - self._bqml_model = bigframes.ml.core.create_bqml_model( + ) -> PCA: + (X,) = utils.convert_to_dataframe(X) + + self._bqml_model = core.create_bqml_model( train_X=X, transforms=transforms, options={ @@ -68,37 +68,45 @@ def fit( "num_principal_components": self.n_components, }, ) + return self - def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: - """Predict the closest cluster for each sample in X. - - Args: - X: a BigQuery DataFrame to predict. - y: ignored for API consistency. - - Returns: predicted BigQuery DataFrames.""" + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") + (X,) = utils.convert_to_dataframe(X) + return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, self._bqml_model.predict(X)[ ["principal_component_" + str(i + 1) for i in range(self.n_components)] ], ) def to_gbq(self, model_name: str, replace: bool = False) -> PCA: - """Save the model to Google Cloud BigQuey. + """Save the model to BigQuery. Args: - model_name: the name of the model. - replace: whether to replace if the model already exists. Default to False. + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: + PCA: saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) + + def score( + self, + X=None, + y=None, + ) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("A model must be fitted before score") + + # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUTE. + return self._bqml_model.evaluate() diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 117759ca1c..56a0cc3d94 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -12,20 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Linear models. This module is styled after Scikit-Learn's linear_model module: -https://scikit-learn.org/stable/modules/linear_model.html""" +"""Ensemble models. This module is styled after Scikit-Learn's ensemble module: +https://scikit-learn.org/stable/modules/ensemble.html""" from __future__ import annotations -from typing import cast, Dict, List, Literal, Optional, TYPE_CHECKING +from typing import cast, Dict, List, Literal, Optional, Union from google.cloud import bigquery -if TYPE_CHECKING: - import bigframes - -import bigframes.ml.base -import bigframes.ml.core +import bigframes +from bigframes.ml import base, core, utils +import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.ensemble._forest import third_party.bigframes_vendored.xgboost.sklearn @@ -51,7 +49,7 @@ class XGBRegressor( third_party.bigframes_vendored.xgboost.sklearn.XGBRegressor, - bigframes.ml.base.TrainablePredictor, + base.TrainablePredictor, ): __doc__ = third_party.bigframes_vendored.xgboost.sklearn.XGBRegressor.__doc__ @@ -96,10 +94,12 @@ def __init__( self.min_rel_progress = min_rel_progress self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None - @staticmethod - def _from_bq(session: bigframes.Session, model: bigquery.Model) -> XGBRegressor: + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> XGBRegressor: assert model.model_type == "BOOSTED_TREE_REGRESSOR" kwargs = {} @@ -107,14 +107,14 @@ def _from_bq(session: bigframes.Session, model: bigquery.Model) -> XGBRegressor: # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun last_fitting = model.training_runs[-1]["trainingOptions"] - dummy_regressor = XGBRegressor() + dummy_regressor = cls() for bf_param, bf_value in dummy_regressor.__dict__.items(): bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) if bqml_param is not None: kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) - new_xgb_regressor = XGBRegressor(**kwargs) - new_xgb_regressor._bqml_model = bigframes.ml.core.BqmlModel(session, model) + new_xgb_regressor = cls(**kwargs) + new_xgb_regressor._bqml_model = core.BqmlModel(session, model) return new_xgb_regressor @property @@ -145,26 +145,31 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def fit( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, - ): - self._bqml_model = bigframes.ml.core.create_bqml_model( + ) -> XGBRegressor: + X, y = utils.convert_to_dataframe(X, y) + + self._bqml_model = core.create_bqml_model( X, y, transforms=transforms, options=self._bqml_options, ) + return self def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + self, + X: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") + (X,) = utils.convert_to_dataframe(X) df = self._bqml_model.predict(X) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[ [ cast(str, field.name) @@ -175,9 +180,11 @@ def predict( def score( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], ): + X, y = utils.convert_to_dataframe(X, y) + if not self._bqml_model: raise RuntimeError("A model must be fitted before score") @@ -187,11 +194,13 @@ def score( return self._bqml_model.evaluate(input_data) def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: - """Save the model to Google Cloud BigQuey. + """Save the model to BigQuery. Args: - model_name: the name of the model. - replace: whether to replace if the model already exists. Default to False. + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. Returns: saved model.""" if not self._bqml_model: @@ -203,7 +212,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: class XGBClassifier( third_party.bigframes_vendored.xgboost.sklearn.XGBClassifier, - bigframes.ml.base.TrainablePredictor, + base.TrainablePredictor, ): __doc__ = third_party.bigframes_vendored.xgboost.sklearn.XGBClassifier.__doc__ @@ -249,10 +258,12 @@ def __init__( self.min_rel_progress = min_rel_progress self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None - @staticmethod - def _from_bq(session: bigframes.Session, model: bigquery.Model) -> XGBClassifier: + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> XGBClassifier: assert model.model_type == "BOOSTED_TREE_CLASSIFIER" kwargs = {} @@ -266,8 +277,8 @@ def _from_bq(session: bigframes.Session, model: bigquery.Model) -> XGBClassifier if bqml_param is not None: kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) - new_xgb_classifier = XGBClassifier(**kwargs) - new_xgb_classifier._bqml_model = bigframes.ml.core.BqmlModel(session, model) + new_xgb_classifier = cls(**kwargs) + new_xgb_classifier._bqml_model = core.BqmlModel(session, model) return new_xgb_classifier @property @@ -298,26 +309,29 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def fit( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, - ): - self._bqml_model = bigframes.ml.core.create_bqml_model( + ) -> XGBClassifier: + X, y = utils.convert_to_dataframe(X, y) + + self._bqml_model = core.create_bqml_model( X, y, transforms=transforms, options=self._bqml_options, ) + return self - def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") + (X,) = utils.convert_to_dataframe(X) + df = self._bqml_model.predict(X) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[ [ cast(str, field.name) @@ -328,25 +342,30 @@ def predict( def score( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], ): if not self._bqml_model: raise RuntimeError("A model must be fitted before score") + X, y = utils.convert_to_dataframe(X, y) + input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None ) return self._bqml_model.evaluate(input_data) def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: - """Save the model to Google Cloud BigQuey. + """Save the model to BigQuery. Args: - model_name: the name of the model. - replace: whether to replace if the model already exists. Default to False. + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: + XGBClassifier: saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") @@ -356,7 +375,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: class RandomForestRegressor( third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor, - bigframes.ml.base.TrainablePredictor, + base.TrainablePredictor, ): __doc__ = ( @@ -396,11 +415,11 @@ def __init__( self.min_rel_progress = min_rel_progress self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None - @staticmethod + @classmethod def _from_bq( - session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, model: bigquery.Model ) -> RandomForestRegressor: assert model.model_type == "RANDOM_FOREST_REGRESSOR" @@ -409,16 +428,14 @@ def _from_bq( # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun last_fitting = model.training_runs[-1]["trainingOptions"] - dummy_model = RandomForestRegressor() + dummy_model = cls() for bf_param, bf_value in dummy_model.__dict__.items(): bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) if bqml_param is not None: kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) - new_random_forest_regressor = RandomForestRegressor(**kwargs) - new_random_forest_regressor._bqml_model = bigframes.ml.core.BqmlModel( - session, model - ) + new_random_forest_regressor = cls(**kwargs) + new_random_forest_regressor._bqml_model = core.BqmlModel(session, model) return new_random_forest_regressor @property @@ -446,26 +463,32 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def fit( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, - ): - self._bqml_model = bigframes.ml.core.create_bqml_model( + ) -> RandomForestRegressor: + X, y = utils.convert_to_dataframe(X, y) + + self._bqml_model = core.create_bqml_model( X, y, transforms=transforms, options=self._bqml_options, ) + return self def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + self, + X: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") + (X,) = utils.convert_to_dataframe(X) + df = self._bqml_model.predict(X) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[ [ cast(str, field.name) @@ -476,36 +499,41 @@ def predict( def score( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], ): """Calculate evaluation metrics of the model. Args: - X: a BigFrames DataFrame as evaluation data. - y: a BigFrames DataFrame as evaluation labels. - - Returns: a BigFrames DataFrame as evaluation result.""" + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation data. + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation labels. + + Returns: + bigframes.dataframe.DataFrame: The DataFrame as evaluation result. + """ if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - if (X is None) != (y is None): - raise ValueError( - "Either both or neither of test_X and test_y must be specified" - ) + X, y = utils.convert_to_dataframe(X, y) + input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None ) return self._bqml_model.evaluate(input_data) def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegressor: - """Save the model to Google Cloud BigQuey. + """Save the model to BigQuery. Args: - model_name: the name of the model. - replace: whether to replace if the model already exists. Default to False. + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: + RandomForestRegressor: saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") @@ -515,7 +543,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso class RandomForestClassifier( third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier, - bigframes.ml.base.TrainablePredictor, + base.TrainablePredictor, ): __doc__ = ( @@ -555,11 +583,11 @@ def __init__( self.min_rel_progress = min_rel_progress self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None - @staticmethod + @classmethod def _from_bq( - session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, model: bigquery.Model ) -> RandomForestClassifier: assert model.model_type == "RANDOM_FOREST_CLASSIFIER" @@ -574,10 +602,8 @@ def _from_bq( if bqml_param is not None: kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) - new_random_forest_classifier = RandomForestClassifier(**kwargs) - new_random_forest_classifier._bqml_model = bigframes.ml.core.BqmlModel( - session, model - ) + new_random_forest_classifier = cls(**kwargs) + new_random_forest_classifier._bqml_model = core.BqmlModel(session, model) return new_random_forest_classifier @property @@ -605,26 +631,32 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def fit( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, - ): - self._bqml_model = bigframes.ml.core.create_bqml_model( + ) -> RandomForestClassifier: + X, y = utils.convert_to_dataframe(X, y) + + self._bqml_model = core.create_bqml_model( X, y, transforms=transforms, options=self._bqml_options, ) + return self def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + self, + X: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") + (X,) = utils.convert_to_dataframe(X) + df = self._bqml_model.predict(X) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[ [ cast(str, field.name) @@ -635,36 +667,41 @@ def predict( def score( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], ): """Calculate evaluation metrics of the model. Args: - X: a BigFrames DataFrame as evaluation data. - y: a BigFrames DataFrame as evaluation labels. - - Returns: a BigFrames DataFrame as evaluation result.""" + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation data. + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation labels. + + Returns: + bigframes.dataframe.DataFrame: The DataFrame as evaluation result. + """ if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - if (X is None) != (y is None): - raise ValueError( - "Either both or neither of test_X and test_y must be specified" - ) + X, y = utils.convert_to_dataframe(X, y) + input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None ) return self._bqml_model.evaluate(input_data) def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestClassifier: - """Save the model to Google Cloud BigQuey. + """Save the model to BigQuery. Args: - model_name: the name of the model. - replace: whether to replace if the model already exists. Default to False. + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: + RandomForestClassifier: saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 24afaad7f2..b7e0553ecb 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -12,35 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Forcasting models.""" + from __future__ import annotations -from typing import cast, Dict, List, Optional, TYPE_CHECKING +from typing import cast, Dict, List, Optional, Union from google.cloud import bigquery -if TYPE_CHECKING: - import bigframes - -import bigframes.ml.base -import bigframes.ml.core +import bigframes +from bigframes.ml import base, core, utils +import bigframes.pandas as bpd _PREDICT_OUTPUT_COLUMNS = ["forecast_timestamp", "forecast_value"] -class ARIMAPlus(bigframes.ml.base.TrainablePredictor): +class ARIMAPlus(base.TrainablePredictor): """Time Series ARIMA Plus model.""" def __init__(self): - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None - @staticmethod - def _from_bq(session: bigframes.Session, model: bigquery.Model) -> ARIMAPlus: + @classmethod + def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ARIMAPlus: assert model.model_type == "ARIMA_PLUS" kwargs: Dict[str, str | int | bool | float | List[str]] = {} - new_arima_plus = ARIMAPlus(**kwargs) - new_arima_plus._bqml_model = bigframes.ml.core.BqmlModel(session, model) + new_arima_plus = cls(**kwargs) + new_arima_plus._bqml_model = core.BqmlModel(session, model) return new_arima_plus @property @@ -50,74 +50,90 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def fit( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, ): - """Fit the model to training data + """Fit the model to training data. Args: - X: A dataframe of training timestamp. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A dataframe of training timestamp. + + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + Target values for training. + transforms (Optional[List[str]], default None): + Do not use. Internal param to be deprecated. + Use bigframes.ml.pipeline instead. + + Returns: + ARIMAPlus: Fitted estimator. + """ + X, y = utils.convert_to_dataframe(X, y) - y: Target values for training.""" - self._bqml_model = bigframes.ml.core.create_bqml_time_series_model( + self._bqml_model = core.create_bqml_time_series_model( X, y, transforms=transforms, options=self._bqml_options, ) - def predict(self, X=None) -> bigframes.dataframe.DataFrame: + def predict(self, X=None) -> bpd.DataFrame: """Predict the closest cluster for each sample in X. Args: - X: ignored, to be compatible with other APIs. + X (default None): + ignored, to be compatible with other APIs. + Returns: - The predicted BigQuery DataFrames. Which contains 2 columns - "forecast_timestamp" and "forecast_value". + bigframes.dataframe.DataFrame: The predicted DataFrames. Which + contains 2 columns "forecast_timestamp" and "forecast_value". """ if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, self._bqml_model.forecast()[_PREDICT_OUTPUT_COLUMNS], ) - # Unlike regression models, time series forcasting can only evaluate with unseen data. X and y must be providee. def score( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, - ) -> bigframes.dataframe.DataFrame: + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. Args: - X: - A BigQuery DataFrames only contains 1 column as + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame only contains 1 column as evaluation timestamp. The timestamp must be within the horizon of the model, which by default is 1000 data points. - y: - A BigQuery DataFrames only contains 1 column as + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame only contains 1 column as evaluation numeric values. Returns: - A BigQuery DataFrames as evaluation result. + bigframes.dataframe.DataFrame: A DataFrame as evaluation result. """ if not self._bqml_model: raise RuntimeError("A model must be fitted before score") + X, y = utils.convert_to_dataframe(X, y) input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) def to_gbq(self, model_name: str, replace: bool = False) -> ARIMAPlus: - """Save the model to Google Cloud BigQuey. + """Save the model to BigQuery. Args: - model_name: the name of the model. - replace: whether to replace if the model already exists. Default to False. + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: + ARIMAPlus: saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 581ee2b1e2..89078f8267 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -12,47 +12,68 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Imported models.""" + from __future__ import annotations -from typing import cast, TYPE_CHECKING +from typing import cast, Optional, Union -if TYPE_CHECKING: - import bigframes +from google.cloud import bigquery -import bigframes.ml.base -import bigframes.ml.core +import bigframes +from bigframes.ml import base, core, utils +import bigframes.pandas as bpd -class TensorFlowModel(bigframes.ml.base.Predictor): +class TensorFlowModel(base.Predictor): """Imported TensorFlow model. Args: - session: BQ session to create the model - model_path: GCS path that holds the model files.""" + session (BigQuery Session): + BQ session to create the model + model_path (str): + GCS path that holds the model files.""" - def __init__(self, session: bigframes.Session, model_path: str): + def __init__(self, session: bigframes.Session, model_path: Optional[str] = None): self.session = session self.model_path = model_path - self._bqml_model: bigframes.ml.core.BqmlModel = self._create_bqml_model() + self._bqml_model: Optional[core.BqmlModel] = None def _create_bqml_model(self): options = {"model_type": "TENSORFLOW", "model_path": self.model_path} - return bigframes.ml.core.create_bqml_imported_model( - session=self.session, options=options - ) + return core.create_bqml_imported_model(session=self.session, options=options) + + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> TensorFlowModel: + assert model.model_type == "TENSORFLOW" - def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + tf_model = cls(session=session, model_path=None) + tf_model._bqml_model = core.BqmlModel(session, model) + return tf_model + + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X: Input DataFrame, schema is defined by the model. + X (bigframes.dataframe.DataFrame): + Input DataFrame, schema is defined by the model. + + Returns: + bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" + + if not self._bqml_model: + if self.model_path is None: + raise ValueError("Model GCS path must be provided.") + self._bqml_model = self._create_bqml_model() + self._bqml_model = cast(core.BqmlModel, self._bqml_model) + + (X,) = utils.convert_to_dataframe(X) - Returns: Output DataFrame, schema is defined by the model.""" df = self._bqml_model.predict(X) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[ [ cast(str, field.name) @@ -61,37 +82,74 @@ def predict( ], ) + def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel: + """Save the model to BigQuery. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + TensorFlowModel: saved model.""" + if not self._bqml_model: + if self.model_path is None: + raise ValueError("Model GCS path must be provided.") + self._bqml_model = self._create_bqml_model() + self._bqml_model = cast(core.BqmlModel, self._bqml_model) + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) -class OnnxModel(bigframes.ml.base.BaseEstimator): + +class ONNXModel(base.Predictor): """Imported Open Neural Network Exchange (ONNX) model. Args: - session: BQ session to create the model - model_path: GCS path that holds the model files.""" + session (BigQuery Session): + BQ session to create the model + model_path (str): + Cloud Storage path that holds the model files.""" - def __init__(self, session: bigframes.Session, model_path: str): + def __init__(self, session: bigframes.Session, model_path: Optional[str] = None): self.session = session self.model_path = model_path - self._bqml_model: bigframes.ml.core.BqmlModel = self._create_bqml_model() + self._bqml_model: Optional[core.BqmlModel] = None def _create_bqml_model(self): options = {"model_type": "ONNX", "model_path": self.model_path} - return bigframes.ml.core.create_bqml_imported_model( - session=self.session, options=options - ) + return core.create_bqml_imported_model(session=self.session, options=options) + + @classmethod + def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ONNXModel: + assert model.model_type == "ONNX" - def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + onnx_model = cls(session=session, model_path=None) + onnx_model._bqml_model = core.BqmlModel(session, model) + return onnx_model + + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X: Input DataFrame, schema is defined by the model. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame or Series, schema is defined by the model. + + Returns: + bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" + + if not self._bqml_model: + if self.model_path is None: + raise ValueError("Model GCS path must be provided.") + self._bqml_model = self._create_bqml_model() + self._bqml_model = cast(core.BqmlModel, self._bqml_model) + + (X,) = utils.convert_to_dataframe(X) - Returns: Output DataFrame, schema is defined by the model.""" df = self._bqml_model.predict(X) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[ [ cast(str, field.name) @@ -99,3 +157,23 @@ def predict( ] ], ) + + def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel: + """Save the model to BigQuery. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + ONNXModel: saved model.""" + if not self._bqml_model: + if self.model_path is None: + raise ValueError("Model GCS path must be provided.") + self._bqml_model = self._create_bqml_model() + self._bqml_model = cast(core.BqmlModel, self._bqml_model) + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 2b2ea5c2af..0b18db9315 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -12,27 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Linear models. This module is styled after Scikit-Learn's linear_model module: -https://scikit-learn.org/stable/modules/linear_model.html""" +"""Linear models. This module is styled after scikit-learn's linear_model module: +https://scikit-learn.org/stable/modules/linear_model.html.""" from __future__ import annotations -from typing import cast, Dict, List, Optional, TYPE_CHECKING +from typing import cast, Dict, List, Optional, Union from google.cloud import bigquery -if TYPE_CHECKING: - import bigframes - -import bigframes.ml.base -import bigframes.ml.core +import bigframes +import bigframes.constants as constants +from bigframes.ml import base, core, utils +import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.linear_model._base import third_party.bigframes_vendored.sklearn.linear_model._logistic class LinearRegression( third_party.bigframes_vendored.sklearn.linear_model._base.LinearRegression, - bigframes.ml.base.TrainablePredictor, + base.TrainablePredictor, ): __doc__ = ( third_party.bigframes_vendored.sklearn.linear_model._base.LinearRegression.__doc__ @@ -43,10 +42,12 @@ def __init__( fit_intercept=True, ): self.fit_intercept = fit_intercept - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None - @staticmethod - def _from_bq(session: bigframes.Session, model: bigquery.Model) -> LinearRegression: + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> LinearRegression: assert model.model_type == "LINEAR_REGRESSION" # TODO(bmil): construct a standard way to extract these properties @@ -57,8 +58,8 @@ def _from_bq(session: bigframes.Session, model: bigquery.Model) -> LinearRegress if "fitIntercept" in last_fitting: kwargs["fit_intercept"] = last_fitting["fitIntercept"] - new_linear_regression = LinearRegression(**kwargs) - new_linear_regression._bqml_model = bigframes.ml.core.BqmlModel(session, model) + new_linear_regression = cls(**kwargs) + new_linear_regression._bqml_model = core.BqmlModel(session, model) return new_linear_regression @property @@ -72,26 +73,29 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def fit( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, - ): - self._bqml_model = bigframes.ml.core.create_bqml_model( + ) -> LinearRegression: + X, y = utils.convert_to_dataframe(X, y) + + self._bqml_model = core.create_bqml_model( X, y, transforms=transforms, options=self._bqml_options, ) + return self - def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") + (X,) = utils.convert_to_dataframe(X) + df = self._bqml_model.predict(X) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[ [ cast(str, field.name) @@ -102,25 +106,28 @@ def predict( def score( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, - ) -> bigframes.dataframe.DataFrame: + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - input_data = ( - X.join(y, how="outer") if (X is not None) and (y is not None) else None - ) + X, y = utils.convert_to_dataframe(X, y) + + input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: - """Save the model to Google Cloud BigQuey. + """Save the model to BigQuery. Args: - model_name: the name of the model. - replace: whether to replace if the model already exists. Default to False. + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: + LinearRegression: saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") @@ -130,7 +137,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: class LogisticRegression( third_party.bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression, - bigframes.ml.base.TrainablePredictor, + base.TrainablePredictor, ): __doc__ = ( third_party.bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression.__doc__ @@ -144,11 +151,11 @@ def __init__( ): self.fit_intercept = fit_intercept self.auto_class_weights = auto_class_weights - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None - @staticmethod + @classmethod def _from_bq( - session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, model: bigquery.Model ) -> LogisticRegression: assert model.model_type == "LOGISTIC_REGRESSION" @@ -166,10 +173,8 @@ def _from_bq( # if "labelClassWeights" in last_fitting: # kwargs["class_weights"] = last_fitting["labelClassWeights"] - new_logistic_regression = LogisticRegression(**kwargs) - new_logistic_regression._bqml_model = bigframes.ml.core.BqmlModel( - session, model - ) + new_logistic_regression = cls(**kwargs) + new_logistic_regression._bqml_model = core.BqmlModel(session, model) return new_logistic_regression @property @@ -186,26 +191,32 @@ def _bqml_options(self) -> Dict[str, str | int | float | List[str]]: def fit( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], transforms: Optional[List[str]] = None, - ): - self._bqml_model = bigframes.ml.core.create_bqml_model( + ) -> LogisticRegression: + X, y = utils.convert_to_dataframe(X, y) + + self._bqml_model = core.create_bqml_model( X, y, transforms=transforms, options=self._bqml_options, ) + return self def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + self, + X: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") + (X,) = utils.convert_to_dataframe(X) + df = self._bqml_model.predict(X) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[ [ cast(str, field.name) @@ -216,32 +227,37 @@ def predict( def score( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, - ) -> bigframes.dataframe.DataFrame: + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - input_data = ( - X.join(y, how="outer") if (X is not None) and (y is not None) else None - ) + X, y = utils.convert_to_dataframe(X, y) + + input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) def to_gbq(self, model_name: str, replace: bool = False) -> LogisticRegression: - """Save the model to Google Cloud BigQuey. + """Save the model to BigQuery. Args: - model_name: the name of the model. - replace: whether to replace if the model already exists. Default to False. + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. - Returns: saved model.""" + Returns: + LogisticRegression: saved model.""" if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") # TODO(ashleyxu): b/285162045 support auto_class_weights once the API is # fixed and enable the tests. if self.auto_class_weights is True: - raise NotImplementedError("auto_class_weight is not supported yet.") + raise NotImplementedError( + f"auto_class_weight is not supported yet. {constants.FEEDBACK_LINK}" + ) new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index ef4f28f1f7..def97b56ff 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -12,80 +12,92 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""LLM models.""" + from __future__ import annotations -from typing import cast +from typing import cast, Union import bigframes +import bigframes.constants as constants from bigframes.core import blocks -import bigframes.ml.base -import bigframes.ml.core +from bigframes.ml import base, core, utils +import bigframes.pandas as bpd _REMOTE_TEXT_GENERATOR_MODEL_CODE = "CLOUD_AI_LARGE_LANGUAGE_MODEL_V1" _TEXT_GENERATE_RESULT_COLUMN = "ml_generate_text_llm_result" _REMOTE_EMBEDDING_GENERATOR_MODEL_CODE = "CLOUD_AI_TEXT_EMBEDDING_MODEL_V1" -_EMBED_TEXT_RESULT_COLUMN = "ml_embed_text_embedding" +_EMBED_TEXT_RESULT_COLUMN = "text_embedding" -class PaLM2TextGenerator(bigframes.ml.base.Predictor): +class PaLM2TextGenerator(base.Predictor): """PaLM2 text generator LLM model. Args: - session: BQ session to create the model - connection_name: connection to connect with remote service. str of the format ..""" + session (BigQuery Session): + BQ session to create the model + connection_name (str): + connection to connect with remote service. str of the format ..""" def __init__(self, session: bigframes.Session, connection_name: str): self.session = session self.connection_name = connection_name - self._bqml_model: bigframes.ml.core.BqmlModel = self._create_bqml_model() + self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): options = { "remote_service_type": _REMOTE_TEXT_GENERATOR_MODEL_CODE, } - return bigframes.ml.core.create_bqml_remote_model( + return core.create_bqml_remote_model( session=self.session, connection_name=self.connection_name, options=options ) def predict( self, - X: bigframes.dataframe.DataFrame, + X: Union[bpd.DataFrame, bpd.Series], temperature: float = 0.0, max_output_tokens: int = 128, top_k: int = 40, top_p: float = 0.95, - ) -> bigframes.dataframe.DataFrame: + ) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X: Input DataFrame, which needs to contain a column with name "prompt". Only the column will be used as input. Prompts can include preamble, questions, suggestions, instructions, or examples. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame or Series, which needs to contain a column with name "prompt". Only the column will be used as input. + Prompts can include preamble, questions, suggestions, instructions, or examples. - temperature: The temperature is used for sampling during the response generation, which occurs when topP and topK are applied. + temperature (float, default 0.0): + The temperature is used for sampling during the response generation, which occurs when topP and topK are applied. Temperature controls the degree of randomness in token selection. Lower temperatures are good for prompts that expect a true or correct response, while higher temperatures can lead to more diverse or unexpected results. A temperature of 0 is deterministic: the highest probability token is always selected. For most use cases, try starting with a temperature of 0.2. Default 0. - max_output_tokens: Maximum number of tokens that can be generated in the response. Specify a lower value for shorter responses and a higher value for longer responses. + max_output_tokens (int, default 128): + Maximum number of tokens that can be generated in the response. Specify a lower value for shorter responses and a higher value for longer responses. A token may be smaller than a word. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Default 128. - top_k: Top-k changes how the model selects tokens for output. A top-k of 1 means the selected token is the most probable among all tokens + top_k (int, default 40): + Top-k changes how the model selects tokens for output. A top-k of 1 means the selected token is the most probable among all tokens in the model’s vocabulary (also called greedy decoding), while a top-k of 3 means that the next token is selected from among the 3 most probable tokens (using temperature). For each token selection step, the top K tokens with the highest probabilities are sampled. Then tokens are further filtered based on topP with the final token selected using temperature sampling. Specify a lower value for less random responses and a higher value for more random responses. Default 40. - top_p: Top-p changes how the model selects tokens for output. Tokens are selected from most K (see topK parameter) probable to least until the sum of their probabilities equals the top-p value. + top_p (float, default 0.95):: + Top-p changes how the model selects tokens for output. Tokens are selected from most K (see topK parameter) probable to least until the sum of their probabilities equals the top-p value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the top-p value is 0.5, then the model will select either A or B as the next token (using temperature) and not consider C at all. Specify a lower value for less random responses and a higher value for more random responses. Default 0.95. - Returns: Output DataFrame with only 1 column as the output text results.""" + Returns: + bigframes.dataframe.DataFrame: Output DataFrame with only 1 column as the output text results.""" # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models if temperature < 0.0 or temperature > 1.0: @@ -98,8 +110,13 @@ def predict( raise ValueError(f"top_k must be [1, 40], but is {top_k}.") if top_p < 0.0 or top_p > 1.0: raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") + + (X,) = utils.convert_to_dataframe(X) + if len(X.columns) != 1: - raise ValueError("Only support one column as input.") + raise ValueError( + f"Only support one column as input. {constants.FEEDBACK_LINK}" + ) # BQML identified the column by name col_label = cast(blocks.Label, X.columns[0]) @@ -114,45 +131,52 @@ def predict( } df = self._bqml_model.generate_text(X, options) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[[_TEXT_GENERATE_RESULT_COLUMN]], ) -class PaLM2EmbeddingGenerator(bigframes.ml.base.Predictor): - """PaLM2 embedding generator LLM model. +class PaLM2TextEmbeddingGenerator(base.Predictor): + """PaLM2 text embedding generator LLM model. Args: - session: BQ session to create the model - connection_name: connection to connect with remote service. str of the format ..""" + session (BigQuery Session): + BQ session to create the model + connection_name (str): + connection to connect with remote service. str of the format ..""" def __init__(self, session: bigframes.Session, connection_name: str): self.session = session self.connection_name = connection_name - self._bqml_model: bigframes.ml.core.BqmlModel = self._create_bqml_model() + self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): options = { "remote_service_type": _REMOTE_EMBEDDING_GENERATOR_MODEL_CODE, } - return bigframes.ml.core.create_bqml_remote_model( + return core.create_bqml_remote_model( session=self.session, connection_name=self.connection_name, options=options ) - def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X: Input DataFrame, which needs to contain a column with name "content". Only the column will be used as input. Content can include preamble, questions, suggestions, instructions, or examples. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame, which needs to contain a column with name "content". Only the column will be used as input. Content can include preamble, questions, suggestions, instructions, or examples. - Returns: Output DataFrame with only 1 column as the output embedding results.""" + Returns: + bigframes.dataframe.DataFrame: Output DataFrame with only 1 column as the output embedding results + """ # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models + (X,) = utils.convert_to_dataframe(X) + if len(X.columns) != 1: - raise ValueError("Only support one column as input.") + raise ValueError( + f"Only support one column as input. {constants.FEEDBACK_LINK}" + ) # BQML identified the column by name col_label = cast(blocks.Label, X.columns[0]) @@ -161,8 +185,8 @@ def predict( options = { "flatten_json_output": True, } - df = self._bqml_model.embed_text(X, options) + df = self._bqml_model.generate_text_embedding(X, options) return cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[[_EMBED_TEXT_RESULT_COLUMN]], ) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index cc43b47698..805747c49b 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -14,61 +14,81 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Union +from types import MappingProxyType +from typing import Union from google.cloud import bigquery -if TYPE_CHECKING: - import bigframes +import bigframes +import bigframes.constants as constants +from bigframes.ml import ( + cluster, + decomposition, + ensemble, + forecasting, + imported, + linear_model, + pipeline, +) -import bigframes.ml.cluster -import bigframes.ml.decomposition -import bigframes.ml.ensemble -import bigframes.ml.forecasting -import bigframes.ml.linear_model +_BQML_MODEL_TYPE_MAPPING = MappingProxyType( + { + "LINEAR_REGRESSION": linear_model.LinearRegression, + "LOGISTIC_REGRESSION": linear_model.LogisticRegression, + "KMEANS": cluster.KMeans, + "PCA": decomposition.PCA, + "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor, + "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier, + "ARIMA_PLUS": forecasting.ARIMAPlus, + "RANDOM_FOREST_REGRESSOR": ensemble.RandomForestRegressor, + "RANDOM_FOREST_CLASSIFIER": ensemble.RandomForestClassifier, + "TENSORFLOW": imported.TensorFlowModel, + "ONNX": imported.ONNXModel, + } +) def from_bq( - session: bigframes.Session, model: bigquery.Model + session: bigframes.Session, bq_model: bigquery.Model ) -> Union[ - bigframes.ml.decomposition.PCA, - bigframes.ml.cluster.KMeans, - bigframes.ml.linear_model.LinearRegression, - bigframes.ml.linear_model.LogisticRegression, - bigframes.ml.ensemble.XGBRegressor, - bigframes.ml.ensemble.XGBClassifier, - bigframes.ml.forecasting.ARIMAPlus, - bigframes.ml.ensemble.RandomForestRegressor, - bigframes.ml.ensemble.RandomForestClassifier, + decomposition.PCA, + cluster.KMeans, + linear_model.LinearRegression, + linear_model.LogisticRegression, + ensemble.XGBRegressor, + ensemble.XGBClassifier, + forecasting.ARIMAPlus, + ensemble.RandomForestRegressor, + ensemble.RandomForestClassifier, + imported.TensorFlowModel, + imported.ONNXModel, + pipeline.Pipeline, ]: """Load a BQML model to BigQuery DataFrames ML. Args: session: a BigQuery DataFrames session. - model: a BigQuery model. + bq_model: a BigQuery model. Returns: A BigQuery DataFrames ML model object. """ - if model.model_type == "LINEAR_REGRESSION": - return bigframes.ml.linear_model.LinearRegression._from_bq(session, model) - elif model.model_type == "KMEANS": - return bigframes.ml.cluster.KMeans._from_bq(session, model) - elif model.model_type == "PCA": - return bigframes.ml.decomposition.PCA._from_bq(session, model) - elif model.model_type == "LOGISTIC_REGRESSION": - return bigframes.ml.linear_model.LogisticRegression._from_bq(session, model) - elif model.model_type == "BOOSTED_TREE_REGRESSOR": - return bigframes.ml.ensemble.XGBRegressor._from_bq(session, model) - elif model.model_type == "BOOSTED_TREE_CLASSIFIER": - return bigframes.ml.ensemble.XGBClassifier._from_bq(session, model) - elif model.model_type == "ARIMA_PLUS": - return bigframes.ml.forecasting.ARIMAPlus._from_bq(session, model) - elif model.model_type == "RANDOM_FOREST_REGRESSOR": - return bigframes.ml.ensemble.RandomForestRegressor._from_bq(session, model) - elif model.model_type == "RANDOM_FOREST_CLASSIFIER": - return bigframes.ml.ensemble.RandomForestClassifier._from_bq(session, model) - else: - raise NotImplementedError( - f"Model type {model.model_type} is not yet supported by BigQuery DataFrames." + if _is_bq_model_pipeline(bq_model): + return pipeline.Pipeline._from_bq(session, bq_model) + + return _model_from_bq(session, bq_model) + + +def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): + if bq_model.model_type in _BQML_MODEL_TYPE_MAPPING: + return _BQML_MODEL_TYPE_MAPPING[bq_model.model_type]._from_bq( # type: ignore + session=session, model=bq_model ) + + raise NotImplementedError( + f"Model type {bq_model.model_type} is not yet supported by BigQuery DataFrames. {constants.FEEDBACK_LINK}" + ) + + +def _is_bq_model_pipeline(bq_model: bigquery.Model) -> bool: + return "transformColumns" in bq_model._properties diff --git a/bigframes/ml/metrics.py b/bigframes/ml/metrics.py index 861e1f02d1..3bcb621f74 100644 --- a/bigframes/ml/metrics.py +++ b/bigframes/ml/metrics.py @@ -13,17 +13,18 @@ # limitations under the License. """Metrics functions for evaluating models. This module is styled after -Scikit-Learn's metrics module: https://scikit-learn.org/stable/modules/metrics.html""" +Scikit-Learn's metrics module: https://scikit-learn.org/stable/modules/metrics.html.""" import inspect import typing -from typing import Tuple +from typing import Tuple, Union import numpy as np import pandas as pd import sklearn.metrics as sklearn_metrics # type: ignore -import bigframes.core.blocks as blocks +import bigframes.constants as constants +from bigframes.ml import utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.metrics._classification as vendored_mertics_classification import third_party.bigframes_vendored.sklearn.metrics._ranking as vendored_mertics_ranking @@ -31,33 +32,20 @@ def r2_score( - y_true: bpd.DataFrame, - y_pred: bpd.DataFrame, + y_true: Union[bpd.DataFrame, bpd.Series], + y_pred: Union[bpd.DataFrame, bpd.Series], force_finite=True, ) -> float: - # TODO(bmil): support multioutput - if len(y_true.columns) > 1 or len(y_pred.columns) > 1: - raise NotImplementedError( - "Only one labels column, one predictions column is supported" - ) - - y_true_series = typing.cast( - bpd.Series, y_true[typing.cast(str, y_true.columns.tolist()[0])] - ) - y_pred_series = typing.cast( - bpd.Series, y_pred[typing.cast(str, y_pred.columns.tolist()[0])] - ) + y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) # total sum of squares # (dataframe, scalar) binops - # TODO(bmil): remove multiply by self when bigframes supports pow() # TODO(tbergeron): These stats are eagerly evaluated. Move to lazy representation once scalar subqueries supported. delta_from_mean = y_true_series - y_true_series.mean() ss_total = (delta_from_mean * delta_from_mean).sum() # residual sum of squares # (scalar, scalar) binops - # TODO(bmil): remove multiply by self when bigframes supports pow() delta_from_pred = y_true_series - y_pred_series ss_res = (delta_from_pred * delta_from_pred).sum() @@ -71,22 +59,12 @@ def r2_score( def accuracy_score( - y_true: bpd.DataFrame, - y_pred: bpd.DataFrame, + y_true: Union[bpd.DataFrame, bpd.Series], + y_pred: Union[bpd.DataFrame, bpd.Series], normalize=True, ) -> float: # TODO(ashleyxu): support sample_weight as the parameter - if len(y_true.columns) != 1 or len(y_pred.columns) != 1: - raise NotImplementedError( - "Only one labels column, one predictions column is supported" - ) - - y_true_series = typing.cast( - bpd.Series, y_true[typing.cast(str, y_true.columns.tolist()[0])] - ) - y_pred_series = typing.cast( - bpd.Series, y_pred[typing.cast(str, y_pred.columns.tolist()[0])] - ) + y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) # Compute accuracy for each possible representation # TODO(ashleyxu): add multilabel classification support where y_type @@ -103,73 +81,86 @@ def accuracy_score( def roc_curve( - y_true: bpd.DataFrame, - y_score: bpd.DataFrame, + y_true: Union[bpd.DataFrame, bpd.Series], + y_score: Union[bpd.DataFrame, bpd.Series], drop_intermediate: bool = True, ) -> Tuple[bpd.Series, bpd.Series, bpd.Series]: # TODO(bmil): Add multi-class support # TODO(bmil): Add multi-label support - if len(y_true.columns) > 1 or len(y_score.columns) > 1: - raise NotImplementedError("Only binary classification is supported") # TODO(bmil): Implement drop_intermediate if drop_intermediate: - raise NotImplementedError("drop_intermediate is not yet implemented") + raise NotImplementedError( + f"drop_intermediate is not yet implemented. {constants.FEEDBACK_LINK}" + ) - # TODO(bmil): remove this once bigframes supports the necessary operations - session = y_true._block.expr._session - pd_y_true = y_true.to_pandas() - pd_y_score = y_score.to_pandas() + y_true_series, y_score_series = utils.convert_to_series(y_true, y_score) + + session = y_true_series._block.expr._session # We operate on rows, so, remove the index if there is one # TODO(bmil): check that the indexes are equivalent before removing - pd_y_true = pd_y_true.reset_index(drop=True) - pd_y_score = pd_y_score.reset_index(drop=True) - pd_df = pd.DataFrame( + y_true_series = typing.cast(bpd.Series, y_true_series.reset_index(drop=True)) + y_score_series = typing.cast(bpd.Series, y_score_series.reset_index(drop=True)) + + df = bpd.DataFrame( { - "y_true": pd_y_true[pd_y_true.columns[0]], - "y_score": pd_y_score[pd_y_score.columns[0]], + "y_true": y_true_series, + "y_score": y_score_series, } ) - total_positives = pd_df.y_true.sum() - total_negatives = len(pd_df) - total_positives + total_positives = y_true_series.sum() + total_negatives = y_true_series.count() - total_positives - pd_df = pd_df.sort_values(by="y_score", ascending=False) - pd_df["cum_tp"] = pd_df.y_true.cumsum() - pd_df["cum_fp"] = (~pd_df.y_true.astype(bool)).cumsum() + df = df.sort_values(by="y_score", ascending=False) + df["cum_tp"] = df["y_true"].cumsum() + # have to astype("Int64") as not supported boolean cumsum yet. + df["cum_fp"] = ( + (~typing.cast(bpd.Series, df["y_true"].astype("boolean"))) + .astype("Int64") + .cumsum() + ) # produce just one data point per y_score - pd_df = pd_df.groupby("y_score", as_index=False).last() - pd_df = pd_df.sort_values(by="y_score", ascending=False) + df = df.drop_duplicates(subset="y_score", keep="last") + df = df.sort_values(by="y_score", ascending=False) - pd_df["tpr"] = pd_df.cum_tp / total_positives - pd_df["fpr"] = pd_df.cum_fp / total_negatives - pd_df["thresholds"] = pd_df.y_score + df["tpr"] = typing.cast(bpd.Series, df["cum_tp"]) / total_positives + df["fpr"] = typing.cast(bpd.Series, df["cum_fp"]) / total_negatives + df["thresholds"] = typing.cast(bpd.Series, df["y_score"].astype("Float64")) # sklearn includes an extra datapoint for the origin with threshold np.inf - pd_origin = pd.DataFrame({"tpr": [0.0], "fpr": [0.0], "thresholds": np.inf}) - pd_df = pd.concat([pd_origin, pd_df]) + # having problems with concating inline + df_origin = session.read_pandas( + pd.DataFrame({"tpr": [0.0], "fpr": [0.0], "thresholds": np.inf}) + ) + df = typing.cast(bpd.DataFrame, bpd.concat([df_origin, df], ignore_index=True)) + df = df.reset_index(drop=True) - df = session.read_pandas(pd_df) - return df.fpr, df.tpr, df.thresholds + return ( + typing.cast(bpd.Series, df["fpr"]), + typing.cast(bpd.Series, df["tpr"]), + typing.cast(bpd.Series, df["thresholds"]), + ) roc_curve.__doc__ = inspect.getdoc(vendored_mertics_ranking.roc_curve) -def roc_auc_score(y_true: bpd.DataFrame, y_score: bpd.DataFrame) -> float: +def roc_auc_score( + y_true: Union[bpd.DataFrame, bpd.Series], y_score: Union[bpd.DataFrame, bpd.Series] +) -> float: # TODO(bmil): Add multi-class support # TODO(bmil): Add multi-label support - if len(y_true.columns) > 1 or len(y_score.columns) > 1: - raise NotImplementedError("Only binary classification is supported") + y_true_series, y_score_series = utils.convert_to_series(y_true, y_score) - fpr, tpr, _ = roc_curve(y_true, y_score, drop_intermediate=False) + fpr, tpr, _ = roc_curve(y_true_series, y_score_series, drop_intermediate=False) # TODO(bmil): remove this once bigframes supports the necessary operations - pd_fpr = fpr.compute() - pd_tpr = tpr.compute() + pd_fpr = fpr.to_pandas() + pd_tpr = tpr.to_pandas() # Use the trapezoid rule to compute the area under the ROC curve width_diff = pd_fpr.diff().iloc[1:].reset_index(drop=True) @@ -181,14 +172,13 @@ def roc_auc_score(y_true: bpd.DataFrame, y_score: bpd.DataFrame) -> float: def auc( - x: bpd.DataFrame, - y: bpd.DataFrame, + x: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], ) -> float: - if len(x.columns) != 1 or len(y.columns) != 1: - raise ValueError("Only 1-D data structure is supported") + x_series, y_series = utils.convert_to_series(x, y) # TODO(b/286410053) Support ML exceptions and error handling. - auc = sklearn_metrics.auc(x.to_pandas(), y.to_pandas()) + auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas()) return auc @@ -196,30 +186,24 @@ def auc( def confusion_matrix( - y_true: bpd.DataFrame, - y_pred: bpd.DataFrame, + y_true: Union[bpd.DataFrame, bpd.Series], + y_pred: Union[bpd.DataFrame, bpd.Series], ) -> pd.DataFrame: # TODO(ashleyxu): support labels and sample_weight parameters - # TODO(ashleyxu): support bpd.Series as input type - if len(y_true.columns) != 1 or len(y_pred.columns) != 1: - raise NotImplementedError( - "Only one labels column, one predictions column is supported" - ) + y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) - y_true_column = typing.cast(blocks.Label, y_true.columns[0]) - y_pred_series = typing.cast( - bpd.Series, - y_pred[typing.cast(blocks.Label, y_pred.columns.tolist()[0])], - ) - confusion_df = y_true.assign(y_pred=y_pred_series) + y_true_series = y_true_series.rename("y_true") + confusion_df = y_true_series.to_frame().assign(y_pred=y_pred_series) confusion_df = confusion_df.assign(dummy=0) groupby_count = ( - confusion_df.groupby(by=[y_true_column, "y_pred"], as_index=False) + confusion_df.groupby(by=["y_true", "y_pred"], as_index=False) .count() .to_pandas() ) - unique_values = sorted(set(groupby_count.y_true).union(set(groupby_count.y_pred))) + unique_values = sorted( + set(groupby_count["y_true"]).union(set(groupby_count["y_pred"])) + ) confusion_matrix = pd.DataFrame( 0, index=pd.Index(unique_values), columns=pd.Index(unique_values), dtype=int @@ -241,28 +225,17 @@ def confusion_matrix( def recall_score( - y_true: bpd.DataFrame, - y_pred: bpd.DataFrame, + y_true: Union[bpd.DataFrame, bpd.Series], + y_pred: Union[bpd.DataFrame, bpd.Series], average: str = "binary", ) -> pd.Series: # TODO(ashleyxu): support more average type, default to "binary" - # TODO(ashleyxu): support bpd.Series as input type - if len(y_true.columns) != 1 or len(y_pred.columns) != 1: + if average is not None: raise NotImplementedError( - "Only one labels column, one predictions column is supported" + f"Only average=None is supported. {constants.FEEDBACK_LINK}" ) - if average is not None: - raise NotImplementedError("Only average=None is supported") - - y_true_series = typing.cast( - bpd.Series, - y_true[typing.cast(blocks.Label, y_true.columns.tolist()[0])], - ) - y_pred_series = typing.cast( - bpd.Series, - y_pred[typing.cast(blocks.Label, y_pred.columns.tolist()[0])], - ) + y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) is_accurate = y_true_series == y_pred_series unique_labels = ( @@ -275,7 +248,7 @@ def recall_score( recall = ( is_accurate.groupby(y_true_series).sum() / is_accurate.groupby(y_true_series).count() - ).compute() + ).to_pandas() recall_score = pd.Series(0, index=index) for i in recall_score.index: @@ -288,28 +261,17 @@ def recall_score( def precision_score( - y_true: bpd.DataFrame, - y_pred: bpd.DataFrame, + y_true: Union[bpd.DataFrame, bpd.Series], + y_pred: Union[bpd.DataFrame, bpd.Series], average: str = "binary", ) -> pd.Series: # TODO(ashleyxu): support more average type, default to "binary" - # TODO(ashleyxu): support bpd.Series as input type - if len(y_true.columns) != 1 or len(y_pred.columns) != 1: + if average is not None: raise NotImplementedError( - "Only one labels column, one predictions column is supported" + f"Only average=None is supported. {constants.FEEDBACK_LINK}" ) - if average is not None: - raise NotImplementedError("Only average=None is supported") - - y_true_series = typing.cast( - bpd.Series, - y_true[typing.cast(blocks.Label, y_true.columns.tolist()[0])], - ) - y_pred_series = typing.cast( - bpd.Series, - y_pred[typing.cast(blocks.Label, y_pred.columns.tolist()[0])], - ) + y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) is_accurate = y_true_series == y_pred_series unique_labels = ( @@ -322,7 +284,7 @@ def precision_score( precision = ( is_accurate.groupby(y_pred_series).sum() / is_accurate.groupby(y_pred_series).count() - ).compute() + ).to_pandas() precision_score = pd.Series(0, index=index) for i in precision.index: @@ -337,22 +299,20 @@ def precision_score( def f1_score( - y_true: bpd.DataFrame, - y_pred: bpd.DataFrame, + y_true: Union[bpd.DataFrame, bpd.Series], + y_pred: Union[bpd.DataFrame, bpd.Series], average: str = "binary", ) -> pd.Series: # TODO(ashleyxu): support more average type, default to "binary" - # TODO(ashleyxu): support bpd.Series as input type - if len(y_true.columns) != 1 or len(y_pred.columns) != 1: - raise NotImplementedError( - "Only one labels column, one predictions column is supported" - ) + y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) if average is not None: - raise NotImplementedError("Only average=None is supported") + raise NotImplementedError( + f"Only average=None is supported. {constants.FEEDBACK_LINK}" + ) - recall = recall_score(y_true, y_pred, average=None) - precision = precision_score(y_true, y_pred, average=None) + recall = recall_score(y_true_series, y_pred_series, average=None) + precision = precision_score(y_true_series, y_pred_series, average=None) f1_score = pd.Series(0, index=recall.index) for index in recall.index: diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 73a2fd76a9..110cbcf493 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -14,40 +14,40 @@ """Functions for test/train split and model tuning. This module is styled after Scikit-Learn's model_selection module: -https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection""" +https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection.""" from typing import List, Union -import bigframes -import bigframes.dataframe +from bigframes.ml import utils +import bigframes.pandas as bpd def train_test_split( - *dataframes: bigframes.dataframe.DataFrame, + *arrays: Union[bpd.DataFrame, bpd.Series], test_size: Union[float, None] = None, train_size: Union[float, None] = None, random_state: Union[int, None] = None, -) -> List[bigframes.dataframe.DataFrame]: - """Splits dataframes into random train and test subsets +) -> List[Union[bpd.DataFrame, bpd.Series]]: + """Splits dataframes or series into random train and test subsets. Args: - *dataframes: - A sequence of BigQuery DataFrames that can be joined on + *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series): + A sequence of BigQuery DataFrames or Series that can be joined on their indexes - test_size: + test_size (default None): The proportion of the dataset to include in the test split. If None, this will default to the complement of train_size. If both are none, it will be set to 0.25. - train_size: + train_size (default None): The proportion of the dataset to include in the train split. If None, this will default to the complement of test_size. - random_state: + random_state (default None): A seed to use for randomly choosing the rows of the split. If not set, a random split will be generated each time. Returns: - A list of BigQuery DataFrames. + List[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]]: A list of BigQuery DataFrames or Series. """ # TODO(garrettwu): Scikit-Learn throws an error when the dataframes don't have the same @@ -73,14 +73,22 @@ def train_test_split( f"The sum of train_size and test_size exceeds 1.0. train_size: {train_size}. test_size: {test_size}" ) - results = dataframes[0]._split( - fracs=(train_size, test_size), random_state=random_state - ) - train_index = results[0].index - test_index = results[1].index + dfs = list(utils.convert_to_dataframe(*arrays)) - results += [ - df.loc[index] for df in dataframes[1:] for index in (train_index, test_index) + split_dfs = dfs[0]._split(fracs=(train_size, test_size), random_state=random_state) + train_index = split_dfs[0].index + test_index = split_dfs[1].index + + split_dfs += [ + df.loc[index] for df in dfs[1:] for index in (train_index, test_index) ] + # convert back to Series. + results: List[Union[bpd.DataFrame, bpd.Series]] = [] + for i, array in enumerate(arrays): + if isinstance(array, bpd.Series): + results += utils.convert_to_series(split_dfs[2 * i], split_dfs[2 * i + 1]) + else: + results += (split_dfs[2 * i], split_dfs[2 * i + 1]) + return results diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 022ba0148d..bfd0392526 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -13,15 +13,19 @@ # limitations under the License. """For composing estimators together. This module is styled after Scikit-Learn's -pipeline module: https://scikit-learn.org/stable/modules/pipeline.html""" +pipeline module: https://scikit-learn.org/stable/modules/pipeline.html.""" from __future__ import annotations -from typing import List, Optional, Tuple +from typing import cast, List, Optional, Tuple, Union + +from google.cloud import bigquery import bigframes -from bigframes.ml import base, cluster, compose, decomposition, preprocessing +import bigframes.constants as constants +from bigframes.ml import base, compose, loader, preprocessing, utils +import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.pipeline @@ -36,7 +40,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): if len(steps) != 2: raise NotImplementedError( - "Currently only two step (transform, estimator) pipelines are supported" + f"Currently only two step (transform, estimator) pipelines are supported. {constants.FEEDBACK_LINK}" ) transform, estimator = steps[0][1], steps[1][1] @@ -51,7 +55,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): self._transform = transform else: raise NotImplementedError( - f"Transform {transform} is not yet supported by Pipeline" + f"Transform {transform} is not yet supported by Pipeline. {constants.FEEDBACK_LINK}" ) if not isinstance( @@ -59,44 +63,134 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): base.TrainablePredictor, ): raise NotImplementedError( - f"Estimator {estimator} is not supported by Pipeline" + f"Estimator {estimator} is not supported by Pipeline. {constants.FEEDBACK_LINK}" ) self._transform = transform self._estimator = estimator + @classmethod + def _from_bq(cls, session: bigframes.Session, bq_model: bigquery.Model) -> Pipeline: + col_transformer = _extract_as_column_transformer(bq_model) + transform = _merge_column_transformer(bq_model, col_transformer) + + estimator = loader._model_from_bq(session, bq_model) + return cls([("transform", transform), ("estimator", estimator)]) + def fit( self, - X: bigframes.dataframe.DataFrame, - y: Optional[bigframes.dataframe.DataFrame] = None, - ): + X: Union[bpd.DataFrame, bpd.Series], + y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + ) -> Pipeline: + (X,) = utils.convert_to_dataframe(X) + compiled_transforms = self._transform._compile_to_sql(X.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] if y is not None: # If labels columns are present, they should pass through un-transformed + (y,) = utils.convert_to_dataframe(y) transform_sqls.extend(y.columns.tolist()) self._estimator.fit(X=X, y=y, transforms=transform_sqls) + return self - def predict( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: return self._estimator.predict(X) def score( self, - X: bigframes.dataframe.DataFrame, - y: bigframes.dataframe.DataFrame, - ): - if isinstance(self._estimator, (cluster.KMeans, decomposition.PCA)): - raise NotImplementedError("KMeans/PCA haven't supported score method.") - - # TODO(b/289280565): remove type ignore after updating KMeans and PCA - return self._estimator.score(X=X, y=y) # type: ignore - - def to_gbq(self, model_name: str, replace: bool = False): - self._estimator.to_gbq(model_name, replace) + X: Union[bpd.DataFrame, bpd.Series], + y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + ) -> bpd.DataFrame: + (X,) = utils.convert_to_dataframe(X) + if y is not None: + (y,) = utils.convert_to_dataframe(y) + + return self._estimator.score(X=X, y=y) + + def to_gbq(self, model_name: str, replace: bool = False) -> Pipeline: + """Save the pipeline to BigQuery. + + Args: + model_name (str): + the name of the model(pipeline). + replace (bool, default False): + whether to replace if the model(pipeline) already exists. Default to False. + + Returns: + Pipeline: saved model(pipeline).""" + if not self._estimator._bqml_model: + raise RuntimeError("A model must be fitted before it can be saved") + + new_model = self._estimator._bqml_model.copy(model_name, replace) + + return new_model.session.read_gbq_model(model_name) + + +def _extract_as_column_transformer( + bq_model: bigquery.Model, +) -> compose.ColumnTransformer: + """Extract transformers as ColumnTransformer obj from a BQ Model.""" + assert "transformColumns" in bq_model._properties + + transformers: List[ + Tuple[ + str, + Union[preprocessing.OneHotEncoder, preprocessing.StandardScaler], + Union[str, List[str]], + ] + ] = [] + for transform_col in bq_model._properties["transformColumns"]: + # pass the columns that are not transformed + if "transformSql" not in transform_col: + continue + + transform_sql: str = cast(dict, transform_col)["transformSql"] + if transform_sql.startswith("ML.STANDARD_SCALER"): + transformers.append( + ( + "standard_scaler", + *preprocessing.StandardScaler._parse_from_sql(transform_sql), + ) + ) + elif transform_sql.startswith("ML.ONE_HOT_ENCODER"): + transformers.append( + ( + "ont_hot_encoder", + *preprocessing.OneHotEncoder._parse_from_sql(transform_sql), + ) + ) + else: + raise NotImplementedError( + f"Unsupported transformer type. {constants.FEEDBACK_LINK}" + ) - # TODO: should instead load from GBQ, but loading pipelines is not implemented yet - return self + return compose.ColumnTransformer(transformers=transformers) + + +def _merge_column_transformer( + bq_model: bigquery.Model, column_transformer: compose.ColumnTransformer +) -> Union[ + compose.ColumnTransformer, + preprocessing.StandardScaler, + preprocessing.OneHotEncoder, +]: + """Try to merge the column transformer to a simple transformer.""" + transformers = column_transformer.transformers_ + + assert len(transformers) > 0 + _, transformer_0, column_0 = transformers[0] + columns = [column_0] + for _, transformer, column in transformers[1:]: + # all transformers are the same + if transformer != transformer_0: + return column_transformer + columns.append(column) + # all feature columns are transformed + if sorted( + [cast(str, feature_column.name) for feature_column in bq_model.feature_columns] + ) == sorted(columns): + return transformer_0 + + return column_transformer diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 59d2cc2ae9..500a9fcb24 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -13,29 +13,35 @@ # limitations under the License. """Transformers that prepare data for other estimators. This module is styled after -Scikit-Learn's preprocessing module: https://scikit-learn.org/stable/modules/preprocessing.html""" +Scikit-Learn's preprocessing module: https://scikit-learn.org/stable/modules/preprocessing.html.""" +from __future__ import annotations import typing -from typing import List, Optional, Tuple +from typing import Any, cast, List, Literal, Optional, Tuple, Union -import bigframes -import bigframes.ml -import bigframes.ml.sql +from bigframes.ml import base, core +from bigframes.ml import sql as ml_sql +from bigframes.ml import utils +import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.preprocessing._data import third_party.bigframes_vendored.sklearn.preprocessing._encoder class StandardScaler( third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler, - bigframes.ml.base.BaseEstimator, + base.BaseEstimator, ): __doc__ = ( third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler.__doc__ ) def __init__(self): - self._bqml_model: Optional[bigframes.ml.core.BqmlModel] = None + self._bqml_model: Optional[core.BqmlModel] = None + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return type(other) is StandardScaler and self._bqml_model == other._bqml_model def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in @@ -47,20 +53,35 @@ def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: Returns: a list of tuples of (sql_expression, output_name)""" return [ ( - bigframes.ml.sql.ml_standard_scaler(column, f"scaled_{column}"), + ml_sql.ml_standard_scaler(column, f"scaled_{column}"), f"scaled_{column}", ) for column in columns ] + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[StandardScaler, str]: + """Parse SQL to tuple(StandardScaler, column_label). + + Args: + sql: SQL string of format "ML.STANDARD_SCALER({col_label}) OVER()" + + Returns: + tuple(StandardScaler, column_label)""" + col_label = sql[sql.find("(") + 1 : sql.find(")")] + return cls(), col_label + def fit( self, - X: bigframes.dataframe.DataFrame, - ): + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> StandardScaler: + (X,) = utils.convert_to_dataframe(X) + compiled_transforms = self._compile_to_sql(X.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - self._bqml_model = bigframes.ml.core.create_bqml_model( + self._bqml_model = core.create_bqml_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, @@ -68,57 +89,124 @@ def fit( # The schema of TRANSFORM output is not available in the model API, so save it during fitting self._output_names = [name for _, name in compiled_transforms] + return self - def transform( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") + (X,) = utils.convert_to_dataframe(X) + df = self._bqml_model.transform(X) return typing.cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[self._output_names], ) class OneHotEncoder( third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, - bigframes.ml.base.BaseEstimator, + base.BaseEstimator, ): + # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax + TOP_K_DEFAULT = 1000000 + FREQUENCY_THRESHOLD_DEFAULT = 0 + __doc__ = ( third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder.__doc__ ) # All estimators must implement __init__ to document their parameters, even # if they don't have any - def __init__(self): - pass + def __init__( + self, + drop: Optional[Literal["most_frequent"]] = None, + min_frequency: Optional[int] = None, + max_categories: Optional[int] = None, + ): + if max_categories is not None and max_categories < 2: + raise ValueError( + f"max_categories has to be larger than or equal to 2, input is {max_categories}." + ) + self.drop = drop + self.min_frequency = min_frequency + self.max_categories = max_categories + self._bqml_model: Optional[core.BqmlModel] = None + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return ( + type(other) is OneHotEncoder + and self._bqml_model == other._bqml_model + and self.drop == other.drop + and self.min_frequency == other.min_frequency + and self.max_categories == other.max_categories + ) def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: a list of column names to transform + columns: + a list of column names to transform Returns: a list of tuples of (sql_expression, output_name)""" + + drop = self.drop if self.drop is not None else "none" + # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that. + top_k = ( + (self.max_categories - 1) + if self.max_categories is not None + else OneHotEncoder.TOP_K_DEFAULT + ) + frequency_threshold = ( + self.min_frequency + if self.min_frequency is not None + else OneHotEncoder.FREQUENCY_THRESHOLD_DEFAULT + ) return [ ( - bigframes.ml.sql.ml_one_hot_encoder(column, f"onehotencoded_{column}"), + ml_sql.ml_one_hot_encoder( + column, drop, top_k, frequency_threshold, f"onehotencoded_{column}" + ), f"onehotencoded_{column}", ) for column in columns ] + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[OneHotEncoder, str]: + """Parse SQL to tuple(OneHotEncoder, column_label). + + Args: + sql: SQL string of format "ML.ONE_HOT_ENCODER({col_label}, '{drop}', {top_k}, {frequency_threshold}) OVER() " + + Returns: + tuple(OneHotEncoder, column_label)""" + s = sql[sql.find("(") + 1 : sql.find(")")] + col_label, drop_str, top_k, frequency_threshold = s.split(", ") + drop = ( + cast(Literal["most_frequent"], "most_frequent") + if drop_str.lower() == "'most_frequent'" + else None + ) + max_categories = int(top_k) + 1 + min_frequency = int(frequency_threshold) + + return cls(drop, min_frequency, max_categories), col_label + def fit( self, - X: bigframes.dataframe.DataFrame, - ): + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> OneHotEncoder: + (X,) = utils.convert_to_dataframe(X) + compiled_transforms = self._compile_to_sql(X.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - self._bqml_model = bigframes.ml.core.create_bqml_model( + self._bqml_model = core.create_bqml_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, @@ -126,15 +214,16 @@ def fit( # The schema of TRANSFORM output is not available in the model API, so save it during fitting self._output_names = [name for _, name in compiled_transforms] + return self - def transform( - self, X: bigframes.dataframe.DataFrame - ) -> bigframes.dataframe.DataFrame: + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") + (X,) = utils.convert_to_dataframe(X) + df = self._bqml_model.transform(X) return typing.cast( - bigframes.dataframe.DataFrame, + bpd.DataFrame, df[self._output_names], ) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 7017b2a4cd..b8d9e2c673 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -18,6 +18,8 @@ from typing import Iterable, Optional, Union +import bigframes.constants as constants + def _encode_value(v: Union[str, int, float, Iterable[str]]) -> str: """Encode a parameter value for SQL""" @@ -29,7 +31,7 @@ def _encode_value(v: Union[str, int, float, Iterable[str]]) -> str: inner = ", ".join([_encode_value(x) for x in v]) return f"[{inner}]" else: - raise ValueError("Unexpected value type") + raise ValueError(f"Unexpected value type. {constants.FEEDBACK_LINK}") def _build_param_Iterable(**kwargs: Union[str, int, float, Iterable[str]]) -> str: @@ -77,9 +79,12 @@ def ml_standard_scaler(numeric_expr_sql: str, name: str) -> str: return f"""ML.STANDARD_SCALER({numeric_expr_sql}) OVER() AS {name}""" -def ml_one_hot_encoder(numeric_expr_sql: str, name: str) -> str: - """Encode ML.ONE_HOT_ENCODER for BQML""" - return f"""ML.ONE_HOT_ENCODER({numeric_expr_sql}) OVER() AS {name}""" +def ml_one_hot_encoder( + numeric_expr_sql: str, drop: str, top_k: int, frequency_threshold: int, name: str +) -> str: + """Encode ML.ONE_HOT_ENCODER for BQML. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder for params.""" + return f"""ML.ONE_HOT_ENCODER({numeric_expr_sql}, '{drop}', {top_k}, {frequency_threshold}) OVER() AS {name}""" def create_model( @@ -88,9 +93,8 @@ def create_model( transform_sql: Optional[str] = None, options_sql: Optional[str] = None, ) -> str: - """Encode the CREATE MODEL statement for BQML""" - # TODO(garrettwu): This should be CREATE TEMP MODEL after b/145824779 is fixed - parts = [f"CREATE MODEL `{model_name}`"] + """Encode the CREATE TEMP MODEL statement for BQML""" + parts = [f"CREATE TEMP MODEL `{model_name}`"] if transform_sql: parts.append(transform_sql) if options_sql: @@ -104,9 +108,8 @@ def create_remote_model( connection_name: str, options_sql: Optional[str] = None, ) -> str: - """Encode the CREATE MODEL statement for BQML""" - # TODO(garrettwu): This should be CREATE TEMP MODEL after b/145824779 is fixed - parts = [f"CREATE MODEL `{model_name}`"] + """Encode the CREATE TEMP MODEL statement for BQML remote model.""" + parts = [f"CREATE TEMP MODEL `{model_name}`"] parts.append(connection(connection_name)) if options_sql: parts.append(options_sql) @@ -117,9 +120,8 @@ def create_imported_model( model_name: str, options_sql: Optional[str] = None, ) -> str: - """Encode the CREATE MODEL statement for BQML""" - # TODO(garrettwu): This should be CREATE TEMP MODEL after b/145824779 is fixed - parts = [f"CREATE MODEL `{model_name}`"] + """Encode the CREATE TEMP MODEL statement for BQML remote model.""" + parts = [f"CREATE TEMP MODEL `{model_name}`"] if options_sql: parts.append(options_sql) return "\n".join(parts) @@ -162,9 +164,11 @@ def ml_generate_text(model_name: str, source_sql: str, struct_options: str) -> s ({source_sql}), {struct_options})""" -def ml_embed_text(model_name: str, source_sql: str, struct_options: str) -> str: - """Encode ML.EMBED_TEXT for BQML""" - return f"""SELECT * FROM ML.EMBED_TEXT(MODEL `{model_name}`, +def ml_generate_text_embedding( + model_name: str, source_sql: str, struct_options: str +) -> str: + """Encode ML.GENERATE_TEXT_EMBEDDING for BQML""" + return f"""SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `{model_name}`, ({source_sql}), {struct_options})""" diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py new file mode 100644 index 0000000000..299282d333 --- /dev/null +++ b/bigframes/ml/utils.py @@ -0,0 +1,58 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing +from typing import Iterable, Union + +import bigframes.constants as constants +from bigframes.core import blocks +import bigframes.pandas as bpd + +# Internal type alias +ArrayType = Union[bpd.DataFrame, bpd.Series] + + +def convert_to_dataframe(*input: ArrayType) -> Iterable[bpd.DataFrame]: + return (_convert_to_dataframe(frame) for frame in input) + + +def _convert_to_dataframe(frame: ArrayType) -> bpd.DataFrame: + if isinstance(frame, bpd.DataFrame): + return frame + if isinstance(frame, bpd.Series): + return frame.to_frame() + raise ValueError( + f"Unsupported type {type(frame)} to convert to DataFrame. {constants.FEEDBACK_LINK}" + ) + + +def convert_to_series(*input: ArrayType) -> Iterable[bpd.Series]: + return (_convert_to_series(frame) for frame in input) + + +def _convert_to_series(frame: ArrayType) -> bpd.Series: + if isinstance(frame, bpd.DataFrame): + if len(frame.columns) != 1: + raise ValueError( + "To convert into Series, DataFrames can only contain one column. " + f"Try input with only one column. {constants.FEEDBACK_LINK}" + ) + + label = typing.cast(blocks.Label, frame.columns.tolist()[0]) + return typing.cast(bpd.Series, frame[label]) + if isinstance(frame, bpd.Series): + return frame + raise ValueError( + f"Unsupported type {type(frame)} to convert to Series. {constants.FEEDBACK_LINK}" + ) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 781cc49339..13063af75f 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -23,11 +23,14 @@ import ibis.expr.operations.generic import ibis.expr.types as ibis_types import numpy as np +import pandas as pd +import bigframes.constants as constants import bigframes.dtypes import bigframes.dtypes as dtypes _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) +_NAN = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.nan)) _INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.inf)) BinaryOp = typing.Callable[[ibis_types.Value, ibis_types.Value], ibis_types.Value] @@ -39,7 +42,9 @@ ### Unary Ops class UnaryOp: def _as_ibis(self, x): - raise NotImplementedError("Base class UnaryOp has no implementation.") + raise NotImplementedError( + f"Base class UnaryOp has no implementation. {constants.FEEDBACK_LINK}" + ) @property def is_windowed(self): @@ -113,6 +118,79 @@ def _as_ibis(self, x: ibis_types.Value): return typing.cast(ibis_types.StringValue, x).capitalize() +class ContainsStringOp(UnaryOp): + def __init__(self, pat: str, case: bool = True): + self._pat = pat + + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).contains(self._pat) + + +class ContainsRegexOp(UnaryOp): + def __init__(self, pat: str): + self._pat = pat + + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search(self._pat) + + +class ReplaceStringOp(UnaryOp): + def __init__(self, pat: str, repl: str): + self._pat = pat + self._repl = repl + + def _as_ibis(self, x: ibis_types.Value): + pat_str_value = typing.cast( + ibis_types.StringValue, ibis_types.literal(self._pat) + ) + repl_str_value = typing.cast( + ibis_types.StringValue, ibis_types.literal(self._pat) + ) + + return typing.cast(ibis_types.StringValue, x).replace( + pat_str_value, repl_str_value + ) + + +class ReplaceRegexOp(UnaryOp): + def __init__(self, pat: str, repl: str): + self._pat = pat + self._repl = repl + + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_replace(self._pat, self._repl) + + +class StartsWithOp(UnaryOp): + def __init__(self, pat: typing.Sequence[str]): + self._pat = pat + + def _as_ibis(self, x: ibis_types.Value): + any_match = None + for pat in self._pat: + pat_match = typing.cast(ibis_types.StringValue, x).startswith(pat) + if any_match is not None: + any_match = any_match | pat_match + else: + any_match = pat_match + return any_match if any_match is not None else ibis_types.literal(False) + + +class EndsWithOp(UnaryOp): + def __init__(self, pat: typing.Sequence[str]): + self._pat = pat + + def _as_ibis(self, x: ibis_types.Value): + any_match = None + for pat in self._pat: + pat_match = typing.cast(ibis_types.StringValue, x).endswith(pat) + if any_match is not None: + any_match = any_match | pat_match + else: + any_match = pat_match + return any_match if any_match is not None else ibis_types.literal(False) + + class HashOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): return typing.cast(ibis_types.IntegerValue, x).hash() @@ -192,6 +270,15 @@ def _as_ibis(self, x: ibis_types.Value): ) +class ExtractOp(UnaryOp): + def __init__(self, pat: str, n: int = 1): + self._pat = pat + self._n = n + + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_extract(self._pat, self._n) + + class SliceOp(UnaryOp): def __init__(self, start, stop): self._start = start @@ -201,6 +288,20 @@ def _as_ibis(self, x: ibis_types.Value): return typing.cast(ibis_types.StringValue, x)[self._start : self._stop] +class IsInOp(UnaryOp): + def __init__(self, values, match_nulls: bool = True): + self._values = values + self._match_nulls = match_nulls + + def _as_ibis(self, x: ibis_types.Value): + if self._match_nulls and any(is_null(value) for value in self._values): + return x.isnull() | x.isin( + [val for val in self._values if not is_null(val)] + ) + else: + return x.isin(self._values) + + class BinopPartialRight(UnaryOp): def __init__(self, binop: BinaryOp, right_scalar: typing.Any): self._binop = binop @@ -231,7 +332,7 @@ class RemoteFunctionOp(UnaryOp): def __init__(self, func: typing.Callable, apply_on_null=True): if not hasattr(func, "bigframes_remote_function"): raise TypeError( - "only a bigframes remote function is supported as a callable" + f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" ) self._func = func @@ -330,13 +431,6 @@ def or_op( ) -def isin_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x.isin(y) - - @short_circuit_nulls() def add_op( x: ibis_types.Value, @@ -431,27 +525,43 @@ def floordiv_op( ) +def _is_float(x: ibis_types.Value): + return isinstance(x, (ibis_types.FloatingColumn, ibis_types.FloatingScalar)) + + @short_circuit_nulls() def mod_op( x: ibis_types.Value, y: ibis_types.Value, ): - # TODO(tbergeron): fully support floats, including when mixed with integer - # Pandas has inconsitency about whether N mod 0. Most conventions have this be NAN. - # For some dtypes, the result is 0 instead. This implementation results in NA always. - x_numeric = typing.cast(ibis_types.NumericValue, x) - y_numeric = typing.cast(ibis_types.NumericValue, y) + is_result_float = _is_float(x) | _is_float(y) + x_numeric = typing.cast( + ibis_types.NumericValue, + x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) + if is_result_float + else x, + ) + y_numeric = typing.cast( + ibis_types.NumericValue, + y.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) + if is_result_float + else y, + ) # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. op = y.op() if isinstance(op, ibis.expr.operations.generic.Literal) and op.value == 0: return ibis_types.null().cast(x.type()) bq_mod = x_numeric % y_numeric # Bigquery will maintain x sign here + if is_result_float: + bq_mod = typing.cast(ibis_types.NumericValue, bq_mod.cast(ibis_dtypes.float64)) + # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) return ( ibis.case() .when( - y_numeric == _ZERO, _ZERO * x_numeric + y_numeric == _ZERO, + _NAN * x_numeric if is_result_float else _ZERO * x_numeric, ) # Dummy op to propogate nulls and type from x arg .when( (y_numeric < _ZERO) & (bq_mod > _ZERO), (y_numeric + bq_mod) @@ -544,3 +654,8 @@ def clip_op( .else_(original) .end() ) + + +def is_null(value) -> bool: + # float NaN/inf should be treated as distinct from 'true' null values + return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 028ed4b606..1687f705a1 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -20,6 +20,9 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types +import bigframes.constants as constants +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops + class WindowOp: def _as_ibis(self, value: ibis_types.Column, window=None): @@ -37,6 +40,8 @@ def handles_ties(self): class AggregateOp(WindowOp): + name = "abstract_aggregate" + def _as_ibis(self, value: ibis_types.Column, window=None): raise NotImplementedError("Base class AggregateOp has no implementaiton.") @@ -51,13 +56,15 @@ def constrained_op(op, column: ibis_types.Column, window=None): return operation(op, column, window) else: raise ValueError( - f"Numeric operation cannot be applied to type {column.type()}" + f"Numeric operation cannot be applied to type {column.type()}. {constants.FEEDBACK_LINK}" ) return constrained_op class SumOp(AggregateOp): + name = "sum" + @numeric_op def _as_ibis( self, column: ibis_types.NumericColumn, window=None @@ -69,7 +76,50 @@ def _as_ibis( ) +class MedianOp(AggregateOp): + name = "median" + + @numeric_op + def _as_ibis( + self, column: ibis_types.NumericColumn, window=None + ) -> ibis_types.NumericValue: + # PERCENTILE_CONT has very few allowed windows. For example, "window + # framing clause is not allowed for analytic function percentile_cont". + if window is not None: + raise NotImplementedError( + f"Median with windowing is not supported. {constants.FEEDBACK_LINK}" + ) + + # TODO(swast): Allow switching between exact and approximate median. + # For now, the best we can do is an approximate median when we're doing + # an aggregation, as PERCENTILE_CONT is only an analytic function. + return typing.cast(ibis_types.NumericValue, column.approx_median()) + + +class ApproxQuartilesOp(AggregateOp): + def __init__(self, quartile: int): + self.name = f"{quartile*25}%" + self._quartile = quartile + + @numeric_op + def _as_ibis( + self, column: ibis_types.NumericColumn, window=None + ) -> ibis_types.NumericValue: + # PERCENTILE_CONT has very few allowed windows. For example, "window + # framing clause is not allowed for analytic function percentile_cont". + if window is not None: + raise NotImplementedError( + f"Approx Quartiles with windowing is not supported. {constants.FEEDBACK_LINK}" + ) + value = vendored_ibis_ops.ApproximateMultiQuantile( + column, num_bins=4 # type: ignore + ).to_expr()[self._quartile] + return typing.cast(ibis_types.NumericValue, value) + + class MeanOp(AggregateOp): + name = "mean" + @numeric_op def _as_ibis( self, column: ibis_types.NumericColumn, window=None @@ -78,6 +128,8 @@ def _as_ibis( class ProductOp(AggregateOp): + name = "product" + @numeric_op def _as_ibis( self, column: ibis_types.NumericColumn, window=None @@ -117,16 +169,22 @@ def _as_ibis( class MaxOp(AggregateOp): + name = "max" + def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: return _apply_window_if_present(column.max(), window) class MinOp(AggregateOp): + name = "min" + def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: return _apply_window_if_present(column.min(), window) class StdOp(AggregateOp): + name = "std" + @numeric_op def _as_ibis(self, x: ibis_types.Column, window=None) -> ibis_types.Value: return _apply_window_if_present( @@ -135,6 +193,8 @@ def _as_ibis(self, x: ibis_types.Column, window=None) -> ibis_types.Value: class VarOp(AggregateOp): + name = "var" + @numeric_op def _as_ibis(self, x: ibis_types.Column, window=None) -> ibis_types.Value: return _apply_window_if_present( @@ -143,6 +203,8 @@ def _as_ibis(self, x: ibis_types.Column, window=None) -> ibis_types.Value: class CountOp(AggregateOp): + name = "count" + def _as_ibis( self, column: ibis_types.Column, window=None ) -> ibis_types.IntegerValue: @@ -153,7 +215,32 @@ def skips_nulls(self): return False +class CutOp(WindowOp): + def __init__(self, bins: int): + self._bins = bins + + def _as_ibis(self, x: ibis_types.Column, window=None): + col_min = _apply_window_if_present(x.min(), window) + col_max = _apply_window_if_present(x.max(), window) + bin_width = (col_max - col_min) / self._bins + out = ibis.case() + for bin in range(self._bins - 1): + out = out.when(x <= (col_min + (bin + 1) * bin_width), bin) + out = out.when(x.notnull(), self._bins - 1) + return out.end() + + @property + def skips_nulls(self): + return False + + @property + def handles_ties(self): + return True + + class NuniqueOp(AggregateOp): + name = "nunique" + def _as_ibis( self, column: ibis_types.Column, window=None ) -> ibis_types.IntegerValue: @@ -165,6 +252,8 @@ def skips_nulls(self): class RankOp(WindowOp): + name = "rank" + def _as_ibis( self, column: ibis_types.Column, window=None ) -> ibis_types.IntegerValue: @@ -230,6 +319,8 @@ def _as_ibis( class AnyOp(AggregateOp): + name = "any" + def _as_ibis( self, column: ibis_types.Column, window=None ) -> ibis_types.BooleanValue: @@ -274,6 +365,7 @@ def _map_to_literal( sum_op = SumOp() mean_op = MeanOp() +median_op = MedianOp() product_op = ProductOp() max_op = MaxOp() min_op = MinOp() @@ -286,3 +378,26 @@ def _map_to_literal( all_op = AllOp() any_op = AnyOp() first_op = FirstOp() + + +# TODO: Alternative names and lookup from numpy function objects +AGGREGATIONS_LOOKUP: dict[str, AggregateOp] = { + op.name: op + for op in [ + sum_op, + mean_op, + median_op, + product_op, + max_op, + min_op, + std_op, + var_op, + count_op, + all_op, + any_op, + nunique_op, + ApproxQuartilesOp(1), + ApproxQuartilesOp(2), + ApproxQuartilesOp(3), + ] +} diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 29017d2bbe..caef33919b 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -19,6 +19,7 @@ import ibis.expr.types as ibis_types import pandas as pd +import bigframes.constants as constants import bigframes.core.blocks as blocks import bigframes.core.scalar as scalars import bigframes.dtypes @@ -29,7 +30,9 @@ # BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. # TODO(tbergeron): Convert to bytes-based limit -MAX_INLINE_SERIES_SIZE = 5000 +# TODO(swast): Address issues with string escaping and empty tables before +# re-enabling inline data (ibis.memtable) feature. +MAX_INLINE_SERIES_SIZE = -1 class SeriesMethods: @@ -47,7 +50,9 @@ def __init__( ): block = None if copy is not None and not copy: - raise ValueError("Series constructor only supports copy=True") + raise ValueError( + f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}" + ) if isinstance(data, blocks.Block): assert len(data.value_columns) == 1 assert len(data.column_labels) == 1 @@ -60,12 +65,12 @@ def __init__( if name: if not isinstance(name, str): raise NotImplementedError( - "BigQuery DataFrames only supports string series names." + f"BigQuery DataFrames only supports string series names. {constants.FEEDBACK_LINK}" ) block = block.with_column_labels([name]) if index: raise NotImplementedError( - "Series 'index' constructor parameter not supported when passing BigQuery-backed objects" + f"Series 'index' constructor parameter not supported when passing BigQuery-backed objects. {constants.FEEDBACK_LINK}" ) if dtype: block = block.multi_apply_unary_op( @@ -138,7 +143,7 @@ def _apply_binary_op( if isinstance(other, pd.Series): # TODO: Convert to BigQuery DataFrames series raise NotImplementedError( - "Pandas series not supported supported as operand." + f"Pandas series not supported supported as operand. {constants.FEEDBACK_LINK}" ) if isinstance(other, series.Series): (left, right, block) = self._align(other, how=alignment) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 57029ede41..a16ecb0d32 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -14,13 +14,23 @@ from __future__ import annotations -from typing import Literal, Optional, Union +import re +from typing import cast, Literal, Optional, Union +import bigframes.constants as constants +import bigframes.dataframe as df import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series import third_party.bigframes_vendored.pandas.core.strings.accessor as vendorstr +# Maps from python to re2 +REGEXP_FLAGS = { + re.IGNORECASE: "i", + re.MULTILINE: "m", + re.DOTALL: "s", +} + class StringMethods(bigframes.operations.base.SeriesMethods, vendorstr.StringMethods): __doc__ = vendorstr.StringMethods.__doc__ @@ -72,6 +82,84 @@ def repeat(self, repeats: int) -> series.Series: def capitalize(self) -> series.Series: return self._apply_unary_op(ops.capitalize_op) + def contains( + self, pat, case: bool = True, flags: int = 0, *, regex: bool = True + ) -> series.Series: + if not case: + return self.contains(pat, flags=flags | re.IGNORECASE, regex=True) + if regex: + re2flags = _parse_flags(flags) + if re2flags: + pat = re2flags + pat + return self._apply_unary_op(ops.ContainsRegexOp(pat)) + else: + return self._apply_unary_op(ops.ContainsStringOp(pat)) + + def extract(self, pat: str, flags: int = 0): + re2flags = _parse_flags(flags) + if re2flags: + pat = re2flags + pat + compiled = re.compile(pat) + if compiled.groups == 0: + raise ValueError("No capture groups in 'pat'") + + results: list[str] = [] + block = self._block + for i in range(compiled.groups): + labels = [ + label + for label, groupn in compiled.groupindex.items() + if i + 1 == groupn + ] + label = labels[0] if labels else str(i) + block, id = block.apply_unary_op( + self._value_column, ops.ExtractOp(pat, i + 1), result_label=label + ) + results.append(id) + block = block.select_columns(results) + return df.DataFrame(block) + + def replace( + self, + pat: Union[str, re.Pattern], + repl: str, + *, + case: Optional[bool] = None, + flags: int = 0, + regex: bool = False, + ) -> series.Series: + is_compiled = isinstance(pat, re.Pattern) + patstr = cast(str, pat.pattern if is_compiled else pat) # type: ignore + if case is False: + return self.replace(pat, repl, flags=flags | re.IGNORECASE, regex=True) + if regex: + re2flags = _parse_flags(flags) + if re2flags: + patstr = re2flags + patstr + return self._apply_unary_op(ops.ReplaceRegexOp(patstr, repl)) + else: + if is_compiled: + raise ValueError( + "Must set 'regex'=True if using compiled regex pattern." + ) + return self._apply_unary_op(ops.ReplaceStringOp(patstr, repl)) + + def startswith( + self, + pat: Union[str, tuple[str, ...]], + ) -> series.Series: + if not isinstance(pat, tuple): + pat = (pat,) + return self._apply_unary_op(ops.StartsWithOp(pat)) + + def endswith( + self, + pat: Union[str, tuple[str, ...]], + ) -> series.Series: + if not isinstance(pat, tuple): + pat = (pat,) + return self._apply_unary_op(ops.EndsWithOp(pat)) + def cat( self, others: Union[str, series.Series], @@ -79,3 +167,22 @@ def cat( join: Literal["outer", "left"] = "left", ) -> series.Series: return self._apply_binary_op(others, ops.concat_op, alignment=join) + + +def _parse_flags(flags: int) -> Optional[str]: + re2flags = [] + for reflag, re2flag in REGEXP_FLAGS.items(): + if flags & flags: + re2flags.append(re2flag) + flags = flags ^ reflag + + # Remaining flags couldn't be mapped to re2 engine + if flags: + raise NotImplementedError( + f"Could not handle RegexFlag: {flags}. {constants.FEEDBACK_LINK}" + ) + + if re2flags: + return "(?" + "".join(re2flags) + ")" + else: + return None diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index e7c1435151..cc8b4e5cc4 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -46,40 +46,94 @@ import bigframes.series import bigframes.session import third_party.bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat +import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile + +# Support pandas dtype attribute +NA = pandas.NA +BooleanDtype = pandas.BooleanDtype +Float64Dtype = pandas.Float64Dtype +Int64Dtype = pandas.Int64Dtype +StringDtype = pandas.StringDtype +ArrowDtype = pandas.ArrowDtype # Include method definition so that the method appears in our docs for # bigframes.pandas general functions. @typing.overload def concat( - objs: Iterable[bigframes.dataframe.DataFrame], *, join, ignore_index + objs: Iterable[bigframes.series.Series], + *, + axis: typing.Literal["index", 0] = ..., + join=..., + ignore_index=..., +) -> bigframes.series.Series: + ... + + +@typing.overload +def concat( + objs: Iterable[bigframes.dataframe.DataFrame], + *, + axis: typing.Literal["index", 0] = ..., + join=..., + ignore_index=..., ) -> bigframes.dataframe.DataFrame: ... @typing.overload def concat( - objs: Iterable[bigframes.series.Series], *, join, ignore_index -) -> bigframes.series.Series: + objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]], + *, + axis: typing.Literal["columns", 1], + join=..., + ignore_index=..., +) -> bigframes.dataframe.DataFrame: ... +@typing.overload def concat( - objs: Union[ - Iterable[bigframes.dataframe.DataFrame], Iterable[bigframes.series.Series] - ], + objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]], *, + axis=..., + join=..., + ignore_index=..., +) -> Union[bigframes.dataframe.DataFrame, bigframes.series.Series]: + ... + + +def concat( + objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]], + *, + axis: typing.Union[str, int] = 0, join: Literal["inner", "outer"] = "outer", ignore_index: bool = False, ) -> Union[bigframes.dataframe.DataFrame, bigframes.series.Series]: return bigframes.core.reshape.concat( - objs=objs, join=join, ignore_index=ignore_index + objs=objs, axis=axis, join=join, ignore_index=ignore_index ) concat.__doc__ = vendored_pandas_concat.concat.__doc__ +def cut( + x: bigframes.series.Series, + bins: int, + *, + labels: Optional[bool] = None, +) -> bigframes.series.Series: + return bigframes.core.reshape.cut( + x, + bins, + labels=labels, + ) + + +cut.__doc__ = vendored_pandas_tile.cut.__doc__ + + options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" @@ -88,9 +142,12 @@ def concat( def reset_session() -> None: - """Start a fresh session next time a function requires a session. + """Start a fresh session the next time a function requires a session. Closes the current session if it was already started. + + Returns: + None """ global _global_session @@ -124,15 +181,22 @@ def _with_default_session(func: Callable[..., _T], *args, **kwargs) -> _T: def _set_default_session_location_if_possible(query): - # If the default session has not started yet and this is the first API user - # is calling, then set the default location as per the query. + # Set the location as per the query if this is the first query the user is + # running and: + # (1) Default session has not started yet, and + # (2) Location is not set yet, and + # (3) Use of regional endpoints is not set. # If query is a table name, then it would be the location of the table. # If query is a SQL with a table, then it would be table's location. # If query is a SQL with no table, then it would be the BQ default location. - if options.bigquery._session_started or options.bigquery.use_regional_endpoints: + if ( + options.bigquery._session_started + or options.bigquery.location + or options.bigquery.use_regional_endpoints + ): return - bqclient, _, _ = bigframes.session._create_bq_clients( + bqclient, _, _, _ = bigframes.session._create_cloud_clients( project=options.bigquery.project, location=options.bigquery.location, use_regional_endpoints=options.bigquery.use_regional_endpoints, @@ -320,6 +384,16 @@ def remote_function( remote_function.__doc__ = inspect.getdoc(bigframes.session.Session.remote_function) +def read_gbq_function(function_name: str): + return _with_default_session( + bigframes.session.Session.read_gbq_function, + function_name=function_name, + ) + + +read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function) + + # Other aliases DataFrame = bigframes.dataframe.DataFrame Index = bigframes.core.indexes.Index @@ -332,6 +406,7 @@ def remote_function( "options", "read_csv", "read_gbq", + "read_gbq_function", "read_gbq_model", "read_pandas", "remote_function", diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 3fe4e38d78..5924941cc5 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -27,20 +27,28 @@ import tempfile import textwrap import time -import typing +from typing import List, NamedTuple, Optional, Sequence, TYPE_CHECKING -if typing.TYPE_CHECKING: +if TYPE_CHECKING: from bigframes.session import Session import cloudpickle import google.api_core.exceptions from google.cloud import bigquery, bigquery_connection_v1, functions_v2 +from google.cloud.bigquery.routine import Routine +from google.cloud.bigquery.standard_sql import StandardSqlTypeNames from ibis.backends.bigquery.compiler import compiles from ibis.backends.bigquery.datatypes import BigQueryType +from ibis.expr.datatypes.core import boolean +from ibis.expr.datatypes.core import DataType as IbisDataType from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type +from ibis.expr.datatypes.core import float64, int64 +from ibis.expr.datatypes.core import string as ibis_string import ibis.expr.operations as ops import ibis.expr.rules as rlz +import bigframes.constants as constants + # TODO(shobs): Change the min log level to INFO after the development stabilizes # before June 2023 logging.basicConfig( @@ -52,10 +60,14 @@ # https://docs.python.org/3/library/pickle.html#data-stream-format _pickle_protocol_version = 4 -# Input and output python types supported by BigQuery DataFrames remote functions. +# Input and output types supported by BigQuery DataFrames remote functions. # TODO(shobs): Extend the support to all types supported by BQ remote functions # https://cloud.google.com/bigquery/docs/remote-functions#limitations -_supported_io_types = set((bool, float, int, str)) +_supported_io_ibis_types = {boolean, float64, int64, ibis_string} +TYPE_ERROR_MESSAGE_FORMAT = ( + f"Type {{}} not supported, supported types are {_supported_io_ibis_types}. " + f"{constants.FEEDBACK_LINK}" +) def get_remote_function_locations(bq_location): @@ -94,9 +106,20 @@ def _run_system_command(command): if exit_code: raise RuntimeError( f"Command: {command}\nOutput: {stdout.decode()}\nError: {stderr.decode()}" + f"{constants.FEEDBACK_LINK}" ) +def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: + return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" + + +class IbisSignature(NamedTuple): + parameter_names: List[str] + input_types: List[IbisDataType] + output_type: IbisDataType + + def get_cloud_function_name(def_, uniq_suffix=None): """Get the name of the cloud function.""" cf_name = _get_hash(def_) @@ -123,6 +146,7 @@ def __init__( self, gcp_project_id, cloud_function_region, + cloud_functions_client, bq_location, bq_dataset, bq_client, @@ -131,6 +155,7 @@ def __init__( ): self._gcp_project_id = gcp_project_id self._cloud_function_region = cloud_function_region + self._cloud_functions_client = cloud_functions_client self._bq_location = bq_location self._bq_dataset = bq_dataset self._bq_client = bq_client @@ -184,21 +209,18 @@ def create_bq_remote_function( f"{name} {BigQueryType.from_ibis(input_types[idx])}" ) create_function_ddl = f""" - CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) - RETURNS {bq_function_return_type} - REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` - OPTIONS ( - endpoint = "{endpoint}" - )""" + CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) + RETURNS {bq_function_return_type} + REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` + OPTIONS ( + endpoint = "{endpoint}" + )""" logger.info(f"Creating BQ remote function: {create_function_ddl}") + # TODO: Use session._start_query() so we get progress bar query_job = self._bq_client.query(create_function_ddl) # Make an API request. query_job.result() # Wait for the job to complete. logger.info(f"Created remote function {query_job.ddl_target_routine}") - def get_remote_function_fully_qualified_name(self, name): - "Get the fully qualilfied name for a BQ remote function." - return "{}.{}.{}".format(self._gcp_project_id, self._bq_dataset, name) - def get_cloud_function_fully_qualified_name(self, name): "Get the fully qualilfied name for a cloud function." return "projects/{}/locations/{}/functions/{}".format( @@ -207,10 +229,11 @@ def get_cloud_function_fully_qualified_name(self, name): def get_cloud_function_endpoint(self, name): """Get the http endpoint of a cloud function if it exists.""" - client = functions_v2.FunctionServiceClient() fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) try: - response = client.get_function(name=fully_qualified_name) + response = self._cloud_functions_client.get_function( + name=fully_qualified_name + ) return response.service_config.uri except google.api_core.exceptions.NotFound: pass @@ -233,6 +256,34 @@ def create_bq_connection(self): def check_bq_connection_exists(self): """Check if the BigQuery Connection exists.""" client = self._bq_connection_client + if self._bq_connection_id.count(".") == 1: + bq_location, bq_connection_id = self._bq_connection_id.split(".") + if bq_location != self._bq_location: + logger.info( + f"Reset location {self._bq_location} to match the" + + f"location in connection name: {bq_location}" + ) + self._bq_location = bq_location + self._bq_connection_id = bq_connection_id + elif self._bq_connection_id.count(".") == 2: + ( + gcp_project_id, + bq_location, + bq_connection_id, + ) = self._bq_connection_id.split(".") + if gcp_project_id != self._gcp_project_id: + raise ValueError( + "The project_id does not match BigQuery connection gcp_project_id: " + f"{self._gcp_project_id}." + ) + if bq_location != self._bq_location: + logger.info( + f"Reset location {self._bq_location} to match the" + + f"location in connection name: {bq_location}" + ) + self._gcp_project_id = gcp_project_id + self._bq_location = bq_location + self._bq_connection_id = bq_connection_id request = bigquery_connection_v1.GetConnectionRequest( name=client.connection_path( self._gcp_project_id, self._bq_location, self._bq_connection_id @@ -299,7 +350,6 @@ def generate_cloud_function_main_code(self, def_, dir): def {handler_func_name}(request): request_json = request.get_json(silent=True) - print("[debug] received json request: " + str(request_json)) calls = request_json["calls"] replies = [] for call in calls: @@ -402,7 +452,9 @@ def create_cloud_function(self, def_, cf_name): # Fetch the endpoint of the just created function endpoint = self.get_cloud_function_endpoint(cf_name) if not endpoint: - raise ValueError("Couldn't fetch the http endpoint") + raise ValueError( + f"Couldn't fetch the http endpoint. {constants.FEEDBACK_LINK}" + ) logger.info( f"Successfully created cloud function {cf_name} with uri ({endpoint})" @@ -466,7 +518,8 @@ def check_cloud_function_tools_and_permissions(self): # cloud function and BigQuery remote function respectively if not shutil.which("gcloud"): raise ValueError( - "gcloud tool not installed, install it from https://cloud.google.com/sdk/docs/install" + "gcloud tool not installed, install it from https://cloud.google.com/sdk/docs/install. " + f"{constants.FEEDBACK_LINK}" ) # TODO(shobs): Check for permissions too @@ -481,110 +534,231 @@ def check_cloud_function_tools_and_permissions(self): # `cloudasset.googleapis.com` +def remote_function_node( + routine_ref: bigquery.RoutineReference, ibis_signature: IbisSignature +): + """Creates an Ibis node representing a remote function call.""" + + fields = { + name: rlz.value(type_) + for name, type_ in zip( + ibis_signature.parameter_names, ibis_signature.input_types + ) + } + + try: + fields["output_type"] = rlz.shape_like("args", dtype=ibis_signature.output_type) # type: ignore + except TypeError: + fields["output_dtype"] = property(lambda _: ibis_signature.output_type) + fields["output_shape"] = rlz.shape_like("args") + + node = type(routine_ref_to_string_for_query(routine_ref), (ops.ValueOp,), fields) # type: ignore + + @compiles(node) + def compile_node(t, op): + return "{}({})".format(node.__name__, ", ".join(map(t.translate, op.args))) + + def f(*args, **kwargs): + return node(*args, **kwargs).to_expr() + + f.bigframes_remote_function = str(routine_ref) # type: ignore + + return f + + +def ibis_type_from_python_type(t: type) -> IbisDataType: + ibis_type = python_type_to_bigquery_type(t) + assert ibis_type in _supported_io_ibis_types, TYPE_ERROR_MESSAGE_FORMAT.format( + ibis_type + ) + return ibis_type + + +def ibis_type_from_type_kind(tk: StandardSqlTypeNames) -> IbisDataType: + ibis_type = BigQueryType.to_ibis(tk) + assert ibis_type in _supported_io_ibis_types, TYPE_ERROR_MESSAGE_FORMAT.format( + ibis_type + ) + return ibis_type + + +def ibis_signature_from_python_signature( + signature: inspect.Signature, + input_types: Sequence[type], + output_type: type, +) -> IbisSignature: + return IbisSignature( + parameter_names=list(signature.parameters.keys()), + input_types=[ibis_type_from_python_type(t) for t in input_types], + output_type=ibis_type_from_python_type(output_type), + ) + + +def ibis_signature_from_routine( + routine: Routine, +) -> IbisSignature: + return IbisSignature( + parameter_names=[arg.name for arg in routine.arguments], + input_types=[ + ibis_type_from_type_kind(arg.data_type.type_kind) + for arg in routine.arguments + ], + output_type=ibis_type_from_type_kind(routine.return_type.type_kind), + ) + + +class DatasetMissingError(ValueError): + pass + + +def get_routine_reference( + routine_ref_str: str, + bigquery_client: bigquery.Client, + session: Optional[Session], +) -> bigquery.RoutineReference: + try: + # Handle cases ".." and + # ".". + return bigquery.RoutineReference.from_string( + routine_ref_str, + default_project=bigquery_client.project, + ) + except ValueError: + # Handle case of "". + if not session: + raise DatasetMissingError + + dataset_ref = bigquery.DatasetReference( + bigquery_client.project, session._session_dataset_id + ) + return dataset_ref.routine(routine_ref_str) + + # Inspired by @udf decorator implemented in ibis-bigquery package # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py # which has moved as @js to the ibis package # https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py def remote_function( - input_types: typing.Sequence[type], + input_types: Sequence[type], output_type: type, - session: typing.Optional[Session] = None, - bigquery_client: typing.Optional[bigquery.Client] = None, - bigquery_connection_client: typing.Optional[ + session: Optional[Session] = None, + bigquery_client: Optional[bigquery.Client] = None, + bigquery_connection_client: Optional[ bigquery_connection_v1.ConnectionServiceClient ] = None, - dataset: typing.Optional[str] = None, - bigquery_connection: typing.Optional[str] = None, + cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, + dataset: Optional[str] = None, + bigquery_connection: Optional[str] = None, reuse: bool = True, ): """Decorator to turn a user defined function into a BigQuery remote function. .. deprecated:: 0.0.1 - Use :func:`bigframes.pandas.remote_function` instead. + This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. + + .. note:: + Please make sure following is setup before using this API: + + 1. Have the below APIs enabled for your project: + + * BigQuery Connection API + * Cloud Functions API + * Cloud Run API + * Cloud Build API + * Artifact Registry API + * Cloud Resource Manager API + + This can be done from the cloud console (change `PROJECT_ID` to yours): + https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID + + Or from the gcloud CLI: + + `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` + + 2. Have following IAM roles enabled for you: + + * BigQuery Data Editor (roles/bigquery.dataEditor) + * BigQuery Connection Admin (roles/bigquery.connectionAdmin) + * Cloud Functions Developer (roles/cloudfunctions.developer) + * Service Account User (roles/iam.serviceAccountUser) + * Storage Object Viewer (roles/storage.objectViewer) + * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) + + 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: + + 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection + 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function + + Alternatively, the IAM could also be setup via the gcloud CLI: + + `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. Args: - input_types : list(type). + input_types list(type): List of input data types in the user defined function. - output_type : type. + output_type type: Data type of the output in the user defined function. - session : bigframes.Session, Optional + session (bigframes.Session, Optional): BigQuery DataFrames session to use for getting default project, dataset and BigQuery connection. - bigquery_client : google.cloud.bigquery.Client, Optional + bigquery_client (google.cloud.bigquery.Client, Optional): Client to use for BigQuery operations. If this param is not provided then bigquery client from the session would be used. - bigquery_connection_client : google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional + bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): + Client to use for cloud functions operations. If this param is not + provided then functions client from the session would be used. + cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): Client to use for BigQuery connection operations. If this param is not provided then bigquery connection client from the session would be used. - dataset : str, Optional. - Dataset to use to create a BigQuery function. It should be in + dataset (str, Optional.): + Dataset in which to create a BigQuery remote function. It should be in `.` or `` format. If this - param is not provided then session dataset id would be used. - bigquery_connection : str, Optional. - Name of the BigQuery connection. If this param is not provided then - the bigquery connection from the session would be used. If it is pre - created in the same location as the `bigquery_client.location` then - it would be used, otherwise it would be created dynamically using - the `bigquery_connection_client` assuming the user has necessary - priviliges. - reuse : bool, Optional. - Reuse the remote function if already exists. - `True` by default, which will result in reusing an existing remote + parameter is not provided then session dataset id is used. + bigquery_connection (str, Optional): + Name of the BigQuery connection in the form of `CONNECTION_ID` or + `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID`. + If this param is not provided then the bigquery connection from the session + would be used. If it is pre created in the same location as the + `bigquery_client.location` then it would be used, otherwise it is created + dynamically using the `bigquery_connection_client` assuming the user has necessary + priviliges. The PROJECT_ID should be the same as the BigQuery connection project. + reuse (bool, Optional): + Reuse the remote function if is already exists. + `True` by default, which results in reusing an existing remote function (if any) that was previously created for the same udf. - Setting it to false would force creating a unique remote function. + Setting it to false forces the creation of creating a unique remote function. If the required remote function does not exist then it would be created irrespective of this param. - Notes: - Please make sure following is setup before using this API: - - 1. Have the below APIs enabled for your project: - a. BigQuery Connection API - b. Cloud Functions API - c. Cloud Run API - d. Cloud Build API - e. Artifact Registry API - f. Cloud Resource Manager API - - This can be done from the cloud console (change PROJECT_ID to yours): - https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID - Or from the gcloud CLI: - $ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com - - 2. Have following IAM roles enabled for you: - a. BigQuery Data Editor (roles/bigquery.dataEditor) - b. BigQuery Connection Admin (roles/bigquery.connectionAdmin) - c. Cloud Functions Developer (roles/cloudfunctions.developer) - d. Service Account User (roles/iam.serviceAccountUser) - e. Storage Object Viewer (roles/storage.objectViewer) - f. Project IAM Admin (roles/resourcemanager.projectIamAdmin) - (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) - - 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: - a. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection - b. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - Alternatively, the IAM could also be setup via the gcloud CLI: - $ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker" - """ # A BigQuery client is required to perform BQ operations - if not bigquery_client: - if session: - bigquery_client = session.bqclient + if not bigquery_client and session: + bigquery_client = session.bqclient if not bigquery_client: raise ValueError( - "A bigquery client must be provided, either directly or via session" + "A bigquery client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" ) # A BigQuery connection client is required to perform BQ connection operations + if not bigquery_connection_client and session: + bigquery_connection_client = session.bqconnectionclient if not bigquery_connection_client: + raise ValueError( + "A bigquery connection client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A cloud functions client is required to perform cloud functions operations + if not cloud_functions_client: if session: - bigquery_connection_client = session.bqconnectionclient - if not bigquery_connection_client: + cloud_functions_client = session.cloudfunctionsclient + if not cloud_functions_client: raise ValueError( - "A bigquery connection client must be provided, either directly or via session" + "A functions connection client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" ) # BQ remote function must be persisted, for which we need a dataset @@ -593,16 +767,15 @@ def remote_function( dataset_ref = bigquery.DatasetReference.from_string( dataset, default_project=bigquery_client.project ) - gcp_project_id = dataset_ref.project - bq_dataset = dataset_ref.dataset_id + elif session: + dataset_ref = bigquery.DatasetReference.from_string( + session._session_dataset_id, default_project=bigquery_client.project + ) else: - gcp_project_id = bigquery_client.project - if session: - bq_dataset = session._session_dataset_id - if not gcp_project_id: - raise ValueError("Project must be provided, either directly or via session") - if not bq_dataset: - raise ValueError("Dataset must be provided, either directly or via session") + raise ValueError( + "Project and dataset must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) bq_location, cloud_function_region = get_remote_function_locations( bigquery_client.location @@ -614,7 +787,8 @@ def remote_function( bigquery_connection = session._remote_udf_connection # type: ignore if not bigquery_connection: raise ValueError( - "BigQuery connection must be provided, either directly or via session" + "BigQuery connection must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" ) uniq_suffix = None @@ -628,70 +802,65 @@ def wrapper(f): raise TypeError("f must be callable, got {}".format(f)) signature = inspect.signature(f) - parameter_names = signature.parameters.keys() - - # Check supported python datatypes and convert to ibis datatypes - type_error_message_format = ( - "type {{}} not supported, supported types are {}.".format( - ", ".join([type_.__name__ for type_ in _supported_io_types]) - ) - ) - for type_ in input_types: - assert type_ in _supported_io_types, type_error_message_format.format(type_) - assert output_type in _supported_io_types, type_error_message_format.format( - output_type + ibis_signature = ibis_signature_from_python_signature( + signature, input_types, output_type ) - input_types_ibis = [ - python_type_to_bigquery_type(type_) for type_ in input_types - ] - output_type_ibis = python_type_to_bigquery_type(output_type) - - rf_node_fields = { - name: rlz.value(type) - for name, type in zip(parameter_names, input_types_ibis) - } - - try: - rf_node_fields["output_type"] = rlz.shape_like( - "args", dtype=output_type_ibis - ) - except TypeError: - rf_node_fields["output_dtype"] = property(lambda _: output_type_ibis) - rf_node_fields["output_shape"] = rlz.shape_like("args") remote_function_client = RemoteFunctionClient( - gcp_project_id, + dataset_ref.project, cloud_function_region, + cloud_functions_client, bq_location, - bq_dataset, + dataset_ref.dataset_id, bigquery_client, bigquery_connection_client, bigquery_connection, ) rf_name, cf_name = remote_function_client.provision_bq_remote_function( - f, input_types_ibis, output_type_ibis, uniq_suffix + f, ibis_signature.input_types, ibis_signature.output_type, uniq_suffix ) - rf_fully_qualified_name = f"`{gcp_project_id}.{bq_dataset}`.{rf_name}" - rf_node = type(rf_fully_qualified_name, (ops.ValueOp,), rf_node_fields) - - @compiles(rf_node) - def compiles_rf_node(t, op): - return "{}({})".format( - rf_node.__name__, ", ".join(map(t.translate, op.args)) - ) - @functools.wraps(f) - def wrapped(*args, **kwargs): - node = rf_node(*args, **kwargs) - return node.to_expr() + node = remote_function_node(dataset_ref.routine(rf_name), ibis_signature) - wrapped.__signature__ = signature - wrapped.bigframes_remote_function = ( - remote_function_client.get_remote_function_fully_qualified_name(rf_name) - ) - wrapped.bigframes_cloud_function = ( + node = functools.wraps(f)(node) + node.__signature__ = signature + node.bigframes_cloud_function = ( remote_function_client.get_cloud_function_fully_qualified_name(cf_name) ) - return wrapped + + return node return wrapper + + +def read_gbq_function( + function_name: str, + session: Optional[Session] = None, + bigquery_client: Optional[bigquery.Client] = None, +): + """ + Read an existing BigQuery function and prepare it for use in future queries. + """ + + # A BigQuery client is required to perform BQ operations + if not bigquery_client and session: + bigquery_client = session.bqclient + if not bigquery_client: + raise ValueError( + "A bigquery client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + try: + routine_ref = get_routine_reference(function_name, bigquery_client, session) + except DatasetMissingError: + raise ValueError( + "Project and dataset must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # Find the routine and get its arguments. + routine = bigquery_client.get_routine(routine_ref) + ibis_signature = ibis_signature_from_routine(routine) + + return remote_function_node(routine_ref, ibis_signature) diff --git a/bigframes/series.py b/bigframes/series.py index f008e0fb06..537991ed00 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -16,9 +16,10 @@ from __future__ import annotations +import numbers import textwrap import typing -from typing import Any, Optional, Union +from typing import Any, Mapping, Optional, Tuple, Union import google.cloud.bigquery as bigquery import ibis.expr.types as ibis_types @@ -27,6 +28,7 @@ import pandas.core.dtypes.common import typing_extensions +import bigframes.constants as constants import bigframes.core from bigframes.core import WindowSpec import bigframes.core.block_transforms as block_ops @@ -34,11 +36,16 @@ import bigframes.core.groupby as groupby import bigframes.core.indexers import bigframes.core.indexes as indexes -from bigframes.core.ordering import OrderingColumnReference, OrderingDirection +from bigframes.core.ordering import ( + OrderingColumnReference, + OrderingDirection, + STABLE_SORTS, +) import bigframes.core.scalar as scalars import bigframes.core.window import bigframes.dataframe import bigframes.dtypes +import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.operations.base @@ -46,7 +53,8 @@ import bigframes.operations.strings as strings import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series -LevelsType = typing.Union[str, int, typing.Sequence[typing.Union[str, int]]] +LevelType = typing.Union[str, int] +LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Series): @@ -100,21 +108,68 @@ def values(self) -> numpy.ndarray: @property def query_job(self) -> Optional[bigquery.QueryJob]: + """BigQuery job metadata for the most recent query. + + Returns: + The most recent `QueryJob + `_. + """ + if self._query_job is None: + self._set_internal_query_job(self._compute_dry_run()) return self._query_job + def _set_internal_query_job(self, query_job: bigquery.QueryJob): + self._query_job = query_job + def __len__(self): return self.shape[0] def copy(self) -> Series: return Series(self._block) - def rename(self, index: Optional[str], **kwargs) -> Series: + def rename( + self, index: Union[blocks.Label, Mapping[Any, Any]] = None, **kwargs + ) -> Series: if len(kwargs) != 0: raise NotImplementedError( - "rename does not currently support any keyword arguments." + f"rename does not currently support any keyword arguments. {constants.FEEDBACK_LINK}" ) - block = self._block.with_column_labels([index]) - return Series(block) + + # rename the Series name + if index is None or isinstance( + index, str + ): # Python 3.9 doesn't allow isinstance of Optional + index = typing.cast(Optional[str], index) + block = self._block.with_column_labels([index]) + return Series(block) + + # rename the index + if isinstance(index, Mapping): + index = typing.cast(Mapping[Any, Any], index) + block = self._block + for k, v in index.items(): + new_idx_ids = [] + for idx_id, idx_dtype in zip(block.index_columns, block.index_dtypes): + # Will throw if key type isn't compatible with index type, which leads to invalid SQL. + block.create_constant(k, dtype=idx_dtype) + + # Will throw if value type isn't compatible with index type. + block, const_id = block.create_constant(v, dtype=idx_dtype) + block, cond_id = block.apply_unary_op( + idx_id, ops.BinopPartialRight(ops.ne_op, k) + ) + block, new_idx_id = block.apply_ternary_op( + idx_id, cond_id, const_id, ops.where_op + ) + + new_idx_ids.append(new_idx_id) + block = block.drop_columns([const_id, cond_id]) + + block = block.set_index(new_idx_ids, index_labels=block.index_labels) + + return Series(block) + + raise ValueError(f"Unsupported type of parameter index: {type(index)}") def rename_axis( self, @@ -123,7 +178,7 @@ def rename_axis( ) -> Series: if len(kwargs) != 0: raise NotImplementedError( - "rename_axis does not currently support any keyword arguments." + f"rename_axis does not currently support any keyword arguments. {constants.FEEDBACK_LINK}" ) # limited implementation: the new index name is simply the 'mapper' parameter if _is_list_like(mapper): @@ -151,8 +206,15 @@ def __repr__(self) -> str: # maybe we just print the job metadata that we have so far? # TODO(swast): Avoid downloading the whole series by using job # metadata, like we do with DataFrame. - preview = self.compute() - return repr(preview) + opts = bigframes.options.display + max_results = opts.max_rows + if opts.repr_mode == "deferred": + return formatter.repr_query_job(self.query_job) + + pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) + self._set_internal_query_job(query_job) + + return repr(pandas_df.iloc[:, 0]) def _to_ibis_expr(self): """Creates an Ibis table expression representing the Series.""" @@ -168,31 +230,77 @@ def astype( ) -> Series: return self._apply_unary_op(bigframes.operations.AsTypeOp(dtype)) - def compute(self) -> pandas.Series: - """Executes deferred operations and downloads the results.""" - df, query_job = self._block.compute((self._value_column,)) - self._query_job = query_job + def to_pandas( + self, + max_download_size: Optional[int] = None, + sampling_method: Optional[str] = None, + random_state: Optional[int] = None, + ) -> pandas.Series: + """Writes Series to pandas Series. + + Args: + max_download_size (int, default None): + Download size threshold in MB. If max_download_size is exceeded when downloading data + (e.g., to_pandas()), the data will be downsampled if + bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be + raised. If set to a value other than None, this will supersede the global config. + sampling_method (str, default None): + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm + returns a portion of the data from the beginning. It is fast and requires minimal + computations to perform the downsampling; "uniform": This algorithm returns uniform + random samples of the data. If set to a value other than None, this will supersede + the global config. + random_state (int, default None): + The seed for the uniform downsampling algorithm. If provided, the uniform method may + take longer to execute and require more computation. If set to a value other than + None, this will supersede the global config. + + Returns: + pandas.Series: A pandas Series with all rows of this Series if the data_sampling_threshold_mb + is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. + """ + df, query_job = self._block.to_pandas( + (self._value_column,), + max_download_size=max_download_size, + sampling_method=sampling_method, + random_state=random_state, + ) + self._set_internal_query_job(query_job) series = df[self._value_column] series.name = self._name return series - def drop(self, labels: blocks.Label | typing.Sequence[blocks.Label] = None): - block = self._block - index_column = block.index_columns[0] + def _compute_dry_run(self) -> bigquery.QueryJob: + return self._block._compute_dry_run((self._value_column,)) + + def drop( + self, + labels: typing.Any = None, + *, + axis: typing.Union[int, str] = 0, + index: typing.Any = None, + columns: Union[blocks.Label, typing.Iterable[blocks.Label]] = None, + level: typing.Optional[LevelType] = None, + ) -> Series: + if labels and index: + raise ValueError("Must specify exacly one of 'labels' or 'index'") + index = labels or index + # ignore axis, columns params + block = self._block + level_id = self._resolve_levels(level or 0)[0] if _is_list_like(labels): block, inverse_condition_id = block.apply_unary_op( - index_column, ops.partial_right(ops.isin_op, labels) + level_id, ops.IsInOp(index, match_nulls=True) ) block, condition_id = block.apply_unary_op( inverse_condition_id, ops.invert_op ) - else: block, condition_id = block.apply_unary_op( - index_column, ops.partial_right(ops.ne_op, labels) + level_id, ops.partial_right(ops.ne_op, labels) ) - block = block.filter(condition_id) + block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) return Series(block.select_column(self._value_column)) @@ -248,6 +356,11 @@ def cummin(self) -> Series: agg_ops.min_op, bigframes.core.WindowSpec(following=0) ) + def cumprod(self) -> Series: + return self._apply_window_op( + agg_ops.product_op, bigframes.core.WindowSpec(following=0) + ) + def shift(self, periods: int = 1) -> Series: window = bigframes.core.WindowSpec( preceding=periods if periods > 0 else None, @@ -255,8 +368,8 @@ def shift(self, periods: int = 1) -> Series: ) return self._apply_window_op(agg_ops.ShiftOp(periods), window) - def diff(self) -> Series: - return self - self.shift(1) + def diff(self, periods: int = 1) -> Series: + return self - self.shift(periods=periods) def rank( self, @@ -337,64 +450,66 @@ def notna(self) -> "Series": notnull = notna - def __and__(self, other: bool | int | Series | pandas.Series) -> Series: + def __and__(self, other: bool | int | Series) -> Series: return self._apply_binary_op(other, ops.and_op) __rand__ = __and__ - def __or__(self, other: bool | int | Series | pandas.Series) -> Series: + def __or__(self, other: bool | int | Series) -> Series: return self._apply_binary_op(other, ops.or_op) __ror__ = __or__ - def __add__(self, other: float | int | Series | pandas.Series) -> Series: + def __add__(self, other: float | int | Series) -> Series: return self.add(other) - def __radd__(self, other: float | int | Series | pandas.Series) -> Series: + def __radd__(self, other: float | int | Series) -> Series: return self.radd(other) - def add(self, other: float | int | Series | pandas.Series) -> Series: + def add(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.add_op) - def radd(self, other: float | int | Series | pandas.Series) -> Series: + def radd(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.reverse(ops.add_op)) - def __sub__(self, other: float | int | Series | pandas.Series) -> Series: + def __sub__(self, other: float | int | Series) -> Series: return self.sub(other) - def __rsub__(self, other: float | int | Series | pandas.Series) -> Series: + def __rsub__(self, other: float | int | Series) -> Series: return self.rsub(other) - def sub(self, other: float | int | Series | pandas.Series) -> Series: + def sub(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.sub_op) - def rsub(self, other: float | int | Series | pandas.Series) -> Series: + def rsub(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.reverse(ops.sub_op)) - def __mul__(self, other: float | int | Series | pandas.Series) -> Series: + subtract = sub + + def __mul__(self, other: float | int | Series) -> Series: return self.mul(other) - def __rmul__(self, other: float | int | Series | pandas.Series) -> Series: + def __rmul__(self, other: float | int | Series) -> Series: return self.rmul(other) - def mul(self, other: float | int | Series | pandas.Series) -> Series: + def mul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op) - def rmul(self, other: float | int | Series | pandas.Series) -> Series: + def rmul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.reverse(ops.mul_op)) multiply = mul - def __truediv__(self, other: float | int | Series | pandas.Series) -> Series: + def __truediv__(self, other: float | int | Series) -> Series: return self.truediv(other) - def __rtruediv__(self, other: float | int | Series | pandas.Series) -> Series: + def __rtruediv__(self, other: float | int | Series) -> Series: return self.rtruediv(other) - def truediv(self, other: float | int | Series | pandas.Series) -> Series: + def truediv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.div_op) - def rtruediv(self, other: float | int | Series | pandas.Series) -> Series: + def rtruediv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.reverse(ops.div_op)) div = truediv @@ -403,22 +518,22 @@ def rtruediv(self, other: float | int | Series | pandas.Series) -> Series: rdiv = rtruediv - def __floordiv__(self, other: float | int | Series | pandas.Series) -> Series: + def __floordiv__(self, other: float | int | Series) -> Series: return self.floordiv(other) - def __rfloordiv__(self, other: float | int | Series | pandas.Series) -> Series: + def __rfloordiv__(self, other: float | int | Series) -> Series: return self.rfloordiv(other) - def floordiv(self, other: float | int | Series | pandas.Series) -> Series: + def floordiv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.floordiv_op) - def rfloordiv(self, other: float | int | Series | pandas.Series) -> Series: + def rfloordiv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.reverse(ops.floordiv_op)) - def __lt__(self, other: float | int | Series | pandas.Series) -> Series: # type: ignore + def __lt__(self, other: float | int | Series) -> Series: # type: ignore return self.lt(other) - def __le__(self, other: float | int | Series | pandas.Series) -> Series: # type: ignore + def __le__(self, other: float | int | Series) -> Series: # type: ignore return self.le(other) def lt(self, other) -> Series: @@ -427,10 +542,10 @@ def lt(self, other) -> Series: def le(self, other) -> Series: return self._apply_binary_op(other, ops.le_op) - def __gt__(self, other: float | int | Series | pandas.Series) -> Series: # type: ignore + def __gt__(self, other: float | int | Series) -> Series: # type: ignore return self.gt(other) - def __ge__(self, other: float | int | Series | pandas.Series) -> Series: # type: ignore + def __ge__(self, other: float | int | Series) -> Series: # type: ignore return self.ge(other) def gt(self, other) -> Series: @@ -451,6 +566,16 @@ def mod(self, other) -> Series: # type: ignore def rmod(self, other) -> Series: # type: ignore return self._apply_binary_op(other, ops.reverse(ops.mod_op)) + def divmod(self, other) -> Tuple[Series, Series]: # type: ignore + # TODO(huanc): when self and other both has dtype int and other contains zeros, + # the output should be dtype float, both floordiv and mod returns dtype int in this case. + return (self.floordiv(other), self.mod(other)) + + def rdivmod(self, other) -> Tuple[Series, Series]: # type: ignore + # TODO(huanc): when self and other both has dtype int and self contains zeros, + # the output should be dtype float, both floordiv and mod returns dtype int in this case. + return (self.rfloordiv(other), self.rmod(other)) + def __matmul__(self, other): return (self * other).sum() @@ -503,11 +628,48 @@ def _central_moment(self, n: int) -> float: delta_power = delta_power * mean_deltas return delta_power.mean() - def kurt(self) -> float: - # TODO(tbergeron): Cache intermediate count/moment/etc. statistics at block level + def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: + if _is_list_like(func): + if self.dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES: + raise NotImplementedError( + f"Multiple aggregations only supported on numeric series. {constants.FEEDBACK_LINK}" + ) + aggregations = [agg_ops.AGGREGATIONS_LOOKUP[f] for f in func] + return Series( + self._block.summarize( + [self._value_column], + aggregations, + ) + ) + else: + + return self._apply_aggregation( + agg_ops.AGGREGATIONS_LOOKUP[typing.cast(str, func)] + ) + + def skew(self): + count = self.count() + if count < 3: + return pandas.NA + + moment3 = self._central_moment(3) + moment2 = self.var() * (count - 1) / count # Convert sample var to pop var + + # See G1 estimator: + # https://en.wikipedia.org/wiki/Skewness#Sample_skewness + numerator = moment3 + denominator = moment2 ** (3 / 2) + adjustment = (count * (count - 1)) ** 0.5 / (count - 2) + + return (numerator / denominator) * adjustment + + def kurt(self): count = self.count() + if count < 4: + return pandas.NA + moment4 = self._central_moment(4) - moment2 = self._central_moment(2) # AKA: Population Variance + moment2 = self.var() * (count - 1) / count # Convert sample var to pop var # Kurtosis is often defined as the second standardize moment: moment(4)/moment(2)**2 # Pandas however uses Fisher’s estimator, implemented below @@ -523,8 +685,8 @@ def mode(self) -> Series: block = self._block # Approach: Count each value, return each value for which count(x) == max(counts)) block, agg_ids = block.aggregate( - [self._value_column], - ((self._value_column, agg_ops.count_op),), + by_column_ids=[self._value_column], + aggregations=((self._value_column, agg_ops.count_op),), as_index=False, ) value_count_col_id = agg_ids[0] @@ -551,6 +713,13 @@ def mode(self) -> Series: def mean(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.mean_op)) + def median(self, *, exact: bool = False) -> float: + if exact: + raise NotImplementedError( + f"Only approximate median is supported. {constants.FEEDBACK_LINK}" + ) + return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) + def sum(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.sum_op)) @@ -622,12 +791,26 @@ def argmin(self) -> scalars.Scalar: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) - def __getitem__(self, indexer: Series): + def __getitem__(self, indexer): # TODO: enforce stricter alignment, should fail if indexer is missing any keys. - (left, right, block) = self._align(indexer, "left") - block = block.filter(right) - block = block.select_column(left) - return Series(block) + use_iloc = ( + isinstance(indexer, slice) + and all( + isinstance(x, numbers.Integral) or (x is None) + for x in [indexer.start, indexer.stop, indexer.step] + ) + ) or ( + isinstance(indexer, numbers.Integral) + and not isinstance(self._block.index.dtypes[0], pandas.Int64Dtype) + ) + if use_iloc: + return self.iloc[indexer] + if isinstance(indexer, Series): + (left, right, block) = self._align(indexer, "left") + block = block.filter(right) + block = block.select_column(left) + return Series(block) + return self.loc[indexer] def __getattr__(self, key: str): if hasattr(pandas.Series, key): @@ -635,11 +818,7 @@ def __getattr__(self, key: str): textwrap.dedent( f""" BigQuery DataFrames has not yet implemented an equivalent to - 'pandas.Series.{key}'. Please check - https://github.com/googleapis/python-bigquery-dataframes/issues for - existing feature requests, or file your own. - Please include information about your use case, as well as - relevant code snippets. + 'pandas.Series.{key}'. {constants.FEEDBACK_LINK} """ ) ) @@ -652,12 +831,7 @@ def _align3(self, other1: Series | scalars.Scalar, other2: Series | scalars.Scal return (values[0], values[1], values[2], index) def _apply_aggregation(self, op: agg_ops.AggregateOp) -> Any: - aggregation_result = typing.cast( - ibis_types.Scalar, op._as_ibis(self[self.notnull()]._to_ibis_expr()) - ) - return bigframes.core.scalar.DeferredScalar( - aggregation_result, self._block._expr._session - ).compute() + return self._block.get_stat(self._value_column, op) def _apply_window_op( self, @@ -687,7 +861,9 @@ def value_counts( ) return Series(block) - def sort_values(self, *, axis=0, ascending=True, na_position="last") -> Series: + def sort_values( + self, *, axis=0, ascending=True, kind: str = "quicksort", na_position="last" + ) -> Series: if na_position not in ["first", "last"]: raise ValueError("Param na_position must be one of 'first' or 'last'") direction = OrderingDirection.ASC if ascending else OrderingDirection.DESC @@ -698,7 +874,8 @@ def sort_values(self, *, axis=0, ascending=True, na_position="last") -> Series: direction=direction, na_last=(na_position == "last"), ) - ] + ], + stable=kind in STABLE_SORTS, ) return Series(block) @@ -750,6 +927,8 @@ def groupby( raise ValueError("as_index=False only valid with DataFrame") if axis: raise ValueError("No axis named {} for object type Series".format(level)) + if not as_index: + raise ValueError("'as_index'=False only applies to DataFrame") if by is not None: return self._groupby_values(by, dropna) if level is not None: @@ -765,7 +944,7 @@ def _groupby_level( return groupby.SeriesGroupBy( self._block, self._value_column, - self._resolve_levels(level), + by_col_ids=self._resolve_levels(level), value_name=self.name, dropna=dropna, ) @@ -805,14 +984,14 @@ def _groupby_values( matches = block.index_name_to_col_id.get(key, []) if len(matches) != 1: raise ValueError( - f"GroupBy key {key} does not map to unambiguous index level" + f"GroupBy key {key} does not match a unique index level. BigQuery DataFrames only interprets lists of strings as index level names, not directly as per-row group assignments." ) grouping_cols = [*grouping_cols, matches[0]] return groupby.SeriesGroupBy( block, value_col, - grouping_cols, + by_col_ids=grouping_cols, value_name=self.name, dropna=dropna, ) @@ -851,7 +1030,8 @@ def mask(self, cond, other=None) -> Series: if not isinstance(cond, Series): raise TypeError( - f"Only bigframes series condition is supported, received {type(cond).__name__}" + f"Only bigframes series condition is supported, received {type(cond).__name__}. " + f"{constants.FEEDBACK_LINK}" ) return self.where(~cond, other) @@ -862,13 +1042,13 @@ def to_frame(self) -> bigframes.dataframe.DataFrame: def to_csv(self, path_or_buf=None, **kwargs) -> typing.Optional[str]: # TODO(b/280651142): Implement version that leverages bq export native csv support to bypass local pandas step. - return self.compute().to_csv(path_or_buf, **kwargs) + return self.to_pandas().to_csv(path_or_buf, **kwargs) def to_dict(self, into: type[dict] = dict) -> typing.Mapping: - return typing.cast(dict, self.compute().to_dict(into)) + return typing.cast(dict, self.to_pandas().to_dict(into)) def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None: - return self.compute().to_excel(excel_writer, sheet_name, **kwargs) + return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs) def to_json( self, @@ -879,17 +1059,17 @@ def to_json( **kwargs, ) -> typing.Optional[str]: # TODO(b/280651142): Implement version that leverages bq export native csv support to bypass local pandas step. - return self.compute().to_json(path_or_buf, **kwargs) + return self.to_pandas().to_json(path_or_buf, **kwargs) def to_latex( self, buf=None, columns=None, header=True, index=True, **kwargs ) -> typing.Optional[str]: - return self.compute().to_latex( + return self.to_pandas().to_latex( buf, columns=columns, header=header, index=index, **kwargs ) def tolist(self) -> list: - return self.compute().to_list() + return self.to_pandas().to_list() to_list = tolist @@ -900,17 +1080,17 @@ def to_markdown( index: bool = True, **kwargs, ) -> typing.Optional[str]: - return self.compute().to_markdown(buf, mode=mode, index=index, **kwargs) # type: ignore + return self.to_pandas().to_markdown(buf, mode=mode, index=index, **kwargs) # type: ignore def to_numpy( self, dtype=None, copy=False, na_value=None, **kwargs ) -> numpy.ndarray: - return self.compute().to_numpy(dtype, copy, na_value, **kwargs) + return self.to_pandas().to_numpy(dtype, copy, na_value, **kwargs) __array__ = to_numpy def to_pickle(self, path, **kwargs) -> None: - return self.compute().to_pickle(path, **kwargs) + return self.to_pandas().to_pickle(path, **kwargs) def to_string( self, @@ -925,7 +1105,7 @@ def to_string( max_rows=None, min_rows=None, ) -> typing.Optional[str]: - return self.compute().to_string( + return self.to_pandas().to_string( buf, na_rep, float_format, @@ -939,7 +1119,7 @@ def to_string( ) def to_xarray(self): - return self.compute().to_xarray() + return self.to_pandas().to_xarray() # Keep this at the bottom of the Series class to avoid # confusing type checker by overriding str diff --git a/bigframes/session.py b/bigframes/session.py index 9590ecb8c7..28a38f9307 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -46,6 +46,7 @@ import google.cloud.bigquery as bigquery import google.cloud.bigquery_connection_v1 import google.cloud.bigquery_storage_v1 +import google.cloud.functions_v2 import google.cloud.storage as storage # type: ignore import ibis import ibis.backends.bigquery as ibis_bigquery @@ -56,15 +57,21 @@ import pydata_google_auth import bigframes._config.bigquery_options as bigquery_options +import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.guid as guid -from bigframes.core.ordering import OrderingColumnReference +import bigframes.core.io as bigframes_io +from bigframes.core.ordering import IntegerEncoding, OrderingColumnReference import bigframes.dataframe as dataframe import bigframes.formatting_helpers as formatting_helpers -import bigframes.ml.loader +from bigframes.remote_function import read_gbq_function as bigframes_rgf from bigframes.remote_function import remote_function as bigframes_rf import bigframes.version + +# Even though the ibis.backends.bigquery.registry import is unused, it's needed +# to register new and replacement ops with the Ibis BigQuery backend. +import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet import third_party.bigframes_vendored.pandas.io.parsers.readers as third_party_pandas_readers @@ -81,13 +88,10 @@ _BIGQUERYCONNECTION_REGIONAL_ENDPOINT = "{location}-bigqueryconnection.googleapis.com" _BIGQUERYSTORAGE_REGIONAL_ENDPOINT = "{location}-bigquerystorage.googleapis.com" -# TODO(swast): Need to connect to regional endpoints when performing remote -# functions operations (BQ Connection API, Cloud Run / Cloud Functions). +_MAX_CLUSTER_COLUMNS = 4 -# pydata-google-auth credentials in case auth credentials are not available -# otherwise -_pydata_google_auth_credentials: Optional[google.auth.credentials.Credentials] = None -_pydata_google_auth_project: Optional[str] = None +# TODO(swast): Need to connect to regional endpoints when performing remote +# functions operations (BQ Connection IAM, Cloud Run / Cloud Functions). logger = logging.getLogger(__name__) @@ -97,53 +101,11 @@ def _is_query(query_or_table: str) -> bool: return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None -# TODO(shobs): Remove it after the same is available via pydata-google-auth -# after https://github.com/pydata/pydata-google-auth/pull/71 is merged, released -# and upgraded in the google colab image. -def _ensure_application_default_credentials_in_colab_environment(): - # This is a special handling for google colab environment where we want to - # use the colab specific authentication flow - # https://github.com/googlecolab/colabtools/blob/3c8772efd332289e1c6d1204826b0915d22b5b95/google/colab/auth.py#L209 - try: - from google.colab import auth - - auth.authenticate_user() - except Exception: - # We are catching a broad exception class here because we want to be - # agnostic to anything that could internally go wrong in the google - # colab auth. Some of the known exception we want to pass on are: - # - # ModuleNotFoundError: No module named 'google.colab' - # ImportError: cannot import name 'auth' from 'google.cloud' - # MessageError: Error: credential propagation was unsuccessful - # - # The MessageError happens on Vertex Colab when it fails to resolve auth - # from the Compute Engine Metadata server. - pass - - -pydata_google_auth.auth._ensure_application_default_credentials_in_colab_environment = ( - _ensure_application_default_credentials_in_colab_environment -) - - def _get_default_credentials_with_project(): - global _pydata_google_auth_credentials, _pydata_google_auth_project - if not _pydata_google_auth_credentials or not _pydata_google_auth_credentials.valid: - # We want to initiate auth via a non-local web server which - # particularly helps in a cloud notebook environment where the - # machine running the notebook UI and the VM running the notebook - # runtime are not the same. - # TODO(shobs, b/278903498): Use BigQuery DataFrames's own client id - # and secret - ( - _pydata_google_auth_credentials, - _pydata_google_auth_project, - ) = pydata_google_auth.default(_SCOPES, use_local_webserver=False) - return _pydata_google_auth_credentials, _pydata_google_auth_project + return pydata_google_auth.default(scopes=_SCOPES, use_local_webserver=False) -def _create_bq_clients( +def _create_cloud_clients( project: Optional[str], location: Optional[str], use_regional_endpoints: Optional[bool], @@ -152,6 +114,7 @@ def _create_bq_clients( bigquery.Client, google.cloud.bigquery_connection_v1.ConnectionServiceClient, google.cloud.bigquery_storage_v1.BigQueryReadClient, + google.cloud.functions_v2.FunctionServiceClient, ]: """Create and initialize BigQuery client objects.""" @@ -170,7 +133,10 @@ def _create_bq_clients( ) if not project: - raise ValueError("Project must be set to initialize BigQuery client.") + raise ValueError( + "Project must be set to initialize BigQuery client. " + "Try setting `bigframes.options.bigquery.project` first." + ) if use_regional_endpoints: bq_options = google.api_core.client_options.ClientOptions( @@ -193,6 +159,7 @@ def _create_bq_clients( client_options=bq_options, credentials=credentials, project=project, + location=location, ) bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( @@ -213,7 +180,15 @@ def _create_bq_clients( credentials=credentials, ) - return bqclient, bqconnectionclient, bqstorageclient + functions_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + cloudfunctionsclient = google.cloud.functions_v2.FunctionServiceClient( + client_info=functions_info, + credentials=credentials, + ) + + return bqclient, bqconnectionclient, bqstorageclient, cloudfunctionsclient class Session( @@ -242,7 +217,8 @@ def __init__(self, context: Optional[bigquery_options.BigQueryOptions] = None): self.bqclient, self.bqconnectionclient, self.bqstorageclient, - ) = _create_bq_clients( + self.cloudfunctionsclient, + ) = _create_cloud_clients( project=context.project, location=self._location, use_regional_endpoints=context.use_regional_endpoints, @@ -310,8 +286,23 @@ def close(self): 24 hours of inactivity or after 7 days.""" if self._session_id is not None and self.bqclient is not None: abort_session_query = "CALL BQ.ABORT_SESSION('{}')".format(self._session_id) - query_job = self.bqclient.query(abort_session_query) - query_job.result() # blocks until finished + try: + query_job = self.bqclient.query(abort_session_query) + query_job.result() # blocks until finished + except google.api_core.exceptions.BadRequest as e: + # Ignore the exception when the BQ session itself has expired + # https://cloud.google.com/bigquery/docs/sessions-terminating#auto-terminate_a_session + if not e.message.startswith( + f"Session {self._session_id} has expired and is no longer available." + ): + raise + except google.auth.exceptions.RefreshError: + # The refresh token may itself have been invalidated or expired + # https://developers.google.com/identity/protocols/oauth2#expiration + # Don't raise the exception in this case while closing the + # BigFrames session, so that the end user has a path for getting + # out of a bad session due to unusable credentials. + pass self._session_id = None def read_gbq( @@ -367,16 +358,10 @@ def read_gbq_query( else: index_cols = list(index_col) - # Make sure we cluster by the index column so that subsequent - # operations are as speedy as they can be. - if index_cols: - destination: bigquery.Table | bigquery.TableReference = ( - self._query_to_session_table(query, index_cols) - ) - else: - _, query_job = self._start_query(query) - query_job.result() # Wait for job to finish. - destination = query_job.destination + # Can't cluster since don't know if index_cols are clusterable data types + # TODO(tbergeron): Maybe use dryrun to determine types of index_cols to see if can cluster + _, query_job = self._start_query(query) + destination = query_job.destination # If there was no destination table, that means the query must have # been DDL or DML. Return some job metadata, instead. @@ -411,6 +396,9 @@ def read_gbq_table( See also: :meth:`Session.read_gbq`. """ + if max_results and max_results <= 0: + raise ValueError("`max_results` should be a positive number.") + # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so # these docstrings are inline. # TODO(swast): Can we re-use the temp table from other reads in the @@ -425,7 +413,6 @@ def read_gbq_table( f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`" ) else: - # TODO(swast): Read from a table snapshot so that reads are consistent. table_expression = self.ibis_client.table( table_ref.table_id, database=f"{table_ref.project}.{table_ref.dataset_id}", @@ -442,6 +429,8 @@ def read_gbq_table( else: index_cols = list(index_col) + hidden_cols: typing.Sequence[str] = () + for key in index_cols: if key not in table_expression.columns: raise ValueError( @@ -466,7 +455,7 @@ def read_gbq_table( SELECT (SELECT COUNT(*) FROM full_table) AS total_count, (SELECT COUNT(*) FROM distinct_table) AS distinct_count """ - results, _ = self._start_query(is_unique_sql) + results, query_job = self._start_query(is_unique_sql) row = next(iter(results)) total_count = row["total_count"] @@ -476,9 +465,24 @@ def read_gbq_table( ordering_value_columns=[ core.OrderingColumnReference(column_id) for column_id in index_cols ], + total_ordering_columns=frozenset(index_cols), ) - if not is_total_ordering: + # We have a total ordering, so query via "time travel" so that + # the underlying data doesn't mutate. + if is_total_ordering: + + # Get the timestamp from the job metadata rather than the query + # text so that the query for determining uniqueness of the ID + # columns can be cached. + current_timestamp = query_job.started + + # The job finished, so we should have a start time. + assert current_timestamp is not None + table_expression = self.ibis_client.sql( + bigframes_io.create_snapshot_sql(table_ref, current_timestamp) + ) + else: # Make sure when we generate an ordering, the row_number() # coresponds to the index columns. table_expression = table_expression.order_by(index_cols) @@ -491,26 +495,37 @@ def read_gbq_table( """, ) ) + + # When ordering by index columns, apply limit after ordering to + # make limit more predictable. + if max_results is not None: + table_expression = table_expression.limit(max_results) else: + if max_results is not None: + # Apply limit before generating rownums and creating temp table + # This makes sure the offsets are valid and limits the number of + # rows for which row numbers must be generated + table_expression = table_expression.limit(max_results) table_expression, ordering = self._create_sequential_ordering( table_expression ) - ordering_id_column = ordering.ordering_id - assert ordering_id_column is not None + hidden_cols = ( + (ordering.total_order_col.column_id,) + if ordering.total_order_col + else () + ) + assert len(ordering.ordering_value_columns) > 0 is_total_ordering = True - index_cols = [ordering_id_column] - index_labels = [None] - - if max_results is not None: - if max_results <= 0: - raise ValueError("`max_results` should be a positive number.") - table_expression = table_expression.limit(max_results) + # Block constructor will generate default index if passed empty + index_cols = [] + index_labels = [] return self._read_gbq_with_ordering( table_expression=table_expression, col_order=col_order, index_cols=index_cols, index_labels=index_labels, + hidden_cols=hidden_cols, ordering=ordering, is_total_ordering=is_total_ordering, ) @@ -522,16 +537,23 @@ def _read_gbq_with_ordering( col_order: Iterable[str] = (), index_cols: Sequence[str] = (), index_labels: Sequence[Optional[str]] = (), + hidden_cols: Sequence[str] = (), ordering: core.ExpressionOrdering, is_total_ordering: bool = False, ) -> dataframe.DataFrame: """Internal helper method that loads DataFrame from Google BigQuery given an ordering column. Args: - table_expression: an ibis table expression to be executed in BigQuery. - col_order: List of BigQuery column names in the desired order for results DataFrame. - index_cols: List of column names to use as the index or multi-index. - ordering: Column name to be used for ordering. If not supplied, a default ordering is generated. + table_expression: + an ibis table expression to be executed in BigQuery. + col_order: + List of BigQuery column names in the desired order for results DataFrame. + index_cols: + List of column names to use as the index or multi-index. + hidden_cols: + Columns that should be hidden. Ordering columns may (not always) be hidden + ordering: + Column name to be used for ordering. If not supplied, a default ordering is generated. Returns: A DataFrame representing results of the query or table. @@ -542,30 +564,23 @@ def _read_gbq_with_ordering( f"Got {len(index_labels)}, expected {len(index_cols)}." ) - if not index_cols: - raise ValueError("Need at least 1 index column.") - # Logic: # no total ordering, index -> create sequential order, ordered by index, use for both ordering and index # total ordering, index -> use ordering as ordering, index as index # This code block ensures the existence of a total ordering. + column_keys = list(col_order) + if len(column_keys) == 0: + non_value_columns = set([*index_cols, *hidden_cols]) + column_keys = [ + key for key in table_expression.columns if key not in non_value_columns + ] if not is_total_ordering: # Rows are not ordered, we need to generate a default ordering and materialize it table_expression, ordering = self._create_sequential_ordering( table_expression, index_cols ) - index_col_values = [table_expression[index_id] for index_id in index_cols] - - column_keys = list(col_order) - if len(column_keys) == 0: - non_columns = set(index_cols) - if ordering.ordering_id is not None: - non_columns.add(ordering.ordering_id) - column_keys = [ - key for key in table_expression.columns if key not in non_columns - ] return self._read_ibis( table_expression, index_col_values, @@ -589,7 +604,7 @@ def _read_bigquery_load_job( index_cols = list(index_col) if not job_config.clustering_fields and index_cols: - job_config.clustering_fields = index_cols + job_config.clustering_fields = index_cols[:_MAX_CLUSTER_COLUMNS] if isinstance(filepath_or_buffer, str): if filepath_or_buffer.startswith("gs://"): @@ -606,7 +621,7 @@ def _read_bigquery_load_job( filepath_or_buffer, table, job_config=job_config ) - load_job.result() # Wait for the job to complete + self._start_generic_job(load_job) # The BigQuery REST API for tables.get doesn't take a session ID, so we # can't get the schema for a temp table that way. @@ -622,12 +637,9 @@ def _read_ibis( index_cols: Sequence[ibis_types.Value], index_labels: Sequence[Optional[str]], column_keys: Sequence[str], - ordering: Optional[core.ExpressionOrdering] = None, - ): + ordering: core.ExpressionOrdering, + ) -> dataframe.DataFrame: """Turns a table expression (plus index column) into a DataFrame.""" - hidden_ordering_columns = None - if ordering is not None and ordering.ordering_id is not None: - hidden_ordering_columns = (table_expression[ordering.ordering_id],) columns = list(index_cols) for key in column_keys: @@ -635,6 +647,12 @@ def _read_ibis( raise ValueError(f"Column '{key}' not found in this table.") columns.append(table_expression[key]) + non_hidden_ids = [col.get_name() for col in columns] + hidden_ordering_columns = [] + for ref in ordering.all_ordering_columns: + if ref.column_id not in non_hidden_ids: + hidden_ordering_columns.append(table_expression[ref.column_id]) + block = blocks.Block( core.ArrayValue( self, table_expression, columns, hidden_ordering_columns, ordering @@ -646,16 +664,19 @@ def _read_ibis( return dataframe.DataFrame(block) def read_gbq_model(self, model_name: str): - """Loads a BQML model from Google BigQuery. + """Loads a BigQuery ML model from BigQuery. Args: - model_name : the model's name in BigQuery in the format - `project_id.dataset_id.model_id`, or just `dataset_id.model_id` - to load from the default project. + model_name (str): + the model's name in BigQuery in the format + `project_id.dataset_id.model_id`, or just `dataset_id.model_id` + to load from the default project. Returns: A bigframes.ml Model wrapping the model. """ + import bigframes.ml.loader + model_ref = bigquery.ModelReference.from_string( model_name, default_project=self.bqclient.project ) @@ -663,16 +684,17 @@ def read_gbq_model(self, model_name: str): return bigframes.ml.loader.from_bq(self, model) def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame: - """Loads DataFrame from a Pandas DataFrame. + """Loads DataFrame from a pandas DataFrame. - The Pandas DataFrame will be persisted as a temporary BigQuery table, which can be + The pandas DataFrame will be persisted as a temporary BigQuery table, which can be automatically recycled after the Session is closed. Args: - pandas_dataframe: a Pandas DataFrame object to be loaded. + pandas_dataframe (pandas.DataFrame): + a pandas DataFrame object to be loaded. Returns: - A BigQuery DataFrames. + bigframes.dataframe.DataFrame: The BigQuery DataFrame. """ # Add order column to pandas DataFrame to preserve order in BigQuery ordering_col = "rowid" @@ -686,7 +708,7 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame pandas_dataframe_copy[ordering_col] = np.arange(pandas_dataframe_copy.shape[0]) # Specify the datetime dtypes, which is auto-detected as timestamp types. - schema = [] + schema: list[bigquery.SchemaField] = [] for column, dtype in zip(pandas_dataframe.columns, pandas_dataframe.dtypes): if dtype == "timestamp[us][pyarrow]": schema.append( @@ -699,11 +721,13 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame filter(lambda name: name is not None, pandas_dataframe_copy.index.names) ) index_labels = typing.cast(List[Optional[str]], index_cols) - cluster_cols = index_cols + [ordering_col] + + # Clustering probably not needed anyways as pandas tables are small + cluster_cols = [ordering_col] if len(index_cols) == 0: - index_cols = [ordering_col] - index_labels = [None] + # Block constructor will implicitly build default index + pass job_config = bigquery.LoadJobConfig(schema=schema) job_config.clustering_fields = cluster_cols @@ -718,10 +742,12 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame load_table_destination, job_config=job_config, ) - load_job.result() # Wait for the job to complete + self._start_generic_job(load_job) ordering = core.ExpressionOrdering( - ordering_id_column=OrderingColumnReference(ordering_col), is_sequential=True + ordering_value_columns=[OrderingColumnReference(ordering_col)], + total_ordering_columns=frozenset([ordering_col]), + integer_encoding=IntegerEncoding(True, is_sequential=True), ) table_expression = self.ibis_client.sql( f"SELECT * FROM `{load_table_destination.table_id}`" @@ -731,6 +757,7 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame table_expression=table_expression, index_cols=index_cols, index_labels=index_labels, + hidden_cols=(ordering_col,), ordering=ordering, is_total_ordering=True, ) @@ -771,14 +798,16 @@ def read_csv( if any(param is not None for param in (dtype, names)): not_supported = ("dtype", "names") raise NotImplementedError( - f"BigQuery engine does not support these arguments: {not_supported}" + f"BigQuery engine does not support these arguments: {not_supported}. " + f"{constants.FEEDBACK_LINK}" ) if index_col is not None and ( not index_col or not isinstance(index_col, str) ): raise NotImplementedError( - "BigQuery engine only supports a single column name for `index_col`." + "BigQuery engine only supports a single column name for `index_col`. " + f"{constants.FEEDBACK_LINK}" ) # None value for index_col cannot be passed to read_gbq @@ -794,13 +823,15 @@ def read_csv( col_order = tuple(col for col in usecols) else: raise NotImplementedError( - "BigQuery engine only supports an iterable of strings for `usecols`." + "BigQuery engine only supports an iterable of strings for `usecols`. " + f"{constants.FEEDBACK_LINK}" ) valid_encodings = {"UTF-8", "ISO-8859-1"} if encoding is not None and encoding not in valid_encodings: raise NotImplementedError( - f"BigQuery engine only supports the following encodings: {valid_encodings}" + f"BigQuery engine only supports the following encodings: {valid_encodings}. " + f"{constants.FEEDBACK_LINK}" ) job_config = bigquery.LoadJobConfig() @@ -830,7 +861,8 @@ def read_csv( else: if any(arg in kwargs for arg in ("chunksize", "iterator")): raise NotImplementedError( - "'chunksize' and 'iterator' arguments are not supported." + "'chunksize' and 'iterator' arguments are not supported. " + f"{constants.FEEDBACK_LINK}" ) if isinstance(filepath_or_buffer, str): @@ -904,20 +936,32 @@ def _create_sequential_ordering( ibis.row_number().cast(ibis_dtypes.int64).name(default_ordering_name) ) table = table.mutate(**{default_ordering_name: default_ordering_col}) + clusterable_index_cols = [ + col for col in index_cols if _can_cluster(table[col].type()) + ] + cluster_cols = (clusterable_index_cols + [default_ordering_name])[ + :_MAX_CLUSTER_COLUMNS + ] table_ref = self._query_to_session_table( self.ibis_client.compile(table), - cluster_cols=list(index_cols) + [default_ordering_name], + cluster_cols=cluster_cols, ) table = self.ibis_client.sql(f"SELECT * FROM `{table_ref.table_id}`") ordering_reference = core.OrderingColumnReference(default_ordering_name) ordering = core.ExpressionOrdering( - ordering_id_column=ordering_reference, is_sequential=True + ordering_value_columns=[ordering_reference], + total_ordering_columns=frozenset([default_ordering_name]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), ) return table, ordering def _query_to_session_table( self, query_text: str, cluster_cols: Iterable[str] ) -> bigquery.TableReference: + if len(list(cluster_cols)) > _MAX_CLUSTER_COLUMNS: + raise ValueError( + f"Too many cluster columns: {list(cluster_cols)}, max {_MAX_CLUSTER_COLUMNS} allowed." + ) # Can't set a table in _SESSION as destination via query job API, so we # run DDL, instead. table = self._create_session_table() @@ -929,9 +973,8 @@ def _query_to_session_table( CLUSTER BY {cluster_cols_sql} AS {query_text} """ - query_job = self.bqclient.query(ddl_text) try: - query_job.result() # Wait for the job to complete + self._start_query(ddl_text) # Wait for the job to complete except google.api_core.exceptions.Conflict: # Allow query retry to succeed. pass @@ -947,21 +990,58 @@ def remote_function( ): """Decorator to turn a user defined function into a BigQuery remote function. + .. note:: + Please make sure following is setup before using this API: + + 1. Have the below APIs enabled for your project: + + * BigQuery Connection API + * Cloud Functions API + * Cloud Run API + * Cloud Build API + * Artifact Registry API + * Cloud Resource Manager API + + This can be done from the cloud console (change `PROJECT_ID` to yours): + https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID + + Or from the gcloud CLI: + + `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` + + 2. Have following IAM roles enabled for you: + + * BigQuery Data Editor (roles/bigquery.dataEditor) + * BigQuery Connection Admin (roles/bigquery.connectionAdmin) + * Cloud Functions Developer (roles/cloudfunctions.developer) + * Service Account User (roles/iam.serviceAccountUser) + * Storage Object Viewer (roles/storage.objectViewer) + * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) + + 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: + + 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection + 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function + + Alternatively, the IAM could also be setup via the gcloud CLI: + + `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. + Args: input_types (list(type)): List of input data types in the user defined function. output_type (type): Data type of the output in the user defined function. dataset (str, Optional): - Dataset to use to create a BigQuery function. It should be in + Dataset in which to create a BigQuery remote function. It should be in `.` or `` format. If this - param is not provided then session dataset id would be used. + parameter is not provided then session dataset id is used. bigquery_connection (str, Optional): - Name of the BigQuery connection. If it is pre created in the same - location as the `bigquery_client.location` then it would be used, - otherwise it would be created dynamically assuming the user has - necessary priviliges. If this param is not provided then the - bigquery connection from the session would be used. + Name of the BigQuery connection. You should either have the + connection already created in the `location` you have chosen, or + you should have the Project IAM Admin role to enable the service + to create the connection for you if you need it.If this parameter is + not provided then the BigQuery connection from the session is used. reuse (bool, Optional): Reuse the remote function if already exists. `True` by default, which will result in reusing an existing remote @@ -969,38 +1049,14 @@ def remote_function( Setting it to false would force creating a unique remote function. If the required remote function does not exist then it would be created irrespective of this param. + Returns: + callable: A remote function object pointing to the cloud assets created + in the background to support the remote execution. The cloud assets can be + located through the following properties set in the object: - Notes: - Please make sure following is setup before using this API: - - 1. Have the below APIs enabled for your project: - a. BigQuery Connection API - b. Cloud Functions API - c. Cloud Run API - d. Cloud Build API - e. Artifact Registry API - f. Cloud Resource Manager API - - This can be done from the cloud console (change PROJECT_ID to yours): - https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID - Or from the gcloud CLI: - $ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com - - 2. Have following IAM roles enabled for you: - a. BigQuery Data Editor (roles/bigquery.dataEditor) - b. BigQuery Connection Admin (roles/bigquery.connectionAdmin) - c. Cloud Functions Developer (roles/cloudfunctions.developer) - d. Service Account User (roles/iam.serviceAccountUser) - e. Storage Object Viewer (roles/storage.objectViewer) - f. Project IAM Admin (roles/resourcemanager.projectIamAdmin) - (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) - - 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: - a. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection - b. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - Alternatively, the IAM could also be setup via the gcloud CLI: - $ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker" + `bigframes_cloud_function` - The google cloud function deployed for the user defined code. + `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`. """ return bigframes_rf( input_types, @@ -1011,34 +1067,62 @@ def remote_function( reuse=reuse, ) + def read_gbq_function( + self, + function_name: str, + ): + """Loads a BigQuery function from BigQuery. + + Then it can be applied to a DataFrame or Series. + + Args: + function_name (str): + the function's name in BigQuery in the format + `project_id.dataset_id.function_name`, or + `dataset_id.function_name` to load from the default project, or + `function_name` to load from the default project and the dataset + associated with the current session. + + Returns: + callable: A function object pointing to the BigQuery function read + from BigQuery. + + The object is similar to the one created by the `remote_function` + decorator, including the `bigframes_remote_function` property, but + not including the `bigframes_cloud_function` property. + """ + + return bigframes_rgf( + function_name=function_name, + session=self, + ) + def _start_query( self, sql: str, job_config: Optional[bigquery.job.QueryJobConfig] = None, max_results: Optional[int] = None, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """ + Starts query job and waits for results + """ if job_config is not None: query_job = self.bqclient.query(sql, job_config=job_config) else: query_job = self.bqclient.query(sql) opts = bigframes.options.display - if opts.progress_bar is not None: - results_iterator = formatting_helpers.wait_for_job( + if opts.progress_bar is not None and not query_job.configuration.dry_run: + results_iterator = formatting_helpers.wait_for_query_job( query_job, max_results, opts.progress_bar ) else: results_iterator = query_job.result(max_results=max_results) return results_iterator, query_job - def _extract_table(self, source_table, destination_uris, job_config): - extract_job = self.bqclient.extract_table( - source=source_table, - destination_uris=destination_uris, - job_config=job_config, - ) - extract_job.result() - return extract_job + def _get_table_size(self, destination_table): + table = self.bqclient.get_table(destination_table) + return table.num_bytes def _rows_to_dataframe( self, row_iterator: bigquery.table.RowIterator @@ -1050,6 +1134,27 @@ def _rows_to_dataframe( string_dtype=pandas.StringDtype(storage="pyarrow"), ) + def _start_generic_job(self, job: formatting_helpers.GenericJob): + if bigframes.options.display.progress_bar is not None: + formatting_helpers.wait_for_job( + job, bigframes.options.display.progress_bar + ) # Wait for the job to complete + else: + job.result() + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) + + +def _can_cluster(ibis_type: ibis_dtypes.DataType): + # https://cloud.google.com/bigquery/docs/clustered-tables + # Notably, float is excluded + return ( + ibis_type.is_integer() + or ibis_type.is_string() + or ibis_type.is_decimal() + or ibis_type.is_date() + or ibis_type.is_timestamp() + or ibis_type.is_boolean() + ) diff --git a/docs/getting_started/index.rst b/docs/getting_started/index.rst deleted file mode 100644 index cde2a70a3f..0000000000 --- a/docs/getting_started/index.rst +++ /dev/null @@ -1,27 +0,0 @@ -Getting started -=============== - -Prerequisites --------------- - -* Install the ``bigframes`` package. -* Create a GCP project and billing account. -* When running locally, authenticate with application default credentials. See - the `gcloud auth application-default login - `_ - reference. - -Code sample ------------ - -.. literalinclude:: ../samples/snippets/quickstart.py - :language: python - :dedent: 4 - :start-after: [START bigquery_bigframes_quickstart] - :end-before: [END bigquery_bigframes_quickstart] - -Learn more ----------- - -* See :ref:`user_guide` for more samples. -* See :ref:`bigframes_ml` for more ML samples. diff --git a/docs/index.rst b/docs/index.rst index eb721fbf29..ff1cd09eb7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,9 +1,11 @@ .. include:: README.rst +API reference +------------- + .. toctree:: :maxdepth: 3 - getting_started/index user_guide/index reference/index diff --git a/docs/reference/bigframes.ml/cluster.rst b/docs/reference/bigframes.ml/cluster.rst index 5456b728b6..e91a28c051 100644 --- a/docs/reference/bigframes.ml/cluster.rst +++ b/docs/reference/bigframes.ml/cluster.rst @@ -3,3 +3,5 @@ bigframes.ml.cluster .. automodule:: bigframes.ml.cluster :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/compose.rst b/docs/reference/bigframes.ml/compose.rst index e41f17a33b..9992728362 100644 --- a/docs/reference/bigframes.ml/compose.rst +++ b/docs/reference/bigframes.ml/compose.rst @@ -3,3 +3,5 @@ bigframes.ml.compose .. automodule:: bigframes.ml.compose :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/decomposition.rst b/docs/reference/bigframes.ml/decomposition.rst index 122dbc2b16..ec804ac8cd 100644 --- a/docs/reference/bigframes.ml/decomposition.rst +++ b/docs/reference/bigframes.ml/decomposition.rst @@ -3,3 +3,5 @@ bigframes.ml.decomposition .. automodule:: bigframes.ml.decomposition :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/ensemble.rst b/docs/reference/bigframes.ml/ensemble.rst new file mode 100644 index 0000000000..2652ab5aa4 --- /dev/null +++ b/docs/reference/bigframes.ml/ensemble.rst @@ -0,0 +1,7 @@ +bigframes.ml.ensemble +===================== + +.. automodule:: bigframes.ml.ensemble + :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/forecasting.rst b/docs/reference/bigframes.ml/forecasting.rst index 037bd7e82e..04015c9911 100644 --- a/docs/reference/bigframes.ml/forecasting.rst +++ b/docs/reference/bigframes.ml/forecasting.rst @@ -3,3 +3,5 @@ bigframes.ml.forecasting .. automodule:: bigframes.ml.forecasting :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/imported.rst b/docs/reference/bigframes.ml/imported.rst index 54120534c0..c151cbda6f 100644 --- a/docs/reference/bigframes.ml/imported.rst +++ b/docs/reference/bigframes.ml/imported.rst @@ -3,3 +3,5 @@ bigframes.ml.imported .. automodule:: bigframes.ml.imported :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/index.rst b/docs/reference/bigframes.ml/index.rst index e17edc2ee9..f3cbe1174a 100644 --- a/docs/reference/bigframes.ml/index.rst +++ b/docs/reference/bigframes.ml/index.rst @@ -13,6 +13,8 @@ API Reference decomposition + ensemble + forecasting imported diff --git a/docs/reference/bigframes.ml/linear_model.rst b/docs/reference/bigframes.ml/linear_model.rst index 5936858e64..8c6c2765b1 100644 --- a/docs/reference/bigframes.ml/linear_model.rst +++ b/docs/reference/bigframes.ml/linear_model.rst @@ -3,3 +3,5 @@ bigframes.ml.linear_model .. automodule:: bigframes.ml.linear_model :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/llm.rst b/docs/reference/bigframes.ml/llm.rst index 4debef74ff..20ae7793e7 100644 --- a/docs/reference/bigframes.ml/llm.rst +++ b/docs/reference/bigframes.ml/llm.rst @@ -3,3 +3,5 @@ bigframes.ml.llm .. automodule:: bigframes.ml.llm :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/metrics.rst b/docs/reference/bigframes.ml/metrics.rst index dc951ebed4..aca11f7e9f 100644 --- a/docs/reference/bigframes.ml/metrics.rst +++ b/docs/reference/bigframes.ml/metrics.rst @@ -3,3 +3,5 @@ bigframes.ml.metrics .. automodule:: bigframes.ml.metrics :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/model_selection.rst b/docs/reference/bigframes.ml/model_selection.rst index b0d4e5ed3e..d662285f99 100644 --- a/docs/reference/bigframes.ml/model_selection.rst +++ b/docs/reference/bigframes.ml/model_selection.rst @@ -3,3 +3,5 @@ bigframes.ml.model_selection .. automodule:: bigframes.ml.model_selection :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/pipeline.rst b/docs/reference/bigframes.ml/pipeline.rst index 239ee59c43..22e877dc5b 100644 --- a/docs/reference/bigframes.ml/pipeline.rst +++ b/docs/reference/bigframes.ml/pipeline.rst @@ -3,3 +3,5 @@ bigframes.ml.pipeline .. automodule:: bigframes.ml.pipeline :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.ml/preprocessing.rst b/docs/reference/bigframes.ml/preprocessing.rst index 4d2279fc8e..eac72da173 100644 --- a/docs/reference/bigframes.ml/preprocessing.rst +++ b/docs/reference/bigframes.ml/preprocessing.rst @@ -3,3 +3,5 @@ bigframes.ml.preprocessing .. automodule:: bigframes.ml.preprocessing :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes/options.rst b/docs/reference/bigframes/options.rst index 914e4976d5..d831a519fe 100644 --- a/docs/reference/bigframes/options.rst +++ b/docs/reference/bigframes/options.rst @@ -10,3 +10,5 @@ Options and settings .. autoclass:: bigframes._config.bigquery_options.BigQueryOptions .. autoclass:: bigframes._config.display_options.DisplayOptions + +.. autoclass:: bigframes._config.sampling_options.SamplingOptions diff --git a/docs/user_guide/bigframes.pandas/remote_functions.rst b/docs/user_guide/bigframes.pandas/remote_functions.rst index 3a1a9d342f..abfe6a10ca 100644 --- a/docs/user_guide/bigframes.pandas/remote_functions.rst +++ b/docs/user_guide/bigframes.pandas/remote_functions.rst @@ -9,32 +9,125 @@ find more details on it via `help` command. .. code-block:: python - import bigframes.pandas as pd - help(pd.remote_function) + import bigframes.pandas as bpd + help(bpd.remote_function) Read a table and inspect the column of interest. .. code-block:: python - df = pd.read_gbq("bigquery-public-data.ml_datasets.penguins") + df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") df["body_mass_g"].head(10) Define a custom function, and specify the intent to turn it into a remote function. It requires a BigQuery connection. If the connection is not already created, BigQuery DataFrames will attempt to create one assuming the necessary -APIs and IAM permissions are setup in the project. +APIs and IAM permissions are setup in the project. In our examples we would be +using a pre-created connection named `bigframes-rf-conn`. Let's try a +`pandas`-like use case in which we want to apply a user defined scalar function +to every value in a `Series`, more specifically bucketize the `body_mass_g` value +of the penguins, which is a real number, into a category, which is a string. .. code-block:: python - @pd.remote_function([float], str, bigquery_connection='bigframes-rf-conn') + @bpd.remote_function([float], str, bigquery_connection='bigframes-rf-conn') def get_bucket(num): if not num: return "NA" boundary = 4000 return "at_or_above_4000" if num >= boundary else "below_4000" -Run the custom function on the column of interest to create a new column. +Then we can apply the remote function on the `Series`` of interest via `apply` +API and store the result in a new column in the DataFrame. .. code-block:: python df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket)) + +This will add a new column `body_mass_bucket` in the DataFrame. You can preview +the original value and the bucketized value side by side. + +.. code-block:: python + df[['body_mass_g', 'body_mass_bucket']].head(10) + +This operation was possible by doing all the computation on the cloud. For that, +there is a google cloud function deployed by serializing the user code. + +.. warning:: + The deployed cloud function may be visible to other users with sufficient + privilege in the project. The user should be careful about having any + sensitive data in the code that will be deployed as a remote function. + +The cloud function can be located from a property set in the remote function object. + +.. code-block:: python + + get_bucket.bigframes_cloud_function + +and then there is a BigQuery remote function created configured to call into the +cloud function via the BigQuery connection. That can also be located from +another property set in the remote function object. + +.. code-block:: python + + get_bucket.bigframes_remote_function + +The cloud assets created are persistant and the user can manage them directy +from the Google Cloud Console. + +Let's continue trying other potential use cases of remote functions. Let's say +we consider the `species`, `island` and `sex` of the penguins sensitive +information and want to redact that by replacing with their hash code instead. +Let's define another scalar custom function and decorated it as a remote function: + +.. code-block:: python + + @bpd.remote_function([str], str, bigquery_connection='bigframes-rf-conn') + def get_hash(input): + import hashlib + # handle missing value + if input is None: + input = "" + encoded_input = input.encode() + hash = hashlib.md5(encoded_input) + return hash.hexdigest() + +We can use this remote function in another `pandas`-like API `map` that can be +applied on a DataFrame: + +.. code-block:: python + + df_redacted = df[["species", "island", "sex"]].map(get_hash) + df_redacted.head(10). + +Using Existing Functions +======================== + +If you have already defined a custom function in BigQuery, either in the +BigQuery Studio or with the `remote_function` decorator above or otherwise, you +may use it with BigQuery DataFrames with the `read_gbq_function` method. + +More details are available via the `help` command: + +.. code-block:: python + + import bigframes.pandas as pd + help(pd.read_gbq_function) + +Here is an example of using `read_gbq_function` to load an existing function +named `get_bucket`: + +.. code-block:: python + + import bigframes.pandas as pd + + df = pd.read_gbq("bigquery-public-data.ml_datasets.penguins") + get_bucket = pd.read_gbq_function("get_bucket") + + df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket)) + df.head(10) + +Note: As mentioned above, if a function is created using the `remote_function` +decorator, its generated name (including project and dataset) is accessible +immediately afterward in the function's `bigframes_remote_function` attribute. +The same string can be passed to `read_gbq_function` later in another context. diff --git a/notebooks/01 - Getting Started.ipynb b/notebooks/01 - Getting Started.ipynb index 132cdf8db0..473bdd8cea 100644 --- a/notebooks/01 - Getting Started.ipynb +++ b/notebooks/01 - Getting Started.ipynb @@ -28,7 +28,22 @@ "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5c480b60490940d3a45fa6b9ca2cecdb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 411d90c8-8b22-40b0-ad42-04f9e38c074e is DONE. 0 Bytes processed. body_mass_g\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
sex
tag_number
11720Dream32.115.5188.03050.036.618.4184.03475.0FEMALE
1371Biscoe37.716.0183.03075.0FEMALE1Dream39.819.1184.04650.0MALE
1417Torgersen38.617.0188.02900.02Dream40.918.9184.03900.0MALE
4Dream37.316.8192.03000.0FEMALE
12045Dream43.218.5192.04100.0MALE
9Dream40.220.1200.03975.0MALE
10Dream40.717.0190.03725.040.818.9208.04300.0MALE
1251Biscoe37.617.011Dream39.018.7185.03600.03650.0MALE
12Dream37.016.9185.03000.0FEMALE
1422Torgersen35.717.0189.03350.014Dream34.017.1185.03400.0FEMALE
1394Torgersen40.217.0176.03450.015Dream37.016.5185.03400.0FEMALE
116318Dream39.717.9193.04250.0MALE
19Dream37.818.1193.03750.0MALE
22Dream36.417.0195.03325.040.217.1193.03400.0FEMALE
1329Biscoe38.117.0181.03175.023Dream36.818.5193.03500.0FEMALE
1406Torgersen44.118.0210.026Dream41.518.5201.04000.0MALE
119631Dream36.518.0182.03150.033.116.1178.02900.0FEMALE
1228Biscoe41.618.0192.03950.032Dream37.218.1178.03900.0MALE
1412Torgersen40.318.0195.033Dream39.516.7178.03250.0FEMALE
114235Dream35.718.0202.03550.036.018.5186.03100.0FEMALE
1430Torgersen33.519.0190.03600.0FEMALE36Dream39.618.1186.04450.0MALE
1333Biscoe43.219.0197.04775.038Dream41.320.3194.03550.0MALE
1414Torgersen38.719.0195.03450.041Dream35.718.0202.03550.0FEMALE
119751Dream41.119.0182.038.117.6187.03425.0MALE
1443Torgersen40.619.0199.04000.0MALEFEMALE
1295Biscoe41.020.0203.04725.0MALE53Dream36.017.1187.03700.0FEMALE
\n", + "

25 rows × 6 columns

\n", "[146 rows x 6 columns in total]" ], "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", - "tag_number \n", - "1172 Dream 32.1 15.5 188.0 \n", - "1371 Biscoe 37.7 16.0 183.0 \n", - "1417 Torgersen 38.6 17.0 188.0 \n", - "1204 Dream 40.7 17.0 190.0 \n", - "1251 Biscoe 37.6 17.0 185.0 \n", - "1422 Torgersen 35.7 17.0 189.0 \n", - "1394 Torgersen 40.2 17.0 176.0 \n", - "1163 Dream 36.4 17.0 195.0 \n", - "1329 Biscoe 38.1 17.0 181.0 \n", - "1406 Torgersen 44.1 18.0 210.0 \n", - "1196 Dream 36.5 18.0 182.0 \n", - "1228 Biscoe 41.6 18.0 192.0 \n", - "1412 Torgersen 40.3 18.0 195.0 \n", - "1142 Dream 35.7 18.0 202.0 \n", - "1430 Torgersen 33.5 19.0 190.0 \n", - "1333 Biscoe 43.2 19.0 197.0 \n", - "1414 Torgersen 38.7 19.0 195.0 \n", - "1197 Dream 41.1 19.0 182.0 \n", - "1443 Torgersen 40.6 19.0 199.0 \n", - "1295 Biscoe 41.0 20.0 203.0 \n", - "1207 Dream 38.8 20.0 190.0 \n", - "1349 Biscoe 38.2 20.0 190.0 \n", - "1350 Biscoe 37.8 20.0 190.0 \n", - "1351 Biscoe 38.1 16.5 198.0 \n", - "1116 Dream 37.0 16.5 185.0 \n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", + "0 Dream 36.6 18.4 184.0 3475.0 \n", + "1 Dream 39.8 19.1 184.0 4650.0 \n", + "2 Dream 40.9 18.9 184.0 3900.0 \n", + "4 Dream 37.3 16.8 192.0 3000.0 \n", + "5 Dream 43.2 18.5 192.0 4100.0 \n", + "9 Dream 40.2 20.1 200.0 3975.0 \n", + "10 Dream 40.8 18.9 208.0 4300.0 \n", + "11 Dream 39.0 18.7 185.0 3650.0 \n", + "12 Dream 37.0 16.9 185.0 3000.0 \n", + "14 Dream 34.0 17.1 185.0 3400.0 \n", + "15 Dream 37.0 16.5 185.0 3400.0 \n", + "18 Dream 39.7 17.9 193.0 4250.0 \n", + "19 Dream 37.8 18.1 193.0 3750.0 \n", + "22 Dream 40.2 17.1 193.0 3400.0 \n", + "23 Dream 36.8 18.5 193.0 3500.0 \n", + "26 Dream 41.5 18.5 201.0 4000.0 \n", + "31 Dream 33.1 16.1 178.0 2900.0 \n", + "32 Dream 37.2 18.1 178.0 3900.0 \n", + "33 Dream 39.5 16.7 178.0 3250.0 \n", + "35 Dream 36.0 18.5 186.0 3100.0 \n", + "36 Dream 39.6 18.1 186.0 4450.0 \n", + "38 Dream 41.3 20.3 194.0 3550.0 \n", + "41 Dream 35.7 18.0 202.0 3550.0 \n", + "51 Dream 38.1 17.6 187.0 3425.0 \n", + "53 Dream 36.0 17.1 187.0 3700.0 \n", "\n", - " body_mass_g sex \n", - "tag_number \n", - "1172 3050.0 FEMALE \n", - "1371 3075.0 FEMALE \n", - "1417 2900.0 FEMALE \n", - "1204 3725.0 MALE \n", - "1251 3600.0 FEMALE \n", - "1422 3350.0 FEMALE \n", - "1394 3450.0 FEMALE \n", - "1163 3325.0 FEMALE \n", - "1329 3175.0 FEMALE \n", - "1406 4000.0 MALE \n", - "1196 3150.0 FEMALE \n", - "1228 3950.0 MALE \n", - "1412 3250.0 FEMALE \n", - "1142 3550.0 FEMALE \n", - "1430 3600.0 FEMALE \n", - "1333 4775.0 MALE \n", - "1414 3450.0 FEMALE \n", - "1197 3425.0 MALE \n", - "1443 4000.0 MALE \n", - "1295 4725.0 MALE \n", - "1207 3950.0 MALE \n", - "1349 3900.0 MALE \n", - "1350 4250.0 MALE \n", - "1351 3825.0 FEMALE \n", - "1116 3400.0 FEMALE \n", + " sex \n", + "0 FEMALE \n", + "1 MALE \n", + "2 MALE \n", + "4 FEMALE \n", + "5 MALE \n", + "9 MALE \n", + "10 MALE \n", + "11 MALE \n", + "12 FEMALE \n", + "14 FEMALE \n", + "15 FEMALE \n", + "18 MALE \n", + "19 MALE \n", + "22 FEMALE \n", + "23 FEMALE \n", + "26 MALE \n", + "31 FEMALE \n", + "32 MALE \n", + "33 FEMALE \n", + "35 FEMALE \n", + "36 MALE \n", + "38 MALE \n", + "41 FEMALE \n", + "51 FEMALE \n", + "53 FEMALE \n", "...\n", "\n", "[146 rows x 6 columns]" ] }, - "execution_count": 2, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# set a friendlier index to uniquely identify the rows\n", - "df = df.set_index(\"tag_number\")\n", - "\n", "# filter down to the data we want to analyze\n", "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", "\n", @@ -683,9 +843,24 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "81f9aa34c7234bd88b6b7a4bc77d4b4e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 288f0daa-a51e-45b4-86bf-d054467c4a99 is DONE. 28.9 kB processed.
\n", " \n", " \n", - " tag_number\n", " species\n", " island\n", " culmen_length_mm\n", @@ -65,288 +120,319 @@ " \n", " \n", " 0\n", - " 1225\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", - " <NA>\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 36.6\n", + " 18.4\n", + " 184.0\n", + " 3475.0\n", + " FEMALE\n", " \n", " \n", " 1\n", - " 1278\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 42.0\n", - " 13.5\n", - " 210.0\n", - " 4150.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 39.8\n", + " 19.1\n", + " 184.0\n", + " 4650.0\n", + " MALE\n", " \n", " \n", " 2\n", - " 1275\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 46.5\n", - " 13.5\n", - " 210.0\n", - " 4550.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 40.9\n", + " 18.9\n", + " 184.0\n", + " 3900.0\n", + " MALE\n", " \n", " \n", " 3\n", - " 1233\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 43.3\n", - " 14.0\n", - " 208.0\n", - " 4575.0\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 46.5\n", + " 17.9\n", + " 192.0\n", + " 3500.0\n", " FEMALE\n", " \n", " \n", " 4\n", - " 1311\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 47.5\n", - " 14.0\n", - " 212.0\n", - " 4875.0\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 37.3\n", + " 16.8\n", + " 192.0\n", + " 3000.0\n", " FEMALE\n", " \n", " \n", " 5\n", - " 1316\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 49.1\n", - " 14.5\n", - " 212.0\n", - " 4625.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 43.2\n", + " 18.5\n", + " 192.0\n", + " 4100.0\n", + " MALE\n", " \n", " \n", " 6\n", - " 1313\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 45.5\n", - " 14.5\n", - " 212.0\n", - " 4750.0\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 46.9\n", + " 16.6\n", + " 192.0\n", + " 2700.0\n", " FEMALE\n", " \n", " \n", " 7\n", - " 1381\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 47.6\n", - " 14.5\n", - " 215.0\n", - " 5400.0\n", - " MALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 50.5\n", + " 18.4\n", + " 200.0\n", + " 3400.0\n", + " FEMALE\n", " \n", " \n", " 8\n", - " 1377\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 45.1\n", - " 14.5\n", - " 207.0\n", - " 5050.0\n", - " FEMALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 49.5\n", + " 19.0\n", + " 200.0\n", + " 3800.0\n", + " MALE\n", " \n", " \n", " 9\n", - " 1380\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 45.1\n", - " 14.5\n", - " 215.0\n", - " 5000.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 40.2\n", + " 20.1\n", + " 200.0\n", + " 3975.0\n", + " MALE\n", " \n", " \n", " 10\n", - " 1257\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 46.2\n", - " 14.5\n", - " 209.0\n", - " 4800.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 40.8\n", + " 18.9\n", + " 208.0\n", + " 4300.0\n", + " MALE\n", " \n", " \n", " 11\n", - " 1336\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 46.5\n", - " 14.5\n", - " 213.0\n", - " 4400.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 39.0\n", + " 18.7\n", + " 185.0\n", + " 3650.0\n", + " MALE\n", " \n", " \n", " 12\n", - " 1237\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 43.2\n", - " 14.5\n", - " 208.0\n", - " 4450.0\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 37.0\n", + " 16.9\n", + " 185.0\n", + " 3000.0\n", " FEMALE\n", " \n", " \n", " 13\n", - " 1302\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 48.5\n", - " 15.0\n", - " 219.0\n", - " 4850.0\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 47.0\n", + " 17.3\n", + " 185.0\n", + " 3700.0\n", " FEMALE\n", " \n", " \n", " 14\n", - " 1325\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 49.1\n", - " 15.0\n", - " 228.0\n", - " 5500.0\n", - " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 34.0\n", + " 17.1\n", + " 185.0\n", + " 3400.0\n", + " FEMALE\n", " \n", " \n", " 15\n", - " 1285\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 47.5\n", - " 15.0\n", - " 218.0\n", - " 4950.0\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 37.0\n", + " 16.5\n", + " 185.0\n", + " 3400.0\n", " FEMALE\n", " \n", " \n", " 16\n", - " 1242\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 49.6\n", - " 15.0\n", - " 216.0\n", - " 4750.0\n", - " MALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 45.7\n", + " 17.3\n", + " 193.0\n", + " 3600.0\n", + " FEMALE\n", " \n", " \n", " 17\n", - " 1246\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 47.7\n", - " 15.0\n", - " 216.0\n", - " 4750.0\n", - " FEMALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 50.6\n", + " 19.4\n", + " 193.0\n", + " 3800.0\n", + " MALE\n", " \n", " \n", " 18\n", - " 1320\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 45.5\n", - " 15.0\n", - " 220.0\n", - " 5000.0\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 39.7\n", + " 17.9\n", + " 193.0\n", + " 4250.0\n", " MALE\n", " \n", " \n", " 19\n", - " 1244\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 46.4\n", - " 15.0\n", - " 216.0\n", - " 4700.0\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 37.8\n", + " 18.1\n", + " 193.0\n", + " 3750.0\n", + " MALE\n", + " \n", + " \n", + " 20\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 46.6\n", + " 17.8\n", + " 193.0\n", + " 3800.0\n", + " FEMALE\n", + " \n", + " \n", + " 21\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 51.3\n", + " 19.2\n", + " 193.0\n", + " 3650.0\n", + " MALE\n", + " \n", + " \n", + " 22\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 40.2\n", + " 17.1\n", + " 193.0\n", + " 3400.0\n", + " FEMALE\n", + " \n", + " \n", + " 23\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 36.8\n", + " 18.5\n", + " 193.0\n", + " 3500.0\n", " FEMALE\n", " \n", + " \n", + " 24\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 49.6\n", + " 18.2\n", + " 193.0\n", + " 3775.0\n", + " MALE\n", + " \n", " \n", "\n", - "[347 rows x 8 columns in total]" + "

25 rows × 7 columns

\n", + "[344 rows x 7 columns in total]" ], "text/plain": [ - " tag_number species island culmen_length_mm \\\n", - "0 1225 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "1 1278 Gentoo penguin (Pygoscelis papua) Biscoe 42.0 \n", - "2 1275 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - "3 1233 Gentoo penguin (Pygoscelis papua) Biscoe 43.3 \n", - "4 1311 Gentoo penguin (Pygoscelis papua) Biscoe 47.5 \n", - "5 1316 Gentoo penguin (Pygoscelis papua) Biscoe 49.1 \n", - "6 1313 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", - "7 1381 Gentoo penguin (Pygoscelis papua) Biscoe 47.6 \n", - "8 1377 Gentoo penguin (Pygoscelis papua) Biscoe 45.1 \n", - "9 1380 Gentoo penguin (Pygoscelis papua) Biscoe 45.1 \n", - "10 1257 Gentoo penguin (Pygoscelis papua) Biscoe 46.2 \n", - "11 1336 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", - "12 1237 Gentoo penguin (Pygoscelis papua) Biscoe 43.2 \n", - "13 1302 Gentoo penguin (Pygoscelis papua) Biscoe 48.5 \n", - "14 1325 Gentoo penguin (Pygoscelis papua) Biscoe 49.1 \n", - "15 1285 Gentoo penguin (Pygoscelis papua) Biscoe 47.5 \n", - "16 1242 Gentoo penguin (Pygoscelis papua) Biscoe 49.6 \n", - "17 1246 Gentoo penguin (Pygoscelis papua) Biscoe 47.7 \n", - "18 1320 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", - "19 1244 Gentoo penguin (Pygoscelis papua) Biscoe 46.4 \n", - "20 1390 Gentoo penguin (Pygoscelis papua) Biscoe 50.7 \n", - "21 1379 Gentoo penguin (Pygoscelis papua) Biscoe 47.8 \n", - "22 1267 Gentoo penguin (Pygoscelis papua) Biscoe 50.1 \n", - "23 1389 Gentoo penguin (Pygoscelis papua) Biscoe 47.2 \n", - "24 1269 Gentoo penguin (Pygoscelis papua) Biscoe 49.6 \n", + " species island culmen_length_mm \\\n", + "0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 \n", + "1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 \n", + "2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 \n", + "3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 \n", + "4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 \n", + "5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 \n", + "6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 \n", + "7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", + "8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 \n", + "9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", + "10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 \n", + "11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", + "12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", + "13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", + "14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 \n", + "15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", + "16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 \n", + "17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 \n", + "18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", + "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 \n", + "20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 \n", + "21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 \n", + "22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", + "23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 \n", + "24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 \n", - "1 13.5 210.0 4150.0 FEMALE \n", - "2 13.5 210.0 4550.0 FEMALE \n", - "3 14.0 208.0 4575.0 FEMALE \n", - "4 14.0 212.0 4875.0 FEMALE \n", - "5 14.5 212.0 4625.0 FEMALE \n", - "6 14.5 212.0 4750.0 FEMALE \n", - "7 14.5 215.0 5400.0 MALE \n", - "8 14.5 207.0 5050.0 FEMALE \n", - "9 14.5 215.0 5000.0 FEMALE \n", - "10 14.5 209.0 4800.0 FEMALE \n", - "11 14.5 213.0 4400.0 FEMALE \n", - "12 14.5 208.0 4450.0 FEMALE \n", - "13 15.0 219.0 4850.0 FEMALE \n", - "14 15.0 228.0 5500.0 MALE \n", - "15 15.0 218.0 4950.0 FEMALE \n", - "16 15.0 216.0 4750.0 MALE \n", - "17 15.0 216.0 4750.0 FEMALE \n", - "18 15.0 220.0 5000.0 MALE \n", - "19 15.0 216.0 4700.0 FEMALE \n", - "20 15.0 223.0 5550.0 MALE \n", - "21 15.0 215.0 5650.0 MALE \n", - "22 15.0 225.0 5000.0 MALE \n", - "23 15.5 215.0 4975.0 FEMALE \n", - "24 16.0 225.0 5700.0 MALE \n", + "0 18.4 184.0 3475.0 FEMALE \n", + "1 19.1 184.0 4650.0 MALE \n", + "2 18.9 184.0 3900.0 MALE \n", + "3 17.9 192.0 3500.0 FEMALE \n", + "4 16.8 192.0 3000.0 FEMALE \n", + "5 18.5 192.0 4100.0 MALE \n", + "6 16.6 192.0 2700.0 FEMALE \n", + "7 18.4 200.0 3400.0 FEMALE \n", + "8 19.0 200.0 3800.0 MALE \n", + "9 20.1 200.0 3975.0 MALE \n", + "10 18.9 208.0 4300.0 MALE \n", + "11 18.7 185.0 3650.0 MALE \n", + "12 16.9 185.0 3000.0 FEMALE \n", + "13 17.3 185.0 3700.0 FEMALE \n", + "14 17.1 185.0 3400.0 FEMALE \n", + "15 16.5 185.0 3400.0 FEMALE \n", + "16 17.3 193.0 3600.0 FEMALE \n", + "17 19.4 193.0 3800.0 MALE \n", + "18 17.9 193.0 4250.0 MALE \n", + "19 18.1 193.0 3750.0 MALE \n", + "20 17.8 193.0 3800.0 FEMALE \n", + "21 19.2 193.0 3650.0 MALE \n", + "22 17.1 193.0 3400.0 FEMALE \n", + "23 18.5 193.0 3500.0 FEMALE \n", + "24 18.2 193.0 3775.0 MALE \n", "...\n", "\n", - "[347 rows x 8 columns]" + "[344 rows x 7 columns]" ] }, - "execution_count": 1, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -371,9 +457,51 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "782462924bc84b9281d6d66f7f4acbe0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 6f0e1877-369d-4e9f-81a8-9c00ab1b57b3 is DONE. 28.9 kB processed.
body_mass_g\n", " sex\n", " \n", - " \n", - " tag_number\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " 1172\n", + " 0\n", " Dream\n", - " 32.1\n", - " 15.5\n", - " 188.0\n", - " 3050.0\n", + " 36.6\n", + " 18.4\n", + " 184.0\n", + " 3475.0\n", " FEMALE\n", " \n", " \n", - " 1371\n", - " Biscoe\n", - " 37.7\n", - " 16.0\n", - " 183.0\n", - " 3075.0\n", - " FEMALE\n", + " 1\n", + " Dream\n", + " 39.8\n", + " 19.1\n", + " 184.0\n", + " 4650.0\n", + " MALE\n", " \n", " \n", - " 1417\n", - " Torgersen\n", - " 38.6\n", - " 17.0\n", - " 188.0\n", - " 2900.0\n", + " 2\n", + " Dream\n", + " 40.9\n", + " 18.9\n", + " 184.0\n", + " 3900.0\n", + " MALE\n", + " \n", + " \n", + " 4\n", + " Dream\n", + " 37.3\n", + " 16.8\n", + " 192.0\n", + " 3000.0\n", " FEMALE\n", " \n", " \n", - " 1204\n", + " 5\n", " Dream\n", - " 40.7\n", - " 17.0\n", - " 190.0\n", - " 3725.0\n", + " 43.2\n", + " 18.5\n", + " 192.0\n", + " 4100.0\n", " MALE\n", " \n", " \n", - " 1251\n", - " Biscoe\n", - " 37.6\n", - " 17.0\n", + " 9\n", + " Dream\n", + " 40.2\n", + " 20.1\n", + " 200.0\n", + " 3975.0\n", + " MALE\n", + " \n", + " \n", + " 10\n", + " Dream\n", + " 40.8\n", + " 18.9\n", + " 208.0\n", + " 4300.0\n", + " MALE\n", + " \n", + " \n", + " 11\n", + " Dream\n", + " 39.0\n", + " 18.7\n", " 185.0\n", - " 3600.0\n", + " 3650.0\n", + " MALE\n", + " \n", + " \n", + " 12\n", + " Dream\n", + " 37.0\n", + " 16.9\n", + " 185.0\n", + " 3000.0\n", " FEMALE\n", " \n", " \n", - " 1422\n", - " Torgersen\n", - " 35.7\n", - " 17.0\n", - " 189.0\n", - " 3350.0\n", + " 14\n", + " Dream\n", + " 34.0\n", + " 17.1\n", + " 185.0\n", + " 3400.0\n", " FEMALE\n", " \n", " \n", - " 1394\n", - " Torgersen\n", - " 40.2\n", - " 17.0\n", - " 176.0\n", - " 3450.0\n", + " 15\n", + " Dream\n", + " 37.0\n", + " 16.5\n", + " 185.0\n", + " 3400.0\n", " FEMALE\n", " \n", " \n", - " 1163\n", + " 18\n", + " Dream\n", + " 39.7\n", + " 17.9\n", + " 193.0\n", + " 4250.0\n", + " MALE\n", + " \n", + " \n", + " 19\n", + " Dream\n", + " 37.8\n", + " 18.1\n", + " 193.0\n", + " 3750.0\n", + " MALE\n", + " \n", + " \n", + " 22\n", " Dream\n", - " 36.4\n", - " 17.0\n", - " 195.0\n", - " 3325.0\n", + " 40.2\n", + " 17.1\n", + " 193.0\n", + " 3400.0\n", " FEMALE\n", " \n", " \n", - " 1329\n", - " Biscoe\n", - " 38.1\n", - " 17.0\n", - " 181.0\n", - " 3175.0\n", + " 23\n", + " Dream\n", + " 36.8\n", + " 18.5\n", + " 193.0\n", + " 3500.0\n", " FEMALE\n", " \n", " \n", - " 1406\n", - " Torgersen\n", - " 44.1\n", - " 18.0\n", - " 210.0\n", + " 26\n", + " Dream\n", + " 41.5\n", + " 18.5\n", + " 201.0\n", " 4000.0\n", " MALE\n", " \n", " \n", - " 1196\n", + " 31\n", " Dream\n", - " 36.5\n", - " 18.0\n", - " 182.0\n", - " 3150.0\n", + " 33.1\n", + " 16.1\n", + " 178.0\n", + " 2900.0\n", " FEMALE\n", " \n", " \n", - " 1228\n", - " Biscoe\n", - " 41.6\n", - " 18.0\n", - " 192.0\n", - " 3950.0\n", + " 32\n", + " Dream\n", + " 37.2\n", + " 18.1\n", + " 178.0\n", + " 3900.0\n", " MALE\n", " \n", " \n", - " 1412\n", - " Torgersen\n", - " 40.3\n", - " 18.0\n", - " 195.0\n", + " 33\n", + " Dream\n", + " 39.5\n", + " 16.7\n", + " 178.0\n", " 3250.0\n", " FEMALE\n", " \n", " \n", - " 1142\n", + " 35\n", " Dream\n", - " 35.7\n", - " 18.0\n", - " 202.0\n", - " 3550.0\n", + " 36.0\n", + " 18.5\n", + " 186.0\n", + " 3100.0\n", " FEMALE\n", " \n", " \n", - " 1430\n", - " Torgersen\n", - " 33.5\n", - " 19.0\n", - " 190.0\n", - " 3600.0\n", - " FEMALE\n", + " 36\n", + " Dream\n", + " 39.6\n", + " 18.1\n", + " 186.0\n", + " 4450.0\n", + " MALE\n", " \n", " \n", - " 1333\n", - " Biscoe\n", - " 43.2\n", - " 19.0\n", - " 197.0\n", - " 4775.0\n", + " 38\n", + " Dream\n", + " 41.3\n", + " 20.3\n", + " 194.0\n", + " 3550.0\n", " MALE\n", " \n", " \n", - " 1414\n", - " Torgersen\n", - " 38.7\n", - " 19.0\n", - " 195.0\n", - " 3450.0\n", + " 41\n", + " Dream\n", + " 35.7\n", + " 18.0\n", + " 202.0\n", + " 3550.0\n", " FEMALE\n", " \n", " \n", - " 1197\n", + " 51\n", " Dream\n", - " 41.1\n", - " 19.0\n", - " 182.0\n", + " 38.1\n", + " 17.6\n", + " 187.0\n", " 3425.0\n", - " MALE\n", - " \n", - " \n", - " 1443\n", - " Torgersen\n", - " 40.6\n", - " 19.0\n", - " 199.0\n", - " 4000.0\n", - " MALE\n", + " FEMALE\n", " \n", " \n", - " 1295\n", - " Biscoe\n", - " 41.0\n", - " 20.0\n", - " 203.0\n", - " 4725.0\n", - " MALE\n", + " 53\n", + " Dream\n", + " 36.0\n", + " 17.1\n", + " 187.0\n", + " 3700.0\n", + " FEMALE\n", " \n", " \n", "\n", + "

25 rows × 6 columns

\n", "[146 rows x 6 columns in total]" ], "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", - "tag_number \n", - "1172 Dream 32.1 15.5 188.0 \n", - "1371 Biscoe 37.7 16.0 183.0 \n", - "1417 Torgersen 38.6 17.0 188.0 \n", - "1204 Dream 40.7 17.0 190.0 \n", - "1251 Biscoe 37.6 17.0 185.0 \n", - "1422 Torgersen 35.7 17.0 189.0 \n", - "1394 Torgersen 40.2 17.0 176.0 \n", - "1163 Dream 36.4 17.0 195.0 \n", - "1329 Biscoe 38.1 17.0 181.0 \n", - "1406 Torgersen 44.1 18.0 210.0 \n", - "1196 Dream 36.5 18.0 182.0 \n", - "1228 Biscoe 41.6 18.0 192.0 \n", - "1412 Torgersen 40.3 18.0 195.0 \n", - "1142 Dream 35.7 18.0 202.0 \n", - "1430 Torgersen 33.5 19.0 190.0 \n", - "1333 Biscoe 43.2 19.0 197.0 \n", - "1414 Torgersen 38.7 19.0 195.0 \n", - "1197 Dream 41.1 19.0 182.0 \n", - "1443 Torgersen 40.6 19.0 199.0 \n", - "1295 Biscoe 41.0 20.0 203.0 \n", - "1207 Dream 38.8 20.0 190.0 \n", - "1349 Biscoe 38.2 20.0 190.0 \n", - "1350 Biscoe 37.8 20.0 190.0 \n", - "1351 Biscoe 38.1 16.5 198.0 \n", - "1116 Dream 37.0 16.5 185.0 \n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", + "0 Dream 36.6 18.4 184.0 3475.0 \n", + "1 Dream 39.8 19.1 184.0 4650.0 \n", + "2 Dream 40.9 18.9 184.0 3900.0 \n", + "4 Dream 37.3 16.8 192.0 3000.0 \n", + "5 Dream 43.2 18.5 192.0 4100.0 \n", + "9 Dream 40.2 20.1 200.0 3975.0 \n", + "10 Dream 40.8 18.9 208.0 4300.0 \n", + "11 Dream 39.0 18.7 185.0 3650.0 \n", + "12 Dream 37.0 16.9 185.0 3000.0 \n", + "14 Dream 34.0 17.1 185.0 3400.0 \n", + "15 Dream 37.0 16.5 185.0 3400.0 \n", + "18 Dream 39.7 17.9 193.0 4250.0 \n", + "19 Dream 37.8 18.1 193.0 3750.0 \n", + "22 Dream 40.2 17.1 193.0 3400.0 \n", + "23 Dream 36.8 18.5 193.0 3500.0 \n", + "26 Dream 41.5 18.5 201.0 4000.0 \n", + "31 Dream 33.1 16.1 178.0 2900.0 \n", + "32 Dream 37.2 18.1 178.0 3900.0 \n", + "33 Dream 39.5 16.7 178.0 3250.0 \n", + "35 Dream 36.0 18.5 186.0 3100.0 \n", + "36 Dream 39.6 18.1 186.0 4450.0 \n", + "38 Dream 41.3 20.3 194.0 3550.0 \n", + "41 Dream 35.7 18.0 202.0 3550.0 \n", + "51 Dream 38.1 17.6 187.0 3425.0 \n", + "53 Dream 36.0 17.1 187.0 3700.0 \n", "\n", - " body_mass_g sex \n", - "tag_number \n", - "1172 3050.0 FEMALE \n", - "1371 3075.0 FEMALE \n", - "1417 2900.0 FEMALE \n", - "1204 3725.0 MALE \n", - "1251 3600.0 FEMALE \n", - "1422 3350.0 FEMALE \n", - "1394 3450.0 FEMALE \n", - "1163 3325.0 FEMALE \n", - "1329 3175.0 FEMALE \n", - "1406 4000.0 MALE \n", - "1196 3150.0 FEMALE \n", - "1228 3950.0 MALE \n", - "1412 3250.0 FEMALE \n", - "1142 3550.0 FEMALE \n", - "1430 3600.0 FEMALE \n", - "1333 4775.0 MALE \n", - "1414 3450.0 FEMALE \n", - "1197 3425.0 MALE \n", - "1443 4000.0 MALE \n", - "1295 4725.0 MALE \n", - "1207 3950.0 MALE \n", - "1349 3900.0 MALE \n", - "1350 4250.0 MALE \n", - "1351 3825.0 FEMALE \n", - "1116 3400.0 FEMALE \n", + " sex \n", + "0 FEMALE \n", + "1 MALE \n", + "2 MALE \n", + "4 FEMALE \n", + "5 MALE \n", + "9 MALE \n", + "10 MALE \n", + "11 MALE \n", + "12 FEMALE \n", + "14 FEMALE \n", + "15 FEMALE \n", + "18 MALE \n", + "19 MALE \n", + "22 FEMALE \n", + "23 FEMALE \n", + "26 MALE \n", + "31 FEMALE \n", + "32 MALE \n", + "33 FEMALE \n", + "35 FEMALE \n", + "36 MALE \n", + "38 MALE \n", + "41 FEMALE \n", + "51 FEMALE \n", + "53 FEMALE \n", "...\n", "\n", "[146 rows x 6 columns]" ] }, - "execution_count": 2, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# set a friendlier index to uniquely identify the rows\n", - "df = df.set_index(\"tag_number\")\n", - "\n", "# filter down to the data we want to analyze\n", "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", "\n", @@ -682,7 +842,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -704,7 +864,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -719,9 +879,79 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "367359e2069c4d198a820d1ced057b81", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 56778fb7-779c-4e44-b2a3-04d2e174c562 is DONE. 31.9 kB processed.
\n", " \n", "\n", + "

1 rows × 6 columns

\n", "[1 rows x 6 columns in total]" ], "text/plain": [ @@ -775,7 +1006,7 @@ "[1 rows x 6 columns]" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -787,9 +1018,79 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "025941ee47864dae956ef25bfe815da4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 1eabd729-6c75-4087-9c87-0d95327b615c is RUNNING.
\n", " predicted_body_mass_g\n", " \n", - " \n", - " tag_number\n", - " \n", - " \n", " \n", " \n", " \n", - " 1393\n", + " 292\n", " 3459.735118\n", " \n", - " \n", - " 1525\n", - " 3947.881639\n", - " \n", - " \n", - " 1524\n", - " 4304.175638\n", - " \n", - " \n", - " 1523\n", - " 3471.668379\n", - " \n", " \n", "\n", - "[4 rows x 1 columns in total]" + "

1 rows × 1 columns

\n", + "[1 rows x 1 columns in total]" ], "text/plain": [ - " predicted_body_mass_g\n", - "tag_number \n", - "1393 3459.735118\n", - "1525 3947.881639\n", - "1524 4304.175638\n", - "1523 3471.668379\n", + " predicted_body_mass_g\n", + "292 3459.735118\n", "\n", - "[4 rows x 1 columns]" + "[1 rows x 1 columns]" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -870,7 +1152,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -879,7 +1161,7 @@ "LinearRegression()" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -906,7 +1188,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.10.12" }, "orig_nbformat": 4, "vscode": { diff --git a/notebooks/06 - Using ML - Large Language Models.ipynb b/notebooks/06 - Using ML - Large Language Models.ipynb index c2c0b83bef..45a46c44af 100644 --- a/notebooks/06 - Using ML - Large Language Models.ipynb +++ b/notebooks/06 - Using ML - Large Language Models.ipynb @@ -137,7 +137,7 @@ } ], "source": [ - "pred = model.predict(bf_df).compute()\n", + "pred = model.predict(bf_df).to_pandas()\n", "pred" ] }, diff --git a/notebooks/10 - Regionalized.ipynb b/notebooks/10 - Regionalized.ipynb new file mode 100644 index 0000000000..a7ff5db84e --- /dev/null +++ b/notebooks/10 - Regionalized.ipynb @@ -0,0 +1,2800 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# README\n", + "\n", + "This Notebook runs differently depending on the following environent variable:\n", + "1. BIGQUERY_LOCATION - can take values as per https://cloud.google.com/bigquery/docs/locations, e.g. `us`, `asia-east1`" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Infer location and set up data in that location if needed" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BigQuery project: bigframes-dev\n", + "BigQuery location: us\n", + "Penguins Table: bigquery-public-data.ml_datasets.penguins\n", + "ML Model Dataset: bigframes-dev.bigframes_testing\n" + ] + } + ], + "source": [ + "# Take multi-region US as the default BQ location, where most of the BQ data lies including the BQ public datasets\n", + "BQ_LOCATION = \"us\"\n", + "PROJECT = \"bigframes-dev\"\n", + "DATASET = \"bigframes_testing\"\n", + "PENGUINS_TABLE = \"bigquery-public-data.ml_datasets.penguins\"\n", + "\n", + "\n", + "# Check for a location set in the environment and do location-specific setup if needed\n", + "\n", + "import os\n", + "import google.api_core.exceptions\n", + "from google.cloud import bigquery\n", + "import bigframes\n", + " \n", + "env_bq_location = os.getenv(\"BIGQUERY_LOCATION\")\n", + "if env_bq_location and env_bq_location != BQ_LOCATION:\n", + " BQ_LOCATION = env_bq_location.lower()\n", + "\n", + "client = bigquery.Client()\n", + "\n", + "if BQ_LOCATION != \"us\":\n", + " bq_location_normalized = BQ_LOCATION.replace('-', '_')\n", + "\n", + " # Nominate a local penguins table\n", + " penguins_table_ref = bigquery.TableReference.from_string(PENGUINS_TABLE)\n", + " penguins_local_dataset_name = f\"{DATASET}_{bq_location_normalized}\"\n", + " penguins_local_dataset_ref = bigquery.DatasetReference(project=PROJECT, dataset_id=penguins_local_dataset_name)\n", + " penguins_local_dataset = bigquery.Dataset(penguins_local_dataset_ref)\n", + " penguins_local_dataset.location = BQ_LOCATION\n", + " penguins_local_table_ref= bigquery.TableReference(penguins_local_dataset, penguins_table_ref.table_id)\n", + " penguins_local_table = str(penguins_local_table_ref)\n", + " try:\n", + " client.get_table(penguins_local_table_ref)\n", + " except google.api_core.exceptions.NotFound:\n", + " client.create_dataset(penguins_local_dataset, exists_ok=True)\n", + "\n", + " # Read the public table as an in-memory dataframe and then write to the local table\n", + " session_us = bigframes.connect()\n", + " df = session_us.read_gbq(PENGUINS_TABLE).to_pandas()\n", + " df.to_gbq(penguins_local_table)\n", + "\n", + " # Finally point the penguins table to the local table\n", + " PENGUINS_TABLE=penguins_local_table\n", + "\n", + " # Also update the dataset name used for test artifacts\n", + " DATASET = f\"{DATASET}_{bq_location_normalized}\"\n", + "\n", + "# Create the dataset to store the model if it doesn't exist \n", + "model_local_dataset = bigquery.Dataset(bigquery.DatasetReference(project=PROJECT, dataset_id=DATASET))\n", + "model_local_dataset.location = BQ_LOCATION\n", + "model_dataset = client.create_dataset(model_local_dataset, exists_ok=True)\n", + "\n", + "# Finally log the variables driving the core notebook execution\n", + "log = ('\\n'.join(f\"{name}: {str(value)}\" for name, value in {\n", + " \"BigQuery project\" : PROJECT,\n", + " \"BigQuery location\" : BQ_LOCATION,\n", + " \"Penguins Table\" : PENGUINS_TABLE,\n", + " \"ML Model Dataset\" : model_dataset.reference\n", + "}.items())) \n", + "print(log)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using the BigQuery DataFrames API" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set BigQuery DataFrames options" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas\n", + "\n", + "bigframes.pandas.options.bigquery.project = PROJECT\n", + "bigframes.pandas.options.bigquery.location = BQ_LOCATION" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize a dataframe for a BigQuery table" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "02af1103261a4e63a4c15efd26b1bc9a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job e1a62d56-8cab-4bc1-9ad3-457f48b71d9c is RUNNING.
= 4000.0]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using the Remote Functions" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BigQuery DataFrames gives you the ability to turn your custom scalar functions into a BigQuery remote function.\n", + "\n", + "It requires the GCP project to be set up appropriately and the user having sufficient privileges to use them. One can find more details on it via `help` command." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on function remote_function in module bigframes.pandas:\n", + "\n", + "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)\n", + " Decorator to turn a user defined function into a BigQuery remote function.\n", + " \n", + " .. note::\n", + " Please make sure following is setup before using this API:\n", + " \n", + " 1. Have the below APIs enabled for your project:\n", + " a. BigQuery Connection API\n", + " b. Cloud Functions API\n", + " c. Cloud Run API\n", + " d. Cloud Build API\n", + " e. Artifact Registry API\n", + " f. Cloud Resource Manager API\n", + " \n", + " This can be done from the cloud console (change PROJECT_ID to yours):\n", + " https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID\n", + " Or from the gcloud CLI:\n", + " $ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com\n", + " \n", + " 2. Have following IAM roles enabled for you:\n", + " a. BigQuery Data Editor (roles/bigquery.dataEditor)\n", + " b. BigQuery Connection Admin (roles/bigquery.connectionAdmin)\n", + " c. Cloud Functions Developer (roles/cloudfunctions.developer)\n", + " d. Service Account User (roles/iam.serviceAccountUser)\n", + " e. Storage Object Viewer (roles/storage.objectViewer)\n", + " f. Project IAM Admin (roles/resourcemanager.projectIamAdmin)\n", + " (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)\n", + " \n", + " 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set:\n", + " a. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection\n", + " b. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function\n", + " Alternatively, the IAM could also be setup via the gcloud CLI:\n", + " $ gcloud projects add-iam-policy-binding PROJECT_ID --member=\"serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID\" --role=\"roles/run.invoker\"\n", + " \n", + " Args:\n", + " input_types (list(type)):\n", + " List of input data types in the user defined function.\n", + " output_type (type):\n", + " Data type of the output in the user defined function.\n", + " dataset (str, Optional):\n", + " Dataset to use to create a BigQuery function. It should be in\n", + " `.` or `` format. If this\n", + " param is not provided then session dataset id would be used.\n", + " bigquery_connection (str, Optional):\n", + " Name of the BigQuery connection. If it is pre created in the same\n", + " location as the `bigquery_client.location` then it would be used,\n", + " otherwise it would be created dynamically assuming the user has\n", + " necessary priviliges. If this param is not provided then the\n", + " bigquery connection from the session would be used.\n", + " reuse (bool, Optional):\n", + " Reuse the remote function if already exists.\n", + " `True` by default, which will result in reusing an existing remote\n", + " function (if any) that was previously created for the same udf.\n", + " Setting it to false would force creating a unique remote function.\n", + " If the required remote function does not exist then it would be\n", + " created irrespective of this param.\n", + "\n" + ] + } + ], + "source": [ + "import bigframes.pandas as pd\n", + "help(pd.remote_function)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define a custom function, and specify the intent to turn it into a remote function.\n", + "\n", + "It requires a BigQuery connection. If the connection is not already created,\n", + "the BigQuery DataFrames package attempts to create one assuming the necessary\n", + "APIs and IAM permissions are setup in the project." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO][2023-08-05 23:12:12,870][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-f9320ad496b5aeca2d7f343cbab03e2f --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmps5m0qu4z --entry-point=udf_http --trigger-http --no-allow-unauthenticated\n", + "[INFO][2023-08-05 23:13:20,660][bigframes.remote_function] Successfully created cloud function bigframes-f9320ad496b5aeca2d7f343cbab03e2f with uri (https://bigframes-f9320ad496b5aeca2d7f343cbab03e2f-7krlje3eoq-uc.a.run.app)\n", + "[INFO][2023-08-05 23:13:32,717][bigframes.remote_function] Connector bigframes-rf-conn already exists\n", + "[INFO][2023-08-05 23:13:32,719][bigframes.remote_function] Creating BQ remote function: \n", + " CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_f9320ad496b5aeca2d7f343cbab03e2f(num FLOAT64)\n", + " RETURNS STRING\n", + " REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`\n", + " OPTIONS (\n", + " endpoint = \"https://bigframes-f9320ad496b5aeca2d7f343cbab03e2f-7krlje3eoq-uc.a.run.app\"\n", + " )\n", + "[INFO][2023-08-05 23:13:33,697][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_f9320ad496b5aeca2d7f343cbab03e2f\n" + ] + } + ], + "source": [ + "@pd.remote_function([float], str, bigquery_connection='bigframes-rf-conn')\n", + "def get_bucket(num):\n", + " if not num: return \"NA\"\n", + " boundary = 4000\n", + " return \"at_or_above_4000\" if num >= boundary else \"below_4000\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run the custom function on the BigQuery-backed dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0dba87f5bcb74dca9efebe8f522beeff", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 5f30816f-f4d0-4063-bb9e-2905b89f717d is DONE. 2.8 kB processed. \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
body_mass_gbody_mass_bucket
03475.0below_4000
14650.0at_or_above_4000
23900.0below_4000
33500.0below_4000
43000.0below_4000
54100.0at_or_above_4000
62700.0below_4000
73400.0below_4000
83800.0below_4000
93975.0below_4000
\n", + "

10 rows × 2 columns

\n", + "[10 rows x 2 columns in total]" + ], + "text/plain": [ + " body_mass_g body_mass_bucket\n", + "0 3475.0 below_4000\n", + "1 4650.0 at_or_above_4000\n", + "2 3900.0 below_4000\n", + "3 3500.0 below_4000\n", + "4 3000.0 below_4000\n", + "5 4100.0 at_or_above_4000\n", + "6 2700.0 below_4000\n", + "7 3400.0 below_4000\n", + "8 3800.0 below_4000\n", + "9 3975.0 below_4000\n", + "\n", + "[10 rows x 2 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket))\n", + "df[['body_mass_g', 'body_mass_bucket']].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using the ML API" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize a DataFrame from a BigQuery table" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "89bf7ae3fd8641bcbdc5a4614a82f48a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job acd770bb-5ccb-463f-beec-2386132ded6b is RUNNING.
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
islandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Dream36.618.4184.03475.0FEMALE
1Dream39.819.1184.04650.0MALE
2Dream40.918.9184.03900.0MALE
4Dream37.316.8192.03000.0FEMALE
5Dream43.218.5192.04100.0MALE
9Dream40.220.1200.03975.0MALE
10Dream40.818.9208.04300.0MALE
11Dream39.018.7185.03650.0MALE
12Dream37.016.9185.03000.0FEMALE
14Dream34.017.1185.03400.0FEMALE
15Dream37.016.5185.03400.0FEMALE
18Dream39.717.9193.04250.0MALE
19Dream37.818.1193.03750.0MALE
22Dream40.217.1193.03400.0FEMALE
23Dream36.818.5193.03500.0FEMALE
26Dream41.518.5201.04000.0MALE
31Dream33.116.1178.02900.0FEMALE
32Dream37.218.1178.03900.0MALE
33Dream39.516.7178.03250.0FEMALE
35Dream36.018.5186.03100.0FEMALE
36Dream39.618.1186.04450.0MALE
38Dream41.320.3194.03550.0MALE
41Dream35.718.0202.03550.0FEMALE
51Dream38.117.6187.03425.0FEMALE
53Dream36.017.1187.03700.0FEMALE
\n", + "

25 rows × 6 columns

\n", + "[146 rows x 6 columns in total]" + ], + "text/plain": [ + " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", + "0 Dream 36.6 18.4 184.0 3475.0 \n", + "1 Dream 39.8 19.1 184.0 4650.0 \n", + "2 Dream 40.9 18.9 184.0 3900.0 \n", + "4 Dream 37.3 16.8 192.0 3000.0 \n", + "5 Dream 43.2 18.5 192.0 4100.0 \n", + "9 Dream 40.2 20.1 200.0 3975.0 \n", + "10 Dream 40.8 18.9 208.0 4300.0 \n", + "11 Dream 39.0 18.7 185.0 3650.0 \n", + "12 Dream 37.0 16.9 185.0 3000.0 \n", + "14 Dream 34.0 17.1 185.0 3400.0 \n", + "15 Dream 37.0 16.5 185.0 3400.0 \n", + "18 Dream 39.7 17.9 193.0 4250.0 \n", + "19 Dream 37.8 18.1 193.0 3750.0 \n", + "22 Dream 40.2 17.1 193.0 3400.0 \n", + "23 Dream 36.8 18.5 193.0 3500.0 \n", + "26 Dream 41.5 18.5 201.0 4000.0 \n", + "31 Dream 33.1 16.1 178.0 2900.0 \n", + "32 Dream 37.2 18.1 178.0 3900.0 \n", + "33 Dream 39.5 16.7 178.0 3250.0 \n", + "35 Dream 36.0 18.5 186.0 3100.0 \n", + "36 Dream 39.6 18.1 186.0 4450.0 \n", + "38 Dream 41.3 20.3 194.0 3550.0 \n", + "41 Dream 35.7 18.0 202.0 3550.0 \n", + "51 Dream 38.1 17.6 187.0 3425.0 \n", + "53 Dream 36.0 17.1 187.0 3700.0 \n", + "\n", + " sex \n", + "0 FEMALE \n", + "1 MALE \n", + "2 MALE \n", + "4 FEMALE \n", + "5 MALE \n", + "9 MALE \n", + "10 MALE \n", + "11 MALE \n", + "12 FEMALE \n", + "14 FEMALE \n", + "15 FEMALE \n", + "18 MALE \n", + "19 MALE \n", + "22 FEMALE \n", + "23 FEMALE \n", + "26 MALE \n", + "31 FEMALE \n", + "32 MALE \n", + "33 FEMALE \n", + "35 FEMALE \n", + "36 MALE \n", + "38 MALE \n", + "41 FEMALE \n", + "51 FEMALE \n", + "53 FEMALE \n", + "...\n", + "\n", + "[146 rows x 6 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# filter down to the data we want to analyze\n", + "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", + "\n", + "# drop the columns we don't care about\n", + "adelie_data = adelie_data.drop(columns=[\"species\"])\n", + "\n", + "# drop rows with nulls to get our training data\n", + "training_data = adelie_data.dropna()\n", + "\n", + "# take a peek at the training data\n", + "training_data" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# pick feature columns and label column\n", + "feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", + "label_columns = training_data[['body_mass_g']]\n", + "\n", + "# also get the rows that we want to make predictions for (i.e. where the feature column is null)\n", + "missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train and evaluate a linear regression model using the ML API" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "865f6bb75c5b48e4a52a3183fe3c2582", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 4b0c58e4-4752-4b96-b490-a95e3ae326c0 is DONE. 31.9 kB processed.
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
0223.87876378553.6016340.005614181.3309110.6239510.623951
\n", + "

1 rows × 6 columns

\n", + "[1 rows x 6 columns in total]" + ], + "text/plain": [ + " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", + "0 223.878763 78553.601634 0.005614 \n", + "\n", + " median_absolute_error r2_score explained_variance \n", + "0 181.330911 0.623951 0.623951 \n", + "\n", + "[1 rows x 6 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bigframes.ml.linear_model import LinearRegression\n", + "\n", + "# as in scikit-learn, a newly created model is just a bundle of parameters\n", + "# default parameters are fine here\n", + "model = LinearRegression()\n", + "\n", + "# this will train a temporary model in BigQuery Machine Learning\n", + "model.fit(feature_columns, label_columns)\n", + "\n", + "# check how the model performed\n", + "model.score(feature_columns, label_columns)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Make predictions using the model" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "caf8cefe06a14da2a02f31aa1e12c23a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job fe2fe252-8433-4d20-861c-681a8dfbf2c4 is RUNNING.
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
predicted_body_mass_g
2923459.735118
\n", + "

1 rows × 1 columns

\n", + "[1 rows x 1 columns in total]" + ], + "text/plain": [ + " predicted_body_mass_g\n", + "292 3459.735118\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(missing_body_mass)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save the trained model to BigQuery, so we can load it later" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.to_gbq(f\"{DATASET}.penguins_model\", replace=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/noxfile.py b/noxfile.py index 5e54c12fb2..d4d97ca2c4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -16,6 +16,7 @@ from __future__ import absolute_import +from multiprocessing import Process import os import pathlib import re @@ -82,7 +83,6 @@ "docfx", "unit", "unit_noextras", - "unit_prerelease", "system", "doctest", "cover", @@ -603,24 +603,62 @@ def notebook(session): ] notebooks = [os.path.join("notebooks", nb) for nb in notebooks] + # Regionalized notebooks + notebooks_reg = { + "10 - Regionalized.ipynb": [ + "asia-southeast1", + "eu", + "europe-west4", + "southamerica-west1", + "us", + "us-central1", + ] + } + notebooks_reg = { + os.path.join("notebooks", nb): regions for nb, regions in notebooks_reg.items() + } + # For some reason nbmake exits silently with "no tests ran" message if # one of the notebook paths supplied does not exist. Let's make sure that # each path exists - for nb in notebooks: + for nb in notebooks + list(notebooks_reg): assert os.path.exists(nb), nb - # Use retries because sometimes parallel runs of the same notebook can try - # to create the same artifacts and may run into resoure conflict at the GCP - # level. - session.run( + # TODO(shobs): For some reason --retries arg masks exceptions occurred in + # notebook failures, and shows unhelpful INTERNALERROR. Investigate that + # and enable retries if we can find a way to surface the real exception + # bacause the notebook is running against real GCP and something may fail + # due to transient issues. + pytest_command = [ "py.test", - "-nauto", "--nbmake", "--nbmake-timeout=600", - "--retries=3", + ] + + # Run self-contained notebooks in single session.run + # achieve parallelization via -n + session.run( + *pytest_command, + "-nauto", *notebooks, ) + # Run regionalized notebooks in parallel session.run's, since each notebook + # takes a different region via env param. + processes = [] + for notebook, regions in notebooks_reg.items(): + for region in regions: + process = Process( + target=session.run, + args=(*pytest_command, notebook), + kwargs={"env": {"BIGQUERY_LOCATION": region}}, + ) + process.start() + processes.append(process) + + for process in processes: + process.join() + @nox.session(python="3.10") def release_dry_run(session): diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index dc8ab7ba2d..a15ea16853 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -14,64 +14,46 @@ def run_quickstart(project_id: str): - # [START bigquery_bigframes_quickstart] - import bigframes.pandas as pd + import bigframes + + session_options = bigframes.BigQueryOptions() + session = bigframes.connect(session_options) + + your_gcp_project_id = project_id + query_or_table = "bigquery-public-data.ml_datasets.penguins" + df_session = session.read_gbq(query_or_table) + average_body_mass = df_session["body_mass_g"].mean() + print(f"average_body_mass (df_session): {average_body_mass}") - # TODO: (Optional) Setup your session with the configuration. Some of these - # settings cannot be changed once a session has started. - pd.options.bigquery.project = "your-gcp-project-id" - pd.options.bigquery.location = "us" + # [START bigquery_bigframes_quickstart] + import bigframes.pandas as bpd - # [START_EXCLUDE silent] - # Ignore the lines between "EXCLUDE" comments. They replace values so this - # sample runs in our test suite. - pd.options.bigquery.project = project_id - # [END_EXCLUDE] + # Set BigQuery DataFrames options + bpd.options.bigquery.project = your_gcp_project_id + bpd.options.bigquery.location = "us" + # Create a DataFrame from a BigQuery table query_or_table = "bigquery-public-data.ml_datasets.penguins" - df = pd.read_gbq(query_or_table) + df = bpd.read_gbq(query_or_table) # Use the DataFrame just as you would a pandas DataFrame, but calculations # happen in the BigQuery query engine instead of the local system. average_body_mass = df["body_mass_g"].mean() - print(f"average_body_mass: {average_body_mass}") - # IMPORTANT: The `bigframes.pandas` package creates a BigQuery session for - # queries and temporary tables. A BigQuery session has a limited lifetime - # (https://cloud.google.com/bigquery/docs/sessions-intro#limitations) and - # does not support concurrent queries. For long lived applications, create - # session objects as needed, instead. - - import bigframes - - session_options = bigframes.BigQueryOptions() - session_options.project = "your-gcp-project-id" - session_options.location = "us" - - # [START_EXCLUDE silent] - # Ignore the lines between "EXCLUDE" comments. They replace values so this - # sample runs in our test suite. - session_options.project = project_id - # [END_EXCLUDE] - - session = bigframes.connect(session_options) - df_session = session.read_gbq(query_or_table) - average_body_mass = df_session["body_mass_g"].mean() - print(f"average_body_mass (df_session): {average_body_mass}") - + # Create the Linear Regression model from bigframes.ml.linear_model import LinearRegression - # filter down to the data we want to analyze + # Filter down to the data we want to analyze adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"] - # drop the columns we don't care about + # Drop the columns we don't care about adelie_data = adelie_data.drop(columns=["species"]) - # drop rows with nulls to get our training data + # Drop rows with nulls to get our training data training_data = adelie_data.dropna() - # pick feature columns and label column + # Pick feature columns and label column X = training_data[ [ "island", diff --git a/scripts/upload_to_google_drive.py b/scripts/upload_to_google_drive.py new file mode 100644 index 0000000000..e579151359 --- /dev/null +++ b/scripts/upload_to_google_drive.py @@ -0,0 +1,71 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Upload latest wheel to Google Drive. + +Based on +https://github.com/googleapis/google-resumable-media-python/blob/main/google/resumable_media/requests/__init__.py + +Before running, execute the following to make sure you can use the Google Drive API: + +gcloud auth application-default login --scopes=openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/drive +""" + +import pathlib + +import google.auth +import google.auth.transport.requests +import google.resumable_media._upload +import google.resumable_media.requests as resumable_requests + +repo_root = pathlib.Path(__file__).parent.parent + +# Use PATCH instead of POST to replace existing files. +google.resumable_media._upload._POST = "PATCH" + +credentials, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/drive"]) +transport = google.auth.transport.requests.AuthorizedSession(credentials) + +wheel_id = "15fZ1DkrFDk4ibMNTzms4akpxmf2pzeAR" +wheel_path = next(iter((repo_root / "dist").glob("bigframes-*.whl"))) + +pdf_id = "1agYjxmPLrxelsaHI-lc41QHcgnQYemcX" +pdf_path = repo_root / "docs" / "_build" / "latex" / "bigframes-latest.pdf" + +uploads = ( + (wheel_id, wheel_path, "application/octet-stream"), + (pdf_id, pdf_path, "application/pdf"), +) + +upload_template = ( + "https://www.googleapis.com/upload/drive/v3/files/{file_id}?uploadType=resumable" +) +chunk_size = 1024 * 1024 # 1MB + +for file_id, file_path, content_type in uploads: + print(f"Uploading {file_path}") + transport = google.auth.transport.requests.AuthorizedSession(credentials) + upload = resumable_requests.ResumableUpload( + upload_template.format(file_id=file_id), chunk_size + ) + + with open(file_path, "rb") as stream: + response = upload.initiate( + transport, stream, metadata={}, content_type=content_type + ) + print(response) + while not upload.finished: + response = upload.transmit_next_chunk(transport) + print(response) diff --git a/setup.py b/setup.py index 311471f262..3e36cd3ecf 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ # 'Development Status :: 5 - Production/Stable' release_status = "Development Status :: 3 - Alpha" dependencies = [ - "cloudpickle >= 2.2.1", + "cloudpickle >= 2.0.0", "fsspec >=2023.3.0", "gcsfs >=2023.3.0", "geopandas >=0.12.2", @@ -39,9 +39,10 @@ "google-cloud-functions >=1.10.1", "google-cloud-bigquery-connection >=1.12.0", "google-cloud-storage >=2.0.0", - "ibis-framework[bigquery] >=6.0.0", + # TODO: Relax upper bound once we have fixed `system_prerelease` tests. + "ibis-framework[bigquery] >=6.0.0,<=6.1.0", "pandas >=1.5.0", - "pydata-google-auth >=1.5.0", + "pydata-google-auth >=1.8.2", "scikit-learn >=1.2.2", "sqlalchemy >=1.4,<3.0", "ipywidgets >=7.7.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index aa72e5426b..fe3d49ef20 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -11,7 +11,7 @@ cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 click==8.1.3 -cloudpickle==2.2.1 +cloudpickle==2.0.0 colorlog==6.7.0 coverage==7.2.2 cryptography==40.0.1 @@ -81,7 +81,7 @@ pyarrow==11.0.0 pyasn1==0.4.8 pyasn1-modules==0.2.8 pycparser==2.21 -pydata-google-auth==1.5.0 +pydata-google-auth==1.8.2 Pygments==2.14.0 PyJWT==2.6.0 pyperclip==1.8.2 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 0363869896..b6b02e4c27 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -22,6 +22,7 @@ import google.cloud.bigquery as bigquery import google.cloud.bigquery_connection_v1 as bigquery_connection_v1 import google.cloud.exceptions +import google.cloud.functions_v2 as functions_v2 import google.cloud.storage as storage # type: ignore import ibis.backends.base import pandas as pd @@ -93,6 +94,13 @@ def bigqueryconnection_client( return session.bqconnectionclient +@pytest.fixture(scope="session") +def cloudfunctions_client( + session: bigframes.Session, +) -> functions_v2.FunctionServiceClient: + return session.cloudfunctionsclient + + @pytest.fixture(scope="session") def session() -> bigframes.Session: return bigframes.Session() @@ -665,3 +673,19 @@ def penguins_randomforest_classifier_model_name( session.bqclient.query(sql).result() finally: return model_name + + +@pytest.fixture() +def deferred_repr(): + bigframes.options.display.repr_mode = "deferred" + yield + bigframes.options.display.repr_mode = "head" + + +@pytest.fixture() +def restore_sampling_settings(): + enable_downsampling = bigframes.options.sampling.enable_downsampling + max_download_size = bigframes.options.sampling.max_download_size + yield + bigframes.options.sampling.enable_downsampling = enable_downsampling + bigframes.options.sampling.max_download_size = max_download_size diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index bb64b19076..eae6896669 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -12,14 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas +import pandas as pd +import pytest -import bigframes.ml.cluster +from bigframes.ml import cluster from tests.system.utils import assert_pandas_df_equal_ignore_ordering -def test_cluster_configure_fit_predict(session, penguins_df_default_index, dataset_id): - model = bigframes.ml.cluster.KMeans(n_clusters=3) +@pytest.mark.flaky(retries=2, delay=120) +def test_cluster_configure_fit_score_predict( + session, penguins_df_default_index, dataset_id +): + model = cluster.KMeans(n_clusters=3) df = penguins_df_default_index.dropna()[ [ @@ -37,7 +41,7 @@ def test_cluster_configure_fit_predict(session, penguins_df_default_index, datas model.fit(df) - pd_new_penguins = pandas.DataFrame.from_dict( + pd_new_penguins = pd.DataFrame.from_dict( { "test1": { "species": "Adelie Penguin (Pygoscelis adeliae)", @@ -81,13 +85,24 @@ def test_cluster_configure_fit_predict(session, penguins_df_default_index, datas pd_new_penguins.index.name = "observation" new_penguins = session.read_pandas(pd_new_penguins) - result = model.predict(new_penguins).compute() - expected = pandas.DataFrame( + + # Check score to ensure the model was fitted + score_result = model.score(new_penguins).to_pandas() + score_expected = pd.DataFrame( + {"davies_bouldin_index": [1.502182], "mean_squared_distance": [1.953408]}, + dtype="Float64", + ) + score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) + + pd.testing.assert_frame_equal( + score_result, score_expected, check_exact=False, rtol=0.1 + ) + + result = model.predict(new_penguins).to_pandas() + expected = pd.DataFrame( {"CENTROID_ID": [2, 3, 1, 2]}, dtype="Int64", - index=pandas.Index( - ["test1", "test2", "test3", "test4"], dtype="string[pyarrow]" - ), + index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) expected.index.name = "observation" assert_pandas_df_equal_ignore_ordering(result, expected) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 52f347151e..0c2744819d 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -56,8 +56,12 @@ def test_columntransformer_standalone_fit_transform( [{"index": 1, "value": 1.0}], [{"index": 2, "value": 1.0}], ], - "scaled_culmen_length_mm": [-0.8099, -0.9931, -1.103], - "scaled_flipper_length_mm": [-0.3495, -1.416, -0.9185], + "scaled_culmen_length_mm": [ + -0.811119671289163, + -0.9945520581113803, + -1.104611490204711, + ], + "scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], }, index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), ) diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index cd829f0ae2..ab33e5d718 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -37,7 +37,7 @@ def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_d ) # no data - report evaluation from the automatic data split - evaluate_result = model.evaluate().compute() + evaluate_result = model.evaluate().to_pandas() evaluate_expected = pandas.DataFrame( { "mean_absolute_error": [225.817334], @@ -57,13 +57,13 @@ def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_d ) # evaluate on all training data - evaluate_result = model.evaluate(df).compute() + evaluate_result = model.evaluate(df).to_pandas() pandas.testing.assert_frame_equal( evaluate_result, evaluate_expected, check_exact=False, rtol=0.1 ) # predict new labels - predictions = model.predict(new_penguins_df).compute() + predictions = model.predict(new_penguins_df).to_pandas() expected = pandas.DataFrame( {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, dtype="Float64", @@ -104,7 +104,7 @@ def test_bqml_manual_preprocessing_e2e( ) # no data - report evaluation from the automatic data split - evaluate_result = model.evaluate().compute() + evaluate_result = model.evaluate().to_pandas() evaluate_expected = pandas.DataFrame( { "mean_absolute_error": [309.477334], @@ -125,13 +125,13 @@ def test_bqml_manual_preprocessing_e2e( ) # evaluate on all training data - evaluate_result = model.evaluate(df).compute() + evaluate_result = model.evaluate(df).to_pandas() pandas.testing.assert_frame_equal( evaluate_result, evaluate_expected, check_exact=False, rtol=0.1 ) # predict new labels - predictions = model.predict(new_penguins_df).compute() + predictions = model.predict(new_penguins_df).to_pandas() expected = pandas.DataFrame( {"predicted_body_mass_g": [3968.8, 3176.3, 3545.2]}, dtype="Float64", @@ -156,11 +156,11 @@ def test_bqml_standalone_transform(penguins_df_default_index, new_penguins_df): options={"model_type": "transform_only"}, transforms=[ "ML.STANDARD_SCALER(culmen_length_mm) OVER() AS scaled_culmen_length_mm", - "ML.ONE_HOT_ENCODER(species) OVER() AS onehotencoded_species", + "ML.ONE_HOT_ENCODER(species, 'none', 1000000, 0) OVER() AS onehotencoded_species", ], ) - transformed = model.transform(new_penguins_df).compute() + transformed = model.transform(new_penguins_df).to_pandas() expected = pandas.DataFrame( { "scaled_culmen_length_mm": [-0.8099, -0.9931, -1.103], diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 57507c1134..460f07b816 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -12,19 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas +import pandas as pd -import bigframes.ml.decomposition +from bigframes.ml import decomposition -def test_decomposition_configure_fit_predict( +def test_decomposition_configure_fit_score_predict( session, penguins_df_default_index, dataset_id ): - model = bigframes.ml.decomposition.PCA(n_components=3) + model = decomposition.PCA(n_components=3) model.fit(penguins_df_default_index) - pd_new_penguins = session.read_pandas( - pandas.DataFrame( + new_penguins = session.read_pandas( + pd.DataFrame( { "tag_number": [1633, 1672, 1690], "species": [ @@ -42,17 +42,31 @@ def test_decomposition_configure_fit_predict( ).set_index("tag_number") ) - result = model.predict(pd_new_penguins).compute() - expected = pandas.DataFrame( + # Check score to ensure the model was fitted + score_result = model.score(new_penguins).to_pandas() + score_expected = pd.DataFrame( + { + "total_explained_variance_ratio": [0.812383], + }, + dtype="Float64", + ) + score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) + + pd.testing.assert_frame_equal( + score_result, score_expected, check_exact=False, rtol=0.1 + ) + + result = model.predict(new_penguins).to_pandas() + expected = pd.DataFrame( { "principal_component_1": [-1.459, 2.258, -1.685], "principal_component_2": [-1.120, -1.351, -0.874], "principal_component_3": [-0.646, 0.443, -0.704], }, dtype="Float64", - index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pandas.testing.assert_frame_equal( + pd.testing.assert_frame_equal( result.sort_index(), expected, check_exact=False, diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index 0d5da380d1..88c5ccd2f0 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -15,10 +15,12 @@ from unittest import TestCase import pandas +import pytest import bigframes.ml.ensemble +@pytest.mark.flaky(retries=2, delay=120) def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.XGBRegressor() @@ -37,7 +39,7 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [97.368139], @@ -62,6 +64,7 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): ) +@pytest.mark.flaky(retries=2, delay=120) def test_xgbregressor_dart_booster_multiple_params( penguins_df_default_index, dataset_id ): @@ -97,7 +100,7 @@ def test_xgbregressor_dart_booster_multiple_params( model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "mean_absolute_error", @@ -136,6 +139,7 @@ def test_xgbregressor_dart_booster_multiple_params( assert reloaded_model.num_parallel_tree == 2 +@pytest.mark.flaky(retries=2, delay=120) def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.XGBClassifier() @@ -153,7 +157,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -175,6 +179,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): ) +@pytest.mark.flaky(retries=2, delay=120) def test_xgbclassifier_dart_booster_multiple_params( penguins_df_default_index, dataset_id ): @@ -209,7 +214,7 @@ def test_xgbclassifier_dart_booster_multiple_params( model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -248,6 +253,7 @@ def test_xgbclassifier_dart_booster_multiple_params( assert reloaded_model.num_parallel_tree == 2 +@pytest.mark.flaky(retries=2, delay=120) def test_randomforestregressor_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestRegressor() @@ -266,7 +272,7 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "mean_absolute_error", @@ -288,6 +294,7 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset ) +@pytest.mark.flaky(retries=2, delay=120) def test_randomforestregressor_multiple_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestRegressor( tree_method="AUTO", @@ -318,7 +325,7 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "mean_absolute_error", @@ -354,6 +361,7 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase assert reloaded_model.enable_global_explain is False +@pytest.mark.flaky(retries=2, delay=120) def test_randomforestclassifier_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestClassifier() @@ -371,7 +379,7 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -393,6 +401,7 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase ) +@pytest.mark.flaky(retries=2, delay=120) def test_randomforestclassifier_multiple_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestClassifier( tree_method="AUTO", @@ -422,7 +431,7 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index 98726eb289..d1e2d12296 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -27,7 +27,7 @@ def test_arima_plus_model_fit_score( result = model.score( new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] - ).compute() + ).to_pandas() expected = pd.DataFrame( { "mean_absolute_error": [154.742547], diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 33010a005c..332b460fe5 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -35,7 +35,7 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() expected = pd.DataFrame( { "mean_absolute_error": [225.735767], @@ -80,7 +80,7 @@ def test_linear_regression_manual_split_configure_fit_score( model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() expected = pd.DataFrame( { "mean_absolute_error": [225.735767], @@ -121,7 +121,7 @@ def test_logistic_regression_auto_class_weights_configure_fit_score( model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() expected = pd.DataFrame( { "precision": [0.58085], @@ -169,7 +169,7 @@ def test_logistic_regression_manual_split_configure_fit_score( model.fit(train_X, train_y) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).compute() + result = model.score(train_X, train_y).to_pandas() expected = pd.DataFrame( { "precision": [0.616753], diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 9700ba2bf6..bec1a51a99 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -13,6 +13,7 @@ # limitations under the License. import pandas as pd +import pytest from bigframes.ml import ( cluster, @@ -38,24 +39,24 @@ def test_pipeline_linear_regression_fit_score_predict( ) df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", ] ] - train_y = df[["body_mass_g"]] - pl.fit(train_X, train_y) + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) # Check score to ensure the model was fitted - score_result = pl.score(train_X, train_y).compute() + score_result = pl.score(X_train, y_train).to_pandas() score_expected = pd.DataFrame( { - "mean_absolute_error": [309.477334], - "mean_squared_error": [152184.227218], + "mean_absolute_error": [309.477331], + "mean_squared_error": [152184.227219], "mean_squared_log_error": [0.009524], - "median_absolute_error": [257.727777], + "median_absolute_error": [257.728263], "r2_score": [0.764356], "explained_variance": [0.764356], }, @@ -96,6 +97,61 @@ def test_pipeline_linear_regression_fit_score_predict( ) +def test_pipeline_linear_regression_series_fit_score_predict( + session, penguins_df_default_index +): + """Test a supervised model with a minimal preprocessing step""" + pl = pipeline.Pipeline( + [ + ("scale", preprocessing.StandardScaler()), + ("linreg", linear_model.LinearRegression()), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df["culmen_length_mm"] + y_train = df["body_mass_g"] + pl.fit(X_train, y_train) + + # Check score to ensure the model was fitted + score_result = pl.score(X_train, y_train).to_pandas() + score_expected = pd.DataFrame( + { + "mean_absolute_error": [528.495599], + "mean_squared_error": [421722.261808], + "mean_squared_log_error": [0.022963], + "median_absolute_error": [468.895249], + "r2_score": [0.346999], + "explained_variance": [0.346999], + }, + dtype="Float64", + ) + score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) + + pd.testing.assert_frame_equal( + score_result, score_expected, check_exact=False, rtol=0.1 + ) + + # predict new labels + new_penguins = session.read_pandas( + pd.DataFrame( + { + "tag_number": [1633, 1672, 1690], + "culmen_length_mm": [39.5, 38.5, 37.9], + } + ).set_index("tag_number") + ) + predictions = pl.predict(new_penguins["culmen_length_mm"]).to_pandas() + expected = pd.DataFrame( + {"predicted_body_mass_g": [3818.845703, 3732.022253, 3679.928123]}, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + pd.testing.assert_frame_equal( + predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1 + ) + + def test_pipeline_logistic_regression_fit_score_predict( session, penguins_df_default_index ): @@ -108,18 +164,18 @@ def test_pipeline_logistic_regression_fit_score_predict( ) df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", ] ] - train_y = df[["sex"]] - pl.fit(train_X, train_y) + y_train = df[["sex"]] + pl.fit(X_train, y_train) # Check score to ensure the model was fitted - score_result = pl.score(train_X, train_y).compute() + score_result = pl.score(X_train, y_train).to_pandas() score_expected = pd.DataFrame( { "precision": [0.537091], @@ -166,6 +222,7 @@ def test_pipeline_logistic_regression_fit_score_predict( ) +@pytest.mark.flaky(retries=2, delay=120) def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_index): """Test a supervised model with a minimal preprocessing step""" pl = pipeline.Pipeline( @@ -176,26 +233,26 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in ) df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", ] ] - train_y = df[["body_mass_g"]] - pl.fit(train_X, train_y) + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) # Check score to ensure the model was fitted - score_result = pl.score(train_X, train_y).compute() + score_result = pl.score(X_train, y_train).to_pandas() score_expected = pd.DataFrame( { - "mean_absolute_error": [203.4001727989334], - "mean_squared_error": [74898.80551717622], - "mean_squared_log_error": [0.004394266810531861], - "median_absolute_error": [152.01806640625], - "r2_score": [0.8840255831308607], - "explained_variance": [0.8858505311591299], + "mean_absolute_error": [202.298434], + "mean_squared_error": [74515.108971], + "mean_squared_log_error": [0.004365], + "median_absolute_error": [142.949219], + "r2_score": [0.88462], + "explained_variance": [0.886454], }, dtype="Float64", ) @@ -240,6 +297,7 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in ) +@pytest.mark.flaky(retries=2, delay=120) def test_pipeline_random_forest_classifier_fit_score_predict( session, penguins_df_default_index ): @@ -252,26 +310,26 @@ def test_pipeline_random_forest_classifier_fit_score_predict( ) df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", ] ] - train_y = df[["sex"]] - pl.fit(train_X, train_y) + y_train = df[["sex"]] + pl.fit(X_train, y_train) # Check score to ensure the model was fitted - score_result = pl.score(train_X, train_y).compute() + score_result = pl.score(X_train, y_train).to_pandas() score_expected = pd.DataFrame( { - "precision": [0.587673], - "recall": [0.588781], - "accuracy": [0.88024], - "f1_score": [0.587644], - "log_loss": [0.859459], - "roc_auc": [0.971737], + "precision": [0.585505], + "recall": [0.58676], + "accuracy": [0.877246], + "f1_score": [0.585657], + "log_loss": [0.880643], + "roc_auc": [0.970697], }, dtype="Float64", ) @@ -310,7 +368,7 @@ def test_pipeline_random_forest_classifier_fit_score_predict( ) -def test_pipeline_PCA_fit_predict(session, penguins_df_default_index): +def test_pipeline_PCA_fit_score_predict(session, penguins_df_default_index): """Test a supervised model with a minimal preprocessing step""" pl = pipeline.Pipeline( [ @@ -320,14 +378,14 @@ def test_pipeline_PCA_fit_predict(session, penguins_df_default_index): ) df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", ] ] - pl.fit(train_X) + pl.fit(X_train) # predict new labels new_penguins = session.read_pandas( @@ -347,12 +405,27 @@ def test_pipeline_PCA_fit_predict(session, penguins_df_default_index): } ).set_index("tag_number") ) + + # Check score to ensure the model was fitted + score_result = pl.score(new_penguins).to_pandas() + score_expected = pd.DataFrame( + { + "total_explained_variance_ratio": [1.0], + }, + dtype="Float64", + ) + score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) + + pd.testing.assert_frame_equal( + score_result, score_expected, check_exact=False, rtol=0.1 + ) + predictions = pl.predict(new_penguins).to_pandas() expected = pd.DataFrame( { - "principal_component_1": [-1.115259, -1.506141, -1.471174], - "principal_component_2": [-0.074824, 0.69664, 0.406104], - "principal_component_3": [0.500012, -0.544479, 0.075849], + "principal_component_1": [-1.115259, -1.506141, -1.471173], + "principal_component_2": [-0.074825, 0.69664, 0.406103], + "principal_component_3": [0.500013, -0.544479, 0.075849], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), @@ -367,7 +440,8 @@ def test_pipeline_PCA_fit_predict(session, penguins_df_default_index): ) -def test_pipeline_standard_scaler_kmeans_fit_predict( +@pytest.mark.flaky(retries=2, delay=120) +def test_pipeline_standard_scaler_kmeans_fit_score_predict( session, penguins_pandas_df_default_index ): """Test an unsupervised model with a non-BQML implementation of StandardScaler""" @@ -380,14 +454,14 @@ def test_pipeline_standard_scaler_kmeans_fit_predict( # kmeans is sensitive to the order with this configuration, so use ordered source data df = session.read_pandas(penguins_pandas_df_default_index).dropna() - train_X = df[ + X_train = df[ [ "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", ] ] - pl.fit(train_X) + pl.fit(X_train) # predict new labels pd_new_penguins = pd.DataFrame.from_dict( @@ -452,6 +526,19 @@ def test_pipeline_standard_scaler_kmeans_fit_predict( pd_new_penguins.index.name = "observation" new_penguins = session.read_pandas(pd_new_penguins) + + # Check score to ensure the model was fitted + score_result = pl.score(new_penguins).to_pandas() + score_expected = pd.DataFrame( + {"davies_bouldin_index": [7.542981], "mean_squared_distance": [94.692409]}, + dtype="Float64", + ) + score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) + + pd.testing.assert_frame_equal( + score_result, score_expected, check_exact=False, rtol=0.1 + ) + result = pl.predict(new_penguins).to_pandas().sort_index() expected = pd.DataFrame( {"CENTROID_ID": [1, 2, 1, 2, 1, 2]}, @@ -466,7 +553,7 @@ def test_pipeline_standard_scaler_kmeans_fit_predict( def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_index): - """Test a preprocessing step that manages heterogenous data with ColumnTransformer""" + """Test a preprocessing step that manages heterogeneous data with ColumnTransformer""" pl = pipeline.Pipeline( [ ( @@ -491,9 +578,9 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind ) df = penguins_df_default_index.dropna() - train_X = df[["species", "culmen_length_mm", "flipper_length_mm"]] - train_y = df[["body_mass_g"]] - pl.fit(train_X, train_y) + X_train = df[["species", "culmen_length_mm", "flipper_length_mm"]] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) # predict new labels new_penguins = session.read_pandas( @@ -522,3 +609,129 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind pd.testing.assert_frame_equal( predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1 ) + + +def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ( + "transform", + compose.ColumnTransformer( + [ + ( + "ont_hot_encoder", + preprocessing.OneHotEncoder( + drop="most_frequent", + min_frequency=5, + max_categories=100, + ), + "species", + ), + ( + "standard_scaler", + preprocessing.StandardScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), + ] + ), + ), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[["species", "culmen_length_mm", "flipper_length_mm"]] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_col_transformer", replace=True + ) + + assert isinstance(pl_loaded._transform, compose.ColumnTransformer) + transformers = pl_loaded._transform.transformers_ + assert len(transformers) == 3 + + assert transformers[0][0] == "ont_hot_encoder" + assert isinstance(transformers[0][1], preprocessing.OneHotEncoder) + one_hot_encoder = transformers[0][1] + assert one_hot_encoder.drop == "most_frequent" + assert one_hot_encoder.min_frequency == 5 + assert one_hot_encoder.max_categories == 100 + assert transformers[0][2] == "species" + + assert transformers[1][0] == "standard_scaler" + assert isinstance(transformers[1][1], preprocessing.StandardScaler) + assert transformers[1][2] == "culmen_length_mm" + + assert transformers[2][0] == "standard_scaler" + assert isinstance(transformers[2][1], preprocessing.StandardScaler) + assert transformers[2][2] == "flipper_length_mm" + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False + + +def test_pipeline_standard_scaler_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ("transform", preprocessing.StandardScaler()), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[ + [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + ] + ] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_standard_scaler", replace=True + ) + assert isinstance(pl_loaded._transform, preprocessing.StandardScaler) + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False + + +def test_pipeline_one_hot_encoder_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ( + "transform", + preprocessing.OneHotEncoder( + drop="most_frequent", min_frequency=5, max_categories=100 + ), + ), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[ + [ + "sex", + "species", + ] + ] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_one_hot_encoder", replace=True + ) + assert isinstance(pl_loaded._transform, preprocessing.OneHotEncoder) + + one_hot_encoder = pl_loaded._transform + assert one_hot_encoder.drop == "most_frequent" + assert one_hot_encoder.min_frequency == 5 + assert one_hot_encoder.max_categories == 100 + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 714252c9e0..8033f79c47 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -295,7 +295,7 @@ def square(x): bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(square) bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() ) pd_int64_col = scalars_pandas_df["int64_col"] @@ -339,7 +339,7 @@ def add_one(x): bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(remote_add_one) bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() ) pd_int64_col = scalars_pandas_df["int64_col"] @@ -392,7 +392,7 @@ def sign(num): bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(remote_sign) bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() ) pd_int64_col = scalars_pandas_df["int64_col"] @@ -437,7 +437,7 @@ def circumference(radius): bf_float64_col_filtered = bf_float64_col[bf_float64_col_filter] bf_result_col = bf_float64_col_filtered.apply(remote_circumference) bf_result = ( - bf_float64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_float64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() ) pd_float64_col = scalars_pandas_df["float64_col"] @@ -486,7 +486,7 @@ def find_team(num): bf_float64_col_filtered = bf_float64_col[bf_float64_col_filter] bf_result_col = bf_float64_col_filtered.apply(remote_find_team) bf_result = ( - bf_float64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_float64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() ) pd_float64_col = scalars_pandas_df["float64_col"] @@ -577,7 +577,9 @@ def inner_test(): bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(remote_add_one) bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_int64_col_filtered.to_frame() + .assign(result=bf_result_col) + .to_pandas() ) pd_int64_col = scalars_pandas_df["int64_col"] @@ -675,7 +677,7 @@ def is_odd(num): bf_int64_col = scalars_df["int64_col"] bf_result_col = bf_int64_col.mask(is_odd_remote) - bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).compute() + bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() pd_int64_col = scalars_pandas_df["int64_col"] pd_result_col = pd_int64_col.mask(is_odd) @@ -718,7 +720,7 @@ def is_odd(num): # for now filter out the nulls and test the rest bf_int64_col = scalars_df["int64_col"] bf_result_col = bf_int64_col[bf_int64_col.notnull()].mask(is_odd_remote, -1) - bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).compute() + bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() pd_int64_col = scalars_pandas_df["int64_col"] pd_result_col = pd_int64_col[pd_int64_col.notnull()].mask(is_odd, -1) @@ -754,7 +756,7 @@ def test_remote_udf_lambda( bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(add_one_lambda_remote) bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() ) pd_int64_col = scalars_pandas_df["int64_col"] diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index 5c3abf25dc..8325cec50a 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -21,11 +21,17 @@ import pandas as pd import pytest -from bigframes.ml import core, forecasting, imported, llm -import bigframes.ml.cluster -import bigframes.ml.core -import bigframes.ml.ensemble -import bigframes.ml.linear_model +import bigframes +from bigframes.ml import ( + cluster, + core, + decomposition, + ensemble, + forecasting, + imported, + linear_model, + llm, +) @pytest.fixture(scope="session") @@ -34,17 +40,15 @@ def ml_connection() -> str: @pytest.fixture(scope="session") -def penguins_bqml_linear_model( - session, penguins_linear_model_name -) -> bigframes.ml.core.BqmlModel: +def penguins_bqml_linear_model(session, penguins_linear_model_name) -> core.BqmlModel: model = session.bqclient.get_model(penguins_linear_model_name) - return bigframes.ml.core.BqmlModel(session, model) + return core.BqmlModel(session, model) @pytest.fixture(scope="function") def ephemera_penguins_bqml_linear_model( penguins_bqml_linear_model, -) -> bigframes.ml.linear_model.LinearRegression: +) -> linear_model.LinearRegression: model = penguins_bqml_linear_model return model.copy( f"{model._model.project}.{model._model.dataset_id}.{uuid.uuid4().hex}" @@ -54,18 +58,18 @@ def ephemera_penguins_bqml_linear_model( @pytest.fixture(scope="session") def penguins_linear_model( session, penguins_linear_model_name: str -) -> bigframes.ml.linear_model.LinearRegression: +) -> linear_model.LinearRegression: return cast( - bigframes.ml.linear_model.LinearRegression, + linear_model.LinearRegression, session.read_gbq_model(penguins_linear_model_name), ) @pytest.fixture(scope="function") def ephemera_penguins_linear_model( - ephemera_penguins_bqml_linear_model: bigframes.ml.core.BqmlModel, -) -> bigframes.ml.linear_model.LinearRegression: - bf_model = bigframes.ml.linear_model.LinearRegression() + ephemera_penguins_bqml_linear_model: core.BqmlModel, +) -> linear_model.LinearRegression: + bf_model = linear_model.LinearRegression() bf_model._bqml_model = ephemera_penguins_bqml_linear_model return bf_model @@ -73,9 +77,9 @@ def ephemera_penguins_linear_model( @pytest.fixture(scope="session") def penguins_logistic_model( session, penguins_logistic_model_name -) -> bigframes.ml.linear_model.LogisticRegression: +) -> linear_model.LogisticRegression: return cast( - bigframes.ml.linear_model.LogisticRegression, + linear_model.LogisticRegression, session.read_gbq_model(penguins_logistic_model_name), ) @@ -83,9 +87,9 @@ def penguins_logistic_model( @pytest.fixture(scope="session") def penguins_xgbregressor_model( session, penguins_xgbregressor_model_name -) -> bigframes.ml.ensemble.XGBRegressor: +) -> ensemble.XGBRegressor: return cast( - bigframes.ml.ensemble.XGBRegressor, + ensemble.XGBRegressor, session.read_gbq_model(penguins_xgbregressor_model_name), ) @@ -93,9 +97,9 @@ def penguins_xgbregressor_model( @pytest.fixture(scope="session") def penguins_xgbclassifier_model( session, penguins_xgbclassifier_model_name -) -> bigframes.ml.ensemble.XGBClassifier: +) -> ensemble.XGBClassifier: return cast( - bigframes.ml.ensemble.XGBClassifier, + ensemble.XGBClassifier, session.read_gbq_model(penguins_xgbclassifier_model_name), ) @@ -103,9 +107,9 @@ def penguins_xgbclassifier_model( @pytest.fixture(scope="session") def penguins_randomforest_regressor_model( session, penguins_randomforest_regressor_model_name -) -> bigframes.ml.ensemble.RandomForestRegressor: +) -> ensemble.RandomForestRegressor: return cast( - bigframes.ml.ensemble.RandomForestRegressor, + ensemble.RandomForestRegressor, session.read_gbq_model(penguins_randomforest_regressor_model_name), ) @@ -113,9 +117,9 @@ def penguins_randomforest_regressor_model( @pytest.fixture(scope="session") def penguins_randomforest_classifier_model( session, penguins_randomforest_classifier_model_name -) -> bigframes.ml.ensemble.RandomForestClassifier: +) -> ensemble.RandomForestClassifier: return cast( - bigframes.ml.ensemble.RandomForestClassifier, + ensemble.RandomForestClassifier, session.read_gbq_model(penguins_randomforest_classifier_model_name), ) @@ -123,7 +127,7 @@ def penguins_randomforest_classifier_model( @pytest.fixture(scope="session") def penguins_kmeans_model( session: bigframes.Session, dataset_id_permanent, penguins_table_id -) -> bigframes.ml.cluster.KMeans: +) -> cluster.KMeans: """Provides a pretrained model as a test fixture that is cached across test runs. This lets us run system tests without having to wait for a model.fit(...)""" sql = f""" @@ -154,7 +158,7 @@ def penguins_kmeans_model( @pytest.fixture(scope="session") def penguins_pca_model( session: bigframes.Session, dataset_id_permanent, penguins_table_id -) -> bigframes.ml.decomposition.PCA: +) -> decomposition.PCA: # TODO(yunmengxie): Create a shared method to get different types of pretrained models. sql = f""" @@ -248,8 +252,10 @@ def ephemera_palm2_text_generator_model( @pytest.fixture(scope="session") def palm2_embedding_generator_model( session, ml_connection -) -> llm.PaLM2EmbeddingGenerator: - return llm.PaLM2EmbeddingGenerator(session=session, connection_name=ml_connection) +) -> llm.PaLM2TextEmbeddingGenerator: + return llm.PaLM2TextEmbeddingGenerator( + session=session, connection_name=ml_connection + ) @pytest.fixture(scope="session") @@ -257,7 +263,7 @@ def time_series_bqml_arima_plus_model( session, time_series_arima_plus_model_name ) -> core.BqmlModel: model = session.bqclient.get_model(time_series_arima_plus_model_name) - return bigframes.ml.core.BqmlModel(session, model) + return core.BqmlModel(session, model) @pytest.fixture(scope="session") @@ -287,8 +293,8 @@ def ephemera_imported_tensorflow_model(session) -> imported.TensorFlowModel: @pytest.fixture(scope="session") -def imported_onnx_model(session) -> imported.OnnxModel: - return imported.OnnxModel( +def imported_onnx_model(session) -> imported.ONNXModel: + return imported.ONNXModel( session=session, model_path="gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx", ) diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index 4aefc5fa69..a003cd1ec1 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -12,68 +12,81 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas +import pandas as pd -import bigframes.ml.cluster +from bigframes.ml import cluster from tests.system.utils import assert_pandas_df_equal_ignore_ordering - -def test_model_predict(session, penguins_kmeans_model: bigframes.ml.cluster.KMeans): - pd_new_penguins = pandas.DataFrame.from_dict( - { - "test1": { - "species": "Adelie Penguin (Pygoscelis adeliae)", - "island": "Dream", - "culmen_length_mm": 37.5, - "culmen_depth_mm": 18.5, - "flipper_length_mm": 199, - "body_mass_g": 4475, - "sex": "MALE", - }, - "test2": { - "species": "Chinstrap penguin (Pygoscelis antarctica)", - "island": "Dream", - "culmen_length_mm": 55.8, - "culmen_depth_mm": 19.8, - "flipper_length_mm": 207, - "body_mass_g": 4000, - "sex": "MALE", - }, - "test3": { - "species": "Adelie Penguin (Pygoscelis adeliae)", - "island": "Biscoe", - "culmen_length_mm": 39.7, - "culmen_depth_mm": 18.9, - "flipper_length_mm": 184, - "body_mass_g": 3550, - "sex": "MALE", - }, - "test4": { - "species": "Gentoo penguin (Pygoscelis papua)", - "island": "Biscoe", - "culmen_length_mm": 43.8, - "culmen_depth_mm": 13.9, - "flipper_length_mm": 208, - "body_mass_g": 4300, - "sex": "FEMALE", - }, +_PD_NEW_PENGUINS = pd.DataFrame.from_dict( + { + "test1": { + "species": "Adelie Penguin (Pygoscelis adeliae)", + "island": "Dream", + "culmen_length_mm": 37.5, + "culmen_depth_mm": 18.5, + "flipper_length_mm": 199, + "body_mass_g": 4475, + "sex": "MALE", }, - orient="index", - ) - pd_new_penguins.index.name = "observation" + "test2": { + "species": "Chinstrap penguin (Pygoscelis antarctica)", + "island": "Dream", + "culmen_length_mm": 55.8, + "culmen_depth_mm": 19.8, + "flipper_length_mm": 207, + "body_mass_g": 4000, + "sex": "MALE", + }, + "test3": { + "species": "Adelie Penguin (Pygoscelis adeliae)", + "island": "Biscoe", + "culmen_length_mm": 39.7, + "culmen_depth_mm": 18.9, + "flipper_length_mm": 184, + "body_mass_g": 3550, + "sex": "MALE", + }, + "test4": { + "species": "Gentoo penguin (Pygoscelis papua)", + "island": "Biscoe", + "culmen_length_mm": 43.8, + "culmen_depth_mm": 13.9, + "flipper_length_mm": 208, + "body_mass_g": 4300, + "sex": "FEMALE", + }, + }, + orient="index", +) - new_penguins = session.read_pandas(pd_new_penguins) - result = penguins_kmeans_model.predict(new_penguins).compute() - expected = pandas.DataFrame( + +def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans): + new_penguins = session.read_pandas(_PD_NEW_PENGUINS) + result = penguins_kmeans_model.predict(new_penguins).to_pandas() + expected = pd.DataFrame( {"CENTROID_ID": [2, 3, 1, 2]}, dtype="Int64", - index=pandas.Index( - ["test1", "test2", "test3", "test4"], dtype="string[pyarrow]" - ), + index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) - expected.index.name = "observation" assert_pandas_df_equal_ignore_ordering(result, expected) +def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans): + new_penguins = session.read_pandas(_PD_NEW_PENGUINS) + result = penguins_kmeans_model.score(new_penguins).to_pandas() + expected = pd.DataFrame( + {"davies_bouldin_index": [1.523606], "mean_squared_distance": [1.965944]}, + dtype="Float64", + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame + check_index_type=False, + ) + + def test_loaded_config(penguins_kmeans_model): assert penguins_kmeans_model.n_clusters == 3 diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index d63e401dd7..8b864d9b55 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -27,7 +27,7 @@ def test_model_eval( penguins_bqml_linear_model, ): - result = penguins_bqml_linear_model.evaluate().compute() + result = penguins_bqml_linear_model.evaluate().to_pandas() expected = pd.DataFrame( { "mean_absolute_error": [227.01223], @@ -52,7 +52,7 @@ def test_model_eval( def test_model_eval_with_data(penguins_bqml_linear_model, penguins_df_default_index): result = penguins_bqml_linear_model.evaluate( penguins_df_default_index.dropna() - ).compute() + ).to_pandas() expected = pd.DataFrame( { "mean_absolute_error": [225.817334], @@ -77,7 +77,7 @@ def test_model_eval_with_data(penguins_bqml_linear_model, penguins_df_default_in def test_model_predict( penguins_bqml_linear_model: bigframes.ml.core.BqmlModel, new_penguins_df ): - predictions = penguins_bqml_linear_model.predict(new_penguins_df).compute() + predictions = penguins_bqml_linear_model.predict(new_penguins_df).to_pandas() expected = pd.DataFrame( {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, dtype="Float64", @@ -105,7 +105,7 @@ def test_model_predict_with_unnamed_index( new_penguins_df[new_penguins_df.tag_number != 1672], ) - predictions = penguins_bqml_linear_model.predict(new_penguins_df).compute() + predictions = penguins_bqml_linear_model.predict(new_penguins_df).to_pandas() expected = pd.DataFrame( {"predicted_body_mass_g": [4030.1, 3177.9]}, @@ -132,7 +132,7 @@ def test_model_generate_text( } df = bqml_palm2_text_generator_model.generate_text( llm_text_df, options=options - ).compute() + ).to_pandas() TestCase().assertSequenceEqual(df.shape, (3, 4)) TestCase().assertSequenceEqual( @@ -150,7 +150,7 @@ def test_model_generate_text( def test_model_forecast(time_series_bqml_arima_plus_model: bigframes.ml.core.BqmlModel): utc = pytz.utc - forecast = time_series_bqml_arima_plus_model.forecast().compute()[ + forecast = time_series_bqml_arima_plus_model.forecast().to_pandas()[ ["forecast_timestamp", "forecast_value"] ] expected = pd.DataFrame( diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index 7e0421129e..01d5207750 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -12,44 +12,59 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas - -import bigframes.ml.decomposition - - -def test_model_predict(session, penguins_pca_model: bigframes.ml.decomposition.PCA): - new_penguins = session.read_pandas( - pandas.DataFrame( - { - "tag_number": [1633, 1672, 1690], - "species": [ - "Adelie Penguin (Pygoscelis adeliae)", - "Gentoo penguin (Pygoscelis papua)", - "Adelie Penguin (Pygoscelis adeliae)", - ], - "island": ["Dream", "Biscoe", "Torgersen"], - "culmen_length_mm": [37.8, 46.5, 41.1], - "culmen_depth_mm": [18.1, 14.8, 18.6], - "flipper_length_mm": [193.0, 217.0, 189.0], - "body_mass_g": [3750.0, 5200.0, 3325.0], - "sex": ["MALE", "FEMALE", "MALE"], - } - ).set_index("tag_number") - ) +import pandas as pd + +from bigframes.ml import decomposition + +_PD_NEW_PENGUINS = pd.DataFrame( + { + "tag_number": [1633, 1672, 1690], + "species": [ + "Adelie Penguin (Pygoscelis adeliae)", + "Gentoo penguin (Pygoscelis papua)", + "Adelie Penguin (Pygoscelis adeliae)", + ], + "island": ["Dream", "Biscoe", "Torgersen"], + "culmen_length_mm": [37.8, 46.5, 41.1], + "culmen_depth_mm": [18.1, 14.8, 18.6], + "flipper_length_mm": [193.0, 217.0, 189.0], + "body_mass_g": [3750.0, 5200.0, 3325.0], + "sex": ["MALE", "FEMALE", "MALE"], + } +).set_index("tag_number") + - predictions = penguins_pca_model.predict(new_penguins).compute() - expected = pandas.DataFrame( +def test_pca_predict(session, penguins_pca_model: decomposition.PCA): + new_penguins = session.read_pandas(_PD_NEW_PENGUINS) + + predictions = penguins_pca_model.predict(new_penguins).to_pandas() + expected = pd.DataFrame( { "principal_component_1": [-1.459, 2.258, -1.685], "principal_component_2": [-1.120, -1.351, -0.874], "principal_component_3": [-0.646, 0.443, -0.704], }, dtype="Float64", - index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pandas.testing.assert_frame_equal( + pd.testing.assert_frame_equal( predictions.sort_index(), expected, check_exact=False, rtol=0.1, ) + + +def test_pca_score(session, penguins_pca_model: decomposition.PCA): + result = penguins_pca_model.score().to_pandas() + expected = pd.DataFrame( + {"total_explained_variance_ratio": [0.812383]}, + dtype="Float64", + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + check_index_type=False, + ) diff --git a/tests/system/small/ml/test_ensemble.py b/tests/system/small/ml/test_ensemble.py index 61c60c249f..fde3cc431e 100644 --- a/tests/system/small/ml/test_ensemble.py +++ b/tests/system/small/ml/test_ensemble.py @@ -36,7 +36,44 @@ def test_xgbregressor_model_score( ] ] test_y = df[["sex"]] - result = penguins_xgbregressor_model.score(test_X, test_y).compute() + result = penguins_xgbregressor_model.score(test_X, test_y).to_pandas() + expected = pandas.DataFrame( + { + "mean_absolute_error": [108.77582], + "mean_squared_error": [20943.272738], + "mean_squared_log_error": [0.00135], + "median_absolute_error": [86.313477], + "r2_score": [0.967571], + "explained_variance": [0.967609], + }, + dtype="Float64", + ) + pandas.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame + check_index_type=False, + ) + + +def test_xgbregressor_model_score_series( + penguins_xgbregressor_model, penguins_df_default_index +): + df = penguins_df_default_index.dropna() + test_X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + ] + ] + test_y = df["sex"] + result = penguins_xgbregressor_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [108.77582], @@ -61,7 +98,7 @@ def test_xgbregressor_model_score( def test_xgbregressor_model_predict( penguins_xgbregressor_model: bigframes.ml.ensemble.XGBRegressor, new_penguins_df ): - result = penguins_xgbregressor_model.predict(new_penguins_df).compute() + result = penguins_xgbregressor_model.predict(new_penguins_df).to_pandas() expected = pandas.DataFrame( {"predicted_body_mass_g": ["4293.1538089", "3410.0271", "3357.944"]}, dtype="Float64", @@ -94,7 +131,7 @@ def test_to_gbq_saved_xgbregressor_model_scores( ] ] test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).compute() + result = saved_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [109.016973], @@ -139,7 +176,35 @@ def test_xgbclassifier_model_score( ] ] test_y = df[["sex"]] - result = penguins_xgbclassifier_model.score(test_X, test_y).compute() + result = penguins_xgbclassifier_model.score(test_X, test_y).to_pandas() + TestCase().assertSequenceEqual(result.shape, (1, 6)) + for col_name in [ + "precision", + "recall", + "accuracy", + "f1_score", + "log_loss", + "roc_auc", + ]: + assert col_name in result.columns + + +def test_xgbclassifier_model_score_series( + penguins_xgbclassifier_model, penguins_df_default_index +): + df = penguins_df_default_index.dropna() + test_X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + ] + ] + test_y = df["sex"] + result = penguins_xgbclassifier_model.score(test_X, test_y).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -155,7 +220,7 @@ def test_xgbclassifier_model_score( def test_xgbclassifier_model_predict( penguins_xgbclassifier_model: bigframes.ml.ensemble.XGBClassifier, new_penguins_df ): - result = penguins_xgbclassifier_model.predict(new_penguins_df).compute() + result = penguins_xgbclassifier_model.predict(new_penguins_df).to_pandas() expected = pandas.DataFrame( {"predicted_sex": ["MALE", "MALE", "FEMALE"]}, dtype="string[pyarrow]", @@ -188,7 +253,7 @@ def test_to_gbq_saved_xgbclassifier_model_scores( ] ] test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).compute() + result = saved_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "precision": [1.0], @@ -235,7 +300,44 @@ def test_randomforestregressor_model_score( ] ] test_y = df[["sex"]] - result = penguins_randomforest_regressor_model.score(test_X, test_y).compute() + result = penguins_randomforest_regressor_model.score(test_X, test_y).to_pandas() + expected = pandas.DataFrame( + { + "mean_absolute_error": [317.031042], + "mean_squared_error": [159713.053504], + "mean_squared_log_error": [0.008449], + "median_absolute_error": [258.385742], + "r2_score": [0.752698], + "explained_variance": [0.756173], + }, + dtype="Float64", + ) + pandas.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + # int64 Index by default in pandas versus Int64 (nullable) Index in BigFramese + check_index_type=False, + ) + + +def test_randomforestregressor_model_score_series( + penguins_randomforest_regressor_model, penguins_df_default_index +): + df = penguins_df_default_index.dropna() + test_X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + ] + ] + test_y = df["sex"] + result = penguins_randomforest_regressor_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [317.031042], @@ -261,7 +363,7 @@ def test_randomforestregressor_model_predict( penguins_randomforest_regressor_model: bigframes.ml.ensemble.RandomForestRegressor, new_penguins_df, ): - result = penguins_randomforest_regressor_model.predict(new_penguins_df).compute() + result = penguins_randomforest_regressor_model.predict(new_penguins_df).to_pandas() expected = pandas.DataFrame( {"predicted_body_mass_g": ["3897.341797", "3458.385742", "3458.385742"]}, dtype="Float64", @@ -294,7 +396,7 @@ def test_to_gbq_saved_randomforestregressor_model_scores( ] ] test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).compute() + result = saved_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [319.239235], @@ -343,7 +445,35 @@ def test_randomforestclassifier_model_score( ] ] test_y = df[["sex"]] - result = penguins_randomforest_classifier_model.score(test_X, test_y).compute() + result = penguins_randomforest_classifier_model.score(test_X, test_y).to_pandas() + TestCase().assertSequenceEqual(result.shape, (1, 6)) + for col_name in [ + "precision", + "recall", + "accuracy", + "f1_score", + "log_loss", + "roc_auc", + ]: + assert col_name in result.columns + + +def test_randomforestclassifier_model_score_series( + penguins_randomforest_classifier_model, penguins_df_default_index +): + df = penguins_df_default_index.dropna() + test_X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + ] + ] + test_y = df["sex"] + result = penguins_randomforest_classifier_model.score(test_X, test_y).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -360,7 +490,7 @@ def test_randomforestclassifier_model_predict( penguins_randomforest_classifier_model: bigframes.ml.ensemble.RandomForestClassifier, new_penguins_df, ): - result = penguins_randomforest_classifier_model.predict(new_penguins_df).compute() + result = penguins_randomforest_classifier_model.predict(new_penguins_df).to_pandas() expected = pandas.DataFrame( {"predicted_sex": ["MALE", "MALE", "FEMALE"]}, dtype="string[pyarrow]", @@ -393,7 +523,7 @@ def test_to_gbq_saved_randomforestclassifier_model_scores( ] ] test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).compute() + result = saved_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "precision": [0.636746], diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index 153299f598..cb27dd388c 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -21,7 +21,7 @@ def test_model_predict(time_series_arima_plus_model): utc = pytz.utc - predictions = time_series_arima_plus_model.predict().compute() + predictions = time_series_arima_plus_model.predict().to_pandas() expected = pd.DataFrame( { "forecast_timestamp": [ @@ -42,3 +42,47 @@ def test_model_predict(time_series_arima_plus_model): rtol=0.1, check_index_type=False, ) + + +def test_model_score(time_series_arima_plus_model, new_time_series_df): + result = time_series_arima_plus_model.score( + new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] + ).to_pandas() + expected = pd.DataFrame( + { + "mean_absolute_error": [154.742547], + "mean_squared_error": [26844.868855], + "root_mean_squared_error": [163.844038], + "mean_absolute_percentage_error": [6.189702], + "symmetric_mean_absolute_percentage_error": [6.097155], + }, + dtype="Float64", + ) + pd.testing.assert_frame_equal( + result, + expected, + rtol=0.1, + check_index_type=False, + ) + + +def test_model_score_series(time_series_arima_plus_model, new_time_series_df): + result = time_series_arima_plus_model.score( + new_time_series_df["parsed_date"], new_time_series_df["total_visits"] + ).to_pandas() + expected = pd.DataFrame( + { + "mean_absolute_error": [154.742547], + "mean_squared_error": [26844.868855], + "root_mean_squared_error": [163.844038], + "mean_absolute_percentage_error": [6.189702], + "symmetric_mean_absolute_percentage_error": [6.097155], + }, + dtype="Float64", + ) + pd.testing.assert_frame_equal( + result, + expected, + rtol=0.1, + check_index_type=False, + ) diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py index b2a9abc46e..6274ab1245 100644 --- a/tests/system/small/ml/test_imported.py +++ b/tests/system/small/ml/test_imported.py @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import google.api_core.exceptions import numpy as np import pandas as pd +import pytest + +from bigframes.ml import imported def test_tensorflow_create_model(imported_tensorflow_model): @@ -23,7 +27,7 @@ def test_tensorflow_create_model(imported_tensorflow_model): def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df): df = llm_text_df.rename(columns={"prompt": "input"}) - result = imported_tensorflow_model.predict(df).compute() + result = imported_tensorflow_model.predict(df).to_pandas() # The values are non-human-readable. As they are a dense layer of Neural Network. # And since it is pretrained and imported, the model is a opaque-box. # We may want to switch to better test model and cases. @@ -44,13 +48,21 @@ def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df): ) +def test_tensorflow_model_to_gbq( + imported_tensorflow_model: imported.TensorFlowModel, dataset_id: str +): + imported_tensorflow_model.to_gbq(f"{dataset_id}.test_tf_model", replace=True) + with pytest.raises(google.api_core.exceptions.Conflict): + imported_tensorflow_model.to_gbq(f"{dataset_id}.test_tf_model") + + def test_onnx_create_model(imported_onnx_model): # Model creation doesn't return error assert imported_onnx_model is not None def test_onnx_model_predict(imported_onnx_model, onnx_iris_df): - result = imported_onnx_model.predict(onnx_iris_df).compute() + result = imported_onnx_model.predict(onnx_iris_df).to_pandas() value1 = np.array([0.9999993443489075, 0.0, 0.0]) value2 = np.array([0.0, 0.0, 0.9999993443489075]) expected = pd.DataFrame( @@ -66,3 +78,9 @@ def test_onnx_model_predict(imported_onnx_model, onnx_iris_df): check_exact=False, atol=0.1, ) + + +def test_onnx_model_to_gbq(imported_onnx_model: imported.ONNXModel, dataset_id: str): + imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model", replace=True) + with pytest.raises(google.api_core.exceptions.Conflict): + imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model") diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index d5d6957eec..bbb7e2820c 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -30,7 +30,44 @@ def test_linear_reg_model_score(penguins_linear_model, penguins_df_default_index ] ] test_y = df[["body_mass_g"]] - result = penguins_linear_model.score(test_X, test_y).compute() + result = penguins_linear_model.score(test_X, test_y).to_pandas() + expected = pandas.DataFrame( + { + "mean_absolute_error": [225.817334], + "mean_squared_error": [80540.705944], + "mean_squared_log_error": [0.004972], + "median_absolute_error": [173.080816], + "r2_score": [0.87529], + "explained_variance": [0.87529], + }, + dtype="Float64", + ) + pandas.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame + check_index_type=False, + ) + + +def test_linear_reg_model_score_series( + penguins_linear_model, penguins_df_default_index +): + df = penguins_df_default_index.dropna() + test_X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + test_y = df["body_mass_g"] + result = penguins_linear_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [225.817334], @@ -53,7 +90,7 @@ def test_linear_reg_model_score(penguins_linear_model, penguins_df_default_index def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df): - predictions = penguins_linear_model.predict(new_penguins_df).compute() + predictions = penguins_linear_model.predict(new_penguins_df).to_pandas() expected = pandas.DataFrame( {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, dtype="Float64", @@ -85,7 +122,7 @@ def test_to_gbq_saved_linear_reg_model_scores( ] ] test_y = df[["body_mass_g"]] - result = saved_model.score(test_X, test_y).compute() + result = saved_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [227.01223], @@ -126,7 +163,44 @@ def test_logistic_model_score(penguins_logistic_model, penguins_df_default_index ] ] test_y = df[["sex"]] - result = penguins_logistic_model.score(test_X, test_y).compute() + result = penguins_logistic_model.score(test_X, test_y).to_pandas() + expected = pandas.DataFrame( + { + "precision": [0.616753], + "recall": [0.618615], + "accuracy": [0.92515], + "f1_score": [0.617681], + "log_loss": [1.498832], + "roc_auc": [0.975807], + }, + dtype="Float64", + ) + pandas.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame + check_index_type=False, + ) + + +def test_logistic_model_score_series( + penguins_logistic_model, penguins_df_default_index +): + df = penguins_df_default_index.dropna() + test_X = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + ] + ] + test_y = df["sex"] + result = penguins_logistic_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "precision": [0.616753], @@ -149,7 +223,7 @@ def test_logistic_model_score(penguins_logistic_model, penguins_df_default_index def test_logsitic_model_predict(penguins_logistic_model, new_penguins_df): - predictions = penguins_logistic_model.predict(new_penguins_df).compute() + predictions = penguins_logistic_model.predict(new_penguins_df).to_pandas() expected = pandas.DataFrame( {"predicted_sex": ["MALE", "MALE", "FEMALE"]}, dtype="string[pyarrow]", @@ -163,7 +237,7 @@ def test_logsitic_model_predict(penguins_logistic_model, new_penguins_df): ) -def test_to_gbq_saved_logsitic_model_score( +def test_logsitic_model_to_gbq_saved_score( penguins_logistic_model, dataset_id, penguins_df_default_index ): saved_model = penguins_logistic_model.to_gbq( @@ -181,7 +255,7 @@ def test_to_gbq_saved_logsitic_model_score( ] ] test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).compute() + result = saved_model.score(test_X, test_y).to_pandas() expected = pandas.DataFrame( { "precision": [0.616753], @@ -203,7 +277,7 @@ def test_to_gbq_saved_logsitic_model_score( ) -def test_to_logistic_model_gbq_replace(penguins_logistic_model, dataset_id): +def test_logistic_model_to_gbq_replace(penguins_logistic_model, dataset_id): penguins_logistic_model.to_gbq(f"{dataset_id}.test_penguins_model", replace=True) with pytest.raises(google.api_core.exceptions.Conflict): penguins_logistic_model.to_gbq(f"{dataset_id}.test_penguins_model") diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index b579e754e5..74356c81e1 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -25,7 +25,17 @@ def test_create_text_generator_model(palm2_text_generator_model): def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df ): - df = palm2_text_generator_model.predict(llm_text_df).compute() + df = palm2_text_generator_model.predict(llm_text_df).to_pandas() + TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() > 20) + + +def test_text_generator_predict_series_default_params_success( + palm2_text_generator_model, llm_text_df +): + df = palm2_text_generator_model.predict(llm_text_df["prompt"]).to_pandas() TestCase().assertSequenceEqual(df.shape, (3, 1)) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] @@ -36,7 +46,7 @@ def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df ): llm_text_df = llm_text_df.rename(columns={"prompt": "arbitrary"}) - df = palm2_text_generator_model.predict(llm_text_df).compute() + df = palm2_text_generator_model.predict(llm_text_df).to_pandas() TestCase().assertSequenceEqual(df.shape, (3, 1)) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] @@ -48,7 +58,7 @@ def test_text_generator_predict_with_params_success( ): df = palm2_text_generator_model.predict( llm_text_df, temperature=0.5, max_output_tokens=100, top_k=20, top_p=0.5 - ).compute() + ).to_pandas() TestCase().assertSequenceEqual(df.shape, (3, 1)) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] @@ -63,10 +73,22 @@ def test_create_embedding_generator_model(palm2_embedding_generator_model): def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df ): - df = palm2_embedding_generator_model.predict(llm_text_df).compute() + df = palm2_embedding_generator_model.predict(llm_text_df).to_pandas() + TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert "text_embedding" in df.columns + series = df["text_embedding"] + value = series[0] + assert isinstance(value, np.ndarray) + assert value.size == 768 + + +def test_embedding_generator_predict_series_success( + palm2_embedding_generator_model, llm_text_df +): + df = palm2_embedding_generator_model.predict(llm_text_df["prompt"]).to_pandas() TestCase().assertSequenceEqual(df.shape, (3, 1)) - assert "ml_embed_text_embedding" in df.columns - series = df["ml_embed_text_embedding"] + assert "text_embedding" in df.columns + series = df["text_embedding"] value = series[0] assert isinstance(value, np.ndarray) assert value.size == 768 diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index 5b4c486b57..b40982e282 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -23,10 +23,20 @@ def test_r2_score_perfect_fit(session): - pd_df = pd.DataFrame({"y_true": [1, 7, 3, 2, 5], "y_pred": [1, 7, 3, 2, 5]}) + pd_df = pd.DataFrame( + { + "y_true_arbitrary_name": [1, 7, 3, 2, 5], + "y_pred_arbitrary_name": [1, 7, 3, 2, 5], + } + ) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.r2_score(df[["y_true"]], df[["y_pred"]]) == 1.0 + assert ( + bigframes.ml.metrics.r2_score( + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]] + ) + == 1.0 + ) def test_r2_score_bad_fit(session): @@ -67,11 +77,28 @@ def test_r2_score_ok_fit_matches_sklearn(session): assert math.isclose(bf_result, sklearn_result) -def test_accuracy_score_perfect_fit(session): +def test_r2_score_series(session): pd_df = pd.DataFrame({"y_true": [1, 7, 3, 2, 5], "y_pred": [1, 7, 3, 2, 5]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.accuracy_score(df[["y_true"]], df[["y_pred"]]) == 1.0 + assert bigframes.ml.metrics.r2_score(df["y_true"], df["y_pred"]) == 1.0 + + +def test_accuracy_score_perfect_fit(session): + pd_df = pd.DataFrame( + { + "y_true_arbitrary_name": [1, 7, 3, 2, 5], + "y_pred_arbitrary_name": [1, 7, 3, 2, 5], + } + ) + + df = session.read_pandas(pd_df) + assert ( + bigframes.ml.metrics.accuracy_score( + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]] + ) + == 1.0 + ) def test_accuracy_score_bad_fit(session): @@ -93,7 +120,6 @@ def test_accuracy_score_not_normailze(session): ) -@pytest.mark.skipif(sklearn_metrics is None, reason="requires sklearn") def test_accuracy_score_fit_matches_sklearn(session): pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) @@ -105,22 +131,42 @@ def test_accuracy_score_fit_matches_sklearn(session): assert math.isclose(bf_result, sklearn_result) +def test_accuracy_score_series(session): + pd_df = pd.DataFrame({"y_true": [1, 7, 3, 2, 5], "y_pred": [1, 7, 3, 2, 5]}) + + df = session.read_pandas(pd_df) + assert bigframes.ml.metrics.accuracy_score(df["y_true"], df["y_pred"]) == 1.0 + + def test_roc_curve_binary_classification_prediction_returns_expected(session): pd_df = pd.DataFrame( { - "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], - "y_score": [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45], + "y_true_arbitrary_name": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], + "y_score_arbitrary_name": [ + 0.1, + 0.4, + 0.35, + 0.8, + 0.65, + 0.9, + 0.5, + 0.3, + 0.6, + 0.45, + ], } ) df = session.read_pandas(pd_df) fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( - df[["y_true"]], df[["y_score"]], drop_intermediate=False + df[["y_true_arbitrary_name"]], + df[["y_score_arbitrary_name"]], + drop_intermediate=False, ) - pd_fpr = fpr.compute() - pd_tpr = tpr.compute() - pd_thresholds = thresholds.compute() + pd_fpr = fpr.to_pandas() + pd_tpr = tpr.to_pandas() + pd_thresholds = thresholds.to_pandas() pd.testing.assert_series_equal( # skip testing the first value, as it is redundant and inconsistent across sklearn versions @@ -181,9 +227,9 @@ def test_roc_curve_binary_classification_prediction_matches_sklearn(session): ) # sklearn returns float64 np arrays - np_fpr = fpr.compute().astype("float64").array - np_tpr = tpr.compute().astype("float64").array - np_thresholds = thresholds.compute().astype("float64").array + np_fpr = fpr.to_pandas().astype("float64").array + np_tpr = tpr.to_pandas().astype("float64").array + np_thresholds = thresholds.to_pandas().astype("float64").array np.testing.assert_array_equal( # skip testing the first value, as it is redundant and inconsistent across sklearn versions @@ -217,9 +263,9 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): df[["y_true"]], df[["y_score"]], drop_intermediate=False ) - pd_fpr = fpr.compute() - pd_tpr = tpr.compute() - pd_thresholds = thresholds.compute() + pd_fpr = fpr.to_pandas() + pd_tpr = tpr.to_pandas() + pd_thresholds = thresholds.to_pandas() pd.testing.assert_series_equal( # skip testing the first value, as it is redundant and inconsistent across sklearn versions @@ -276,9 +322,9 @@ def test_roc_curve_binary_classification_decision_matches_sklearn(session): ) # sklearn returns float64 np arrays - np_fpr = fpr.compute().astype("float64").array - np_tpr = tpr.compute().astype("float64").array - np_thresholds = thresholds.compute().astype("float64").array + np_fpr = fpr.to_pandas().astype("float64").array + np_tpr = tpr.to_pandas().astype("float64").array + np_thresholds = thresholds.to_pandas().astype("float64").array np.testing.assert_array_equal( # skip testing the first value, as it is redundant and inconsistent across sklearn versions @@ -295,7 +341,7 @@ def test_roc_curve_binary_classification_decision_matches_sklearn(session): ) -def test_roc_auc_score_returns_expected(session): +def test_roc_curve_binary_classification_prediction_series(session): pd_df = pd.DataFrame( { "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], @@ -304,7 +350,79 @@ def test_roc_auc_score_returns_expected(session): ) df = session.read_pandas(pd_df) - score = bigframes.ml.metrics.roc_auc_score(df[["y_true"]], df[["y_score"]]) + fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + df["y_true"], df["y_score"], drop_intermediate=False + ) + + pd_fpr = fpr.to_pandas() + pd_tpr = tpr.to_pandas() + pd_thresholds = thresholds.to_pandas() + + pd.testing.assert_series_equal( + # skip testing the first value, as it is redundant and inconsistent across sklearn versions + pd_thresholds[1:], + pd.Series( + [0.9, 0.8, 0.65, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.1], + dtype="Float64", + name="thresholds", + ), + check_index=False, + ) + pd.testing.assert_series_equal( + pd_fpr, + pd.Series( + [0.0, 0.0, 0.0, 0.25, 0.25, 0.5, 0.5, 0.75, 0.75, 0.75, 1.0], + dtype="Float64", + name="fpr", + ), + check_index_type=False, + ) + pd.testing.assert_series_equal( + pd_tpr, + pd.Series( + [ + 0.0, + 0.16666667, + 0.33333333, + 0.33333333, + 0.5, + 0.5, + 0.66666667, + 0.66666667, + 0.83333333, + 1.0, + 1.0, + ], + dtype="Float64", + name="tpr", + ), + check_index_type=False, + ) + + +def test_roc_auc_score_returns_expected(session): + pd_df = pd.DataFrame( + { + "y_true_arbitrary_name": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], + "y_score_arbitrary_name": [ + 0.1, + 0.4, + 0.35, + 0.8, + 0.65, + 0.9, + 0.5, + 0.3, + 0.6, + 0.45, + ], + } + ) + + df = session.read_pandas(pd_df) + score = bigframes.ml.metrics.roc_auc_score( + df[["y_true_arbitrary_name"]], df[["y_score_arbitrary_name"]] + ) assert score == 0.625 @@ -326,11 +444,25 @@ def test_roc_auc_score_returns_matches_sklearn(session): assert score == expected_score +def test_roc_auc_score_series(session): + pd_df = pd.DataFrame( + { + "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], + "y_score": [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45], + } + ) + + df = session.read_pandas(pd_df) + score = bigframes.ml.metrics.roc_auc_score(df["y_true"], df["y_score"]) + + assert score == 0.625 + + def test_auc_invalid_x_size(session): - pd_df = pd.DataFrame({"x": [0], "y": [0]}) + pd_df = pd.DataFrame({"x_arbitrary_name": [0], "y_arbitrary_name": [0]}) df = session.read_pandas(pd_df) with pytest.raises(ValueError): - bigframes.ml.metrics.auc(df[["x"]], df[["y"]]) + bigframes.ml.metrics.auc(df[["x_arbitrary_name"]], df[["y_arbitrary_name"]]) def test_auc_nondecreasing_x(session): @@ -352,16 +484,23 @@ def test_auc_nonincreasing_x_negative(session): assert bigframes.ml.metrics.auc(df[["x"]], df[["y"]]) == -0.75 +def test_auc_series(session): + pd_df = pd.DataFrame({"x": [0, 0, 0.5, 0.5, 1], "y": [0, 0.5, 0.5, 1, 1]}) + + df = session.read_pandas(pd_df) + assert bigframes.ml.metrics.auc(df["x"], df["y"]) == 0.75 + + def test_confusion_matrix(session): pd_df = pd.DataFrame( { - "y_true": [2, 0, 2, 2, 0, 1], - "y_pred": [0, 0, 2, 2, 0, 2], + "y_true_arbitrary_name": [2, 0, 2, 2, 0, 1], + "y_pred_arbitrary_name": [0, 0, 2, 2, 0, 2], } ).astype("Int64") df = session.read_pandas(pd_df) confusion_matrix = bigframes.ml.metrics.confusion_matrix( - df[["y_true"]], df[["y_pred"]] + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]] ) expected_pd_df = pd.DataFrame( { @@ -441,7 +580,7 @@ def test_confusion_matrix_str_matches_sklearn(session): ) -def test_recall_score(session): +def test_confusion_matrix_series(session): pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -449,8 +588,29 @@ def test_recall_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) + confusion_matrix = bigframes.ml.metrics.confusion_matrix(df["y_true"], df["y_pred"]) + expected_pd_df = pd.DataFrame( + { + 0: [2, 0, 1], + 1: [0, 0, 0], + 2: [0, 1, 2], + } + ).astype("int64") + pd.testing.assert_frame_equal( + confusion_matrix, expected_pd_df, check_index_type=False + ) + + +def test_recall_score(session): + pd_df = pd.DataFrame( + { + "y_true_arbitrary_name": [2, 0, 2, 2, 0, 1], + "y_pred_arbitrary_name": [0, 0, 2, 2, 0, 2], + } + ).astype("Int64") + df = session.read_pandas(pd_df) recall = bigframes.ml.metrics.recall_score( - df[["y_true"]], df[["y_pred"]], average=None + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [1.000000, 0.000000, 0.666667] expected_index = [0, 1, 2] @@ -497,7 +657,7 @@ def test_recall_score_str_matches_sklearn(session): pd.testing.assert_series_equal(recall, expected_recall, check_index_type=False) -def test_precision_score(session): +def test_recall_score_series(session): pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -505,8 +665,24 @@ def test_precision_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) + recall = bigframes.ml.metrics.recall_score(df["y_true"], df["y_pred"], average=None) + expected_values = [1.000000, 0.000000, 0.666667] + expected_index = [0, 1, 2] + expected_recall = pd.Series(expected_values, index=expected_index) + + pd.testing.assert_series_equal(recall, expected_recall, check_index_type=False) + + +def test_precision_score(session): + pd_df = pd.DataFrame( + { + "y_true_arbitrary_name": [2, 0, 2, 2, 0, 1], + "y_pred_arbitrary_name": [0, 0, 2, 2, 0, 2], + } + ).astype("Int64") + df = session.read_pandas(pd_df) precision_score = bigframes.ml.metrics.precision_score( - df[["y_true"]], df[["y_pred"]], average=None + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [0.666667, 0.000000, 0.666667] expected_index = [0, 1, 2] @@ -559,7 +735,7 @@ def test_precision_score_str_matches_sklearn(session): ) -def test_f1_score(session): +def test_precision_score_series(session): pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -567,8 +743,28 @@ def test_f1_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) + precision_score = bigframes.ml.metrics.precision_score( + df["y_true"], df["y_pred"], average=None + ) + expected_values = [0.666667, 0.000000, 0.666667] + expected_index = [0, 1, 2] + expected_precision = pd.Series(expected_values, index=expected_index) + + pd.testing.assert_series_equal( + precision_score, expected_precision, check_index_type=False + ) + + +def test_f1_score(session): + pd_df = pd.DataFrame( + { + "y_true_arbitrary_name": [2, 0, 2, 2, 0, 1], + "y_pred_arbitrary_name": [0, 0, 2, 2, 0, 2], + } + ).astype("Int64") + df = session.read_pandas(pd_df) f1_score = bigframes.ml.metrics.f1_score( - df[["y_true"]], df[["y_pred"]], average=None + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [0.8, 0.000000, 0.666667] expected_index = [0, 1, 2] @@ -613,3 +809,19 @@ def test_f1_score_str_matches_sklearn(session): expected_index = ["ant", "bird", "cat"] expected_f1 = pd.Series(expected_values, index=expected_index) pd.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) + + +def test_f1_score_series(session): + pd_df = pd.DataFrame( + { + "y_true": [2, 0, 2, 2, 0, 1], + "y_pred": [0, 0, 2, 2, 0, 2], + } + ).astype("Int64") + df = session.read_pandas(pd_df) + f1_score = bigframes.ml.metrics.f1_score(df["y_true"], df["y_pred"], average=None) + expected_values = [0.8, 0.000000, 0.666667] + expected_index = [0, 1, 2] + expected_f1 = pd.Series(expected_values, index=expected_index) + + pd.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index add455ab9a..9eb3645591 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -15,7 +15,8 @@ import pandas as pd import pytest -import bigframes.ml.model_selection +from bigframes.ml import model_selection +import bigframes.pandas as bpd def test_train_test_split_default_correct_shape(penguins_df_default_index): @@ -27,9 +28,7 @@ def test_train_test_split_default_correct_shape(penguins_df_default_index): ] ] y = penguins_df_default_index[["body_mass_g"]] - X_train, X_test, y_train, y_test = bigframes.ml.model_selection.train_test_split( - X, y - ) + X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y) # even though the default seed is random, it should always result in this shape assert X_train.shape == (258, 3) @@ -38,6 +37,22 @@ def test_train_test_split_default_correct_shape(penguins_df_default_index): assert y_test.shape == (86, 1) +def test_train_test_split_series_default_correct_shape(penguins_df_default_index): + X = penguins_df_default_index[["species"]] + y = penguins_df_default_index["body_mass_g"] + X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y) + assert isinstance(X_train, bpd.DataFrame) + assert isinstance(X_test, bpd.DataFrame) + assert isinstance(y_train, bpd.Series) + assert isinstance(y_test, bpd.Series) + + # even though the default seed is random, it should always result in this shape + assert X_train.shape == (258, 1) + assert X_test.shape == (86, 1) + assert y_train.shape == (258,) + assert y_test.shape == (86,) + + def test_train_test_double_split_correct_shape(penguins_df_default_index): X = penguins_df_default_index[ [ @@ -47,7 +62,7 @@ def test_train_test_double_split_correct_shape(penguins_df_default_index): ] ] y = penguins_df_default_index[["body_mass_g"]] - X_train, X_test, y_train, y_test = bigframes.ml.model_selection.train_test_split( + X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, train_size=0.4 ) @@ -78,7 +93,7 @@ def test_train_test_three_dataframes_correct_shape(penguins_df_default_index): B_test, C_train, C_test, - ) = bigframes.ml.model_selection.train_test_split(A, B, C) + ) = model_selection.train_test_split(A, B, C) assert A_train.shape == (258, 2) assert A_test.shape == (86, 2) @@ -111,7 +126,7 @@ def test_train_test_split_seeded_correct_rows( ] ] y = df[["body_mass_g"]] - X_train, X_test, y_train, y_test = bigframes.ml.model_selection.train_test_split( + X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, random_state=42 ) @@ -124,31 +139,31 @@ def test_train_test_split_seeded_correct_rows( [ 144, 146, - 148, 168, 183, 186, 217, + 221, 225, - 226, 237, + 240, 244, 245, + 257, 260, 262, 263, + 264, 266, + 267, 268, - 269, - 289, 290, - 291, ], dtype="Int64", name="rowindex", ) test_index = pd.Index( - [161, 221, 240, 257, 264, 267, 278], dtype="Int64", name="rowindex" + [148, 161, 226, 269, 278, 289, 291], dtype="Int64", name="rowindex" ) all_data.index.name = "_" @@ -209,6 +224,6 @@ def test_train_test_split_value_error(penguins_df_default_index, train_size, tes ] y = penguins_df_default_index[["body_mass_g"]] with pytest.raises(ValueError): - bigframes.ml.model_selection.train_test_split( + model_selection.train_test_split( X, y, train_size=train_size, test_size=test_size ) diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index f31b93b4cc..420a80754f 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -20,7 +20,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): - # TODO(bmil): add a second test that compares output to sklearn.preprocessing.StandardScaler + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. scaler = bigframes.ml.preprocessing.StandardScaler() scaler.fit( penguins_df_default_index[ @@ -34,10 +34,9 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): ] ).to_pandas() - # If standard-scaled correctly, mean should be 0.0 and standard deviation 1.0 + # If standard-scaled correctly, mean should be 0.0 for column in result.columns: assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3) - assert math.isclose(result[column].std(), 1.0, abs_tol=1e-3) result = scaler.transform(new_penguins_df).to_pandas() @@ -48,9 +47,9 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): expected = pd.DataFrame( { - "scaled_culmen_depth_mm": [0.8349, 0.02473, 0.4805], - "scaled_culmen_length_mm": [-0.8099, -0.9931, -1.103], - "scaled_flipper_length_mm": [-0.3495, -1.4163, -0.9185], + "scaled_culmen_depth_mm": [0.836148, 0.024748, 0.48116], + "scaled_culmen_length_mm": [-0.81112, -0.994552, -1.104611], + "scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), @@ -59,9 +58,42 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): pd.testing.assert_frame_equal(result, expected, rtol=1e-3) -def test_one_hot_encoder_encodes(penguins_df_default_index, new_penguins_df): +def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + scaler = bigframes.ml.preprocessing.StandardScaler() + scaler.fit(penguins_df_default_index["culmen_length_mm"]) + + result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() + + # If standard-scaled correctly, mean should be 0.0 + for column in result.columns: + assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3) + + result = scaler.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "scaled_culmen_length_mm": [ + -0.811119671289163, + -0.9945520581113803, + -1.104611490204711, + ], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_one_hot_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder() - encoder.fit(penguins_df_default_index["species", "sex"]) + encoder.fit(new_penguins_df["species", "sex"]) result = encoder.transform(new_penguins_df).to_pandas() @@ -87,3 +119,91 @@ def test_one_hot_encoder_encodes(penguins_df_default_index, new_penguins_df): ) pd.testing.assert_frame_equal(result, expected) + + +def test_one_hot_encoder_series_default_params(new_penguins_df): + encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder.fit(new_penguins_df["species"]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + }, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_one_hot_encoder_params(new_penguins_df): + encoder = bigframes.ml.preprocessing.OneHotEncoder("most_frequent", 100, 2) + encoder.fit(new_penguins_df["species", "sex"]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "onehotencoded_sex": [ + [{"index": 0, "value": 1.0}], + [{"index": 0, "value": 1.0}], + [{"index": 0, "value": 1.0}], + ], + "onehotencoded_species": [ + [{"index": 0, "value": 1.0}], + [{"index": 0, "value": 1.0}], + [{"index": 0, "value": 1.0}], + ], + }, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_df): + encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder.fit(penguins_df_default_index["species", "sex"]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "onehotencoded_sex": [ + [{"index": 3, "value": 1.0}], + [{"index": 2, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + }, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +# TODO(garrettwu): add OneHotEncoder tests to compare with sklearn. diff --git a/tests/system/small/ml/test_register.py b/tests/system/small/ml/test_register.py index ad378d3508..bcf1f4a5b0 100644 --- a/tests/system/small/ml/test_register.py +++ b/tests/system/small/ml/test_register.py @@ -28,7 +28,7 @@ def test_linear_reg_register( ) # Only registered model contains the field, and the field includes project/dataset. Here only check model_id. assert ( - model_name + model_name[:63] # truncated in cast(core.BqmlModel, model._bqml_model).model.training_runs[-1][ "vertexAiModelId" ] @@ -44,7 +44,7 @@ def test_linear_reg_register_with_params( # Only registered model contains the field, and the field includes project/dataset. Here only check model_id. assert ( - model_name + model_name[:63] # truncated in cast(core.BqmlModel, model._bqml_model).model.training_runs[-1][ "vertexAiModelId" ] @@ -62,7 +62,7 @@ def test_palm2_text_generator_register( ) # Only registered model contains the field, and the field includes project/dataset. Here only check model_id. assert ( - model_name + model_name[:63] # truncated in cast(core.BqmlModel, model._bqml_model).model.training_runs[-1][ "vertexAiModelId" ] @@ -80,7 +80,7 @@ def test_imported_tensorflow_register( ) # Only registered model contains the field, and the field includes project/dataset. Here only check model_id. assert ( - model_name + model_name[:63] # truncated in cast(core.BqmlModel, model._bqml_model).model.training_runs[-1][ "vertexAiModelId" ] diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 25d3eb69ad..7dc55b9367 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -30,7 +30,7 @@ def test_day(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.day.compute() + bf_result = bf_series.dt.day.to_pandas() pd_result = scalars_pandas_df[col_name].dt.day assert_series_equal_ignoring_order( @@ -48,7 +48,7 @@ def test_date(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.date.compute() + bf_result = bf_series.dt.date.to_pandas() pd_result = scalars_pandas_df[col_name].dt.date assert_series_equal_ignoring_order( @@ -66,7 +66,7 @@ def test_dayofweek(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.dayofweek.compute() + bf_result = bf_series.dt.dayofweek.to_pandas() pd_result = scalars_pandas_df[col_name].dt.dayofweek assert_series_equal_ignoring_order(pd_result, bf_result, check_dtype=False) @@ -81,7 +81,7 @@ def test_hour(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.hour.compute() + bf_result = bf_series.dt.hour.to_pandas() pd_result = scalars_pandas_df[col_name].dt.hour assert_series_equal_ignoring_order( @@ -99,7 +99,7 @@ def test_minute(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.minute.compute() + bf_result = bf_series.dt.minute.to_pandas() pd_result = scalars_pandas_df[col_name].dt.minute assert_series_equal_ignoring_order( @@ -117,7 +117,7 @@ def test_month(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.month.compute() + bf_result = bf_series.dt.month.to_pandas() pd_result = scalars_pandas_df[col_name].dt.month assert_series_equal_ignoring_order( @@ -135,7 +135,7 @@ def test_quarter(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.quarter.compute() + bf_result = bf_series.dt.quarter.to_pandas() pd_result = scalars_pandas_df[col_name].dt.quarter assert_series_equal_ignoring_order( @@ -153,7 +153,7 @@ def test_second(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.second.compute() + bf_result = bf_series.dt.second.to_pandas() pd_result = scalars_pandas_df[col_name].dt.second assert_series_equal_ignoring_order( @@ -171,7 +171,7 @@ def test_time(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.time.compute() + bf_result = bf_series.dt.time.to_pandas() pd_result = scalars_pandas_df[col_name].dt.time assert_series_equal_ignoring_order( @@ -189,7 +189,7 @@ def test_year(scalars_dfs, col_name): pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.dt.year.compute() + bf_result = bf_series.dt.year.to_pandas() pd_result = scalars_pandas_df[col_name].dt.year assert_series_equal_ignoring_order( diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 76736e3cdd..d7bf3312f0 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re + import pandas as pd import pytest @@ -24,7 +26,7 @@ def test_find(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.find("W").compute() + bf_result = bf_series.str.find("W").to_pandas() pd_result = scalars_pandas_df[col_name].str.find("W") # One of type mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but @@ -35,11 +37,138 @@ def test_find(scalars_dfs): ) +@pytest.mark.parametrize( + ("pat", "case", "flags", "regex"), + [ + ("hEllo", True, 0, False), + ("hEllo", False, 0, False), + ("hEllo", False, re.I, True), + (".*", True, 0, True), + (".*", True, 0, False), + ], +) +def test_str_contains(scalars_dfs, pat, case, flags, regex): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_series: bigframes.series.Series = scalars_df[col_name] + + bf_result = bf_series.str.contains( + pat, case=case, flags=flags, regex=regex + ).to_pandas() + pd_result = scalars_pandas_df[col_name].str.contains( + pat, case=case, flags=flags, regex=regex + ) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("pat"), + [(r"(ell)(lo)"), (r"(?Ph..)"), (r"(?Pe.*o)([g-l]+)")], +) +def test_str_extract(scalars_dfs, pat): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_series: bigframes.series.Series = scalars_df[col_name] + + bf_result = bf_series.str.extract(pat).to_pandas() + pd_result = scalars_pandas_df[col_name].str.extract(pat) + + # Pandas produces int col labels, while bq df only supports str labels at present + pd_result = pd_result.set_axis(pd_result.columns.astype(str), axis=1) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("pat", "repl", "case", "flags", "regex"), + [ + ("hEllo", "blah", True, 0, False), + ("hEllo", "blah", False, 0, False), + ("hEllo", "blah", False, re.I, True), + (".*", "blah", True, 0, True), + ("h.l", "blah", False, 0, True), + (re.compile("(?i).e.."), "blah", None, 0, True), + ], +) +def test_str_replace(scalars_dfs, pat, repl, case, flags, regex): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_series: bigframes.series.Series = scalars_df[col_name] + + bf_result = bf_series.str.replace( + pat, repl=repl, case=case, flags=flags, regex=regex + ).to_pandas() + pd_result = scalars_pandas_df[col_name].str.replace( + pat, repl=repl, case=case, flags=flags, regex=regex + ) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("pat",), + [ + ("こん",), + ("Tag!",), + ( + ( + "Tag!", + "Hel", + ), + ), + ], +) +def test_str_startswith(scalars_dfs, pat): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_series: bigframes.series.Series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].astype("object") + + bf_result = bf_series.str.startswith(pat).to_pandas() + pd_result = pd_series.str.startswith(pat) + + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("pat",), + [ + ("こん",), + ("Tag!",), + ( + ( + "Tag!", + "Hel", + ), + ), + ], +) +def test_str_endswith(scalars_dfs, pat): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_series: bigframes.series.Series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name].astype("object") + + bf_result = bf_series.str.endswith(pat).to_pandas() + pd_result = pd_series.str.endswith(pat) + + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + def test_len(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.len().compute() + bf_result = bf_series.str.len().to_pandas() pd_result = scalars_pandas_df[col_name].str.len() # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but @@ -54,7 +183,7 @@ def test_lower(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.lower().compute() + bf_result = bf_series.str.lower().to_pandas() pd_result = scalars_pandas_df[col_name].str.lower() assert_series_equal_ignoring_order( @@ -67,7 +196,7 @@ def test_reverse(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.reverse().compute() + bf_result = bf_series.str.reverse().to_pandas() pd_result = scalars_pandas_df[col_name].copy() for i in pd_result.index: cell = pd_result.loc[i] @@ -89,7 +218,7 @@ def test_slice(scalars_dfs, start, stop): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.slice(start, stop).compute() + bf_result = bf_series.str.slice(start, stop).to_pandas() pd_series = scalars_pandas_df[col_name] pd_result = pd_series.str.slice(start, stop) @@ -103,7 +232,7 @@ def test_strip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.strip().compute() + bf_result = bf_series.str.strip().to_pandas() pd_result = scalars_pandas_df[col_name].str.strip() assert_series_equal_ignoring_order( @@ -116,7 +245,7 @@ def test_upper(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.upper().compute() + bf_result = bf_series.str.upper().to_pandas() pd_result = scalars_pandas_df[col_name].str.upper() assert_series_equal_ignoring_order( @@ -149,7 +278,7 @@ def test_isnumeric(session): df = session.read_pandas(pandas_df) pd_result = pandas_df.numeric_string_col.str.isnumeric() - bf_result = df.numeric_string_col.str.isnumeric().compute() + bf_result = df.numeric_string_col.str.isnumeric().to_pandas() assert_series_equal_ignoring_order( bf_result, @@ -163,7 +292,7 @@ def test_rstrip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.rstrip().compute() + bf_result = bf_series.str.rstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.rstrip() assert_series_equal_ignoring_order( @@ -176,7 +305,7 @@ def test_lstrip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.lstrip().compute() + bf_result = bf_series.str.lstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.lstrip() assert_series_equal_ignoring_order( @@ -190,7 +319,7 @@ def test_repeat(scalars_dfs, repeats): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.repeat(repeats).compute() + bf_result = bf_series.str.repeat(repeats).to_pandas() pd_result = scalars_pandas_df[col_name].str.repeat(repeats) assert_series_equal_ignoring_order( @@ -203,7 +332,7 @@ def test_capitalize(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.capitalize().compute() + bf_result = bf_series.str.capitalize().to_pandas() pd_result = scalars_pandas_df[col_name].str.capitalize() assert_series_equal_ignoring_order( @@ -218,7 +347,7 @@ def test_cat_with_series(scalars_dfs): bf_filter: bigframes.series.Series = scalars_df["bool_col"] bf_left: bigframes.series.Series = scalars_df[col_name][bf_filter] bf_right: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_left.str.cat(others=bf_right).compute() + bf_result = bf_left.str.cat(others=bf_right).to_pandas() pd_filter = scalars_pandas_df["bool_col"] pd_left = scalars_pandas_df[col_name][pd_filter] pd_right = scalars_pandas_df[col_name] diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 1833c674fd..c7e17f5a2d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -13,6 +13,8 @@ # limitations under the License. import operator +import typing +from typing import Tuple import geopandas as gpd # type: ignore import numpy as np @@ -24,6 +26,7 @@ import bigframes import bigframes._config.display_options as display_options import bigframes.dataframe as dataframe +import bigframes.series as series from tests.system.utils import ( assert_pandas_df_equal_ignore_ordering, assert_series_equal_ignoring_order, @@ -33,7 +36,7 @@ def test_df_construct_copy(scalars_dfs): columns = ["int64_col", "string_col", "float64_col"] scalars_df, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame(scalars_df, columns=columns).compute() + bf_result = dataframe.DataFrame(scalars_df, columns=columns).to_pandas() pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) pandas.testing.assert_frame_equal(bf_result, pd_result) @@ -41,7 +44,7 @@ def test_df_construct_copy(scalars_dfs): def test_df_construct_pandas(scalars_dfs): columns = ["int64_too", "int64_col", "float64_col", "bool_col", "string_col"] _, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).compute() + bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).to_pandas() pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) pandas.testing.assert_frame_equal(bf_result, pd_result) @@ -56,7 +59,7 @@ def test_df_construct_pandas_set_dtype(scalars_dfs): _, scalars_pandas_df = scalars_dfs bf_result = dataframe.DataFrame( scalars_pandas_df, columns=columns, dtype="Float64" - ).compute() + ).to_pandas() pd_result = pd.DataFrame(scalars_pandas_df, columns=columns, dtype="Float64") pandas.testing.assert_frame_equal(bf_result, pd_result) @@ -66,7 +69,7 @@ def test_df_construct_from_series(scalars_dfs): bf_result = dataframe.DataFrame( {"a": scalars_df["int64_col"], "b": scalars_df["string_col"]}, dtype="string[pyarrow]", - ).compute() + ).to_pandas() pd_result = pd.DataFrame( {"a": scalars_pandas_df["int64_col"], "b": scalars_pandas_df["string_col"]}, dtype="string[pyarrow]", @@ -78,7 +81,7 @@ def test_get_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" series = scalars_df[col_name] - bf_result = series.compute() + bf_result = series.to_pandas() pd_result = scalars_pandas_df[col_name] assert_series_equal_ignoring_order(bf_result, pd_result) @@ -96,7 +99,7 @@ def test_head_with_custom_column_labels(scalars_df_index, scalars_pandas_df_inde "string_col": "言語列", } bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) - bf_result = bf_df.compute() + bf_result = bf_df.to_pandas() pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) pandas.testing.assert_frame_equal(bf_result, pd_result) @@ -107,7 +110,7 @@ def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_inde "string_col": "言語列", } bf_df = scalars_df_index.rename(columns=rename_mapping).tail(3) - bf_result = bf_df.compute() + bf_result = bf_df.to_pandas() pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).tail(3) pandas.testing.assert_frame_equal(bf_result, pd_result) @@ -115,7 +118,7 @@ def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_inde def test_get_column_by_attr(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs series = scalars_df.int64_col - bf_result = series.compute() + bf_result = series.to_pandas() pd_result = scalars_pandas_df.int64_col assert_series_equal_ignoring_order(bf_result, pd_result) @@ -124,7 +127,7 @@ def test_get_columns(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_names = ["bool_col", "float64_col", "int64_col"] df_subset = scalars_df.get(col_names) - df_pandas = df_subset.compute() + df_pandas = df_subset.to_pandas() pd.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df[col_names].columns ) @@ -140,7 +143,7 @@ def test_get_columns_default(scalars_dfs): def test_drop_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" - df_pandas = scalars_df.drop(columns=col_name).compute() + df_pandas = scalars_df.drop(columns=col_name).to_pandas() pd.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.drop(columns=col_name).columns ) @@ -149,12 +152,22 @@ def test_drop_column(scalars_dfs): def test_drop_columns(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_names = ["int64_col", "geography_col", "time_col"] - df_pandas = scalars_df.drop(columns=col_names).compute() + df_pandas = scalars_df.drop(columns=col_names).to_pandas() pd.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.drop(columns=col_names).columns ) +def test_drop_labels_axis_1(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + labels = ["int64_col", "geography_col", "time_col"] + + pd_result = scalars_pandas_df.drop(labels=labels, axis=1) + bf_result = scalars_df.drop(labels=labels, axis=1).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + def test_drop_with_custom_column_labels(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs rename_mapping = { @@ -166,17 +179,44 @@ def test_drop_with_custom_column_labels(scalars_dfs): "timestamp_col", ] bf_df = scalars_df.rename(columns=rename_mapping).drop(columns=dropped_columns) - bf_result = bf_df.compute() + bf_result = bf_df.to_pandas() pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( columns=dropped_columns ) assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) +def test_drop_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2]) + bf_result = scalars_df.drop(index=[4, 1, 2]).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_labels_axis_0(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(labels=[4, 1, 2], axis=0) + bf_result = scalars_df.drop(labels=[4, 1, 2], axis=0).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_index_and_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2], columns="int64_col") + bf_result = scalars_df.drop(index=[4, 1, 2], columns="int64_col").to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + def test_rename(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name_dict = {"bool_col": "boolean_col"} - df_pandas = scalars_df.rename(columns=col_name_dict).compute() + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() pd.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns ) @@ -210,7 +250,7 @@ def test_repr_w_all_rows(scalars_dfs): def test_repr_html_w_all_rows(scalars_dfs): scalars_df, _ = scalars_dfs # get a pandas df of the expected format - df, _ = scalars_df._block.compute() + df, _ = scalars_df._block.to_pandas() pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) pandas_df.index.name = scalars_df.index.name @@ -229,7 +269,7 @@ def test_repr_html_w_all_rows(scalars_dfs): def test_df_column_name_with_space(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name_dict = {"bool_col": "bool col"} - df_pandas = scalars_df.rename(columns=col_name_dict).compute() + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() pd.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns ) @@ -238,7 +278,7 @@ def test_df_column_name_with_space(scalars_dfs): def test_df_column_name_duplicate(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name_dict = {"int64_too": "int64_col"} - df_pandas = scalars_df.rename(columns=col_name_dict).compute() + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() pd.testing.assert_index_equal( df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns ) @@ -248,7 +288,7 @@ def test_get_df_column_name_duplicate(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name_dict = {"int64_too": "int64_col"} - bf_result = scalars_df.rename(columns=col_name_dict)["int64_col"].compute() + bf_result = scalars_df.rename(columns=col_name_dict)["int64_col"].to_pandas() pd_result = scalars_pandas_df.rename(columns=col_name_dict)["int64_col"] pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) @@ -257,7 +297,7 @@ def test_filter_df(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_bool_series = scalars_df["bool_col"] - bf_result = scalars_df[bf_bool_series].compute() + bf_result = scalars_df[bf_bool_series].to_pandas() pd_bool_series = scalars_pandas_df["bool_col"] pd_result = scalars_pandas_df[pd_bool_series] @@ -269,7 +309,7 @@ def test_assign_new_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs kwargs = {"new_col": 2} df = scalars_df.assign(**kwargs) - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df.assign(**kwargs) # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. @@ -284,7 +324,7 @@ def test_assign_new_column_w_loc(scalars_dfs): pd_df = scalars_pandas_df.copy() bf_df.loc[:, "new_col"] = 2 pd_df.loc[:, "new_col"] = 2 - bf_result = bf_df.compute() + bf_result = bf_df.to_pandas() pd_result = pd_df # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. @@ -299,7 +339,7 @@ def test_assign_new_column_w_setitem(scalars_dfs): pd_df = scalars_pandas_df.copy() bf_df["new_col"] = 2 pd_df["new_col"] = 2 - bf_result = bf_df.compute() + bf_result = bf_df.to_pandas() pd_result = pd_df # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. @@ -312,7 +352,7 @@ def test_assign_existing_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs kwargs = {"int64_col": 2} df = scalars_df.assign(**kwargs) - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df.assign(**kwargs) # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. @@ -325,7 +365,7 @@ def test_assign_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs column_name = "int64_col" df = scalars_df.assign(new_col=scalars_df[column_name]) - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @@ -335,7 +375,7 @@ def test_assign_series_overwrite(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs column_name = "int64_col" df = scalars_df.assign(**{column_name: scalars_df[column_name] + 3}) - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df.assign( **{column_name: scalars_pandas_df[column_name] + 3} ) @@ -347,7 +387,7 @@ def test_assign_sequential(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs kwargs = {"int64_col": 2, "new_col": 3, "new_col2": 4} df = scalars_df.assign(**kwargs) - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df.assign(**kwargs) # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. @@ -371,7 +411,7 @@ def test_assign_same_table_different_index_performs_self_join( ) bf_df_2 = bf_df.set_index("alternative_index") pd_df_2 = pd_df.set_index("alternative_index") - bf_result = bf_df.assign(new_col=bf_df_2[column_name] * 10).compute() + bf_result = bf_df.assign(new_col=bf_df_2[column_name] * 10).to_pandas() pd_result = pd_df.assign(new_col=pd_df_2[column_name] * 10) pandas.testing.assert_frame_equal(bf_result, pd_result) @@ -383,7 +423,7 @@ def test_assign_different_df( ): column_name = "int64_col" df = scalars_df_index.assign(new_col=scalars_df_2_index[column_name]) - bf_result = df.compute() + bf_result = df.to_pandas() # Doesn't matter to pandas if it comes from the same DF or a different DF. pd_result = scalars_pandas_df_index.assign( new_col=scalars_pandas_df_index[column_name] @@ -402,7 +442,7 @@ def test_assign_different_df_w_loc( assert "int64_col" in pd_df.columns bf_df.loc[:, "int64_col"] = bf_df2.loc[:, "int64_col"] + 1 pd_df.loc[:, "int64_col"] = pd_df.loc[:, "int64_col"] + 1 - bf_result = bf_df.compute() + bf_result = bf_df.to_pandas() pd_result = pd_df # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. @@ -421,7 +461,7 @@ def test_assign_different_df_w_setitem( assert "int64_col" in pd_df.columns bf_df["int64_col"] = bf_df2["int64_col"] + 1 pd_df["int64_col"] = pd_df["int64_col"] + 1 - bf_result = bf_df.compute() + bf_result = bf_df.to_pandas() pd_result = pd_df # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. @@ -434,7 +474,7 @@ def test_assign_callable_lambda(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs kwargs = {"new_col": lambda x: x["int64_col"] + x["int64_too"]} df = scalars_df.assign(**kwargs) - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df.assign(**kwargs) # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. @@ -446,7 +486,7 @@ def test_assign_callable_lambda(scalars_dfs): def test_dropna(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs df = scalars_df.dropna() - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df.dropna() assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @@ -472,7 +512,7 @@ def test_merge(scalars_dfs, merge_how): right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) df = left.merge(right, merge_how, on, sort=True) - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df[left_columns].merge( scalars_pandas_df[right_columns].assign( @@ -506,7 +546,7 @@ def test_merge_custom_col_name(scalars_dfs, merge_how): left = left.rename(columns=rename_columns) right = scalars_df[right_columns] df = left.merge(right, merge_how, on, sort=True) - bf_result = df.compute() + bf_result = df.to_pandas() pandas_left_df = scalars_pandas_df[left_columns] pandas_left_df = pandas_left_df.rename(columns=rename_columns) @@ -536,7 +576,7 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): df = left.merge( right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True ) - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df[left_columns].merge( scalars_pandas_df[right_columns], @@ -633,7 +673,7 @@ def test_empty_false(scalars_dfs): assert bf_result == pd_result -def test_empty_true(scalars_dfs): +def test_empty_true_column_filter(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[[]].empty @@ -642,6 +682,31 @@ def test_empty_true(scalars_dfs): assert bf_result == pd_result +def test_empty_true_row_filter(scalars_dfs: Tuple[dataframe.DataFrame, pd.DataFrame]): + scalars_df, scalars_pandas_df = scalars_dfs + bf_bool: series.Series = typing.cast(series.Series, scalars_df["bool_col"]) + pd_bool: pd.Series = scalars_pandas_df["bool_col"] + bf_false = bf_bool.notna() & (bf_bool != bf_bool) + pd_false = pd_bool.notna() & (pd_bool != pd_bool) + + bf_result = scalars_df[bf_false].empty + pd_result = scalars_pandas_df[pd_false].empty + + assert pd_result + assert bf_result == pd_result + + +def test_empty_true_memtable(session: bigframes.Session): + bf_df = dataframe.DataFrame(session=session) + pd_df = pd.DataFrame() + + bf_result = bf_df.empty + pd_result = pd_df.empty + + assert pd_result + assert bf_result == pd_result + + @pytest.mark.parametrize( ("drop",), ((True,), (False,)), @@ -650,7 +715,7 @@ def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): df = scalars_df_index.reset_index(drop=drop) assert df.index.name is None - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df_index.reset_index(drop=drop) # Pandas uses int64 instead of Int64 (nullable) dtype. @@ -666,7 +731,7 @@ def test_reset_index_then_filter( ): bf_filter = scalars_df_index["bool_col"].fillna(True) bf_df = scalars_df_index.reset_index()[bf_filter] - bf_result = bf_df.compute() + bf_result = bf_df.to_pandas() pd_filter = scalars_pandas_df_index["bool_col"].fillna(True) pd_result = scalars_pandas_df_index.reset_index()[pd_filter] @@ -693,7 +758,7 @@ def test_reset_index_with_unnamed_index( # reset_index(drop=False) creates a new column "index". assert df.columns[0] == "index" - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df_index.reset_index(drop=False) # Pandas uses int64 instead of Int64 (nullable) dtype. @@ -720,7 +785,7 @@ def test_reset_index_with_unnamed_index_and_index_column( # reset_index(drop=False) creates a new column "level_0" if the "index" column already exists. assert df.columns[0] == "level_0" - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df_index.assign( index=scalars_pandas_df_index["int64_col"] ).reset_index(drop=False) @@ -753,7 +818,7 @@ def test_reset_index_with_unnamed_index_and_index_column( def test_set_index(scalars_dfs, index_column, drop, append): scalars_df, scalars_pandas_df = scalars_dfs df = scalars_df.set_index(index_column, append=append, drop=drop) - bf_result = df.compute() + bf_result = df.to_pandas() pd_result = scalars_pandas_df.set_index(index_column, append=append, drop=drop) # Sort to disambiguate when there are duplicate index labels. @@ -767,11 +832,30 @@ def test_set_index(scalars_dfs, index_column, drop, append): pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("ascending",), + ((True,), (False,)), +) +@pytest.mark.parametrize( + ("na_position",), + (("first",), ("last",)), +) +def test_sort_index(scalars_dfs, ascending, na_position): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.set_index(index_column) + bf_result = df.sort_index(ascending=ascending, na_position=na_position).to_pandas() + pd_result = scalars_pandas_df.set_index(index_column).sort_index( + ascending=ascending, na_position=na_position + ) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + def test_df_abs(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs columns = ["int64_col", "int64_too", "float64_col"] - bf_result = scalars_df[columns].abs().compute() + bf_result = scalars_df[columns].abs().to_pandas() pd_result = scalars_pandas_df[columns].abs() assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @@ -781,7 +865,7 @@ def test_df_isnull(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs columns = ["int64_col", "int64_too", "string_col", "bool_col"] - bf_result = scalars_df[columns].isnull().compute() + bf_result = scalars_df[columns].isnull().to_pandas() pd_result = scalars_pandas_df[columns].isnull() # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is @@ -798,7 +882,7 @@ def test_df_notnull(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs columns = ["int64_col", "int64_too", "string_col", "bool_col"] - bf_result = scalars_df[columns].notnull().compute() + bf_result = scalars_df[columns].notnull().to_pandas() pd_result = scalars_pandas_df[columns].notnull() # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is @@ -819,6 +903,8 @@ def test_df_notnull(scalars_dfs): operator.mul, operator.truediv, operator.floordiv, + operator.eq, + operator.ne, operator.gt, operator.ge, operator.lt, @@ -830,6 +916,8 @@ def test_df_notnull(scalars_dfs): "multiply", "true_divide", "floor_divide", + "eq", + "ne", "gt", "ge", "lt", @@ -845,7 +933,7 @@ def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): maybe_reversed_op = (lambda x, y: op(y, x)) if reverse_operands else op - bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).compute() + bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @@ -857,7 +945,7 @@ def test_mod(scalars_dfs, other_scalar): # This is likely a pandas bug as mod 0 is undefined in other dtypes, and most programming languages. scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).compute() + bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @@ -867,7 +955,7 @@ def test_scalar_binop_str_exception(scalars_dfs): scalars_df, _ = scalars_dfs columns = ["string_col"] with pytest.raises(TypeError): - (scalars_df[columns] + 1).compute() + (scalars_df[columns] + 1).to_pandas() @pytest.mark.parametrize( @@ -913,7 +1001,7 @@ def test_series_binop_axis_index( df_columns = ["int64_col", "float64_col"] series_column = "int64_too" - bf_result = op(scalars_df[df_columns], scalars_df[series_column]).compute() + bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @@ -963,7 +1051,7 @@ def test_dataframe_binop_axis_index_throws_not_implemented( other_df_columns = ["int64_too"] with pytest.raises(NotImplementedError): - op(scalars_df[df_columns], scalars_df[other_df_columns]).compute() + op(scalars_df[df_columns], scalars_df[other_df_columns]).to_pandas() # Differnt table will only work for explicit index, since default index orders are arbitrary. @@ -976,7 +1064,7 @@ def test_series_binop_add_different_table( bf_result = ( scalars_df_index[df_columns] .add(scalars_df_2_index[series_column], axis="index") - .compute() + .to_pandas() ) pd_result = scalars_pandas_df_index[df_columns].add( scalars_pandas_df_index[series_column], axis="index" @@ -1001,14 +1089,12 @@ def test_series_binop_add_different_table( @all_joins def test_join_same_table(scalars_dfs, how): bf_df, pd_df = scalars_dfs - if how == "right" and pd_df.index.name != "rowindex": - pytest.skip("right join not supported without an index") - bf_df_a = bf_df[["string_col", "int64_col"]] - bf_df_b = bf_df[["float64_col"]] - bf_result = bf_df_a.join(bf_df_b, how=how).compute() - pd_df_a = pd_df[["string_col", "int64_col"]] - pd_df_b = pd_df[["float64_col"]] + bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] + bf_df_b = bf_df.set_index("int64_too")[["float64_col"]] + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]] + pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @@ -1019,7 +1105,7 @@ def test_join_different_table( ): bf_df_a = scalars_df_index[["string_col", "int64_col"]] bf_df_b = scalars_df_2_index.dropna()[["float64_col"]] - bf_result = bf_df_a.join(bf_df_b, how=how).compute() + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) @@ -1031,7 +1117,23 @@ def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): df_a = scalars_df[["string_col", "float64_col"]] df_b = scalars_df[["float64_col"]] with pytest.raises(NotImplementedError): - df_a.join(df_b, how="outer").compute() + df_a.join(df_b, how="outer").to_pandas() + + +@all_joins +def test_join_param_on(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_df_b = bf_df[["float64_col"]] + bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_df_b = pd_df[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @pytest.mark.parametrize( @@ -1050,7 +1152,7 @@ def test_dataframe_sort_values( # Test needs values to be unique bf_result = scalars_df_index.sort_values( by, ascending=ascending, na_position=na_position - ).compute() + ).to_pandas() pd_result = scalars_pandas_df_index.sort_values( by, ascending=ascending, na_position=na_position ) @@ -1061,6 +1163,22 @@ def test_dataframe_sort_values( ) +def test_dataframe_sort_values_stable(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index.sort_values("int64_col", kind="stable") + .sort_values("bool_col", kind="stable") + .to_pandas() + ) + pd_result = scalars_pandas_df_index.sort_values( + "int64_col", kind="stable" + ).sort_values("bool_col", kind="stable") + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + @pytest.mark.parametrize( ("operator", "columns"), [ @@ -1086,7 +1204,7 @@ def test_dataframe_numeric_analytic_op( # TODO: Add nullable ints (pandas 1.x has poor behavior on these) bf_series = operator(scalars_df_index[columns]) pd_series = operator(scalars_pandas_df_index[columns]) - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() pd.testing.assert_frame_equal(pd_series, bf_result, check_dtype=False) @@ -1111,13 +1229,90 @@ def test_dataframe_general_analytic_op( col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] bf_series = operator(scalars_df_index[col_names]) pd_series = operator(scalars_pandas_df_index[col_names]) - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() pd.testing.assert_frame_equal( pd_series, bf_result, ) +def test_dataframe_agg_single_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[numeric_cols].agg("sum").to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg("sum") + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_dataframe_agg_multi_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + aggregations = [ + "sum", + "mean", + "median", + "std", + "var", + "min", + "max", + "nunique", + "count", + ] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Drop median, as it's an approximation. + bf_median = bf_result.loc["median", :] + bf_result = bf_result.drop(labels=["median"]) + pd_result = pd_result.drop(labels=["median"]) + + pd.testing.assert_frame_equal(pd_result, bf_result, check_index_type=False) + + # Double-check that median is at least plausible. + assert ( + (bf_result.loc["min", :] <= bf_median) & (bf_median <= bf_result.loc["max", :]) + ).all() + + +def test_df_describe(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # pyarrows time columns fail in pandas + unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] + bf_result = scalars_df.describe().to_pandas() + + modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns) + pd_result = modified_pd_df.describe() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Drop quartiles, as they are approximate + bf_min = bf_result.loc["min", :] + bf_p25 = bf_result.loc["25%", :] + bf_p50 = bf_result.loc["50%", :] + bf_p75 = bf_result.loc["75%", :] + bf_max = bf_result.loc["max", :] + + bf_result = bf_result.drop(labels=["25%", "50%", "75%"]) + pd_result = pd_result.drop(labels=["25%", "50%", "75%"]) + + pd.testing.assert_frame_equal(pd_result, bf_result, check_index_type=False) + + # Double-check that quantiles are at least plausible. + assert ( + (bf_min <= bf_p25) + & (bf_p25 <= bf_p50) + & (bf_p50 <= bf_p50) + & (bf_p75 <= bf_max) + ).all() + + def test_ipython_key_completions_with_drop(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_names = "string_col" @@ -1206,7 +1401,7 @@ def test__dir__with_rename(scalars_dfs): ], ) def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): - bf_result = scalars_df_index.iloc[start:stop:step].compute() + bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index.iloc[start:stop:step] # Pandas may assign non-object dtype to empty series and series index @@ -1230,7 +1425,7 @@ def test_iloc_slice_zero_step(scalars_df_index): def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.iloc[1:].iloc[1:].compute() + bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas() pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] pd.testing.assert_frame_equal( @@ -1241,7 +1436,7 @@ def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index): @pytest.mark.parametrize( "index", - [0, 5], + [0, 5, -2], ) def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): bf_result = scalars_df_index.iloc[index] @@ -1261,7 +1456,7 @@ def test_iloc_single_integer_out_of_bound_error( def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[scalars_df_index.bool_col].compute() + bf_result = scalars_df_index.loc[scalars_df_index.bool_col].to_pandas() pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.bool_col] pd.testing.assert_frame_equal( @@ -1271,7 +1466,7 @@ def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[:, "int64_col"].compute() + bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() pd_result = scalars_pandas_df_index.loc[:, "int64_col"] pd.testing.assert_series_equal( bf_result, @@ -1279,6 +1474,32 @@ def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): ) +def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_series_equal( + bf_result.to_pandas().iloc[0, :], + pd_result, + ) + + @pytest.mark.parametrize( ("op"), [ @@ -1297,7 +1518,7 @@ def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op): col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] bf_series = op(scalars_df_index[col_names]) pd_series = op(scalars_pandas_df_index[col_names]) - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_series = pd_series.astype("Float64") @@ -1305,6 +1526,21 @@ def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op): pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) +def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] + bf_result = scalars_df_index[col_names].median(numeric_only=True).to_pandas() + pd_result = scalars_pandas_df_index[col_names].agg(["min", "max"]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Median is an approximation, but double-check that median is plausible. + for col in col_names: + assert (pd_result.loc["min", col] <= bf_result[col]) and ( + bf_result[col] <= pd_result.loc["max", col] + ) + + @pytest.mark.parametrize( ("op"), [ @@ -1320,7 +1556,7 @@ def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op ) bf_series = op(scalars_df_index) pd_series = op(scalars_pandas_df_index).astype("boolean") - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() # Pandas has object index type pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) @@ -1330,7 +1566,7 @@ def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): col_names = ["int64_too", "float64_col"] bf_series = scalars_df_index[col_names].prod() pd_series = scalars_pandas_df_index[col_names].prod() - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_series = pd_series.astype("Float64") @@ -1358,7 +1594,7 @@ def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): def test_sample(scalars_dfs, frac, n, random_state): scalars_df, _ = scalars_dfs df = scalars_df.sample(frac=frac, n=n, random_state=random_state) - bf_result = df.compute() + bf_result = df.to_pandas() n = 1 if n is None else n expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n @@ -1384,7 +1620,7 @@ def test_sample_raises_value_error(scalars_dfs): def test_df_add_prefix(scalars_df_index, scalars_pandas_df_index, axis): if pd.__version__.startswith("1."): pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") - bf_result = scalars_df_index.add_prefix("prefix_", axis).compute() + bf_result = scalars_df_index.add_prefix("prefix_", axis).to_pandas() pd_result = scalars_pandas_df_index.add_prefix("prefix_", axis) @@ -1405,7 +1641,7 @@ def test_df_add_prefix(scalars_df_index, scalars_pandas_df_index, axis): def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): if pd.__version__.startswith("1."): pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") - bf_result = scalars_df_index.add_suffix("_suffix", axis).compute() + bf_result = scalars_df_index.add_suffix("_suffix", axis).to_pandas() pd_result = scalars_pandas_df_index.add_suffix("_suffix", axis) @@ -1462,11 +1698,11 @@ def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): scalars_df_index = scalars_df_index.set_index("string_col") scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") - bf_result = scalars_df_index.loc[index_list] + bf_result = scalars_df_index.loc[index_list].to_pandas() pd_result = scalars_pandas_df_index.loc[index_list] pd.testing.assert_frame_equal( - bf_result.compute(), + bf_result, pd_result, ) @@ -1478,7 +1714,7 @@ def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.loc[index_list] pd.testing.assert_frame_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1490,7 +1726,7 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.iloc[index_list] pd.testing.assert_frame_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1501,7 +1737,7 @@ def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] - bf_result = bf_result.compute() + bf_result = bf_result.to_pandas() assert bf_result.shape == pd_result.shape # types are known to be different @@ -1510,7 +1746,7 @@ def test_rename_axis(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.rename_axis("newindexname") pd.testing.assert_frame_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1526,7 +1762,7 @@ def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.loc[pd_string_series] pd.testing.assert_frame_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1539,7 +1775,7 @@ def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.loc[pd_index] pd.testing.assert_frame_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1559,7 +1795,7 @@ def test_loc_bf_index_integer_index_renamed_col( pd_result = scalars_pandas_df_index.loc[pd_index] pd.testing.assert_frame_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1568,6 +1804,7 @@ def test_loc_bf_index_integer_index_renamed_col( ("subset"), [ None, + "bool_col", ["bool_col", "int64_too"], ], ) @@ -1581,7 +1818,7 @@ def test_loc_bf_index_integer_index_renamed_col( ) def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset): columns = ["bool_col", "int64_too", "int64_col"] - bf_series = scalars_df_index[columns].drop_duplicates(subset, keep=keep).compute() + bf_series = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() pd_series = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) pd.testing.assert_frame_equal( pd_series, @@ -1606,7 +1843,7 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub ) def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): columns = ["bool_col", "int64_too", "int64_col"] - bf_series = scalars_df_index[columns].duplicated(subset, keep=keep).compute() + bf_series = scalars_df_index[columns].duplicated(subset, keep=keep).to_pandas() pd_series = scalars_pandas_df_index[columns].duplicated(subset, keep=keep) pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) @@ -1625,7 +1862,7 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): bf_result = ( scalars_df[["string_col", "bool_col"]] .value_counts(subset, normalize=normalize, ascending=ascending, dropna=dropna) - .compute() + .to_pandas() ) pd_result = scalars_pandas_df[["string_col", "bool_col"]].value_counts( subset, normalize=normalize, ascending=ascending, dropna=dropna @@ -1670,7 +1907,7 @@ def test_df_rank_with_nulls( ascending=ascending, numeric_only=numeric_only, ) - .compute() + .to_pandas() ) pd_result = ( scalars_pandas_df_index.drop(columns=unsupported_columns) @@ -1692,3 +1929,14 @@ def test_df_rank_with_nulls( def test_df_bool_interpretation_error(scalars_df_index): with pytest.raises(ValueError): True if scalars_df_index else False + + +def test_query_job_setters(scalars_df_default_index: dataframe.DataFrame): + job_ids = set() + repr(scalars_df_default_index) + assert scalars_df_default_index.query_job is not None + job_ids.add(scalars_df_default_index.query_job.job_id) + scalars_df_default_index.to_pandas() + job_ids.add(scalars_df_default_index.query_job.job_id) + + assert len(job_ids) == 2 diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index a4318a8658..7ad753e1bc 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -72,15 +72,17 @@ def test_to_csv_index( gcs_folder: str, index: bool, ): + if pd.__version__.startswith("1."): + pytest.skip("date_format parameter not supported in pandas 1.x.") """Test the `to_csv` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs index_col = None if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_csv_index_{index}" + path = gcs_folder + f"test_index_df_to_csv_index_{index}*.csv" if index: index_col = scalars_df.index.name else: - path = gcs_folder + f"test_default_index_df_to_csv_index_{index}" + path = gcs_folder + f"test_default_index_df_to_csv_index_{index}*.csv" # TODO(swast): Support "date_format" parameter and make sure our # DATETIME/TIMESTAMP column export is the same format as pandas by default. @@ -90,14 +92,60 @@ def test_to_csv_index( # BigQuery-backed dataframes, so manually convert the dtypes specifically # here. dtype = scalars_df.reset_index().dtypes.to_dict() - dtype.pop("timestamp_col") dtype.pop("geography_col") + dtype.pop("rowindex") gcs_df = pd.read_csv( - path, dtype=dtype, parse_dates=["timestamp_col"], index_col=index_col + path, + dtype=dtype, + date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, + index_col=index_col, ) convert_pandas_dtypes(gcs_df, bytes_col=True) + gcs_df.index.name = scalars_df.index.name - assert_pandas_df_equal_ignore_ordering(gcs_df, scalars_pandas_df) + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.index = scalars_pandas_df.index.astype("int64") + + # Ordering should be maintained for tables smaller than 1 GB. + pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) + + +def test_to_csv_tabs( + scalars_dfs: Tuple[bigframes.dataframe.DataFrame, pd.DataFrame], + gcs_folder: str, +): + if pd.__version__.startswith("1."): + pytest.skip("date_format parameter not supported in pandas 1.x.") + """Test the `to_csv` API with the `sep` parameter.""" + scalars_df, scalars_pandas_df = scalars_dfs + index_col = scalars_df.index.name + path = gcs_folder + "test_to_csv_tabs*.csv" + + # TODO(swast): Support "date_format" parameter and make sure our + # DATETIME/TIMESTAMP column export is the same format as pandas by default. + scalars_df.to_csv(path, sep="\t", index=True) + + # Pandas dataframes dtypes from read_csv are not fully compatible with + # BigQuery-backed dataframes, so manually convert the dtypes specifically + # here. + dtype = scalars_df.reset_index().dtypes.to_dict() + dtype.pop("geography_col") + dtype.pop("rowindex") + gcs_df = pd.read_csv( + path, + sep="\t", + dtype=dtype, + date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, + index_col=index_col, + ) + convert_pandas_dtypes(gcs_df, bytes_col=True) + gcs_df.index.name = scalars_df.index.name + + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.index = scalars_pandas_df.index.astype("int64") + + # Ordering should be maintained for tables smaller than 1 GB. + pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) @pytest.mark.parametrize( @@ -190,9 +238,9 @@ def test_to_json_index_invalid_orient( ): scalars_df, scalars_pandas_df = scalars_dfs if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_json_index_{index}" + path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" else: - path = gcs_folder + f"test_default_index_df_to_json_index_{index}" + path = gcs_folder + f"test_default_index_df_to_json_index_{index}*.jsonl" with pytest.raises(ValueError): scalars_df.to_json(path, index=index, lines=True) @@ -208,9 +256,9 @@ def test_to_json_index_invalid_lines( ): scalars_df, scalars_pandas_df = scalars_dfs if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_json_index_{index}" + path = gcs_folder + f"test_index_df_to_json_index_{index}.jsonl" else: - path = gcs_folder + f"test_default_index_df_to_json_index_{index}" + path = gcs_folder + f"test_default_index_df_to_json_index_{index}.jsonl" with pytest.raises(NotImplementedError): scalars_df.to_json(path, index=index) @@ -227,9 +275,9 @@ def test_to_json_index_records_orient( """Test the `to_json` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_json_index_{index}" + path = gcs_folder + f"test_index_df_to_json_index_{index}*.jsonl" else: - path = gcs_folder + f"test_default_index_df_to_json_index_{index}" + path = gcs_folder + f"test_default_index_df_to_json_index_{index}*.jsonl" """ Test the `to_json` API with `orient` is `records` and `lines` is True""" scalars_df.to_json(path, index=index, orient="records", lines=True) @@ -241,7 +289,13 @@ def test_to_json_index_records_orient( assert len(gcs_df.index) == len(scalars_pandas_df.index) pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) - assert_pandas_df_equal_ignore_ordering(gcs_df, scalars_pandas_df) + + gcs_df.index.name = scalars_df.index.name + gcs_df.index = gcs_df.index.astype("Int64") + scalars_pandas_df.index = scalars_pandas_df.index.astype("Int64") + + # Ordering should be maintained for tables smaller than 1 GB. + pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) @pytest.mark.parametrize( @@ -251,10 +305,12 @@ def test_to_json_index_records_orient( def test_to_parquet_index(scalars_dfs, gcs_folder, index): """Test the `to_parquet` API with the `index` parameter.""" scalars_df, scalars_pandas_df = scalars_dfs + scalars_pandas_df = scalars_pandas_df.copy() + if scalars_df.index.name is not None: - path = gcs_folder + f"test_index_df_to_parquet_{index}" + path = gcs_folder + f"test_index_df_to_parquet_{index}*.parquet" else: - path = gcs_folder + f"test_default_index_df_to_parquet_{index}" + path = gcs_folder + f"test_default_index_df_to_parquet_{index}*.parquet" # TODO(b/268693993): Type GEOGRAPHY is not currently supported for parquet. scalars_df = scalars_df.drop(columns="geography_col") @@ -265,20 +321,26 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): # table. scalars_df.to_parquet(path, index=index) - gcs_df = pd.read_parquet(path) + gcs_df = pd.read_parquet(path.replace("*", "000000000000")) convert_pandas_dtypes(gcs_df, bytes_col=False) if index and scalars_df.index.name is not None: gcs_df = gcs_df.set_index(scalars_df.index.name) assert len(gcs_df.index) == len(scalars_pandas_df.index) pd.testing.assert_index_equal(gcs_df.columns, scalars_pandas_df.columns) - assert_pandas_df_equal_ignore_ordering(gcs_df, scalars_pandas_df) + + gcs_df.index.name = scalars_df.index.name + gcs_df.index = gcs_df.index.astype("Int64") + scalars_pandas_df.index = scalars_pandas_df.index.astype("Int64") + + # Ordering should be maintained for tables smaller than 1 GB. + pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) def test_to_sql_query_named_index_included( session, scalars_df_index, scalars_pandas_df_index ): - sql, index_columns = scalars_df_index.to_sql_query(always_include_index=True) + sql, index_columns = scalars_df_index._to_sql_query(always_include_index=True) assert len(index_columns) == 1 index_column, is_named = index_columns[0] assert index_column == "rowindex" @@ -294,7 +356,7 @@ def test_to_sql_query_unnamed_index_excluded( session, scalars_df_default_index, scalars_pandas_df_default_index ): # The .sql property should return SQL without the unnamed indexes - sql, index_columns = scalars_df_default_index.to_sql_query( + sql, index_columns = scalars_df_default_index._to_sql_query( always_include_index=False ) assert len(index_columns) == 0 @@ -310,7 +372,7 @@ def test_to_sql_query_unnamed_index_always_include( scalars_df_default_index: bigframes.dataframe.DataFrame, scalars_pandas_df_default_index, ): - sql, index_columns = scalars_df_default_index.to_sql_query( + sql, index_columns = scalars_df_default_index._to_sql_query( always_include_index=True ) assert len(index_columns) == 1 diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 066e20bb12..e72d75729b 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -41,11 +41,27 @@ def test_dataframe_groupby_numeric_aggregate( col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = operator(scalars_df_index[col_names].groupby("string_col")) pd_result = operator(scalars_pandas_df_index[col_names].groupby("string_col")) - bf_result_computed = bf_result.compute() + bf_result_computed = bf_result.to_pandas() # Pandas std function produces float64, not matching Float64 from bigframes pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] + bf_result = ( + scalars_df_index[col_names].groupby("string_col").median(numeric_only=True) + ) + pd_min = ( + scalars_pandas_df_index[col_names].groupby("string_col").min(numeric_only=True) + ) + pd_max = ( + scalars_pandas_df_index[col_names].groupby("string_col").max(numeric_only=True) + ) + bf_result_computed = bf_result.to_pandas() + # Median is approximate. Just check for plausibility. + assert ((pd_min <= bf_result_computed) & (bf_result_computed <= pd_max)).all().all() + + @pytest.mark.parametrize( ("operator"), [ @@ -65,7 +81,7 @@ def test_dataframe_groupby_aggregate( col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = operator(scalars_df_index[col_names].groupby("string_col")) pd_result = operator(scalars_pandas_df_index[col_names].groupby("string_col")) - bf_result_computed = bf_result.compute() + bf_result_computed = bf_result.to_pandas() pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) @@ -91,7 +107,7 @@ def test_dataframe_groupby_multi_sum( .groupby(["bool_col", "int64_col"], as_index=as_index) .sum(numeric_only=True) ) - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() if not as_index: # BigQuery DataFrames default indices use nullable Int64 always @@ -124,6 +140,36 @@ def test_dataframe_groupby_analytic( col_names = ["float64_col", "int64_col", "bool_col", "string_col"] bf_result = operator(scalars_df_index[col_names].groupby("string_col")) pd_result = operator(scalars_pandas_df_index[col_names].groupby("string_col")) - bf_result_computed = bf_result.compute() + bf_result_computed = bf_result.to_pandas() pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + + +def test_dataframe_groupby_getitem( + scalars_df_index, + scalars_pandas_df_index, +): + col_names = ["float64_col", "int64_col", "bool_col", "string_col"] + bf_result = ( + scalars_df_index[col_names].groupby("string_col")["int64_col"].min().to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[col_names].groupby("string_col")["int64_col"].min() + ) + + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + +def test_dataframe_groupby_getitem_list( + scalars_df_index, + scalars_pandas_df_index, +): + col_names = ["float64_col", "int64_col", "bool_col", "string_col"] + bf_result = ( + scalars_df_index[col_names].groupby("string_col")[col_names].min().to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[col_names].groupby("string_col")[col_names].min() + ) + + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py new file mode 100644 index 0000000000..58b78e0048 --- /dev/null +++ b/tests/system/small/test_ibis.py @@ -0,0 +1,39 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for monkeypatched ibis code.""" + +import ibis.expr.types as ibis_types + +import bigframes +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops + + +def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str): + num_bins = 3 + ibis_client = session.ibis_client + _, dataset, table_id = scalars_table_id.split(".") + ibis_table: ibis_types.Table = ibis_client.table(table_id, database=dataset) + ibis_column: ibis_types.NumericColumn = ibis_table["int64_col"] + quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( # type: ignore + ibis_column, num_bins=num_bins + ).to_expr() + value = quantiles[1] + num_edges = quantiles.length() + + sql = ibis_client.compile(value) + num_edges_result = num_edges.to_pandas() + + assert "APPROX_QUANTILES" in sql + assert num_edges_result == num_bins + 1 diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 312e86d7f2..ac1f8c7220 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -19,7 +19,7 @@ def test_get_index(scalars_df_index, scalars_pandas_df_index): index = scalars_df_index.index - bf_result = index.compute() + bf_result = index.to_pandas() pd_result = scalars_pandas_df_index.index assert_pandas_index_equal_ignore_index_type(bf_result, pd_result) @@ -40,7 +40,13 @@ def test_index_len(scalars_df_index, scalars_pandas_df_index): def test_index_array(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.__array__() - pd_result = scalars_pandas_df_index.__array__() + bf_result = scalars_df_index.index.__array__() + pd_result = scalars_pandas_df_index.index.__array__() numpy.array_equal(bf_result, pd_result) + + +def test_index_getitem_int(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.index[-2] + pd_result = scalars_pandas_df_index.index[-2] + assert bf_result == pd_result diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index bf2a2080de..914be6dae4 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -16,10 +16,11 @@ import pytest import bigframes.pandas as bpd +from tests.system.utils import assert_pandas_df_equal_ignore_ordering def test_set_multi_index(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.set_index(["bool_col", "int64_too"]).compute() + bf_result = scalars_df_index.set_index(["bool_col", "int64_too"]).to_pandas() pd_result = scalars_pandas_df_index.set_index(["bool_col", "int64_too"]) pandas.testing.assert_frame_equal(bf_result, pd_result) @@ -27,7 +28,7 @@ def test_set_multi_index(scalars_df_index, scalars_pandas_df_index): def test_reset_multi_index(scalars_df_index, scalars_pandas_df_index): bf_result = ( - scalars_df_index.set_index(["bool_col", "int64_too"]).reset_index().compute() + scalars_df_index.set_index(["bool_col", "int64_too"]).reset_index().to_pandas() ) pd_result = scalars_pandas_df_index.set_index( ["bool_col", "int64_too"] @@ -51,7 +52,7 @@ def test_binop_series_series_matching_multi_indices( pd_result = pd_left["int64_col"] + pd_right["int64_too"] pandas.testing.assert_series_equal( - bf_result.sort_index().compute(), pd_result.sort_index() + bf_result.sort_index().to_pandas(), pd_result.sort_index() ) @@ -67,7 +68,7 @@ def test_binop_df_series_matching_multi_indices( pd_result = pd_left[["int64_col", "int64_too"]].add(pd_right["int64_too"], axis=0) pandas.testing.assert_frame_equal( - bf_result.sort_index().compute(), pd_result.sort_index() + bf_result.sort_index().to_pandas(), pd_result.sort_index() ) @@ -80,7 +81,7 @@ def test_binop_multi_index_mono_index(scalars_df_index, scalars_pandas_df_index) bf_result = bf_left["int64_col"] + bf_right["int64_too"] pd_result = pd_left["int64_col"] + pd_right["int64_too"] - pandas.testing.assert_series_equal(bf_result.compute(), pd_result) + pandas.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_binop_overlapping_multi_indices(scalars_df_index, scalars_pandas_df_index): @@ -93,7 +94,7 @@ def test_binop_overlapping_multi_indices(scalars_df_index, scalars_pandas_df_ind pd_result = pd_left["int64_col"] + pd_right["int64_too"] pandas.testing.assert_series_equal( - bf_result.sort_index().compute(), pd_result.sort_index() + bf_result.sort_index().to_pandas(), pd_result.sort_index() ) @@ -108,7 +109,7 @@ def test_concat_compatible_multi_indices(scalars_df_index, scalars_pandas_df_ind bf_result = bpd.concat([bf_left, bf_right]) pd_result = pandas.concat([pd_left, pd_right]) - pandas.testing.assert_frame_equal(bf_result.compute(), pd_result) + pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) def test_concat_multi_indices_ignore_index(scalars_df_index, scalars_pandas_df_index): @@ -123,12 +124,12 @@ def test_concat_multi_indices_ignore_index(scalars_df_index, scalars_pandas_df_i # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) - pandas.testing.assert_frame_equal(bf_result.compute(), pd_result) + pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) def test_multi_index_loc(scalars_df_index, scalars_pandas_df_index): bf_result = ( - scalars_df_index.set_index(["int64_too", "bool_col"]).loc[[2, 0]].compute() + scalars_df_index.set_index(["int64_too", "bool_col"]).loc[[2, 0]].to_pandas() ) pd_result = scalars_pandas_df_index.set_index(["int64_too", "bool_col"]).loc[[2, 0]] @@ -139,7 +140,7 @@ def test_multi_index_getitem_bool(scalars_df_index, scalars_pandas_df_index): bf_frame = scalars_df_index.set_index(["int64_too", "bool_col"]) pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) - bf_result = bf_frame[bf_frame["int64_col"] > 0].compute() + bf_result = bf_frame[bf_frame["int64_col"] > 0].to_pandas() pd_result = pd_frame[pd_frame["int64_col"] > 0] pandas.testing.assert_frame_equal(bf_result, pd_result) @@ -159,12 +160,30 @@ def test_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, level) bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"]) pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"]) - bf_result = bf_frame.droplevel(level).compute() + bf_result = bf_frame.droplevel(level).to_pandas() pd_result = pd_frame.droplevel(level) pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("labels", "level"), + [ + (1, 0), + ([0, 1], 0), + ([True, None], 1), + ], +) +def test_multi_index_drop(scalars_df_index, scalars_pandas_df_index, labels, level): + bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"]) + + bf_result = bf_frame.drop(labels=labels, axis="index", level=level).to_pandas() + pd_result = pd_frame.drop(labels=labels, axis="index", level=level) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("order"), [ @@ -182,7 +201,7 @@ def test_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_index, o bf_frame = scalars_df_index.set_index(["int64_too", "bool_col", "int64_col"]) pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col", "int64_col"]) - bf_result = bf_frame.reorder_levels(order).compute() + bf_result = bf_frame.reorder_levels(order).to_pandas() pd_result = pd_frame.reorder_levels(order) pandas.testing.assert_frame_equal(bf_result, pd_result) @@ -194,7 +213,7 @@ def test_multi_index_series_groupby(scalars_df_index, scalars_pandas_df_index): bf_frame["float64_col"] .groupby([bf_frame.int64_col % 2, "bool_col"]) .mean() - .compute() + .to_pandas() ) pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) pd_result = ( @@ -220,7 +239,7 @@ def test_multi_index_series_groupby_level( scalars_df_index.set_index(["int64_too", "bool_col"])["float64_col"] .groupby(level=level) .mean() - .compute() + .to_pandas() ) pd_result = ( scalars_pandas_df_index.set_index(["int64_too", "bool_col"])["float64_col"] @@ -236,7 +255,7 @@ def test_multi_index_dataframe_groupby(scalars_df_index, scalars_pandas_df_index bf_result = ( bf_frame.groupby([bf_frame.int64_col % 2, "bool_col"]) .mean(numeric_only=True) - .compute() + .to_pandas() ) pd_frame = scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) pd_result = pd_frame.groupby([pd_frame.int64_col % 2, "bool_col"]).mean( @@ -247,27 +266,180 @@ def test_multi_index_dataframe_groupby(scalars_df_index, scalars_pandas_df_index @pytest.mark.parametrize( - ("level"), + ("level", "as_index"), [ - (1), - ([0]), - (["bool_col"]), - (["bool_col", "int64_too"]), + (1, True), + ([0], False), + (["bool_col"], True), + (["bool_col", "int64_too"], False), ], ) -def test_multi_index_dataframe_groupby_level( - scalars_df_index, scalars_pandas_df_index, level +def test_multi_index_dataframe_groupby_level_aggregate( + scalars_df_index, scalars_pandas_df_index, level, as_index ): bf_result = ( scalars_df_index.set_index(["int64_too", "bool_col"]) - .groupby(level=level) + .groupby(level=level, as_index=as_index) .mean(numeric_only=True) - .compute() + .to_pandas() ) pd_result = ( scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) - .groupby(level=level) + .groupby(level=level, as_index=as_index) .mean(numeric_only=True) ) - pandas.testing.assert_frame_equal(bf_result, pd_result) + # Pandas will have int64 index, while bigquery will have Int64 when resetting + pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("level", "as_index"), + [ + (1, True), + ([0], False), + ( + ["bool_col"], + True, + ), + (["bool_col", "int64_too"], False), + ], +) +def test_multi_index_dataframe_groupby_level_analytic( + scalars_df_index, scalars_pandas_df_index, level, as_index +): + bf_result = ( + scalars_df_index.set_index(["int64_too", "bool_col"]) + .groupby(level=level, as_index=as_index, dropna=False) + .cumsum(numeric_only=True) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) + .groupby(level=level, as_index=as_index, dropna=False) + .cumsum(numeric_only=True) + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +all_joins = pytest.mark.parametrize( + ("how",), + ( + ("outer",), + ("left",), + ("right",), + ("inner",), + ), +) + + +@all_joins +# Both DFs are multi-index +def test_multi_index_dataframe_join(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df.set_index((["bool_col", "rowindex_2"]))[["string_col", "int64_col"]] + bf_df_b = bf_df.assign(rowindex_2=bf_df["rowindex_2"] + 2).set_index( + (["bool_col", "rowindex_2"]) + )[["float64_col"]] + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + + pd_df_a = pd_df.set_index((["bool_col", "rowindex_2"]))[["string_col", "int64_col"]] + pd_df_b = pd_df.assign(rowindex_2=pd_df["rowindex_2"] + 2).set_index( + (["bool_col", "rowindex_2"]) + )[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, how=how) + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + +@all_joins +# Only left DF is multi-index +def test_multi_index_dataframe_join_on(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df.set_index((["int64_too", "bool_col"]))[ + ["string_col", "int64_col", "rowindex_2"] + ] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_df_b = bf_df[["float64_col"]] + bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df.set_index((["int64_too", "bool_col"]))[ + ["string_col", "int64_col", "rowindex_2"] + ] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_df_b = pd_df[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("level",), + [ + (1,), + ([0],), + (["bool_col"],), + (["bool_col", "int64_too"],), + ], +) +def test_multi_index_series_groupby_level_aggregate( + scalars_df_index, scalars_pandas_df_index, level +): + bf_result = ( + scalars_df_index.set_index(["int64_too", "bool_col"])["float64_col"] + .groupby(level=level) + .mean() + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index.set_index(["int64_too", "bool_col"])["float64_col"] + .groupby(level=level) + .mean() + ) + + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("level",), + [ + (1,), + ([0],), + (["bool_col"],), + (["bool_col", "int64_too"],), + ], +) +def test_multi_index_series_groupby_level_analytic( + scalars_df_index, scalars_pandas_df_index, level +): + bf_result = ( + scalars_df_index.set_index(["int64_too", "bool_col"])["float64_col"] + .groupby(level=level, dropna=False) + .cumsum() + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index.set_index(["int64_too", "bool_col"])["float64_col"] + .groupby(level=level, dropna=False) + .cumsum() + ) + + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_multi_index_series_rename_dict_same_type( + scalars_df_index, scalars_pandas_df_index +): + bf_result = ( + scalars_df_index.set_index(["rowindex_2", "int64_too"])["string_col"] + .rename({1: 100, 2: 200}) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index(["rowindex_2", "int64_too"])[ + "string_col" + ].rename({1: 100, 2: 200}) + + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a612ad946f..98bafc6392 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -21,7 +21,7 @@ def test_concat_dataframe(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.concat(11 * [scalars_df]) - bf_result = bf_result.compute() + bf_result = bf_result.to_pandas() pd_result = pd.concat(11 * [scalars_pandas_df]) pd.testing.assert_frame_equal(bf_result, pd_result) @@ -32,7 +32,7 @@ def test_concat_series(scalars_dfs): bf_result = bpd.concat( [scalars_df.int64_col, scalars_df.int64_too, scalars_df.int64_col] ) - bf_result = bf_result.compute() + bf_result = bf_result.to_pandas() pd_result = pd.concat( [ scalars_pandas_df.int64_col, @@ -45,10 +45,10 @@ def test_concat_series(scalars_dfs): @pytest.mark.parametrize( - ("how",), + ("how"), [ - ("inner",), - ("outer",), + ("inner"), + ("outer"), ], ) def test_concat_dataframe_mismatched_columns(scalars_dfs, how): @@ -56,9 +56,52 @@ def test_concat_dataframe_mismatched_columns(scalars_dfs, how): cols2 = ["int64_col", "string_col", "int64_too"] scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.concat([scalars_df[cols1], scalars_df[cols2]], join=how) - bf_result = bf_result.compute() + bf_result = bf_result.to_pandas() pd_result = pd.concat( - [scalars_pandas_df[cols1], scalars_pandas_df[cols2]], join=how + [scalars_pandas_df[cols1], scalars_pandas_df[cols2]], + join=how, ) pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("how",), + [ + ("inner",), + ("outer",), + ], +) +def test_concat_axis_1(scalars_dfs, how): + if pd.__version__.startswith("1."): + pytest.skip("pandas has different behavior in 1.x") + scalars_df, scalars_pandas_df = scalars_dfs + cols1 = ["int64_col", "float64_col", "rowindex_2"] + cols2 = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + part1 = scalars_df[cols1] + part1.index.name = "newindexname" + # Offset the rows somewhat so that outer join can have an effect. + part2 = ( + scalars_df[cols2] + .assign(rowindex_2=scalars_df["rowindex_2"] + 2) + .sort_values(["string_col"], kind="stable") + ) + part3 = scalars_df["int64_too"].cumsum().iloc[2:] + + bf_result = bpd.concat([part1, part2, part3], join=how, axis=1) + + # Copy since modifying index + pd_part1 = scalars_pandas_df.copy()[cols1] + pd_part1.index.name = "newindexname" + # Offset the rows somewhat so that outer join can have an effect. + pd_part2 = ( + scalars_pandas_df[cols2] + .assign(rowindex_2=scalars_pandas_df["rowindex_2"] + 2) + .sort_values(["string_col"], kind="stable") + ) + pd_part3 = scalars_pandas_df["int64_too"].cumsum().iloc[2:] + + pd_result = pd.concat([pd_part1, pd_part2, pd_part3], join=how, axis=1) + + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index 261e0d5b14..96697dbcab 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -12,7 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime +from unittest import mock + import google.api_core.exceptions +import google.auth +import google.auth.exceptions import pytest import bigframes.pandas as bpd @@ -79,6 +84,10 @@ def test_read_gbq_start_sets_session_location( # There should still be the previous location set in the bigquery options assert bpd.options.bigquery.location == tokyo_location + # Reset the location to be able to query another location + bpd.options.bigquery.location = None + assert not bpd.options.bigquery.location + # Starting over the user journey with read_gbq* should work for a table # in another location, in this case US df = read_method(query) @@ -143,3 +152,188 @@ def test_read_gbq_after_session_start_must_comply_with_default_location( # read_gbq* from a table in the default location should work df = read_method(query) assert df is not None + + +@pytest.mark.parametrize( + ("read_method", "query_prefix"), + [ + (bpd.read_gbq, None), + (bpd.read_gbq, "SELECT COUNT(1) FROM "), + (bpd.read_gbq_table, None), + (bpd.read_gbq_query, "SELECT COUNT(1) FROM "), + ], + ids=[ + "read_gbq-on-table-name", + "read_gbq-on-sql", + "read_gbq_table-on-table-name", + "read_gbq_query-on-sql", + ], +) +def test_read_gbq_must_comply_with_set_location_US( + test_data_tables, + test_data_tables_tokyo, + dataset_id_permanent_tokyo, + read_method, + query_prefix, +): + # Form query as a table name or a SQL depending on the test scenario + query_tokyo = test_data_tables_tokyo["scalars"] + query = test_data_tables["scalars"] + if query_prefix: + query_tokyo = f"{query_prefix} {query_tokyo}" + query = f"{query_prefix} {query}" + + # Initially there is no location set in the bigquery options + assert not bpd.options.bigquery.location + + # Explicitly set location + bpd.options.bigquery.location = "US" + assert bpd.options.bigquery.location == "US" + + # Starting user journey with read_gbq* from another location should fail + with pytest.raises( + google.api_core.exceptions.NotFound, + match=f"404 Not found: Dataset {dataset_id_permanent_tokyo} was not found in location US", + ): + read_method(query_tokyo) + + # Starting user journey with read_gbq* should work for a table in the same + # location, in this case tokyo + df = read_method(query) + assert df is not None + + +@pytest.mark.parametrize( + ("read_method", "query_prefix"), + [ + (bpd.read_gbq, None), + (bpd.read_gbq, "SELECT COUNT(1) FROM "), + (bpd.read_gbq_table, None), + (bpd.read_gbq_query, "SELECT COUNT(1) FROM "), + ], + ids=[ + "read_gbq-on-table-name", + "read_gbq-on-sql", + "read_gbq_table-on-table-name", + "read_gbq_query-on-sql", + ], +) +def test_read_gbq_must_comply_with_set_location_non_US( + tokyo_location, + test_data_tables, + test_data_tables_tokyo, + dataset_id_permanent, + read_method, + query_prefix, +): + # Form query as a table name or a SQL depending on the test scenario + query_tokyo = test_data_tables_tokyo["scalars"] + query = test_data_tables["scalars"] + if query_prefix: + query_tokyo = f"{query_prefix} {query_tokyo}" + query = f"{query_prefix} {query}" + + # Initially there is no location set in the bigquery options + assert not bpd.options.bigquery.location + + # Explicitly set location + bpd.options.bigquery.location = tokyo_location + assert bpd.options.bigquery.location == tokyo_location + + # Starting user journey with read_gbq* from another location should fail + with pytest.raises( + google.api_core.exceptions.NotFound, + match=f"404 Not found: Dataset {dataset_id_permanent} was not found in location {tokyo_location}", + ): + read_method(query) + + # Starting user journey with read_gbq* should work for a table in the same + # location, in this case tokyo + df = read_method(query_tokyo) + assert df is not None + + +def test_reset_session_after_bq_session_ended(): + # Use a simple test query to verify that default session works to interact + # with BQ + test_query = "SELECT 1" + + # Confirm that there is a session id in the default session + session = bpd.get_global_session() + assert session._session_id + + # Confirm that session works as usual + df = bpd.read_gbq(test_query) + assert df is not None + + # Abort the session to simulate the auto-expiration + # https://cloud.google.com/bigquery/docs/sessions-terminating#auto-terminate_a_session + abort_session_query = "CALL BQ.ABORT_SESSION()" + query_job = session.bqclient.query(abort_session_query) + query_job.result() # blocks until finished + + # Confirm that session is unusable to run any jobs + with pytest.raises( + google.api_core.exceptions.BadRequest, + match=f"Session {session._session_id} has expired and is no longer available.", + ): + query_job = session.bqclient.query(test_query) + query_job.result() # blocks until finished + + # Confirm that as a result bigframes.pandas interface is unusable + with pytest.raises( + google.api_core.exceptions.BadRequest, + match=f"Session {session._session_id} has expired and is no longer available.", + ): + bpd.read_gbq(test_query) + + # Now try to reset session and verify that it works + bpd.reset_session() + assert bpd._global_session is None + + # Now verify that use is able to start over + df = bpd.read_gbq(test_query) + assert df is not None + + +def test_reset_session_after_credentials_need_reauthentication(monkeypatch): + # Use a simple test query to verify that default session works to interact + # with BQ + test_query = "SELECT 1" + + # Confirm that default session has BQ client with valid credentials + session = bpd.get_global_session() + assert session.bqclient._credentials.valid + + # Confirm that default session works as usual + df = bpd.read_gbq(test_query) + assert df is not None + + with monkeypatch.context() as m: + # Simulate expired credentials to trigger the credential refresh flow + m.setattr(session.bqclient._credentials, "expiry", datetime.datetime.utcnow()) + assert not session.bqclient._credentials.valid + + # Simulate an exception during the credential refresh flow + m.setattr( + session.bqclient._credentials, + "refresh", + mock.Mock(side_effect=google.auth.exceptions.RefreshError()), + ) + + # Confirm that session is unusable to run any jobs + with pytest.raises(google.auth.exceptions.RefreshError): + query_job = session.bqclient.query(test_query) + query_job.result() # blocks until finished + + # Confirm that as a result bigframes.pandas interface is unusable + with pytest.raises(google.auth.exceptions.RefreshError): + bpd.read_gbq(test_query) + + # Now verify that resetting the session works + bpd.reset_session() + assert bpd._global_session is None + + # Now verify that use is able to start over + df = bpd.read_gbq(test_query) + assert df is not None diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 6eeee3a3a4..00380c2639 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile + +import pandas as pd + import bigframes as bf import bigframes.formatting_helpers as formatting_helpers @@ -20,11 +24,12 @@ def test_progress_bar_dataframe( penguins_df_default_index: bf.dataframe.DataFrame, capsys ): bf.options.display.progress_bar = "notebook" - penguins_df_default_index.compute() + penguins_df_default_index.to_pandas() html_check = "HTML(value=" open_job_check = "Open Job" lines = capsys.readouterr().out.split("\n") - lines = filter(None, lines) + lines = [line for line in lines if len(line) > 0] + assert len(lines) > 0 assert penguins_df_default_index.query_job is not None for line in lines: assert html_check in line and open_job_check in line @@ -33,11 +38,12 @@ def test_progress_bar_dataframe( def test_progress_bar_series(penguins_df_default_index: bf.dataframe.DataFrame, capsys): bf.options.display.progress_bar = "notebook" series = penguins_df_default_index["body_mass_g"].head(10) - series.compute() + series.to_pandas() html_check = "HTML(value=" open_job_check = "Open Job" lines = capsys.readouterr().out.split("\n") - lines = filter(None, lines) + lines = [line for line in lines if len(line) > 0] + assert len(lines) > 0 assert series.query_job is not None for line in lines: assert html_check in line and open_job_check in line @@ -49,18 +55,63 @@ def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, html_check = "HTML(value=" open_job_check = "Open Job" lines = capsys.readouterr().out.split("\n") - lines = filter(None, lines) + lines = [line for line in lines if len(line) > 0] + assert len(lines) > 0 for line in lines: assert html_check in line and open_job_check in line -def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): +def test_progress_bar_read_gbq(session: bf.Session, penguins_table_id: str, capsys): + bf.options.display.progress_bar = "notebook" + session.read_gbq(penguins_table_id) + html_check = "HTML(value=" + open_job_check = "Open Job" + lines = capsys.readouterr().out.split("\n") + lines = [line for line in lines if len(line) > 0] + assert len(lines) > 0 + for line in lines: + assert html_check in line and open_job_check in line + + +def test_progress_bar_extract_jobs( + penguins_df_default_index: bf.dataframe.DataFrame, gcs_folder, capsys +): + bf.options.display.progress_bar = "notebook" + path = gcs_folder + "test_read_csv_progress_bar*.csv" + penguins_df_default_index.to_csv(path) + html_check = "HTML(value=" + open_job_check = "Open Job" + lines = capsys.readouterr().out.split("\n") + lines = [line for line in lines if len(line) > 0] + assert len(lines) > 0 + for line in lines: + assert html_check in line and open_job_check in line + + +def test_progress_bar_load_jobs( + session: bf.Session, penguins_pandas_df_default_index: pd.DataFrame, capsys +): + bf.options.display.progress_bar = "notebook" + with tempfile.TemporaryDirectory() as dir: + path = dir + "/test_read_csv_progress_bar*.csv" + penguins_pandas_df_default_index.to_csv(path, index=False) + session.read_csv(path) + html_check = "HTML(value=" + open_job_check = "Open Job" + lines = capsys.readouterr().out.split("\n") + lines = [line for line in lines if len(line) > 0] + assert len(lines) > 0 + for line in lines: + assert html_check in line and open_job_check in line + + +def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): bf.options.display.progress_bar = "notebook" penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( False ) - penguins_df_default_index.compute() - query_job_repr = formatting_helpers.repr_query_job( + penguins_df_default_index.to_pandas() + query_job_repr = formatting_helpers.repr_query_job_html( penguins_df_default_index.query_job ).value string_checks = [ @@ -72,3 +123,33 @@ def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): ] for string in string_checks: assert string in query_job_repr + + +def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): + penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( + False + ) + penguins_df_default_index.to_pandas() + query_job_repr = formatting_helpers.repr_query_job( + penguins_df_default_index.query_job + ) + string_checks = [ + "Job", + "Destination Table", + "Slot Time", + "Bytes Processed", + "Cache hit", + ] + for string in string_checks: + assert string in query_job_repr + + +def test_query_job_dry_run( + penguins_df_default_index: bf.dataframe.DataFrame, capsys, deferred_repr +): + repr(penguins_df_default_index) + repr(penguins_df_default_index["body_mass_g"]) + lines = capsys.readouterr().out.split("\n") + lines = filter(None, lines) + for line in lines: + assert "Computation deferred. Computation will process" in line diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 4c75b5d3d2..fe4b1c5a97 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -16,7 +16,7 @@ import pytest import bigframes -from bigframes.remote_function import remote_function +from bigframes.remote_function import read_gbq_function, remote_function from tests.system.utils import assert_pandas_df_equal_ignore_ordering @@ -28,6 +28,38 @@ def bq_cf_connection() -> str: return "bigframes-rf-conn" +@pytest.fixture(scope="module") +def bq_cf_connection_location() -> str: + """Pre-created BQ connection to invoke cloud function for bigframes-dev + $ bq show --connection --location=us --project_id=bigframes-dev bigframes-rf-conn + """ + return "us.bigframes-rf-conn" + + +@pytest.fixture(scope="module") +def bq_cf_connection_location_mistached() -> str: + """Pre-created BQ connection to invoke cloud function for bigframes-dev + $ bq show --connection --location=us-east1 --project_id=bigframes-dev bigframes-rf-conn + """ + return "us-east1.bigframes-rf-conn" + + +@pytest.fixture(scope="module") +def bq_cf_connection_location_project() -> str: + """Pre-created BQ connection to invoke cloud function for bigframes-dev + $ bq show --connection --location=us --project_id=bigframes-dev bigframes-rf-conn + """ + return "bigframes-dev.us.bigframes-rf-conn" + + +@pytest.fixture(scope="module") +def bq_cf_connection_location_project_mistached() -> str: + """Pre-created BQ connection to invoke cloud function for bigframes-dev + $ bq show --connection --location=us-east1 --project_id=bigframes-metrics bigframes-rf-conn + """ + return "bigframes-metrics.us-east1.bigframes-rf-conn" + + @pytest.fixture(scope="module") def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: return bigframes.Session( @@ -35,10 +67,42 @@ def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: ) +@pytest.fixture(scope="module") +def session_with_bq_connection_location_specified( + bq_cf_connection_location, +) -> bigframes.Session: + return bigframes.Session( + bigframes.BigQueryOptions(remote_udf_connection=bq_cf_connection_location) + ) + + +@pytest.fixture(scope="module") +def session_with_bq_connection_location_mistached( + bq_cf_connection_location_mistached, +) -> bigframes.Session: + return bigframes.Session( + bigframes.BigQueryOptions( + remote_udf_connection=bq_cf_connection_location_mistached + ) + ) + + +@pytest.fixture(scope="module") +def session_with_bq_connection_location_project_specified( + bq_cf_connection_location_project, +) -> bigframes.Session: + return bigframes.Session( + bigframes.BigQueryOptions( + remote_udf_connection=bq_cf_connection_location_project + ) + ) + + @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, bigqueryconnection_client, + cloudfunctions_client, scalars_dfs, dataset_id_permanent, bq_cf_connection, @@ -48,6 +112,7 @@ def test_remote_function_direct_no_session_param( int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, + cloud_functions_client=cloudfunctions_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. @@ -56,13 +121,112 @@ def test_remote_function_direct_no_session_param( def square(x): return x * x + assert square.bigframes_remote_function + assert square.bigframes_cloud_function + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pd.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_direct_no_session_param_location_specified( + bigquery_client, + bigqueryconnection_client, + cloudfunctions_client, + scalars_dfs, + dataset_id_permanent, + bq_cf_connection_location, +): + @remote_function( + [int], + int, + bigquery_client=bigquery_client, + bigquery_connection_client=bigqueryconnection_client, + cloud_functions_client=cloudfunctions_client, + dataset=dataset_id_permanent, + bigquery_connection=bq_cf_connection_location, + # See e2e tests for tests that actually deploy the Cloud Function. + reuse=True, + ) + def square(x): + return x * x + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pd.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_direct_no_session_param_location_mistached( + bigquery_client, + bigqueryconnection_client, + cloudfunctions_client, + scalars_dfs, + dataset_id_permanent, + bq_cf_connection_location_mistached, +): + @remote_function( + [int], + int, + bigquery_client=bigquery_client, + bigquery_connection_client=bigqueryconnection_client, + cloud_functions_client=cloudfunctions_client, + dataset=dataset_id_permanent, + bigquery_connection=bq_cf_connection_location_mistached, + # See e2e tests for tests that actually deploy the Cloud Function. + reuse=True, + ) + def square(x): + return x * x + scalars_df, scalars_pandas_df = scalars_dfs bf_int64_col = scalars_df["int64_col"] bf_int64_col_filter = bf_int64_col.notnull() bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) pd_int64_col = scalars_pandas_df["int64_col"] pd_int64_col_filter = pd_int64_col.notnull() @@ -78,6 +242,78 @@ def square(x): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_direct_no_session_param_location_project_specified( + bigquery_client, + bigqueryconnection_client, + cloudfunctions_client, + scalars_dfs, + dataset_id_permanent, + bq_cf_connection_location_project, +): + @remote_function( + [int], + int, + bigquery_client=bigquery_client, + bigquery_connection_client=bigqueryconnection_client, + cloud_functions_client=cloudfunctions_client, + dataset=dataset_id_permanent, + bigquery_connection=bq_cf_connection_location_project, + # See e2e tests for tests that actually deploy the Cloud Function. + reuse=True, + ) + def square(x): + return x * x + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pd.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_direct_no_session_param_project_mismatched( + bigquery_client, + bigqueryconnection_client, + cloudfunctions_client, + dataset_id_permanent, + bq_cf_connection_location_project_mistached, +): + with pytest.raises(ValueError): + + @remote_function( + [int], + int, + bigquery_client=bigquery_client, + bigquery_connection_client=bigqueryconnection_client, + cloud_functions_client=cloudfunctions_client, + dataset=dataset_id_permanent, + bigquery_connection=bq_cf_connection_location_project_mistached, + # See e2e tests for tests that actually deploy the Cloud Function. + reuse=True, + ) + def square(x): + return x * x + + @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs): @remote_function( @@ -94,7 +330,9 @@ def square(x): bf_int64_col_filter = bf_int64_col.notnull() bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) pd_int64_col = scalars_pandas_df["int64_col"] pd_int64_col_filter = pd_int64_col.notnull() @@ -129,7 +367,9 @@ def square(x): bf_int64_col_filter = bf_int64_col.notnull() bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) pd_int64_col = scalars_pandas_df["int64_col"] pd_int64_col_filter = pd_int64_col.notnull() @@ -166,7 +406,9 @@ def square(x): bf_int64_col_filter = bf_int64_col.notnull() bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) pd_int64_col = scalars_pandas_df["int64_col"] pd_int64_col_filter = pd_int64_col.notnull() @@ -210,7 +452,9 @@ def square(x): bf_int64_col_filter = bf_int64_col.notnull() bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = bf_int64_col_filtered.to_frame().assign(result=bf_result_col).compute() + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) pd_int64_col = scalars_pandas_df["int64_col"] pd_int64_col_filter = pd_int64_col.notnull() @@ -238,7 +482,7 @@ def add_one(x): bf_int64_df = scalars_df[int64_cols] bf_int64_df_filtered = bf_int64_df.dropna() - bf_result = bf_int64_df_filtered.applymap(remote_add_one).compute() + bf_result = bf_int64_df_filtered.applymap(remote_add_one).to_pandas() pd_int64_df = scalars_pandas_df[int64_cols] pd_int64_df_filtered = pd_int64_df.dropna() @@ -264,7 +508,7 @@ def add_one(x): int64_cols = ["int64_col", "int64_too"] bf_int64_df = scalars_df[int64_cols] - bf_result = bf_int64_df.applymap(remote_add_one, na_action="ignore").compute() + bf_result = bf_int64_df.applymap(remote_add_one, na_action="ignore").to_pandas() pd_int64_df = scalars_pandas_df[int64_cols] pd_result = pd_int64_df.applymap(add_one, na_action="ignore") @@ -276,3 +520,56 @@ def add_one(x): pd_result[col] = pd_result[col].astype(pd_int64_df[col].dtype) assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_like_original( + bigquery_client, + bigqueryconnection_client, + cloudfunctions_client, + scalars_df_index, + dataset_id_permanent, + bq_cf_connection, +): + @remote_function( + [int], + int, + bigquery_client=bigquery_client, + bigquery_connection_client=bigqueryconnection_client, + dataset=dataset_id_permanent, + cloud_functions_client=cloudfunctions_client, + bigquery_connection=bq_cf_connection, + reuse=True, + ) + def square1(x): + return x * x + + square2 = read_gbq_function( + function_name=square1.bigframes_remote_function, + bigquery_client=bigquery_client, + ) + + # The newly-created function (square1) should have a remote function AND a + # cloud function associated with it, while the read-back version (square2) + # should only have a remote function. + assert square1.bigframes_remote_function + assert square1.bigframes_cloud_function + + assert square2.bigframes_remote_function + assert not hasattr(square2, "bigframes_cloud_function") + + # They should point to the same function. + assert square1.bigframes_remote_function == square2.bigframes_remote_function + + # The result of applying them should be the same. + int64_col = scalars_df_index["int64_col"] + int64_col_filter = int64_col.notnull() + int64_col_filtered = int64_col[int64_col_filter] + + s1_result_col = int64_col_filtered.apply(square1) + s1_result = int64_col_filtered.to_frame().assign(result=s1_result_col) + + s2_result_col = int64_col_filtered.apply(square2) + s2_result = int64_col_filtered.to_frame().assign(result=s2_result_col) + + assert_pandas_df_equal_ignore_ordering(s1_result.to_pandas(), s2_result.to_pandas()) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 751521bd75..1c1e7b035b 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -13,6 +13,7 @@ # limitations under the License. import math +import re import tempfile import geopandas as gpd # type: ignore @@ -21,6 +22,7 @@ import pyarrow as pa # type: ignore import pytest +import bigframes.pandas import bigframes.series as series from tests.system.utils import ( assert_pandas_df_equal_ignore_ordering, @@ -32,7 +34,7 @@ def test_series_construct_copy(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = series.Series( scalars_df["int64_col"], name="test_series", dtype="Float64" - ).compute() + ).to_pandas() pd_result = pd.Series( scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" ) @@ -48,11 +50,11 @@ def test_series_construct_pandas(scalars_dfs): scalars_pandas_df["int64_col"], name="test_series", dtype="Float64" ) assert bf_result.shape == pd_result.shape - pd.testing.assert_series_equal(bf_result.compute(), pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_series_construct_from_list(): - bf_result = series.Series([1, 1, 2, 3, 5, 8, 13], dtype="Int64").compute() + bf_result = series.Series([1, 1, 2, 3, 5, 8, 13], dtype="Int64").to_pandas() pd_result = pd.Series([1, 1, 2, 3, 5, 8, 13], dtype="Int64") # BigQuery DataFrame default indices use nullable Int64 always @@ -61,6 +63,22 @@ def test_series_construct_from_list(): pd.testing.assert_series_equal(bf_result, pd_result) +def test_series_construct_from_list_escaped_strings(): + """Check that special characters are supported.""" + strings = [ + "string\nwith\nnewline", + "string\twith\ttabs", + "string\\with\\backslashes", + ] + bf_result = series.Series(strings, name="test_series", dtype="string[pyarrow]") + pd_result = pd.Series(strings, name="test_series", dtype="string[pyarrow]") + + # BigQuery DataFrame default indices use nullable Int64 always + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + @pytest.mark.parametrize( ["col_name", "expected_dtype"], [ @@ -83,7 +101,7 @@ def test_series_construct_from_list(): def test_get_column(scalars_dfs, col_name, expected_dtype): scalars_df, scalars_pandas_df = scalars_dfs series = scalars_df[col_name] - series_pandas = series.compute() + series_pandas = series.to_pandas() assert series_pandas.dtype == expected_dtype assert series_pandas.shape[0] == scalars_pandas_df.shape[0] @@ -94,6 +112,54 @@ def test_series_get_column_default(scalars_dfs): assert result == "default_val" +def test_series_get_with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].get(key) + pd_result = scalars_pandas_df[col_name].get(key) + assert bf_result.to_pandas().iloc[0] == pd_result + + +@pytest.mark.parametrize( + ("index_col", "key"), + ( + ("int64_too", 2), + ("string_col", "Hello, World!"), + ("int64_too", slice(2, 6)), + ), +) +def test_series___getitem__(scalars_dfs, index_col, key): + col_name = "float64_col" + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + bf_result = scalars_df[col_name][key] + pd_result = scalars_pandas_df[col_name][key] + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + +def test_series___getitem___with_int_key(scalars_dfs): + col_name = "int64_too" + index_col = "string_col" + key = 2 + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.set_index(index_col, drop=False) + scalars_pandas_df = scalars_pandas_df.set_index(index_col, drop=False) + bf_result = scalars_df[col_name][key] + pd_result = scalars_pandas_df[col_name][key] + assert bf_result == pd_result + + +def test_series___getitem___with_default_index(scalars_dfs): + col_name = "float64_col" + key = 2 + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name][key] + pd_result = scalars_pandas_df[col_name][key] + assert bf_result.to_pandas().iloc[0] == pd_result + + @pytest.mark.parametrize( ("col_name",), ( @@ -103,7 +169,7 @@ def test_series_get_column_default(scalars_dfs): ) def test_abs(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col_name].abs().compute() + bf_result = scalars_df[col_name].abs().to_pandas() pd_result = scalars_pandas_df[col_name].abs() assert_series_equal_ignoring_order(pd_result, bf_result) @@ -112,7 +178,7 @@ def test_abs(scalars_dfs, col_name): def test_fillna(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_result = scalars_df[col_name].fillna("Missing").compute() + bf_result = scalars_df[col_name].fillna("Missing").to_pandas() pd_result = scalars_pandas_df[col_name].fillna("Missing") assert_series_equal_ignoring_order( pd_result, @@ -120,6 +186,25 @@ def test_fillna(scalars_dfs): ) +def test_series_agg_single_string(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].agg("sum") + pd_result = scalars_pandas_df["int64_col"].agg("sum") + assert math.isclose(pd_result, bf_result) + + +def test_series_agg_multi_string(scalars_dfs): + aggregations = ["sum", "mean", "std", "var", "min", "max", "nunique", "count"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].agg(aggregations).to_pandas() + pd_result = scalars_pandas_df["int64_col"].agg(aggregations) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + @pytest.mark.parametrize( ("col_name",), ( @@ -176,6 +261,36 @@ def test_kurt(scalars_dfs, col_name): assert math.isclose(pd_result, bf_result) +@pytest.mark.parametrize( + ("col_name",), + ( + ("float64_col",), + ("int64_col",), + ), +) +def test_skew(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col_name].skew() + pd_result = scalars_pandas_df[col_name].skew() + assert math.isclose(pd_result, bf_result) + + +def test_skew_undefined(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].iloc[:2].skew() + pd_result = scalars_pandas_df["int64_col"].iloc[:2].skew() + # both should be pd.NA + assert pd_result is bf_result + + +def test_kurt_undefined(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].iloc[:3].kurt() + pd_result = scalars_pandas_df["int64_col"].iloc[:3].kurt() + # both should be pd.NA + assert pd_result is bf_result + + @pytest.mark.parametrize( ("col_name",), ( @@ -198,7 +313,7 @@ def test_var(scalars_dfs, col_name): ), ) def test_mode_stat(scalars_df_index, scalars_pandas_df_index, col_name): - bf_result = scalars_df_index[col_name].mode().compute() + bf_result = scalars_df_index[col_name].mode().to_pandas() pd_result = scalars_pandas_df_index[col_name].mode() ## Mode implicitly resets index, and bigframes default indices use nullable Int64 @@ -244,7 +359,7 @@ def test_series_int_int_operators_scalar( maybe_reversed_op = (lambda x, y: operator(y, x)) if reverse_operands else operator - bf_result = maybe_reversed_op(scalars_df["int64_col"], other_scalar).compute() + bf_result = maybe_reversed_op(scalars_df["int64_col"], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df["int64_col"], other_scalar) assert_series_equal_ignoring_order(pd_result, bf_result) @@ -270,7 +385,7 @@ def test_series_bool_bool_operators_scalar( maybe_reversed_op = (lambda x, y: operator(y, x)) if reverse_operands else operator - bf_result = maybe_reversed_op(scalars_df["bool_col"], other_scalar).compute() + bf_result = maybe_reversed_op(scalars_df["bool_col"], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df["bool_col"], other_scalar) assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) @@ -309,12 +424,101 @@ def test_series_bool_bool_operators_scalar( ) def test_series_int_int_operators_series(scalars_dfs, operator): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = operator(scalars_df["int64_col"], scalars_df["int64_too"]).compute() + bf_result = operator(scalars_df["int64_col"], scalars_df["int64_too"]).to_pandas() pd_result = operator(scalars_pandas_df["int64_col"], scalars_pandas_df["int64_too"]) - assert_series_equal_ignoring_order(pd_result, bf_result) +@pytest.mark.parametrize( + ("col_x",), + [ + ("int64_col",), + ("int64_too",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("col_y",), + [ + ("int64_col",), + ("int64_too",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("method",), + [ + ("mod",), + ("rmod",), + ], +) +def test_mods(scalars_dfs, col_x, col_y, method): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]).to_pandas() + pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) + pd.testing.assert_series_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("col_x",), + [ + ("int64_col",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("col_y",), + [ + ("int64_col",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("method",), + [ + ("divmod",), + ("rdivmod",), + ], +) +def test_divmods_series(scalars_dfs, col_x, col_y, method): + scalars_df, scalars_pandas_df = scalars_dfs + bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]) + pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)( + scalars_pandas_df[col_y] + ) + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + + +@pytest.mark.parametrize( + ("col_x",), + [ + ("int64_col",), + ("float64_col",), + ], +) +@pytest.mark.parametrize( + ("other",), + [ + (-1000,), + (678,), + ], +) +@pytest.mark.parametrize( + ("method",), + [ + ("divmod",), + ("rdivmod",), + ], +) +def test_divmods_scalars(scalars_dfs, col_x, other, method): + scalars_df, scalars_pandas_df = scalars_dfs + bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(other) + pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other) + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + + @pytest.mark.parametrize( ("other",), [ @@ -324,7 +528,7 @@ def test_series_int_int_operators_series(scalars_dfs, operator): ) def test_series_add_scalar(scalars_dfs, other): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (scalars_df["float64_col"] + other).compute() + bf_result = (scalars_df["float64_col"] + other).to_pandas() pd_result = scalars_pandas_df["float64_col"] + other assert_series_equal_ignoring_order(pd_result, bf_result) @@ -340,7 +544,7 @@ def test_series_add_scalar(scalars_dfs, other): ) def test_series_add_bigframes_series(scalars_dfs, left_col, right_col): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (scalars_df[left_col] + scalars_df[right_col]).compute() + bf_result = (scalars_df[left_col] + scalars_df[right_col]).to_pandas() pd_result = scalars_pandas_df[left_col] + scalars_pandas_df[right_col] assert_series_equal_ignoring_order(pd_result, bf_result) @@ -360,7 +564,7 @@ def test_series_add_bigframes_series_nested( scalars_df, scalars_pandas_df = scalars_dfs bf_result = ( (scalars_df[left_col] + scalars_df[right_col]) + scalars_df[righter_col] - ).compute() + ).to_pandas() pd_result = ( scalars_pandas_df[left_col] + scalars_pandas_df[right_col] ) + scalars_pandas_df[righter_col] @@ -375,12 +579,12 @@ def test_series_add_different_table_default_index( bf_result = ( scalars_df_default_index["float64_col"] + scalars_df_2_default_index["float64_col"] - ).compute() + ).to_pandas() pd_result = ( # Default index may not have a well defined order, but it should at - # least be consistent across compute() calls. - scalars_df_default_index["float64_col"].compute() - + scalars_df_2_default_index["float64_col"].compute() + # least be consistent across to_pandas() calls. + scalars_df_default_index["float64_col"].to_pandas() + + scalars_df_2_default_index["float64_col"].to_pandas() ) # TODO(swast): Can remove sort_index() when there's default ordering. pd.testing.assert_series_equal(bf_result.sort_index(), pd_result.sort_index()) @@ -394,7 +598,7 @@ def test_series_add_different_table_with_index( # When index values are unique, we can emulate with values from the same # DataFrame. pd_result = scalars_pandas_df["float64_col"] + scalars_pandas_df["int64_col"] - pd.testing.assert_series_equal(bf_result.compute(), pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): @@ -413,7 +617,7 @@ def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): # BigQuery DataFrames default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_series_equal(bf_result.compute(), pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -440,7 +644,7 @@ def test_reset_index_no_drop(scalars_df_index, scalars_pandas_df_index, name): # BigQuery DataFrames default indices use nullable Int64 always pd_result.index = pd_result.index.astype("Int64") - pd.testing.assert_frame_equal(bf_result.compute(), pd_result) + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) def test_series_add_pandas_series_not_implemented(scalars_dfs): @@ -451,7 +655,7 @@ def test_series_add_pandas_series_not_implemented(scalars_dfs): + pd.Series( [1, 1, 1, 1], ) - ).compute() + ).to_pandas() def test_copy(scalars_df_index, scalars_pandas_df_index): @@ -467,14 +671,14 @@ def test_copy(scalars_df_index, scalars_pandas_df_index): pd_copy.loc[0] = 5.6 pd_series.loc[0] = 3.4 - assert bf_copy.compute().loc[0] != bf_series.compute().loc[0] - pd.testing.assert_series_equal(bf_copy.compute(), pd_copy) + assert bf_copy.to_pandas().loc[0] != bf_series.to_pandas().loc[0] + pd.testing.assert_series_equal(bf_copy.to_pandas(), pd_copy) def test_isnull(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "float64_col" - bf_series = scalars_df[col_name].isnull().compute() + bf_series = scalars_df[col_name].isnull().to_pandas() pd_series = scalars_pandas_df[col_name].isnull() # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but @@ -485,7 +689,7 @@ def test_isnull(scalars_dfs): def test_notnull(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series = scalars_df[col_name].notnull().compute() + bf_series = scalars_df[col_name].notnull().to_pandas() pd_series = scalars_pandas_df[col_name].notnull() # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but @@ -496,7 +700,7 @@ def test_notnull(scalars_dfs): def test_round(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "float64_col" - bf_result = scalars_df[col_name].round().compute() + bf_result = scalars_df[col_name].round().to_pandas() pd_result = scalars_pandas_df[col_name].round() assert_series_equal_ignoring_order(pd_result, bf_result) @@ -505,7 +709,7 @@ def test_round(scalars_dfs): def test_eq_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_result = scalars_df[col_name].eq(0).compute() + bf_result = scalars_df[col_name].eq(0).to_pandas() pd_result = scalars_pandas_df[col_name].eq(0) assert_series_equal_ignoring_order(pd_result, bf_result) @@ -514,7 +718,7 @@ def test_eq_scalar(scalars_dfs): def test_eq_wider_type_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_result = scalars_df[col_name].eq(1.0).compute() + bf_result = scalars_df[col_name].eq(1.0).to_pandas() pd_result = scalars_pandas_df[col_name].eq(1.0) assert_series_equal_ignoring_order(pd_result, bf_result) @@ -523,7 +727,7 @@ def test_eq_wider_type_scalar(scalars_dfs): def test_ne_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_result = (scalars_df[col_name] != 0).compute() + bf_result = (scalars_df[col_name] != 0).to_pandas() pd_result = scalars_pandas_df[col_name] != 0 assert_series_equal_ignoring_order(pd_result, bf_result) @@ -532,7 +736,7 @@ def test_ne_scalar(scalars_dfs): def test_eq_int_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_result = (scalars_df[col_name] == 0).compute() + bf_result = (scalars_df[col_name] == 0).to_pandas() pd_result = scalars_pandas_df[col_name] == 0 assert_series_equal_ignoring_order(pd_result, bf_result) @@ -549,7 +753,7 @@ def test_eq_int_scalar(scalars_dfs): def test_eq_same_type_series(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_result = (scalars_df[col_name] == scalars_df[col_name]).compute() + bf_result = (scalars_df[col_name] == scalars_df[col_name]).to_pandas() pd_result = scalars_pandas_df[col_name] == scalars_pandas_df[col_name] # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but @@ -564,18 +768,18 @@ def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index): pd_series = scalars_pandas_df_index["string_col"].copy() bf_series.loc[2] = "This value isn't in the test data." pd_series.loc[2] = "This value isn't in the test data." - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() pd_result = pd_series pd.testing.assert_series_equal(bf_result, pd_result) # Per Copy-on-Write semantics, other references to the original DataFrame # should remain unchanged. - pd.testing.assert_series_equal(bf_original.compute(), pd_original) + pd.testing.assert_series_equal(bf_original.to_pandas(), pd_original) def test_ne_obj_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_result = (scalars_df[col_name] != scalars_df[col_name]).compute() + bf_result = (scalars_df[col_name] != scalars_df[col_name]).to_pandas() pd_result = scalars_pandas_df[col_name] != scalars_pandas_df[col_name] # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but @@ -586,7 +790,7 @@ def test_ne_obj_series(scalars_dfs): def test_indexing_using_unselected_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_result = scalars_df[col_name][scalars_df["int64_too"].eq(0)].compute() + bf_result = scalars_df[col_name][scalars_df["int64_too"].eq(0)].to_pandas() pd_result = scalars_pandas_df[col_name][scalars_pandas_df["int64_too"].eq(0)] assert_series_equal_ignoring_order( @@ -600,7 +804,7 @@ def test_indexing_using_selected_series(scalars_dfs): col_name = "string_col" bf_result = scalars_df[col_name][ scalars_df["string_col"].eq("Hello, World!") - ].compute() + ].to_pandas() pd_result = scalars_pandas_df[col_name][ scalars_pandas_df["string_col"].eq("Hello, World!") ] @@ -618,7 +822,7 @@ def test_nested_filter(scalars_dfs): bool_col = scalars_df["bool_col"] == bool( True ) # Convert from nullable bool to nonnullable bool usable as indexer - bf_result = string_col[int64_too == 0][~bool_col].compute() + bf_result = string_col[int64_too == 0][~bool_col].to_pandas() pd_string_col = scalars_pandas_df["string_col"] pd_int64_too = scalars_pandas_df["int64_too"] @@ -633,12 +837,35 @@ def test_nested_filter(scalars_dfs): ) +def test_binop_repeated_application_does_row_identity_joins(scalars_dfs): + """Make sure row identity joins kick in so that we don't do way more joins than expected.""" + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"] + + num_joins = 10 + for _ in range(num_joins): + bf_series = bf_series + bf_series + pd_series = pd_series + pd_series + + bf_result = bf_series.to_pandas() + pd_result = pd_series + assert_series_equal_ignoring_order( + bf_result, + pd_result, + ) + + bf_sql, _ = bf_series.to_frame()._to_sql_query(always_include_index=True) + selects = re.findall("SELECT", bf_sql.upper()) + assert 0 < len(selects) < (num_joins // 2) + + def test_binop_opposite_filters(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs int64_col1 = scalars_df["int64_col"] int64_col2 = scalars_df["int64_col"] bool_col = scalars_df["bool_col"] - bf_result = (int64_col1[bool_col] + int64_col2[bool_col.__invert__()]).compute() + bf_result = (int64_col1[bool_col] + int64_col2[bool_col.__invert__()]).to_pandas() pd_int64_col1 = scalars_pandas_df["int64_col"] pd_int64_col2 = scalars_pandas_df["int64_col"] @@ -656,7 +883,7 @@ def test_binop_left_filtered(scalars_dfs): int64_col = scalars_df["int64_col"] float64_col = scalars_df["float64_col"] bool_col = scalars_df["bool_col"] - bf_result = (int64_col[bool_col] + float64_col).compute() + bf_result = (int64_col[bool_col] + float64_col).to_pandas() pd_int64_col = scalars_pandas_df["int64_col"] pd_float64_col = scalars_pandas_df["float64_col"] @@ -674,7 +901,7 @@ def test_binop_right_filtered(scalars_dfs): int64_col = scalars_df["int64_col"] float64_col = scalars_df["float64_col"] bool_col = scalars_df["bool_col"] - bf_result = (float64_col + int64_col[bool_col]).compute() + bf_result = (float64_col + int64_col[bool_col]).to_pandas() pd_int64_col = scalars_pandas_df["int64_col"] pd_float64_col = scalars_pandas_df["float64_col"] @@ -695,6 +922,16 @@ def test_mean(scalars_dfs): assert math.isclose(pd_result, bf_result) +def test_median(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].median() + pd_max = scalars_pandas_df[col_name].max() + pd_min = scalars_pandas_df[col_name].min() + # Median is approximate, so just check for plausibility. + assert pd_min < bf_result < pd_max + + def test_repr(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs if scalars_pandas_df.index.name != "rowindex": @@ -722,6 +959,19 @@ def test_product(scalars_dfs): assert math.isclose(pd_result, bf_result) +def test_cumprod(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("Series.cumprod NA mask are different in pandas 1.x.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "float64_col" + bf_result = scalars_df[col_name].cumprod() + pd_result = scalars_pandas_df[col_name].cumprod() + pd.testing.assert_series_equal( + pd_result, + bf_result.to_pandas(), + ) + + def test_count(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" @@ -762,7 +1012,7 @@ def test_groupby_sum(scalars_dfs): scalars_pandas_df[col_name].groupby(scalars_pandas_df["string_col"]).sum() ) # TODO(swast): Update groupby to use index based on group by key(s). - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() assert_series_equal_ignoring_order( pd_series, bf_result, @@ -780,7 +1030,7 @@ def test_groupby_std(scalars_dfs): .std() .astype(pd.Float64Dtype()) ) - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() assert_series_equal_ignoring_order( pd_series, bf_result, @@ -795,7 +1045,7 @@ def test_groupby_var(scalars_dfs): pd_series = ( scalars_pandas_df[col_name].groupby(scalars_pandas_df["string_col"]).var() ) - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() assert_series_equal_ignoring_order( pd_series, bf_result, @@ -815,7 +1065,7 @@ def test_groupby_level_sum(scalars_dfs): # TODO(swast): Update groupby to use index based on group by key(s). pd.testing.assert_series_equal( pd_series.sort_index(), - bf_series.compute().sort_index(), + bf_series.to_pandas().sort_index(), ) @@ -831,7 +1081,7 @@ def test_groupby_level_list_sum(scalars_dfs): # TODO(swast): Update groupby to use index based on group by key(s). pd.testing.assert_series_equal( pd_series.sort_index(), - bf_series.compute().sort_index(), + bf_series.to_pandas().sort_index(), ) @@ -847,13 +1097,36 @@ def test_groupby_mean(scalars_dfs): .mean() ) # TODO(swast): Update groupby to use index based on group by key(s). - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() assert_series_equal_ignoring_order( pd_series, bf_result, ) +def test_groupby_median(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median() + ) + pd_max = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .max() + ) + pd_min = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .min() + ) + # TODO(swast): Update groupby to use index based on group by key(s). + bf_result = bf_series.to_pandas() + + # Median is approximate, so just check that it's plausible. + assert ((pd_min <= bf_result) & (bf_result <= pd_max)).all() + + def test_groupby_prod(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" @@ -862,7 +1135,7 @@ def test_groupby_prod(scalars_dfs): scalars_pandas_df[col_name].groupby(scalars_pandas_df["int64_col"]).prod() ) # TODO(swast): Update groupby to use index based on group by key(s). - bf_result = bf_series.compute() + bf_result = bf_series.to_pandas() assert_series_equal_ignoring_order( pd_series, bf_result, @@ -897,7 +1170,7 @@ def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator) group_key = "int64_too" # has some duplicates values, good for grouping bf_series = ( operator(scalars_df_index[col_name].groupby(scalars_df_index[group_key])) - ).compute() + ).to_pandas() pd_series = operator( scalars_pandas_df_index[col_name].groupby(scalars_pandas_df_index[group_key]) ).astype(pd.Int64Dtype()) @@ -909,7 +1182,7 @@ def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator) def test_drop_label(scalars_df_index, scalars_pandas_df_index): col_name = "int64_col" - bf_series = scalars_df_index[col_name].drop(1).compute() + bf_series = scalars_df_index[col_name].drop(1).to_pandas() pd_series = scalars_pandas_df_index[col_name].drop(1) pd.testing.assert_series_equal( pd_series, @@ -919,7 +1192,7 @@ def test_drop_label(scalars_df_index, scalars_pandas_df_index): def test_drop_label_list(scalars_df_index, scalars_pandas_df_index): col_name = "int64_col" - bf_series = scalars_df_index[col_name].drop([1, 3]).compute() + bf_series = scalars_df_index[col_name].drop([1, 3]).to_pandas() pd_series = scalars_pandas_df_index[col_name].drop([1, 3]) pd.testing.assert_series_equal( pd_series, @@ -943,7 +1216,7 @@ def test_drop_label_list(scalars_df_index, scalars_pandas_df_index): ], ) def test_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, col_name): - bf_series = scalars_df_index[col_name].drop_duplicates(keep=keep).compute() + bf_series = scalars_df_index[col_name].drop_duplicates(keep=keep).to_pandas() pd_series = scalars_pandas_df_index[col_name].drop_duplicates(keep=keep) pd.testing.assert_series_equal( pd_series, @@ -980,7 +1253,7 @@ def test_unique(scalars_df_index, scalars_pandas_df_index, col_name): ], ) def test_duplicated(scalars_df_index, scalars_pandas_df_index, keep, col_name): - bf_series = scalars_df_index[col_name].duplicated(keep=keep).compute() + bf_series = scalars_df_index[col_name].duplicated(keep=keep).to_pandas() pd_series = scalars_pandas_df_index[col_name].duplicated(keep=keep) pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) @@ -1021,7 +1294,7 @@ def test_empty_false(scalars_dfs): assert pd_result == bf_result -def test_empty_true(scalars_dfs): +def test_empty_true_row_filter(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["string_col"][ @@ -1031,9 +1304,21 @@ def test_empty_true(scalars_dfs): scalars_pandas_df["string_col"] == "won't find this" ].empty + assert pd_result assert pd_result == bf_result +def test_empty_true_memtable(session: bigframes.Session): + bf_series: series.Series = series.Series(session=session) + pd_series: pd.Series = pd.Series() + + bf_result = bf_series.empty + pd_result = pd_series.empty + + assert pd_result + assert bf_result == pd_result + + def test_dtype(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs @@ -1058,7 +1343,7 @@ def test_head(scalars_dfs): if scalars_df.index.name is None: pytest.skip("Require explicit index for offset ops.") - bf_result = scalars_df["string_col"].head(2).compute() + bf_result = scalars_df["string_col"].head(2).to_pandas() pd_result = scalars_pandas_df["string_col"].head(2) assert_series_equal_ignoring_order( @@ -1073,7 +1358,7 @@ def test_tail(scalars_dfs): if scalars_df.index.name is None: pytest.skip("Require explicit index for offset ops.") - bf_result = scalars_df["string_col"].tail(2).compute() + bf_result = scalars_df["string_col"].tail(2).to_pandas() pd_result = scalars_pandas_df["string_col"].tail(2) assert_series_equal_ignoring_order( @@ -1088,7 +1373,7 @@ def test_head_then_scalar_operation(scalars_dfs): if scalars_df.index.name is None: pytest.skip("Require explicit index for offset ops.") - bf_result = (scalars_df["float64_col"].head(1) + 4).compute() + bf_result = (scalars_df["float64_col"].head(1) + 4).to_pandas() pd_result = scalars_pandas_df["float64_col"].head(1) + 4 pd.testing.assert_series_equal( @@ -1105,7 +1390,7 @@ def test_head_then_series_operation(scalars_dfs): bf_result = ( scalars_df["float64_col"].head(4) + scalars_df["float64_col"].head(2) - ).compute() + ).to_pandas() pd_result = scalars_pandas_df["float64_col"].head(4) + scalars_pandas_df[ "float64_col" ].head(2) @@ -1118,7 +1403,7 @@ def test_head_then_series_operation(scalars_dfs): def test_shift(scalars_df_index, scalars_pandas_df_index): col_name = "int64_col" - bf_result = scalars_df_index[col_name].shift().compute() + bf_result = scalars_df_index[col_name].shift().to_pandas() # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA pd_result = scalars_pandas_df_index[col_name].shift().astype(pd.Int64Dtype()) @@ -1133,7 +1418,7 @@ def test_cumsum_int(scalars_df_index, scalars_pandas_df_index): pytest.skip("Series.cumsum NA mask are different in pandas 1.x.") col_name = "int64_col" - bf_result = scalars_df_index[col_name].cumsum().compute() + bf_result = scalars_df_index[col_name].cumsum().to_pandas() # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Int64Dtype()) @@ -1143,6 +1428,27 @@ def test_cumsum_int(scalars_df_index, scalars_pandas_df_index): ) +def test_cumsum_int_ordered(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("1."): + pytest.skip("Series.cumsum NA mask are different in pandas 1.x.") + + col_name = "int64_col" + bf_result = ( + scalars_df_index.sort_values(by="rowindex_2")[col_name].cumsum().to_pandas() + ) + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + scalars_pandas_df_index.sort_values(by="rowindex_2")[col_name] + .cumsum() + .astype(pd.Int64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + @pytest.mark.parametrize( ("na_option",), [ @@ -1167,7 +1473,7 @@ def test_cumsum_int(scalars_df_index, scalars_pandas_df_index): def test_rank_with_nulls(scalars_df_index, scalars_pandas_df_index, na_option, method): col_name = "bool_col" bf_result = ( - scalars_df_index[col_name].rank(na_option=na_option, method=method).compute() + scalars_df_index[col_name].rank(na_option=na_option, method=method).to_pandas() ) pd_result = ( scalars_pandas_df_index[col_name] @@ -1191,7 +1497,7 @@ def test_rank_with_nulls(scalars_df_index, scalars_pandas_df_index, na_option, m ) def test_nlargest(scalars_df_index, scalars_pandas_df_index, keep): col_name = "bool_col" - bf_result = scalars_df_index[col_name].nlargest(4, keep=keep).compute() + bf_result = scalars_df_index[col_name].nlargest(4, keep=keep).to_pandas() pd_result = scalars_pandas_df_index[col_name].nlargest(4, keep=keep) pd.testing.assert_series_equal( @@ -1200,6 +1506,29 @@ def test_nlargest(scalars_df_index, scalars_pandas_df_index, keep): ) +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_diff(scalars_df_index, scalars_pandas_df_index, periods): + bf_result = scalars_df_index["int64_col"].diff(periods=periods).to_pandas() + # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA + pd_result = ( + scalars_pandas_df_index["int64_col"] + .diff(periods=periods) + .astype(pd.Int64Dtype()) + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + @pytest.mark.parametrize( ("keep",), [ @@ -1210,7 +1539,7 @@ def test_nlargest(scalars_df_index, scalars_pandas_df_index, keep): ) def test_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): col_name = "bool_col" - bf_result = scalars_df_index[col_name].nsmallest(2, keep=keep).compute() + bf_result = scalars_df_index[col_name].nsmallest(2, keep=keep).to_pandas() pd_result = scalars_pandas_df_index[col_name].nsmallest(2, keep=keep) pd.testing.assert_series_equal( @@ -1221,7 +1550,7 @@ def test_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): def test_rank_ints(scalars_df_index, scalars_pandas_df_index): col_name = "int64_too" - bf_result = scalars_df_index[col_name].rank().compute() + bf_result = scalars_df_index[col_name].rank().to_pandas() pd_result = scalars_pandas_df_index[col_name].rank().astype(pd.Float64Dtype()) pd.testing.assert_series_equal( @@ -1230,9 +1559,33 @@ def test_rank_ints(scalars_df_index, scalars_pandas_df_index): ) +def test_cast_float_to_int(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].astype(pd.Int64Dtype()).to_pandas() + # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_cast_float_to_bool(scalars_df_index, scalars_pandas_df_index): + col_name = "float64_col" + bf_result = scalars_df_index[col_name].astype(pd.BooleanDtype()).to_pandas() + # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA + pd_result = scalars_pandas_df_index[col_name].astype(pd.BooleanDtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + def test_cumsum_nested(scalars_df_index, scalars_pandas_df_index): col_name = "float64_col" - bf_result = scalars_df_index[col_name].cumsum().cumsum().cumsum().compute() + bf_result = scalars_df_index[col_name].cumsum().cumsum().cumsum().to_pandas() # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA pd_result = ( scalars_pandas_df_index[col_name] @@ -1252,7 +1605,7 @@ def test_cumsum_int_filtered(scalars_df_index, scalars_pandas_df_index): col_name = "int64_col" bf_col = scalars_df_index[col_name] - bf_result = bf_col[bf_col > -2].cumsum().compute() + bf_result = bf_col[bf_col > -2].cumsum().to_pandas() pd_col = scalars_pandas_df_index[col_name] # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA @@ -1266,7 +1619,7 @@ def test_cumsum_int_filtered(scalars_df_index, scalars_pandas_df_index): def test_cumsum_float(scalars_df_index, scalars_pandas_df_index): col_name = "float64_col" - bf_result = scalars_df_index[col_name].cumsum().compute() + bf_result = scalars_df_index[col_name].cumsum().to_pandas() # cumsum does not behave well on nullable floats in pandas, produces object type and never ignores NA pd_result = scalars_pandas_df_index[col_name].cumsum().astype(pd.Float64Dtype()) @@ -1278,7 +1631,7 @@ def test_cumsum_float(scalars_df_index, scalars_pandas_df_index): def test_cummin_int(scalars_df_index, scalars_pandas_df_index): col_name = "int64_col" - bf_result = scalars_df_index[col_name].cummin().compute() + bf_result = scalars_df_index[col_name].cummin().to_pandas() pd_result = scalars_pandas_df_index[col_name].cummin() pd.testing.assert_series_equal( @@ -1289,7 +1642,7 @@ def test_cummin_int(scalars_df_index, scalars_pandas_df_index): def test_cummax_int(scalars_df_index, scalars_pandas_df_index): col_name = "int64_col" - bf_result = scalars_df_index[col_name].cummax().compute() + bf_result = scalars_df_index[col_name].cummax().to_pandas() pd_result = scalars_pandas_df_index[col_name].cummax() pd.testing.assert_series_equal( @@ -1302,7 +1655,7 @@ def test_value_counts(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_result = scalars_df[col_name].value_counts().compute() + bf_result = scalars_df[col_name].value_counts().to_pandas() pd_result = scalars_pandas_df[col_name].value_counts() # Older pandas version may not have these values, bigframes tries to emulate 2.0+ @@ -1315,9 +1668,29 @@ def test_value_counts(scalars_dfs): ) +def test_value_counts_w_cut(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + + bf_cut = bigframes.pandas.cut(scalars_df[col_name], 3, labels=False) + pd_cut = pd.cut(scalars_pandas_df[col_name], 3, labels=False) + + bf_result = bf_cut.value_counts().to_pandas() + pd_result = pd_cut.value_counts() + # Older pandas version may not have these values, bigframes tries to emulate 2.0+ + pd_result.name = "count" + pd_result.index.name = col_name + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.Int64Dtype()), + ) + + def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["string_col"].iloc[1:].iloc[1:].compute() + bf_result = scalars_df_index["string_col"].iloc[1:].iloc[1:].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[1:].iloc[1:] pd.testing.assert_series_equal( @@ -1347,7 +1720,7 @@ def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): ], ) def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step): - bf_result = scalars_df_index["string_col"].iloc[start:stop:step].compute() + bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step] # Pandas may assign non-object dtype to empty series and series index @@ -1362,7 +1735,7 @@ def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, ste def test_series_add_prefix(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].add_prefix("prefix_").compute() + bf_result = scalars_df_index["int64_too"].add_prefix("prefix_").to_pandas() pd_result = scalars_pandas_df_index["int64_too"].add_prefix("prefix_") @@ -1375,7 +1748,7 @@ def test_series_add_prefix(scalars_df_index, scalars_pandas_df_index): def test_series_add_suffix(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index["int64_too"].add_suffix("_suffix").compute() + bf_result = scalars_df_index["int64_too"].add_suffix("_suffix").to_pandas() pd_result = scalars_pandas_df_index["int64_too"].add_suffix("_suffix") @@ -1391,7 +1764,7 @@ def test_where_with_series(scalars_df_index, scalars_pandas_df_index): bf_result = ( scalars_df_index["int64_col"] .where(scalars_df_index["bool_col"], scalars_df_index["int64_too"]) - .compute() + .to_pandas() ) pd_result = scalars_pandas_df_index["int64_col"].where( scalars_pandas_df_index["bool_col"], scalars_pandas_df_index["int64_too"] @@ -1411,7 +1784,7 @@ def test_where_with_different_indices(scalars_df_index, scalars_pandas_df_index) scalars_df_index["bool_col"].iloc[2:], scalars_df_index["int64_too"].iloc[:5], ) - .compute() + .to_pandas() ) pd_result = ( scalars_pandas_df_index["int64_col"] @@ -1430,7 +1803,7 @@ def test_where_with_different_indices(scalars_df_index, scalars_pandas_df_index) def test_where_with_default(scalars_df_index, scalars_pandas_df_index): bf_result = ( - scalars_df_index["int64_col"].where(scalars_df_index["bool_col"]).compute() + scalars_df_index["int64_col"].where(scalars_df_index["bool_col"]).to_pandas() ) pd_result = scalars_pandas_df_index["int64_col"].where( scalars_pandas_df_index["bool_col"] @@ -1446,7 +1819,7 @@ def test_clip(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_col"] lower_bf = scalars_df_index["int64_too"] - 1 upper_bf = scalars_df_index["int64_too"] + 1 - bf_result = col_bf.clip(lower_bf, upper_bf).compute() + bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas() col_pd = scalars_pandas_df_index["int64_col"] lower_pd = scalars_pandas_df_index["int64_too"] - 1 @@ -1463,7 +1836,7 @@ def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_col"].iloc[::2] lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 upper_bf = scalars_df_index["int64_too"].iloc[:5] + 1 - bf_result = col_bf.clip(lower_bf, upper_bf).compute() + bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas() col_pd = scalars_pandas_df_index["int64_col"].iloc[::2] lower_pd = scalars_pandas_df_index["int64_too"].iloc[2:] - 1 @@ -1479,7 +1852,7 @@ def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): def test_clip_filtered_one_sided(scalars_df_index, scalars_pandas_df_index): col_bf = scalars_df_index["int64_col"].iloc[::2] lower_bf = scalars_df_index["int64_too"].iloc[2:] - 1 - bf_result = col_bf.clip(lower_bf, None).compute() + bf_result = col_bf.clip(lower_bf, None).to_pandas() col_pd = scalars_pandas_df_index["int64_col"].iloc[::2] lower_pd = scalars_pandas_df_index["int64_too"].iloc[2:] - 1 @@ -1510,7 +1883,9 @@ def test_dot(scalars_dfs): ], ) def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusive): - bf_result = scalars_df_index["int64_col"].between(left, right, inclusive).compute() + bf_result = ( + scalars_df_index["int64_col"].between(left, right, inclusive).to_pandas() + ) pd_result = scalars_pandas_df_index["int64_col"].between(left, right, inclusive) pd.testing.assert_series_equal( @@ -1522,7 +1897,7 @@ def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusi def test_to_frame(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_col"].to_frame().compute() + bf_result = scalars_df["int64_col"].to_frame().to_pandas() pd_result = scalars_pandas_df["int64_col"].to_frame() assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @@ -1651,7 +2026,7 @@ def test_sort_values(scalars_df_index, scalars_pandas_df_index, ascending, na_po bf_result = ( scalars_df_index["int64_col"] .sort_values(ascending=ascending, na_position=na_position) - .compute() + .to_pandas() ) pd_result = scalars_pandas_df_index["int64_col"].sort_values( ascending=ascending, na_position=na_position @@ -1671,7 +2046,9 @@ def test_sort_values(scalars_df_index, scalars_pandas_df_index, ascending, na_po ], ) def test_sort_index(scalars_df_index, scalars_pandas_df_index, ascending): - bf_result = scalars_df_index["int64_too"].sort_index(ascending=ascending).compute() + bf_result = ( + scalars_df_index["int64_too"].sort_index(ascending=ascending).to_pandas() + ) pd_result = scalars_pandas_df_index["int64_too"].sort_index(ascending=ascending) pd.testing.assert_series_equal( @@ -1685,7 +2062,7 @@ def test_mask_default_value(scalars_dfs): bf_col = scalars_df["int64_col"] bf_col_masked = bf_col.mask(bf_col % 2 == 1) - bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).compute() + bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).to_pandas() pd_col = scalars_pandas_df["int64_col"] pd_col_masked = pd_col.mask(pd_col % 2 == 1) @@ -1699,7 +2076,7 @@ def test_mask_custom_value(scalars_dfs): bf_col = scalars_df["int64_col"] bf_col_masked = bf_col.mask(bf_col % 2 == 1, -1) - bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).compute() + bf_result = bf_col.to_frame().assign(int64_col_masked=bf_col_masked).to_pandas() pd_col = scalars_pandas_df["int64_col"] pd_col_masked = pd_col.mask(pd_col % 2 == 1, -1) @@ -1739,14 +2116,14 @@ def test_mask_custom_value(scalars_dfs): ], ) def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type): - bf_result = scalars_df_index[column].astype(to_type).compute() + bf_result = scalars_df_index[column].astype(to_type).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) pd.testing.assert_series_equal(bf_result, pd_result) @pytest.mark.parametrize( "index", - [0, 5], + [0, 5, -2], ) def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): bf_result = scalars_df_index.string_col.iloc[index] @@ -1763,7 +2140,7 @@ def test_iloc_single_integer_out_of_bound_error( def test_loc_bool_series_explicit_index(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.string_col.loc[scalars_df_index.bool_col].compute() + bf_result = scalars_df_index.string_col.loc[scalars_df_index.bool_col].to_pandas() pd_result = scalars_pandas_df_index.string_col.loc[scalars_pandas_df_index.bool_col] pd.testing.assert_series_equal( @@ -1777,7 +2154,7 @@ def test_loc_bool_series_default_index( ): bf_result = scalars_df_default_index.string_col.loc[ scalars_df_default_index.bool_col - ].compute() + ].to_pandas() pd_result = scalars_pandas_df_default_index.string_col.loc[ scalars_pandas_df_default_index.bool_col ] @@ -1815,7 +2192,19 @@ def test_rename(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.string_col.rename("newname") pd.testing.assert_series_equal( - bf_result.compute(), + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_dict_same_type(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.rename({1: 100, 2: 200}) + pd_result = scalars_pandas_df_index.string_col.rename({1: 100, 2: 200}) + + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal( + bf_result.to_pandas(), pd_result, ) @@ -1825,7 +2214,7 @@ def test_rename_axis(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.string_col.rename_axis("newindexname") pd.testing.assert_series_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1842,7 +2231,7 @@ def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.string_col.loc[index_list] pd.testing.assert_series_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1854,7 +2243,7 @@ def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.bool_col.loc[index_list] pd.testing.assert_series_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1866,7 +2255,7 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.string_col.iloc[index_list] pd.testing.assert_series_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1880,7 +2269,7 @@ def test_iloc_list_nameless(scalars_df_index, scalars_pandas_df_index): pd_result = pd_series.iloc[index_list] pd.testing.assert_series_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1895,7 +2284,7 @@ def test_loc_list_nameless(scalars_df_index, scalars_pandas_df_index): pd_result = pd_series.loc[index_list] pd.testing.assert_series_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1911,7 +2300,7 @@ def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.date_col.loc[pd_string_series] pd.testing.assert_series_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) @@ -1924,11 +2313,46 @@ def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): pd_result = scalars_pandas_df_index.date_col.loc[pd_index] pd.testing.assert_series_equal( - bf_result.compute(), + bf_result.to_pandas(), pd_result, ) +def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.date_col.loc[index] + pd_result = scalars_pandas_df_index.date_col.loc[index] + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.date_col.loc[index] + pd_result = scalars_pandas_df_index.date_col.loc[index] + assert bf_result.to_pandas().iloc[0] == pd_result + + def test_series_bool_interpretation_error(scalars_df_index): with pytest.raises(ValueError): True if scalars_df_index["string_col"] else False + + +def test_query_job_setters(scalars_dfs): + job_ids = set() + df, _ = scalars_dfs + series = df["int64_col"] + assert series.query_job is not None + repr(series) + job_ids.add(series.query_job.job_id) + series.to_pandas() + job_ids.add(series.query_job.job_id) + assert len(job_ids) == 2 diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 31ab4dee12..599b8aabbc 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -27,6 +27,8 @@ import bigframes.dtypes import bigframes.ml.linear_model +FIRST_FILE = "000000000000" + def test_read_gbq_tokyo( session_tokyo: bigframes.Session, @@ -35,7 +37,7 @@ def test_read_gbq_tokyo( tokyo_location: str, ): df = session_tokyo.read_gbq(scalars_table_tokyo, index_col=["rowindex"]) - result = df.sort_index().compute() + result = df.sort_index().to_pandas() expected = scalars_pandas_df_index _, query_job = df._block.expr.start_query() @@ -86,7 +88,11 @@ def test_read_gbq_w_col_order( @pytest.mark.parametrize( ("query_or_table", "index_col"), [ - pytest.param("{scalars_table_id}", ["bool_col", "int64_col"], id="multiindex"), + pytest.param( + "{scalars_table_id}", + ["bool_col", "int64_col"], + id="unique_multiindex_table", + ), pytest.param( """SELECT t.float64_col * 2 AS my_floats, @@ -97,11 +103,43 @@ def test_read_gbq_w_col_order( ["my_strings"], id="string_index", ), + pytest.param( + "SELECT GENERATE_UUID() AS uuid, 0 AS my_value FROM UNNEST(GENERATE_ARRAY(1, 20))", + ["uuid"], + id="unique_uuid_index_query", + ), pytest.param( "{scalars_table_id}", ["bool_col"], id="non_unique_index", ), + pytest.param( + "{scalars_table_id}", + ["float64_col"], + id="non_unique_float_index", + ), + pytest.param( + "{scalars_table_id}", + [ + "timestamp_col", + "float64_col", + "datetime_col", + "int64_too", + ], + id="multi_part_index_direct", + ), + pytest.param( + "SELECT * FROM {scalars_table_id}", + [ + "timestamp_col", + "float64_col", + "string_col", + "bool_col", + "int64_col", + "int64_too", + ], + id="multi_part_index_w_query", + ), ], ) def test_read_gbq_w_index_col( @@ -116,6 +154,11 @@ def test_read_gbq_w_index_col( ) assert list(df.index.names) == index_col + # Verify that we get the expected number of results. + bf_shape = df.shape + result = df.to_pandas() + assert bf_shape == result.shape + @pytest.mark.parametrize( ("query_or_table", "max_results"), @@ -152,7 +195,7 @@ def test_read_gbq_w_max_results( query_or_table.format(scalars_table_id=scalars_table_id), max_results=max_results, ) - bf_result = df.compute() + bf_result = df.to_pandas() assert bf_result.shape[0] == max_results @@ -181,7 +224,7 @@ def test_read_pandas(session, scalars_dfs): df = session.read_pandas(scalars_pandas_df) assert df._block._expr._ordering is not None - result = df.compute() + result = df.to_pandas() expected = scalars_pandas_df pd.testing.assert_frame_equal(result, expected) @@ -189,7 +232,7 @@ def test_read_pandas(session, scalars_dfs): def test_read_pandas_multi_index(session, scalars_pandas_df_multi_index): df = session.read_pandas(scalars_pandas_df_multi_index) - result = df.compute() + result = df.to_pandas() pd.testing.assert_frame_equal(result, scalars_pandas_df_multi_index) @@ -199,7 +242,8 @@ def test_read_pandas_rowid_exists_adds_suffix(session, scalars_pandas_df_default ) df = session.read_pandas(scalars_pandas_df_default_index) - assert df._block._expr._ordering.ordering_id == "rowid_2" + total_order_col = df._block._expr._ordering.total_order_col + assert total_order_col and total_order_col.column_id == "rowid_2" def test_read_pandas_tokyo( @@ -208,7 +252,7 @@ def test_read_pandas_tokyo( tokyo_location: str, ): df = session_tokyo.read_pandas(scalars_pandas_df_index) - result = df.compute() + result = df.to_pandas() expected = scalars_pandas_df_index _, query_job = df._block.expr.start_query() @@ -220,14 +264,15 @@ def test_read_pandas_tokyo( def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs if scalars_df.index.name is not None: - path = gcs_folder + "test_read_csv_gcs_default_engine_w_index.csv" + path = gcs_folder + "test_read_csv_gcs_default_engine_w_index*.csv" else: - path = gcs_folder + "test_read_csv_gcs_default_engine_wo_index.csv" + path = gcs_folder + "test_read_csv_gcs_default_engine_wo_index*.csv" + read_path = path.replace("*", FIRST_FILE) scalars_df.to_csv(path, index=False) dtype = scalars_df.dtypes.to_dict() dtype.pop("geography_col") df = session.read_csv( - path, + read_path, # Convert default pandas dtypes to match BigQuery DataFrames dtypes. dtype=dtype, ) @@ -247,9 +292,9 @@ def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs if scalars_df.index.name is not None: - path = gcs_folder + "test_read_csv_gcs_bq_engine_w_index.csv" + path = gcs_folder + "test_read_csv_gcs_bq_engine_w_index*.csv" else: - path = gcs_folder + "test_read_csv_gcs_bq_engine_wo_index.csv" + path = gcs_folder + "test_read_csv_gcs_bq_engine_wo_index*.csv" scalars_df.to_csv(path, index=False) df = session.read_csv(path, engine="bigquery") @@ -419,22 +464,25 @@ def test_read_csv_default_engine_throws_not_implemented_error( match, ): path = ( - gcs_folder + "test_read_csv_gcs_default_engine_throws_not_implemented_error.csv" + gcs_folder + + "test_read_csv_gcs_default_engine_throws_not_implemented_error*.csv" ) + read_path = path.replace("*", FIRST_FILE) scalars_df_index.to_csv(path) with pytest.raises(NotImplementedError, match=match): - session.read_csv(path, **kwargs) + session.read_csv(read_path, **kwargs) def test_read_csv_gcs_default_engine_w_header(session, scalars_df_index, gcs_folder): - path = gcs_folder + "test_read_csv_gcs_default_engine_w_header.csv" + path = gcs_folder + "test_read_csv_gcs_default_engine_w_header*.csv" + read_path = path.replace("*", FIRST_FILE) scalars_df_index.to_csv(path) # Skips header=N rows, normally considers the N+1th row as the header, but overridden by # passing the `names` argument. In this case, pandas will skip the N+1th row too, take # the column names from `names`, and begin reading data from the N+2th row. df = session.read_csv( - path, + read_path, header=2, names=scalars_df_index.columns.to_list(), ) @@ -443,7 +491,7 @@ def test_read_csv_gcs_default_engine_w_header(session, scalars_df_index, gcs_fol def test_read_csv_gcs_bq_engine_w_header(session, scalars_df_index, gcs_folder): - path = gcs_folder + "test_read_csv_gcs_bq_engine_w_header.csv" + path = gcs_folder + "test_read_csv_gcs_bq_engine_w_header*.csv" scalars_df_index.to_csv(path, index=False) # Skip the header and the first 2 data rows. Without provided schema, the column names @@ -487,10 +535,11 @@ def test_read_csv_local_bq_engine_w_header(session, scalars_pandas_df_index): def test_read_csv_gcs_default_engine_w_index_col_name( session, scalars_df_default_index, gcs_folder ): - path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_name.csv" + path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_name*.csv" + read_path = path.replace("*", FIRST_FILE) scalars_df_default_index.to_csv(path) - df = session.read_csv(path, index_col="rowindex") + df = session.read_csv(read_path, index_col="rowindex") scalars_df_default_index = scalars_df_default_index.set_index( "rowindex" ).sort_index() @@ -501,11 +550,12 @@ def test_read_csv_gcs_default_engine_w_index_col_name( def test_read_csv_gcs_default_engine_w_index_col_index( session, scalars_df_default_index, gcs_folder ): - path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_index.csv" + path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_index*.csv" + read_path = path.replace("*", FIRST_FILE) scalars_df_default_index.to_csv(path) index_col = scalars_df_default_index.columns.to_list().index("rowindex") - df = session.read_csv(path, index_col=index_col) + df = session.read_csv(read_path, index_col=index_col) scalars_df_default_index = scalars_df_default_index.set_index( "rowindex" ).sort_index() @@ -559,11 +609,12 @@ def test_read_csv_local_default_engine_w_index_col_index( ) def test_read_csv_gcs_w_usecols(session, scalars_df_index, gcs_folder, engine): path = gcs_folder + "test_read_csv_gcs_w_usecols" - path = path + "_default_engine.csv" if engine is None else path + "_bq_engine.csv" + path = path + "_default_engine*.csv" if engine is None else path + "_bq_engine*.csv" + read_path = path.replace("*", FIRST_FILE) if engine is None else path scalars_df_index.to_csv(path) # df should only have 1 column which is bool_col. - df = session.read_csv(path, usecols=["bool_col"], engine=engine) + df = session.read_csv(read_path, usecols=["bool_col"], engine=engine) assert len(df.columns) == 1 diff --git a/tests/system/small/test_window.py b/tests/system/small/test_window.py index 34b56fcd73..e2f0fe999b 100644 --- a/tests/system/small/test_window.py +++ b/tests/system/small/test_window.py @@ -19,42 +19,31 @@ @pytest.mark.parametrize( ("windowing"), [ - (lambda x: x.expanding()), - (lambda x: x.rolling(3, min_periods=3)), - (lambda x: x.groupby(x % 2).rolling(3, min_periods=3)), - (lambda x: x.groupby(x % 3).expanding(min_periods=2)), - ], - ids=[ - "expanding", - "rolling", - "rollinggroupby", - "expandinggroupby", + pytest.param(lambda x: x.expanding(), id="expanding"), + pytest.param(lambda x: x.rolling(3, min_periods=3), id="rolling"), + pytest.param( + lambda x: x.groupby(x % 2).rolling(3, min_periods=3), id="rollinggroupby" + ), + pytest.param( + lambda x: x.groupby(x % 3).expanding(min_periods=2), id="expandinggroupby" + ), ], ) @pytest.mark.parametrize( ("agg_op"), [ - (lambda x: x.sum()), - (lambda x: x.min()), - (lambda x: x.max()), - (lambda x: x.mean()), - (lambda x: x.count()), - (lambda x: x.std()), - (lambda x: x.var()), - ], - ids=[ - "sum", - "min", - "max", - "mean", - "count", - "std", - "var", + pytest.param(lambda x: x.sum(), id="sum"), + pytest.param(lambda x: x.min(), id="min"), + pytest.param(lambda x: x.max(), id="max"), + pytest.param(lambda x: x.mean(), id="mean"), + pytest.param(lambda x: x.count(), id="count"), + pytest.param(lambda x: x.std(), id="std"), + pytest.param(lambda x: x.var(), id="var"), ], ) def test_window_agg_ops(scalars_df_index, scalars_pandas_df_index, windowing, agg_op): col_name = "int64_too" - bf_series = agg_op(windowing(scalars_df_index[col_name])).compute() + bf_series = agg_op(windowing(scalars_df_index[col_name])).to_pandas() pd_series = agg_op(windowing(scalars_pandas_df_index[col_name])) # Pandas always converts to float64, even for min/max/count, which is not desired diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 92cb52a681..6f5c96da49 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -45,8 +45,10 @@ def test_standard_scaler_produces_correct_sql(): def test_one_hot_encoder_produces_correct_sql(): - sql = ml_sql.ml_one_hot_encoder("col_a", "encoded_col_a") - assert sql == "ML.ONE_HOT_ENCODER(col_a) OVER() AS encoded_col_a" + sql = ml_sql.ml_one_hot_encoder("col_a", "none", 1000000, 0, "encoded_col_a") + assert ( + sql == "ML.ONE_HOT_ENCODER(col_a, 'none', 1000000, 0) OVER() AS encoded_col_a" + ) def test_create_model_produces_correct_sql(): @@ -57,7 +59,7 @@ def test_create_model_produces_correct_sql(): ) assert ( sql - == """CREATE MODEL `my_dataset.my_model` + == """CREATE TEMP MODEL `my_dataset.my_model` my_options_sql AS my_source_sql""" ) @@ -65,14 +67,14 @@ def test_create_model_produces_correct_sql(): def test_create_model_transform_produces_correct_sql(): sql = ml_sql.create_model( - model_name="my_dataset.my_model", + model_name="my_model", source_sql="my_source_sql", options_sql="my_options_sql", transform_sql="my_transform_sql", ) assert ( sql - == """CREATE MODEL `my_dataset.my_model` + == """CREATE TEMP MODEL `my_model` my_transform_sql my_options_sql AS my_source_sql""" @@ -81,18 +83,30 @@ def test_create_model_transform_produces_correct_sql(): def test_create_remote_model_produces_correct_sql(): sql = ml_sql.create_remote_model( - model_name="my_dataset.my_model", + model_name="my_model", connection_name="my_project.us.my_connection", options_sql="my_options_sql", ) assert ( sql - == """CREATE MODEL `my_dataset.my_model` + == """CREATE TEMP MODEL `my_model` REMOTE WITH CONNECTION `my_project.us.my_connection` my_options_sql""" ) +def test_create_imported_model_produces_correct_sql(): + sql = ml_sql.create_imported_model( + model_name="my_model", + options_sql="my_options_sql", + ) + assert ( + sql + == """CREATE TEMP MODEL `my_model` +my_options_sql""" + ) + + def test_alter_model_correct_sql(): sql = ml_sql.alter_model( model_name="my_dataset.my_model", diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 2528618963..123dae7939 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -12,80 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -from unittest import mock - import ibis -from ibis.expr.types import Column, Table +from ibis.expr.types import Table from bigframes import core +ORDERING = core.ExpressionOrdering( + [ + core.OrderingColumnReference("int64_col"), + core.OrderingColumnReference("string_col"), + ], + total_ordering_columns=frozenset(["int64_col", "string_col"]), +) + def test_constructor_from_ibis_table_adds_all_columns( session, scalars_ibis_table: Table ): - actual = core.ArrayValue(session=session, table=scalars_ibis_table) + columns = tuple(scalars_ibis_table[key] for key in scalars_ibis_table.columns) + actual = core.ArrayValue( + session=session, table=scalars_ibis_table, columns=columns, ordering=ORDERING + ) assert actual._table is scalars_ibis_table assert len(actual._columns) == len(scalars_ibis_table.columns) -def test_builder_doesnt_change_original(session): - mock_table = mock.create_autospec(Table) - mock_column = mock.create_autospec(Column) - original = core.ArrayValue(session=session, table=mock_table, columns=[mock_column]) - assert original._table is mock_table - assert len(original._columns) == 1 - assert original._columns[0] is mock_column - - # Create a new expression from a builder. - builder = original.builder() - new_table = mock.create_autospec(Table) - assert new_table is not mock_table - builder.table = new_table - new_column = mock.create_autospec(Column) - assert new_column is not mock_column - builder.columns.append(new_column) - actual = builder.build() - - # Expected values are present. - assert actual._table is new_table - assert len(actual._columns) == 2 - assert actual._columns[0] is mock_column - assert actual._columns[1] is new_column - # Don't modify the original. - assert original._table is mock_table - assert len(original._columns) == 1 - assert original._columns[0] is mock_column - - -def test_projection_doesnt_change_original(session): - mock_table = mock.create_autospec(Table) - mock_column = mock.create_autospec(Column) - original = core.ArrayValue(session=session, table=mock_table, columns=[mock_column]) - assert original._table is mock_table - assert len(original._columns) == 1 - assert original._columns[0] is mock_column - - # Create a new expression from a projection. - new_column_1 = mock.create_autospec(Column) - new_column_2 = mock.create_autospec(Column) - assert new_column_1 is not mock_column - assert new_column_2 is not mock_column - actual = original.projection([new_column_1, mock_column, new_column_2]) - - # Expected values are present. - assert actual._table is mock_table - assert len(actual._columns) == 3 - assert actual._columns[0] is new_column_1 - assert actual._columns[1] is mock_column - assert actual._columns[2] is new_column_2 - # Don't modify the original. - assert original._table is mock_table - assert len(original._columns) == 1 - assert original._columns[0] is mock_column - - def test_to_ibis_expr_with_projection(session, scalars_ibis_table: Table): - expr = core.ArrayValue(session=session, table=scalars_ibis_table).projection( + columns = tuple(scalars_ibis_table[key] for key in scalars_ibis_table.columns) + expr = core.ArrayValue( + session=session, table=scalars_ibis_table, columns=columns, ordering=ORDERING + ).projection( [ scalars_ibis_table["int64_col"], ibis.literal(123456789).name("literals"), diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index ec49315f0f..dafed08980 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -175,10 +175,9 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): def test_remote_function_io_types_are_supported_bigframes_types(): - from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type + from bigframes.remote_function import ( + _supported_io_ibis_types as rf_supported_io_ibis_types, + ) - from bigframes.remote_function import _supported_io_types as rf_supported_io_types - - for python_type in rf_supported_io_types: - ibis_type = python_type_to_bigquery_type(python_type) + for ibis_type in rf_supported_io_ibis_types: assert ibis_type in bigframes.dtypes.IBIS_TO_BIGFRAMES diff --git a/tests/unit/test_formatting_helper.py b/tests/unit/test_formatting_helper.py new file mode 100644 index 0000000000..ea29869e82 --- /dev/null +++ b/tests/unit/test_formatting_helper.py @@ -0,0 +1,17 @@ +import pytest + +import bigframes.formatting_helpers as formatter + + +@pytest.mark.parametrize( + "test_input, expected", [(None, "N/A"), ("string", "N/A"), (100000, "100.0 kB")] +) +def test_get_formatted_bytes(test_input, expected): + assert formatter.get_formatted_bytes(test_input) == expected + + +@pytest.mark.parametrize( + "test_input, expected", [(None, None), ("string", "string"), (100000, "a minute")] +) +def test_get_formatted_time(test_input, expected): + assert formatter.get_formatted_time(test_input) == expected diff --git a/tests/unit/test_formatting_helpers.py b/tests/unit/test_formatting_helpers.py new file mode 100644 index 0000000000..9db9b372e2 --- /dev/null +++ b/tests/unit/test_formatting_helpers.py @@ -0,0 +1,46 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest.mock as mock + +import google.api_core.exceptions as api_core_exceptions +import google.cloud.bigquery as bigquery +import pytest + +import bigframes.constants as constants +import bigframes.formatting_helpers as formatting_helpers + + +def test_wait_for_query_job_error_includes_feedback_link(): + mock_query_job = mock.create_autospec(bigquery.QueryJob) + mock_query_job.result.side_effect = api_core_exceptions.BadRequest( + "Test message 123." + ) + + with pytest.raises(api_core_exceptions.BadRequest) as cap_exc: + formatting_helpers.wait_for_query_job(mock_query_job) + + cap_exc.match("Test message 123.") + cap_exc.match(constants.FEEDBACK_LINK) + + +def test_wait_for_job_error_includes_feedback_link(): + mock_job = mock.create_autospec(bigquery.LoadJob) + mock_job.result.side_effect = api_core_exceptions.BadRequest("Test message 123.") + + with pytest.raises(api_core_exceptions.BadRequest) as cap_exc: + formatting_helpers.wait_for_job(mock_job) + + cap_exc.match("Test message 123.") + cap_exc.match(constants.FEEDBACK_LINK) diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 3e8e16ee6a..a178a45438 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -15,10 +15,12 @@ import inspect import re import sys +import unittest.mock as mock +import pandas as pd import pytest -import bigframes.pandas +import bigframes.pandas as bpd import bigframes.session leading_whitespace = re.compile(r"^\s+", flags=re.MULTILINE) @@ -79,3 +81,31 @@ def test_method_matches_session(method_name: str): 1: ] assert pandas_signature.return_annotation == session_signature.return_annotation + + +def test_cut_raises_with_labels(): + with pytest.raises(NotImplementedError, match="Only labels=False"): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) + bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) + + +@pytest.mark.parametrize( + ("bins",), + ( + (0,), + (-1,), + ), +) +def test_cut_raises_with_invalid_bins(bins: int): + with pytest.raises(ValueError, match="`bins` should be a positive integer."): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) + bigframes.pandas.cut(mock_series, bins, labels=False) + + +def test_pandas_attribute(): + assert bpd.NA is pd.NA + assert bpd.BooleanDtype is pd.BooleanDtype + assert bpd.Float64Dtype is pd.Float64Dtype + assert bpd.Int64Dtype is pd.Int64Dtype + assert bpd.StringDtype is pd.StringDtype + assert bpd.ArrowDtype is pd.ArrowDtype diff --git a/third_party/bigframes_vendored/ibis/LICENSE.txt b/third_party/bigframes_vendored/ibis/LICENSE.txt new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/bigframes_vendored/ibis/README.md b/third_party/bigframes_vendored/ibis/README.md new file mode 100644 index 0000000000..8a00750e92 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/README.md @@ -0,0 +1,196 @@ +# Ibis + +[![Documentation Status](https://img.shields.io/badge/docs-docs.ibis--project.org-blue.svg)](http://ibis-project.org) +[![Anaconda-Server Badge](https://anaconda.org/conda-forge/ibis-framework/badges/version.svg)](https://anaconda.org/conda-forge/ibis-framework) +[![PyPI](https://img.shields.io/pypi/v/ibis-framework.svg)](https://pypi.org/project/ibis-framework) +[![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml?query=branch%3Amaster) +[![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml?query=branch%3Amaster) +[![Codecov branch](https://img.shields.io/codecov/c/github/ibis-project/ibis/master.svg)](https://codecov.io/gh/ibis-project/ibis) + +## What is Ibis? + +Ibis is a Python library that provides a lightweight, universal interface for data wrangling. It helps Python users explore and transform data of any size, stored anywhere. + +Ibis has three primary components: + +1. **A dataframe API for Python**. + Python users can write Ibis code to manipulate tabular data. +2. **Interfaces to 15+ query engines.** + Wherever data is stored, people can use Ibis as their API of choice to communicate with any of those query engines. +3. **Deferred execution**. + Ibis uses deferred execution, so execution of code is pushed to the query engine. + Users can execute at the speed of their backend, not their local computer. + +## Why Use Ibis? + +Ibis aims to be a future-proof solution to interacting with data using Python and can accomplish this goal through its main features: + +- **Familiar API**: Ibis’s API design borrows from popular APIs like pandas and dplyr that most users already know and like to use. +- **Consistent syntax**: Ibis aims to be a universal Python API for tabular data of any size, big or small. +- **Deferred execution**: Ibis pushes code execution to the query engine and only moves required data into memory when necessary. + Analytics workflows are faster and more efficient +- **Interactive mode**: Ibis provides an interactive mode in which users can quickly diagnose problems, explore data, and mock up workflows and pipelines locally. +- **10+ supported backends**: Ibis supports multiple query engines and DataFrame APIs. + Use one interface to transform with your data wherever it lives: from DataFrames in pandas to Parquet files through DuckDB to tables in BigQuery. +- **Minimize rewrites**: Teams can often keep their Ibis code the same regardless of backend changes, like increasing or decreasing computing power, changing the number or size of their databases, or switching backends entirely. +- **Flexibility when you need it**: When Ibis doesn't support something, it provides a way to jump directly into SQL. + +## Common Use Cases + +- **Speed up prototype to production.** + Scale code written and tested locally to a distributed system or cloud SQL engine with minimal rewrites. +- **Boost performance of existing Python or pandas code.** + For example a general rule of thumb for pandas is "Have 5 to 10 times as much RAM as the size of your dataset". + When a dataset exceeds this rule using in-memory frameworks like pandas can be slow. + Instead, using Ibis will significantly speed up your workflows because of its deferred execution. + Ibis also empowers you to switch to a faster database engine, without changing much of your code. +- **Get rid of long, error-prone, `f`-strings.** + Ibis provides one syntax for multiple query engines and dataframe APIs that lets you avoid learning new flavors of SQL or other framework-specific code. + Learn the syntax once and use that syntax anywhere. + +## Backends + +Ibis acts as a universal frontend to the following systems: + +- [Apache Arrow DataFusion](https://ibis-project.org/backends/datafusion/) (experimental) +- [Apache Druid](https://ibis-project.org/backends/druid/) (experimental) +- [Apache Impala](https://ibis-project.org/backends/impala/) +- [Apache PySpark](https://ibis-project.org/backends/pyspark/) +- [BigQuery](https://ibis-project.org/backends/bigquery/) +- [ClickHouse](https://ibis-project.org/backends/clickhouse/) +- [Dask](https://ibis-project.org/backends/dask/) +- [DuckDB](https://ibis-project.org/backends/duckdb/) +- [HeavyAI](https://github.com/heavyai/ibis-heavyai) +- [MySQL](https://ibis-project.org/backends/mysql/) +- [Oracle](https://ibis-project.org/backends/oracle/) (experimental) +- [Pandas](https://ibis-project.org/backends/pandas/) +- [Polars](https://ibis-project.org/backends/polars/) (experimental) +- [PostgreSQL](https://ibis-project.org/backends/postgresql/) +- [SQL Server](https://ibis-project.org/backends/mssql/) +- [SQLite](https://ibis-project.org/backends/sqlite/) +- [Snowflake](https://ibis-project.org/backends/snowflake) (experimental) +- [Trino](https://ibis-project.org/backends/trino/) (experimental) + +The list of supported backends is continuously growing. Anyone can get involved +in adding new ones! Learn more about contributing to ibis in our contributing +documentation at https://github.com/ibis-project/ibis/blob/master/docs/CONTRIBUTING.md + +## Installation + +Install Ibis from PyPI with: + +```bash +pip install 'ibis-framework[duckdb]' +``` + +Or from conda-forge with: + +```bash +conda install ibis-framework -c conda-forge +``` + +(It’s a common mistake to `pip install ibis`. If you try to use Ibis and get errors early on try uninstalling `ibis` and installing `ibis-framework`) + +To discover ibis, we suggest starting with the DuckDB backend (which is included by default in the conda-forge package). The DuckDB backend is performant and fully featured. + +To use ibis with other backends, include the backend name in brackets for PyPI: + +```bash +pip install 'ibis-framework[postgres]' +``` + +Or use `ibis-$BACKEND` where `$BACKEND` is the specific backend you want to use when installing from conda-forge: + +```bash +conda install ibis-postgres -c conda-forge +``` + +## Getting Started with Ibis + +We provide a number of tutorial and example notebooks in the +[ibis-examples](https://github.com/ibis-project/ibis-examples). The easiest way +to try these out is through the online interactive notebook environment +provided here: +[![Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ibis-project/ibis-examples/main) + +You can also get started analyzing any dataset, anywhere with just a few lines +of Ibis code. Here’s an example of how to use Ibis with a SQLite database. + +Download the SQLite database from the `ibis-tutorial-data` GCS (Google Cloud +Storage) bucket, then connect to it using ibis. + +```bash +curl -LsS -o geography.db 'https://storage.googleapis.com/ibis-tutorial-data/geography.db' +``` + +Connect to the database and show the available tables + +```python +>>> import ibis +>>> from ibis import _ +>>> ibis.options.interactive = True +>>> con = ibis.sqlite.connect("geography.db") +>>> con.tables +Tables +------ +- countries +- gdp +- independence +``` + +Choose the `countries` table and preview its first few rows + +```python +>>> countries = con.tables.countries +>>> countries.head() +┏━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓ +┃ iso_alpha2 ┃ iso_alpha3 ┃ iso_numeric ┃ fips ┃ name ┃ capital ┃ area_km2 ┃ population ┃ continent ┃ +┡━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━┩ +│ string │ string │ int32 │ string │ string │ string │ float64 │ int32 │ string │ +├────────────┼────────────┼─────────────┼────────┼──────────────────────┼──────────────────┼──────────┼────────────┼───────────┤ +│ AD │ AND │ 20 │ AN │ Andorra │ Andorra la Vella │ 468.0 │ 84000 │ EU │ +│ AE │ ARE │ 784 │ AE │ United Arab Emirates │ Abu Dhabi │ 82880.0 │ 4975593 │ AS │ +│ AF │ AFG │ 4 │ AF │ Afghanistan │ Kabul │ 647500.0 │ 29121286 │ AS │ +│ AG │ ATG │ 28 │ AC │ Antigua and Barbuda │ St. Johns │ 443.0 │ 86754 │ NA │ +│ AI │ AIA │ 660 │ AV │ Anguilla │ The Valley │ 102.0 │ 13254 │ NA │ +└────────────┴────────────┴─────────────┴────────┴──────────────────────┴──────────────────┴──────────┴────────────┴───────────┘ +``` + +Show the 5 least populous countries in Asia + +```python + +>>> ( +... countries.filter(_.continent == "AS") +... .select("name", "population") +... .order_by(_.population) +... .limit(5) +... ) +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓ +┃ name ┃ population ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩ +│ string │ int32 │ +├────────────────────────────────┼────────────┤ +│ Cocos [Keeling] Islands │ 628 │ +│ British Indian Ocean Territory │ 4000 │ +│ Brunei │ 395027 │ +│ Maldives │ 395650 │ +│ Macao │ 449198 │ +└────────────────────────────────┴────────────┘ +``` + +## Community and Contributing + +Ibis is an open source project and welcomes contributions from anyone in the community. + +- Read [the contributing guide](https://github.com/ibis-project/ibis/blob/master/docs/CONTRIBUTING.md). +- We care about keeping the community welcoming for all. Check out [the code of conduct](https://github.com/ibis-project/ibis/blob/master/docs/CODE_OF_CONDUCT.md). +- The Ibis project is open sourced under the [Apache License](https://github.com/ibis-project/ibis/blob/master/LICENSE.txt). + +Join our community here: + +- Twitter: https://twitter.com/IbisData +- Gitter: https://gitter.im/ibis-dev/Lobby +- StackOverflow: https://stackoverflow.com/questions/tagged/ibis + +For more information visit https://ibis-project.org/. diff --git a/third_party/bigframes_vendored/ibis/__init__.py b/third_party/bigframes_vendored/ibis/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/ibis/backends/__init__.py b/third_party/bigframes_vendored/ibis/backends/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py new file mode 100644 index 0000000000..d209284ab7 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -0,0 +1,19 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/registry.py +"""Module to convert from Ibis expression to SQL string.""" + +from ibis.backends.bigquery.registry import OPERATION_REGISTRY + +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops + + +def _approx_quantiles(translator, op: vendored_ibis_ops.ApproximateMultiQuantile): + arg = translator.translate(op.arg) + num_bins = translator.translate(op.num_bins) + return f"APPROX_QUANTILES({arg}, {num_bins})" + + +patched_ops = { + vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, +} + +OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/expr/__init__.py b/third_party/bigframes_vendored/ibis/expr/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py new file mode 100644 index 0000000000..f3ab753a3b --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py @@ -0,0 +1,4 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/__init__.py +from __future__ import annotations + +from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F403 diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py new file mode 100644 index 0000000000..5e6ad9ecf2 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -0,0 +1,23 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/reductions.py + +from __future__ import annotations + +import ibis.expr.datatypes as dt +from ibis.expr.operations.reductions import Filterable, Reduction +import ibis.expr.rules as rlz + + +class ApproximateMultiQuantile(Filterable, Reduction): + """Calculate (approximately) evenly-spaced quantiles. + + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_quantiles + """ + + arg = rlz.any + num_bins = rlz.value(dt.int64) + output_dtype = dt.Array(dt.float64) + + +__all__ = [ + "ApproximateMultiQuantile", +] diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 10f5546f10..198654015e 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -16,10 +16,44 @@ Encapsulates configuration for displaying objects. Attributes: - max_columns (int): - If max_columns is exceeded, switch to truncate view. - max_rows (int): - If max_rows is exceeded, switch to truncate view. - progress_bar Optional(str): - Determines if progress bars are shown during job runs. Use auto | notebook | terminal. Set to 'None' to remove progress bars + max_columns (int, default 20): + If `max_columns` is exceeded, switch to truncate view. + max_rows (int, default 25): + If `max_rows` is exceeded, switch to truncate view. + progress_bar (Optional(str), default "auto"): + Determines if progress bars are shown during job runs. + Valid values are `auto`, `notebook`, and `terminal`. Set + to `None` to remove progress bars. + repr_mode (Literal[`head`, `deferred`]): + `head`: + Execute, download, and display results (limited to head) from + dataframe and series objects during repr. + `deferred`: + Prevent executions from repr statements in dataframe and series objects. + Instead estimated bytes processed will be shown. Dataframe and Series + objects can still be computed with methods that explicitly execute and + download results. +""" + +sampling_options_doc = """ +Encapsulates configuration for data sampling. + +Attributes: + max_download_size (int, default 500): + Download size threshold in MB. If value set to None, the download size + won't be checked. + enable_downsampling (bool, default False): + Whether to enable downsampling, If max_download_size is exceeded when + downloading data (e.g., to_pandas()), the data will be downsampled + if enable_downsampling is True, otherwise, an error will be raised. + sampling_method (str, default "uniform"): + Downsampling algorithms to be chosen from, the choices are: + "head": This algorithm returns a portion of the data from + the beginning. It is fast and requires minimal computations + to perform the downsampling.; "uniform": This algorithm returns + uniform random samples of the data. + random_state (int, default None): + The seed for the uniform downsampling algorithm. If provided, + the uniform method may take longer to execute and require more + computation. """ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 93b7c7683c..653b65c834 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -13,6 +13,8 @@ from typing import Iterable, Literal, Mapping, Optional, Sequence, Union +import numpy + from third_party.bigframes_vendored.pandas.core.generic import NDFrame # ----------------------------------------------------------------------- @@ -33,8 +35,61 @@ def shape(self) -> tuple[int, int]: """Return a tuple representing the dimensionality of the DataFrame.""" raise NotImplementedError("abstract method") + @property + def axes(self) -> list: + """ + Return a list representing the axes of the DataFrame. + + It has the row axis labels and column axis labels as the only members. + They are returned in that order. + + Examples + .. code-block:: + + df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + df.axes + [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], + dtype='object')] + """ + return [self.index, self.columns] + + @property + def values(self) -> numpy.ndarray: + """Return the values of DataFrame in the form of a NumPy array. + + Args: + dytype (default None): + The dtype to pass to `numpy.asarray()`. + copy (bool, default False): + Whether to ensure that the returned value is not a view + on another array. + na_value (default None): + The value to use for missing values. + """ + raise NotImplementedError("abstract method") + # ---------------------------------------------------------------------- # IO methods (to / from other formats) + def to_numpy( + self, dtype=None, copy=False, na_value=None, **kwargs + ) -> numpy.ndarray: + """ + Convert the DataFrame to a NumPy array. + + Args: + dtype (None): + The dtype to pass to `numpy.asarray()`. + copy (bool, default None): + Whether to ensure that the returned value is not a view + on another array. + na_value (Any, default None): + The value to use for missing values. The default value + depends on dtype and the dtypes of the DataFrame columns. + + Returns: + numpy.ndarray: The converted NumPy array. + """ + raise NotImplementedError("abstract method") def to_gbq( self, @@ -42,15 +97,16 @@ def to_gbq( *, if_exists: Optional[Literal["fail", "replace", "append"]] = "fail", index: bool = True, + ordering_id: Optional[str] = None, ) -> None: - """Write a DataFrame to a Google BigQuery table. + """Write a DataFrame to a BigQuery table. Args: - destination_table : str + destination_table (str): Name of table to be written, in the form ``dataset.tablename`` or ``project.dataset.tablename``. - if_exists : str, default 'fail' + if_exists (str, default 'fail'): Behavior when the destination table exists. Value can be one of: ``'fail'`` @@ -60,7 +116,12 @@ def to_gbq( ``'append'`` If table exists, insert data. Create if does not exist. - index: whether write row names (index) or not. + index (bool. default True): + whether write row names (index) or not. + + ordering_id (Optional[str], default None): + If set, write the ordering of the DataFrame as a column in the + result table with this name. """ raise NotImplementedError("abstract method") @@ -70,19 +131,19 @@ def to_parquet( *, index: bool = True, ) -> None: - """Write a DataFrame to the binary parquet format. + """Write a DataFrame to the binary Parquet format. This function writes the dataframe as a `parquet file - `_ to Google Cloud Storage. + `_ to Cloud Storage. Args: - path: - Destination URI(s) of GCS files(s) to store the extracted dataframe + path (str): + Destination URI(s) of Cloud Storage files(s) to store the extracted dataframe in format of ``gs:///``. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files varies. - index : bool, default None + index (bool, default True): If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -101,6 +162,12 @@ def assign(self, **kwargs) -> DataFrame: Returns a new object with all original columns in addition to new ones. Existing columns that are re-assigned will be overwritten. + .. note:: + Assigning multiple columns within the same ``assign`` is possible. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in + order. + Args: kwargs: A dictionary of ``{str: values}``. The column names are @@ -108,14 +175,8 @@ def assign(self, **kwargs) -> DataFrame: are simply assigned to the column. Returns: - A new DataFrame with the new columns in addition to + DataFrame: A new DataFrame with the new columns in addition to all the existing columns. - - Notes: - Assigning multiple columns within the same ``assign`` is possible. - Later items in '\*\*kwargs' may refer to newly created or modified - columns in 'df'; items are computed and assigned into 'df' in - order. """ raise NotImplementedError("abstract method") @@ -123,24 +184,31 @@ def assign(self, **kwargs) -> DataFrame: # Reindexing and alignment def drop( - self, - *, - columns: Union[str, Iterable[str]], + self, labels=None, *, axis=0, index=None, columns=None, level=None ) -> DataFrame | None: """Drop specified labels from columns. Remove columns by directly specifying column names. Args: + labels: + Index or column labels to drop. + axis: + Whether to drop labels from the index (0 or 'index') or + columns (1 or 'columns'). + index: + Alternative to specifying axis (``labels, axis=0`` + is equivalent to ``index=labels``). columns: - Single label or list-like. - + Alternative to specifying axis (``labels, axis=1`` + is equivalent to ``columns=labels``). + level: + For MultiIndex, level from which the labels will be removed. Returns: - DataFrame without the removed column labels. + DataFrame: DataFrame without the removed column labels. Raises: - KeyError: - If any of the labels is not found in the selected axis. + KeyError: If any of the labels is not found in the selected axis. """ raise NotImplementedError("abstract method") @@ -155,15 +223,14 @@ def rename( will be left as-is. Extra labels listed don't throw an error. Args: - columns: + columns (Mapping): Dict-like from old column labels to new column labels. Returns: - DataFrame with the renamed axis labels. + DataFrame: DataFrame with the renamed axis labels. Raises: - KeyError: - If any of the labels is not found. + KeyError: If any of the labels is not found. """ raise NotImplementedError("abstract method") @@ -171,14 +238,16 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: """ Set the name of the axis for the index. - Note: currently only accepts a single string parameter (the new name of the index) + .. Note:: + + Currently only accepts a single string parameter (the new name of the index). Args: - mapper : str + mapper str: Value to set the axis name attribute. Returns: - DataFrame with the new index name + DataFrame: DataFrame with the new index name """ raise NotImplementedError("abstract method") @@ -197,12 +266,11 @@ def set_index( Args: keys: A label. This parameter can be a single column key. - drop: + drop : Delete columns to be used as the new index. Returns: - DataFrame: - Changed row labels. + DataFrame: Changed row labels. """ raise NotImplementedError("abstract method") @@ -211,12 +279,12 @@ def reorder_levels(self, order: Sequence[int | str]) -> DataFrame: Rearrange index levels using input order. May not drop or duplicate levels. Args: - order: list of int or list of str + order (list of int or list of str): List representing new level order. Reference level by number (position) or by key (label). Returns: - DataFrame + DataFrame: DataFrame of rearranged index. """ raise NotImplementedError("abstract method") @@ -225,12 +293,12 @@ def droplevel(self, level): Return DataFrame with requested index / column level(s) removed. Args: - level: int, str, or list-like + level (int, str, or list-like): If a string is given, must be the name of a level If list-like, elements must be names or positional indexes of levels. Returns: - DataFrame with requested index / column level(s) removed. + DataFrame: DataFrame with requested index / column level(s) removed. """ raise NotImplementedError("abstract method") @@ -244,12 +312,12 @@ def reset_index( Reset the index of the DataFrame, and use the default one instead. Args: - drop: + drop (bool, default False): Do not try to insert index into dataframe columns. This resets the index to the default integer index. Returns: - DataFrame with the new index. + DataFrame: DataFrame with the new index. """ raise NotImplementedError("abstract method") @@ -265,10 +333,10 @@ def drop_duplicates( are ignored. Args: - subset : column label or sequence of labels, optional + subset (column label or sequence of labels, optional): Only consider certain columns for identifying duplicates, by default use all of the columns. - keep : {'first', 'last', ``False``}, default 'first' + keep ({'first', 'last', ``False``}, default 'first'): Determines which duplicates (if any) to keep. - 'first' : Drop duplicates except for the first occurrence. @@ -276,7 +344,7 @@ def drop_duplicates( - ``False`` : Drop all duplicates. Returns: - DataFrame with duplicates removed + DataFrame: DataFrame with duplicates removed """ raise NotImplementedError("abstract method") @@ -287,10 +355,10 @@ def duplicated(self, subset=None, keep="first"): Considering certain columns is optional. Args: - subset : column label or sequence of labels, optional + subset (column label or sequence of labels, optional): Only consider certain columns for identifying duplicates, by default use all of the columns. - keep : {'first', 'last', False}, default 'first' + keep ({'first', 'last', False}, default 'first'): Determines which duplicates (if any) to mark. - ``first`` : Mark duplicates as ``True`` except for the first occurrence. @@ -311,7 +379,7 @@ def dropna( """Remove missing values. Returns: - DataFrame with NA entries dropped from it. + DataFrame: DataFrame with NA entries dropped from it. """ raise NotImplementedError("abstract method") @@ -323,18 +391,23 @@ def sort_values( by: str | Sequence[str], *, ascending: bool | Sequence[bool] = True, + kind: str = "quicksort", na_position="last", ) -> DataFrame: """Sort by the values along row axis. Args: - by: + by (str or Sequence[str]): Name or list of names to sort by. - ascending: + ascending (bool or Sequence[bool], default True): Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. - na_position: + kind (str, default `quicksort`): + Choice of sorting algorithm. Accepts 'quicksort’, ‘mergesort’, + ‘heapsort’, ‘stable’. Ignored except when determining whether to + sort stably. 'mergesort' or 'stable' will result in stable reorder. + na_position ({'first', 'last'}, default `last`): ``{'first', 'last'}``, default 'last' Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. @@ -356,8 +429,9 @@ def sort_index( # ---------------------------------------------------------------------- # Arithmetic Methods - def le(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'less than or equal to' of dataframe and other, element-wise (binary operator `<=`). + def eq(self, other, axis: str | int = "columns") -> DataFrame: + """ + Get equal to of DataFrame and other, element-wise (binary operator `eq`). Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison operators. @@ -366,24 +440,20 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: (rows or columns) and level for comparison. Args: - other: scalar, sequence, Series, or DataFrame + other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. - axis: {{0 or 'index', 1 or 'columns'}}, default 'columns' + axis ({0 or 'index', 1 or 'columns'}, default 'columns'): Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). Returns: - DataFrame of bool. The result of the comparison. - - Notes: - Mismatched indices will be unioned together. `NaN` values in - floating point columns are considered different - (i.e. `NaN` != `NaN`). + Result of the comparison. """ raise NotImplementedError("abstract method") - def lt(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'less than' of dataframe and other, element-wise (binary operator `<`). + def ne(self, other, axis: str | int = "columns") -> DataFrame: + """ + Get not equal to of DataFrame and other, element-wise (binary operator `ne`). Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison operators. @@ -392,24 +462,44 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: (rows or columns) and level for comparison. Args: - other: scalar, sequence, Series, or DataFrame + other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. - axis: {{0 or 'index', 1 or 'columns'}}, default 'columns' + axis ({0 or 'index', 1 or 'columns'}, default 'columns'): Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). - Returns: - DataFrame of bool. The result of the comparison. + DataFrame: Result of the comparison. + """ + raise NotImplementedError("abstract method") - Notes: + def le(self, other, axis: str | int = "columns") -> DataFrame: + """Get 'less than or equal to' of dataframe and other, element-wise (binary operator `<=`). + + Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison + operators. + + Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis + (rows or columns) and level for comparison. + + .. note:: Mismatched indices will be unioned together. `NaN` values in floating point columns are considered different (i.e. `NaN` != `NaN`). + + Args: + other (scalar, sequence, Series, or DataFrame): + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}, default 'columns'): + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). + + Returns: + DataFrame: DataFrame of bool. The result of the comparison. """ raise NotImplementedError("abstract method") - def ge(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'greater than or equal to' of dataframe and other, element-wise (binary operator `>=`). + def lt(self, other, axis: str | int = "columns") -> DataFrame: + """Get 'less than' of DataFrame and other, element-wise (binary operator `<`). Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison operators. @@ -417,25 +507,51 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + .. note:: + Mismatched indices will be unioned together. `NaN` values in + floating point columns are considered different + (i.e. `NaN` != `NaN`). + Args: - other: scalar, sequence, Series, or DataFrame + other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. - axis: {{0 or 'index', 1 or 'columns'}}, default 'columns' + axis ({0 or 'index', 1 or 'columns'}, default 'columns'): Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). Returns: - DataFrame of bool. The result of the comparison. + DataFrame: DataFrame of bool. The result of the comparison. + """ + raise NotImplementedError("abstract method") - Notes: + def ge(self, other, axis: str | int = "columns") -> DataFrame: + """Get 'greater than or equal to' of DataFrame and other, element-wise (binary operator `>=`). + + Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison + operators. + + Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis + (rows or columns) and level for comparison. + + .. note:: Mismatched indices will be unioned together. `NaN` values in floating point columns are considered different (i.e. `NaN` != `NaN`). + + Args: + other (scalar, sequence, Series, or DataFrame): + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}, default 'columns'): + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). + + Returns: + DataFrame: DataFrame of bool. The result of the comparison. """ raise NotImplementedError("abstract method") def gt(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'greater than' of dataframe and other, element-wise (binary operator `>`). + """Get 'greater than' of DataFrame and other, element-wise (binary operator `>`). Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison operators. @@ -443,260 +559,250 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + .. note:: + Mismatched indices will be unioned together. `NaN` values in + floating point columns are considered different + (i.e. `NaN` != `NaN`). + Args: - other: scalar, sequence, Series, or DataFrame + other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. - axis: {{0 or 'index', 1 or 'columns'}}, default 'columns' + axis ({0 or 'index', 1 or 'columns'}, default 'columns'): Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). Returns: - DataFrame of bool. The result of the comparison. - - Notes: - Mismatched indices will be unioned together. `NaN` values in - floating point columns are considered different - (i.e. `NaN` != `NaN`). + DataFrame: DataFrame of bool: The result of the comparison. """ raise NotImplementedError("abstract method") def add(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'addition' of dataframe and other, element-wise (binary operator `+`). + """Get addition of DataFrame and other, element-wise (binary operator `+`). Equivalent to ``dataframe + other``. With reverse version, `radd`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: - other: + other (float, int, or Series): Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError("abstract method") def sub(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'subtraction' of dataframe and other, element-wise (binary operator `-`). + """Get subtraction of DataFrame and other, element-wise (binary operator `-`). Equivalent to ``dataframe - other``. With reverse version, `rsub`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: - other: + other (float, int, or Series): Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError("abstract method") def rsub(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'subtraction' of dataframe and other, element-wise (binary operator `-`). + """Get subtraction of DataFrame and other, element-wise (binary operator `-`). Equivalent to ``other - dataframe``. With reverse version, `sub`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: - other: + other (float, int, or Series): Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError("abstract method") def mul(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'multiplication' of dataframe and other, element-wise (binary operator `*`). + """Get multiplication of DataFrame and other, element-wise (binary operator `*`). Equivalent to ``dataframe * other``. With reverse version, `rmul`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: - other: + other (float, int, or Series): Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError("abstract method") def truediv(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'floating division' of dataframe and other, element-wise (binary operator `/`). + """Get floating division of DataFrame and other, element-wise (binary operator `/`). Equivalent to ``dataframe / other``. With reverse version, `rtruediv`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: - other: + other (float, int, or Series): Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError("abstract method") def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'floating division' of dataframe and other, element-wise (binary operator `/`). + """Get floating division of DataFrame and other, element-wise (binary operator `/`). Equivalent to ``other / dataframe``. With reverse version, `truediv`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: - other: + other (float, int, or Series): Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. """ raise NotImplementedError("abstract method") def floordiv(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'integer division' of dataframe and other, element-wise (binary operator `//`). + """Get integer division of DataFrame and other, element-wise (binary operator `//`). Equivalent to ``dataframe // other``. With reverse version, `rfloordiv`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: - other: + other (float, int, or Series): Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError("abstract method") def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'integer division' of dataframe and other, element-wise (binary operator `//`). + """Get integer division of DataFrame and other, element-wise (binary operator `//`). Equivalent to ``other // dataframe``. With reverse version, `rfloordiv`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: - other: + other (float, int, or Series): Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError("abstract method") def mod(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'modulo' of dataframe and other, element-wise (binary operator `%`). + """Get modulo of DataFrame and other, element-wise (binary operator `%`). Equivalent to ``dataframe % other``. With reverse version, `rmod`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: other: Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError("abstract method") def rmod(self, other, axis: str | int = "columns") -> DataFrame: - """Get 'modulo' of dataframe and other, element-wise (binary operator `%`). + """Get modulo of DataFrame and other, element-wise (binary operator `%`). Equivalent to ``other % dataframe``. With reverse version, `mod`. Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + .. note:: + Mismatched indices will be unioned together. + Args: - other: + other (float, int, or Series): Any single or multiple element data structure, or list-like object. - axis: - ``{{0 or 'index', 1 or 'columns'}}``. Whether to compare by the - index (0 or 'index') or columns. (1 or 'columns'). For Series - input, axis to match Series index on. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. - - Notes: - Mismatched indices will be unioned together. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError("abstract method") @@ -719,20 +825,20 @@ def groupby( groups. Args: - by: + by (str, Sequence[str]): A label or list of labels may be passed to group by the columns in ``self``. Notice that a tuple is interpreted as a (single) key. - level : int, level name, or sequence of such, default None + level (int, level name, or sequence of such, default None): If the axis is a MultiIndex (hierarchical), group by a particular level or levels. Do not specify both ``by`` and ``level``. - as_index: + as_index (bool, default True): Default True. Return object with group labels as the index. Only relevant for DataFrame input. ``as_index=False`` is effectively "SQL-style" grouped output. This argument has no effect on filtrations such as ``head()``, ``tail()``, ``nth()`` and in transformations. - dropna: + dropna (bool, default True): Default True. If True, and if group keys contain NA values, NA values together with row/column will be dropped. If False, NA values will also be treated as the key in groups. @@ -748,31 +854,31 @@ def groupby( def map(self, func, na_action: Optional[str] = None) -> DataFrame: """Apply a function to a Dataframe elementwise. + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. + .. note:: In pandas 2.1.0, DataFrame.applymap is deprecated and renamed to DataFrame.map. - This method applies a function that accepts and returns a scalar - to every element of a DataFrame. - Args: func: Python function wrapped by ``remote_function`` decorator, returns a single value from a single value. - na_action: + na_action (Optional[str], default None): ``{None, 'ignore'}``, default None. If ‘ignore’, propagate NaN values, without passing them to func. Returns: - Transformed DataFrame. + DataFrame: Transformed DataFrame. """ raise NotImplementedError("abstract method") # ---------------------------------------------------------------------- # Merging / joining methods - def join(self, other, *, how: str) -> DataFrame: + def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: """Join columns of another DataFrame. Join columns with `other` DataFrame on index @@ -780,22 +886,20 @@ def join(self, other, *, how: str) -> DataFrame: Args: other: DataFrame with an Index similar to the Index of this one. - how: - ``{'left', 'right', 'outer', 'inner'}, default 'left'`` - + on: + Column in the caller to join on the index in other, otherwise + joins index-on-index. Like an Excel VLOOKUP operation. + how ({'left', 'right', 'outer', 'inner'}, default 'left'`): How to handle the operation of the two objects. - - * left: use calling frame's index (or column if on is specified) - * right: use `other`'s index. - * outer: form union of calling frame's index (or column if on is - specified) with `other`'s index, and sort it. - lexicographically. - * inner: form intersection of calling frame's index (or column if - on is specified) with `other`'s index, preserving the order - of the calling's one. + ``left``: use calling frame's index (or column if on is specified) + ``right``: use `other`'s index. ``outer``: form union of calling + frame's index (or column if on is specified) with `other`'s index, + and sort it lexicographically. ``inner``: form intersection of + calling frame's index (or column if on is specified) with `other`'s + index, preserving the order of the calling's one. Returns: - A dataframe containing columns from both the caller and `other`. + DataFrame: A dataframe containing columns from both the caller and `other`. """ raise NotImplementedError("abstract method") @@ -824,7 +928,6 @@ def merge( allowed. .. warning:: - If both key columns contain rows where the key is a null value, those rows will be matched against each other. This is different from usual SQL join behaviour and can lead to unexpected results. @@ -835,15 +938,14 @@ def merge( how: ``{'left', 'right', 'outer', 'inner'}, default 'inner'`` Type of merge to be performed. - - * left: use only keys from left frame, similar to a SQL left outer join; - preserve key order. - * right: use only keys from right frame, similar to a SQL right outer join; - preserve key order. - * outer: use union of keys from both frames, similar to a SQL full outer - join; sort keys lexicographically. - * inner: use intersection of keys from both frames, similar to a SQL inner - join; preserve the order of the left keys. + ``left``: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + ``right``: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + ``outer``: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + ``inner``: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. on: Column join on. It must be found in both DataFrames. Either on or left_on + right_on @@ -867,7 +969,7 @@ def merge( no suffix. At least one of the values must not be None. Returns: - A DataFrame of the two merged objects. + DataFrame: A DataFrame of the two merged objects. """ raise NotImplementedError("abstract method") @@ -883,7 +985,7 @@ def any(self, *, bool_only: bool = False): non-empty). Args: - bool_only: + bool_only (bool. default False): Include only boolean columns. Returns: @@ -895,12 +997,12 @@ def all(self, *, bool_only: bool = False): """ Return whether all elements are True, potentially over an axis. - Returns True unless there at least one element within a series or - along a Dataframe axis that is False or equivalent (e.g. zero or + Returns True unless there at least one element within a Series or + along a DataFrame axis that is False or equivalent (e.g. zero or empty). Args: - bool_only: + bool_only (bool. default False): Include only boolean columns. Returns: @@ -913,7 +1015,7 @@ def prod(self, *, numeric_only: bool = False): Return the product of the values over the requested axis. Args: - numeric_only: + numeric_only (bool. default False): Include only float, int, boolean columns. Returns: @@ -928,7 +1030,7 @@ def min(self, *, numeric_only: bool = False): equivalent of the ``numpy.ndarray`` method ``argmin``. Args: - numeric_only: + numeric_only (bool, default False): Default False. Include only float, int, boolean columns. Returns: @@ -943,7 +1045,7 @@ def max(self, *, numeric_only: bool = False): the equivalent of the ``numpy.ndarray`` method ``argmax``. Args: - numeric_only: + numeric_only (bool. default False): Default False. Include only float, int, boolean columns. Returns: @@ -957,7 +1059,7 @@ def sum(self, *, numeric_only: bool = False): This is equivalent to the method ``numpy.sum``. Args: - numeric_only: + numeric_only (bool. default False): Default False. Include only float, int, boolean columns. Returns: @@ -969,7 +1071,7 @@ def mean(self, *, numeric_only: bool = False): """Return the mean of the values over the requested axis. Args: - numeric_only: + numeric_only (bool. default False): Default False. Include only float, int, boolean columns. Returns: @@ -977,13 +1079,28 @@ def mean(self, *, numeric_only: bool = False): """ raise NotImplementedError("abstract method") + def median(self, *, numeric_only: bool = False, exact: bool = False): + """Return the median of the values over the requested axis. + + Args: + numeric_only (bool. default False): + Default False. Include only float, int, boolean columns. + exact (bool. default False): + Default False. Get the exact median instead of an approximate + one. Note: ``exact=True`` not yet supported. + + Returns: + Series + """ + raise NotImplementedError("abstract method") + def var(self, *, numeric_only: bool = False): """Return unbiased variance over requested axis. Normalized by N-1 by default. Args: - numeric_only: + numeric_only (bool. default False): Default False. Include only float, int, boolean columns. Returns: @@ -997,7 +1114,7 @@ def std(self, *, numeric_only: bool = False): Normalized by N-1 by default. Args: - numeric_only: + numeric_only (bool. default False): Default False. Include only float, int, boolean columns. Returns: @@ -1013,7 +1130,7 @@ def count(self, *, numeric_only: bool = False): on `pandas.options.mode.use_inf_as_na`) are considered NA. Args: - numeric_only : bool, default False + numeric_only (bool, default False): Include only `float`, `int` or `boolean` data. Returns: @@ -1039,7 +1156,7 @@ def cummin(self) -> DataFrame: Returns a DataFrame of the same size containing the cumulative minimum. Returns: - Return cumulative minimum of DataFrame. + DataFrame: Return cumulative minimum of DataFrame. """ raise NotImplementedError("abstract method") @@ -1049,7 +1166,7 @@ def cummax(self) -> DataFrame: Returns a DataFrame of the same size containing the cumulative maximum. Returns: - Return cumulative maximum of DataFrame. + DataFrame: Return cumulative maximum of DataFrame. """ raise NotImplementedError("abstract method") @@ -1059,7 +1176,7 @@ def cumsum(self) -> DataFrame: Returns a DataFrame of the same size containing the cumulative sum. Returns: - Return cumulative sum of DataFrame. + DataFrame: Return cumulative sum of DataFrame. """ raise NotImplementedError("abstract method") @@ -1069,7 +1186,49 @@ def cumprod(self) -> DataFrame: Returns a DataFrame of the same size containing the cumulative product. Returns: - Return cumulative product of DataFrame. + DataFrame: Return cumulative product of DataFrame. + """ + raise NotImplementedError("abstract method") + + def agg(self, func): + """ + Aggregate using one or more operations over the specified axis. + + Args: + func (function): + Function to use for aggregating the data. + Accepted combinations are: string function name, list of + function names, e.g. ``['sum', 'mean']``. + + Returns: + Series or DataFrame: Aggregated results + """ + raise NotImplementedError("abstract method") + + def describe(self): + """ + Generate descriptive statistics. + + Descriptive statistics include those that summarize the central + tendency, dispersion and shape of a + dataset's distribution, excluding ``NaN`` values. + + Only supports numeric columns. + + .. note:: + Percentile values are approximates only. + + Returns: + Summary statistics of the Series or Dataframe provided. + + + Notes + ----- + For numeric data, the result's index will include ``count``, + ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and + upper percentiles. By default the lower percentile is ``25`` and the + upper percentile is ``75``. The ``50`` percentile is the + same as the median. """ raise NotImplementedError("abstract method") @@ -1107,18 +1266,18 @@ def value_counts( Return a Series containing counts of unique rows in the DataFrame. Args: - subset : label or list of labels, optional + subset (label or list of labels, optional): Columns to use when counting unique combinations. - normalize : bool, default False + normalize (bool, default False): Return proportions rather than frequencies. - sort : bool, default True + sort (bool, default True): Sort by frequencies. - ascending : bool, default False + ascending (bool, default False): Sort in ascending order. - dropna : bool, default True + dropna (bool, default True): Don’t include counts of rows that contain NA values. Returns: - Series + Series: Series containing counts of unique rows in the DataFrame """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index b6ee19b46a..4843c971da 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -19,7 +19,8 @@ class NDFrame(indexing.IndexingMixin): def ndim(self) -> int: """Return an int representing the number of axes / array dimensions. - Return 1 if Series. Otherwise return 2 if DataFrame. + Returns: + int: Return 1 if Series. Otherwise return 2 if DataFrame. """ raise NotImplementedError("abstract method") @@ -27,8 +28,9 @@ def ndim(self) -> int: def size(self) -> int: """Return an int representing the number of elements in this object. - Return the number of rows if Series. Otherwise return the number of - rows times number of columns if DataFrame. + Returns: + int: Return the number of rows if Series. Otherwise return the number of + rows times number of columns if DataFrame. """ raise NotImplementedError("abstract method") @@ -42,6 +44,7 @@ def abs(self): Returns: Series/DataFrame containing the absolute value of each element. + Returns a Series/DataFrame containing the absolute value of each element. """ raise NotImplementedError("abstract method") @@ -49,18 +52,18 @@ def astype(self, dtype): """ Cast a pandas object to a specified dtype ``dtype``. - Parameters - ---------- - dtype : str, data type, Series or Mapping of column name -> data type - Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to - cast entire pandas object to the same type. Alternatively, use a - mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is - a numpy.dtype or Python type to cast one or more of the DataFrame's - columns to column-specific types. + Args: + dtype (str or pandas.ExtensionDtype): + A dtype supported by BigQuery DataFrame include 'boolean','Float64','Int64', + 'string', 'tring[pyarrow]','timestamp[us, tz=UTC][pyarrow]', + 'timestamp[us][pyarrow]','date32[day][pyarrow]','time64[us][pyarrow]' + A pandas.ExtensionDtype include pandas.BooleanDtype(), pandas.Float64Dtype(), + pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), + pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), + pd.ArrowDtype(pa.timestamp("us")), pd.ArrowDtype(pa.timestamp("us", tz="UTC")). - Returns - ------- - same type as caller + Returns: + same type as caller """ raise NotImplementedError("abstract method") @@ -70,17 +73,17 @@ def astype(self, dtype): @property def empty(self) -> bool: - """Indicator whether Series/DataFrame is empty. + """Indicates whether Series/DataFrame is empty. True if Series/DataFrame is entirely empty (no items), meaning any of the axes are of length 0. - Returns: - If Series/DataFrame is empty, return True, if not return False. - - Note: + .. note:: If Series/DataFrame contains only NA values, it is still not considered empty. + + Returns: + bool: If Series/DataFrame is empty, return True, if not return False. """ raise NotImplementedError("abstract method") @@ -97,28 +100,28 @@ def to_json( index: bool = True, lines: bool = False, ) -> str | None: - """Convert the object to a JSON string, written to GCS. + """Convert the object to a JSON string, written to Cloud Storage. Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. + .. note:: + Only ``orient='records'`` and ``lines=True`` is supported so far. + Args: - path_or_buf: - A destination URI of GCS files(s) to store the extracted dataframe - in format of ``gs:///``. + path_or_buf (str): + A destination URI of Cloud Storage files(s) to store the extracted + dataframe in format of ``gs:///``. + Must contain a wildcard `*` character. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files varies. None, file-like objects or local file paths not yet supported. - orient: + orient ({`split`, `records`, `index`, `columns`, `values`, `table`}, default 'columns): Indication of expected JSON string format. - .. note:: - - In BigQuery DataFrame, only `orient='records'` is supported so far. - * Series: - default is 'index' @@ -141,30 +144,25 @@ def to_json( - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}} Describing the data, where data component is like ``orient='records'``. + index (bool, default True): + If True, write row names (index). - lines: + lines (bool, default False): If 'orient' is 'records' write out line-delimited json format. Will throw ValueError if incorrect 'orient' since others are not list-like. - .. note:: - - BigQuery DataFrames only supports ``lines=True`` so far. - - index: - If True, write row names (index). - Returns: - None. String output not yet supported. + None: String output not yet supported. """ raise NotImplementedError("abstract method") def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: - """Write object to a comma-separated values (csv) file on GCS. + """Write object to a comma-separated values (csv) file on Cloud Storage. Args: - path_or_buf: - A destination URI of GCS files(s) to store the extracted dataframe + path_or_buf (str): + A destination URI of Cloud Storage files(s) to store the extracted dataframe in format of ``gs:///``. If the data size is more than 1GB, you must use a wildcard to @@ -173,11 +171,11 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: None, file-like objects or local file paths not yet supported. - index: + index (bool, default True): If True, write row names (index). Returns: - None. String output not yet supported. + None: String output not yet supported. """ raise NotImplementedError("abstract method") @@ -208,11 +206,11 @@ def add_prefix(self, prefix: str, axis: int | str | None = None): For DataFrame, the column labels are prefixed. Args: - prefix: + prefix (str): The string to add before each label. - axis: + axis (int or str or None, default None): ``{{0 or 'index', 1 or 'columns', None}}``, default None. Axis - to add prefix on + to add prefix on. Returns: New Series or DataFrame with updated labels. @@ -250,7 +248,7 @@ def head(self, n: int = 5): If n is larger than the number of rows, this function returns all rows. Args: - n: + n (int, default 5): Default 5. Number of rows to select. Returns: @@ -271,7 +269,8 @@ def tail(self, n: int = 5): If n is larger than the number of rows, this function returns all rows. Args: - n: int, default 5. Number of rows to select. + n (int, default 5): + Number of rows to select. Returns: The last `n` rows of the caller object. @@ -290,12 +289,12 @@ def sample( You can use `random_state` for reproducibility. Args: - n: + n (Optional[int], default None): Number of items from axis to return. Cannot be used with `frac`. Default = 1 if `frac` = None. - frac: + frac (Optional[float], default None): Fraction of axis items to return. Cannot be used with `n`. - random_state: + random_state (Optional[int], default None): Seed for random number generator. Returns: @@ -360,7 +359,7 @@ def notna(self) -> NDFrame: NA values get mapped to False values. Returns: - Mask of bool values for each element that indicates whether an + NDFrame: Mask of bool values for each element that indicates whether an element is not an NA value. """ raise NotImplementedError("abstract method") @@ -376,11 +375,11 @@ def shift( Shifts the index without realigning the data. Args: - periods: + periods int: Number of periods to shift. Can be positive or negative. Returns: - Copy of input object, shifted. + NDFrame: Copy of input object, shifted. """ raise NotImplementedError("abstract method") @@ -398,34 +397,27 @@ def rank( By default, equal values are assigned a rank that is the average of the ranks of those values. - Parameters - ---------- - method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - How to rank the group of records that have the same value (i.e. ties): - - * average: average rank of the group - * min: lowest rank in the group - * max: highest rank in the group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups. - - numeric_only : bool, default False - For DataFrame objects, rank only numeric columns if set to True. + Args: + method ({'average', 'min', 'max', 'first', 'dense'}, default 'average'): + How to rank the group of records that have the same value (i.e. ties): + `average`: average rank of the group, `min`: lowest rank in the group + max`: highest rank in the group, `first`: ranks assigned in order they + appear in the array, `dense`: like 'min', but rank always increases by + 1 between groups. - na_option : {'keep', 'top', 'bottom'}, default 'keep' - How to rank NaN values: + numeric_only (bool, default False): + For DataFrame objects, rank only numeric columns if set to True. - * keep: assign NaN rank to NaN values - * top: assign lowest rank to NaN values - * bottom: assign highest rank to NaN values + na_option ({'keep', 'top', 'bottom'}, default 'keep'): + How to rank NaN values: `keep`: assign NaN rank to NaN values, + , `top`: assign lowest rank to NaN values, `bottom`: assign highest + rank to NaN values. - ascending : bool, default True - Whether or not the elements should be ranked in ascending order. + ascending (bool, default True): + Whether or not the elements should be ranked in ascending order. - Returns - ------- - same type as caller - Return a Series or DataFrame with data ranks as values. + Returns: + same type as caller: Return a Series or DataFrame with data ranks as values. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 5b5a9f206b..6975e6edf2 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -17,26 +17,23 @@ class GroupBy: def any(self): """ - Return True if any value in the group is truthful, else False. + Return True if any value in the group is true, else False. - - Returns - ------- - Series or DataFrame - DataFrame or Series of boolean values, where a value is True if any element - is True within its respective group, False otherwise. + Returns: + Series or DataFrame: DataFrame or Series of boolean values, + where a value is True if any element is True within its + respective group, False otherwise. """ raise NotImplementedError("abstract property") def all(self): """ - Return True if all values in the group are truthful, else False. + Return True if all values in the group are true, else False. - Returns - ------- - Series or DataFrame - DataFrame or Series of boolean values, where a value is True if all elements - are True within its respective group, False otherwise. + Returns: + Series or DataFrame: DataFrame or Series of boolean values, + where a value is True if all elements are True within its + respective group, False otherwise. """ raise NotImplementedError("abstract property") @@ -44,10 +41,8 @@ def count(self): """ Compute count of group, excluding missing values. - Returns - ------- - Series or DataFrame - Count of values within each group. + Returns: + Series or DataFrame: Count of values within each group. """ raise NotImplementedError("abstract property") @@ -58,14 +53,33 @@ def mean( """ Compute mean of groups, excluding missing values. - Parameters - ---------- - numeric_only : bool, default False - Include only float, int, boolean columns. + Args: + numeric_only (bool, default False): + Include only float, int, boolean columns. - Returns - ------- - pandas.Series or pandas.DataFrame + Returns: + pandas.Series or pandas.DataFrame: Mean of groups. + """ + raise NotImplementedError("abstract property") + + def median( + self, + numeric_only: bool = False, + *, + exact: bool = False, + ): + """ + Compute median of groups, excluding missing values. + + Args: + numeric_only (bool, default False): + Include only float, int, boolean columns. + exact (bool, default False): + Calculate the exact median instead of an approximation. Note: + ``exact=True`` not yet supported. + + Returns: + pandas.Series or pandas.DataFrame: Median of groups. """ raise NotImplementedError("abstract property") @@ -79,15 +93,12 @@ def std( For multiple groupings, the result index will be a MultiIndex. - Parameters - ---------- - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. + Args: + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. - Returns - ------- - Series or DataFrame - Standard deviation of values within each group. + Returns: + Series or DataFrame: Standard deviation of values within each group. """ raise NotImplementedError("abstract property") @@ -101,15 +112,13 @@ def var( For multiple groupings, the result index will be a MultiIndex. - Parameters - ---------- - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. + Args: + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. - Returns - ------- - Series or DataFrame - Variance of values within each group. + Returns: + Series or DataFrame + Variance of values within each group. """ raise NotImplementedError("abstract property") @@ -121,17 +130,15 @@ def sum( """ Compute sum of group values. - Parameters - ---------- - numeric_only : bool, default False - Include only float, int, boolean columns. - min_count : int, default 0 - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - Returns - ------- - Series or DataFrame - Computed sum of values within each group. + Args: + numeric_only (bool, default False): + Include only float, int, boolean columns. + min_count (int, default 0): + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns: + Series or DataFrame: Computed sum of values within each group. """ raise NotImplementedError("abstract property") @@ -139,17 +146,15 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): """ Compute prod of group values. - Parameters - ---------- - numeric_only : bool, default False - Include only float, int, boolean columns. - min_count : int, default 0 - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - Returns - ------- - Series or DataFrame - Computed prod of values within each group. + Args: + numeric_only (bool, default False): + Include only float, int, boolean columns. + min_count (int, default 0): + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns: + Series or DataFrame: Computed prod of values within each group. """ raise NotImplementedError("abstract property") @@ -161,17 +166,15 @@ def min( """ Compute min of group values. - Parameters - ---------- - numeric_only : bool, default False - Include only float, int, boolean columns. - min_count : int, default 0 - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - Returns - ------- - Series or DataFrame - Computed min of values within each group. + Args: + numeric_only (bool, default False): + Include only float, int, boolean columns. + min_count (int, default 0): + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns: + Series or DataFrame: Computed min of values within each group. """ raise NotImplementedError("abstract property") @@ -183,17 +186,15 @@ def max( """ Compute max of group values. - Parameters - ---------- - numeric_only : bool, default False - Include only float, int, boolean columns. - min_count : int, default 0 - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - Returns - ------- - Series or DataFrame - Computed max of values within each group. + Args: + numeric_only (bool, default False): + Include only float, int, boolean columns. + min_count (int, default 0): + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + Returns: + Series or DataFrame: Computed max of values within each group. """ raise NotImplementedError("abstract property") @@ -201,15 +202,12 @@ def cumcount(self, ascending: bool = True): """ Number each item in each group from 0 to the length of that group - 1. - Parameters - ---------- - ascending : bool, default True - If False, number in reverse, from length of group - 1 to 0. + Args: + ascending (bool, default True): + If False, number in reverse, from length of group - 1 to 0. - Returns - ------- - Series - Sequence number of each element within each group. + Returns: + Series: Sequence number of each element within each group. """ raise NotImplementedError("abstract property") @@ -217,9 +215,8 @@ def cumprod(self, *args, **kwargs): """ Cumulative product for each group. - Returns - ------- - Series or DataFrame + Returns: + Series or DataFrame: Cumulative product for each group. """ raise NotImplementedError("abstract property") @@ -227,9 +224,8 @@ def cumsum(self, *args, **kwargs): """ Cumulative sum for each group. - Returns - ------- - Series or DataFrame + Returns: + Series or DataFrame: Cumulative sum for each group. """ raise NotImplementedError("abstract property") @@ -237,9 +233,8 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): """ Cumulative min for each group. - Returns - ------- - Series or DataFrame + Returns: + Series or DataFrame: Cumulative min for each group. """ raise NotImplementedError("abstract property") @@ -247,9 +242,8 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): """ Cumulative max for each group. - Returns - ------- - Series or DataFrame + Returns: + Series or DataFrame: Cumulative max for each group. """ raise NotImplementedError("abstract property") @@ -259,35 +253,30 @@ def diff(self): Calculates the difference of each element compared with another element in the group (default is element in previous row). - Returns - ------- - Series or DataFrame - First differences. + Returns: + Series or DataFrame: First differences. """ raise NotImplementedError("abstract property") def shift(self, periods: int = 1): """ Shift each group by periods observations. - If freq is passed, the index will be increased using the periods and the freq. - Parameters - ---------- - periods : int, default 1 - Number of periods to shift. - Returns - ------- - Series or DataFrame - Object shifted within each group. + Args: + periods (int, default 1): + Number of periods to shift. + + Returns: + Series or DataFrame: Object shifted within each group. """ raise NotImplementedError("abstract property") def rolling(self, *args, **kwargs): """ - Return a rolling grouper, providing rolling functionality per group. + Returns a rolling grouper, providing rolling functionality per group. Args: - min_periods : int, default None + min_periods (int, default None): Minimum number of observations in window required to have a value; otherwise, result is ``np.nan``. @@ -298,14 +287,16 @@ def rolling(self, *args, **kwargs): to the size of the window. Returns: - Return a new grouper with our rolling appended. + Series or DataFrame: Return a new grouper with our rolling appended. """ raise NotImplementedError("abstract property") def expanding(self, *args, **kwargs): """ - Return an expanding grouper, providing expanding - functionality per group. + Provides expanding functionality. + + Returns: + Series or DataFrame: A expanding grouper, providing expanding functionality per group. """ raise NotImplementedError("abstract property") diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 5d431a8a94..d59886e8aa 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -1,6 +1,6 @@ class DatetimeProperties: """ - Accessor object for datetimelike properties of the Series values. + Accessor object for datetime-like properties of the Series values. """ @property @@ -19,20 +19,19 @@ def dayofweek(self): values (using the `dt` accessor) or DatetimeIndex. Returns: - Series or Index - Containing integers indicating the day number. + Series or Index: Containing integers indicating the day number. """ raise NotImplementedError("abstract method") @property def date(self): - """Returns numpy array of python :class:`datetime.date` objects. + """Returns numpy array of Python :class:`datetime.date` objects. Namely, the date part of Timestamps without time and timezone information. - warning: + .. warning:: This method returns a Series whereas pandas returns a numpy array. """ @@ -69,7 +68,7 @@ def time(self): The time part of the Timestamps. - warning: + .. warning:: This method returns a Series whereas pandas returns a numpy array. """ @@ -80,7 +79,7 @@ def time(self): def quarter(self): """The quarter of the date. - warning: + .. warning:: This method returns a Series whereas pandas returns a numpy array. """ diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index bdd05a0100..ebad5eb918 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -9,7 +9,7 @@ class Index: @property def name(self): - """Return Index name.""" + """Returns Index name.""" raise NotImplementedError("abstract method") @property @@ -18,3 +18,19 @@ def shape(self): Return a tuple of the shape of the underlying data. """ raise NotImplementedError("abstract method") + + def to_numpy(self, dtype): + """ + A NumPy ndarray representing the values in this Series or Index. + + Args: + dtype: + The dtype to pass to :meth:`numpy.asarray`. + **kwargs: + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). + + Returns: + numpy.ndarray + """ + raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/core/indexing.py b/third_party/bigframes_vendored/pandas/core/indexing.py index 87fb34b32a..d5b9f3c079 100644 --- a/third_party/bigframes_vendored/pandas/core/indexing.py +++ b/third_party/bigframes_vendored/pandas/core/indexing.py @@ -21,7 +21,7 @@ def iloc(self): - A slice object with ints, e.g. ``1:7``. - **Not supported yet** A boolean array. - **Not supported yet** A ``callable`` function with one argument (the - calling Series or DataFrame) and that returns valid output for + calling Series or DataFrame) that returns valid output for indexing (one of the above). This is useful in method chains, when you don't have a reference to the calling object, but would like to base your selection on some value. @@ -43,30 +43,24 @@ def loc(self): Allowed inputs are: - - **Not supported yet** A single label, e.g. ``5`` or ``'a'``, (note + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index, and **never** as an integer position along the index). - - **Not supported yet** A list or array of labels, e.g. ``['a', 'b', 'c']``. - - **Not supported yet** A slice object with labels, e.g. ``'a':'f'``. - - .. warning:: Note that contrary to usual python slices, **both** the - start and the stop are included - - - **Not supported yet** A boolean array of the same length as the axis being sliced, + - A list of labels, e.g. ``['a', 'b', 'c']``. + - A boolean series of the same length as the axis being sliced, e.g. ``[True, False, True]``. - - An alignable boolean Series. The index of the key will be aligned before - masking. - - **Not supported yet** An alignable Index. The Index of the returned + - An alignable Index. The index of the returned selection will be the input. + - **Not supported yet** An alignable boolean Series. The index of the key will be aligned before + masking. + - **Not supported yet** A slice object with labels, e.g. ``'a':'f'``. + Note: contrary to usual python slices, **both** the start and the stop are included. - **Not supported yet** A ``callable`` function with one argument (the - calling Series or DataFrame) - and that returns valid output for indexing (one of the above) + calling Series or DataFrame) that returns valid output for indexing + (one of the above). Raises: - KeyError: If any items are not found. - IndexingError: - If an indexed key is passed and its index is unalignable to the - frame index. + NotImplementError: if the inputs are not supported. """ raise NotImplementedError("abstract methdod") diff --git a/third_party/bigframes_vendored/pandas/core/reshape/__init__.py b/third_party/bigframes_vendored/pandas/core/reshape/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/pandas/core/reshape/concat.py b/third_party/bigframes_vendored/pandas/core/reshape/concat.py index 98b43c7dfb..6a5a9fdde9 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/concat.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/concat.py @@ -8,6 +8,7 @@ def concat( objs, *, + axis=0, join: str = "outer", ignore_index: bool = False, ): @@ -25,6 +26,8 @@ def concat( objs: Objects to concatenate. Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. + axis : {0/'index', 1/'columns'}, default 0 + The axis to concatenate along. join: {'inner', 'outer'}, default 'outer' How to handle indexes on other axis (or axes). ignore_index : bool, default False diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py new file mode 100644 index 0000000000..9381ad4552 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -0,0 +1,65 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/tile.py +""" +Quantilization functions and related stuff +""" +from __future__ import annotations + + +def cut( + x, + bins, + *, + labels=None, +): + """ + Bin values into discrete intervals. + + Use `cut` when you need to segment and sort data values into bins. This + function is also useful for going from a continuous variable to a + categorical variable. For example, `cut` could convert ages to groups of + age ranges. Supports binning into an equal number of bins, or a + pre-specified array of bins. + + ``labels=False`` implies you just want the bins back. + + Examples: + + .. code-block:: + + import bigframes.pandas as pd + + pd.options.display.progress_bar = None + s = pd.Series([0, 1, 1, 2]) + pd.cut(s, bins=4, labels=False) + + 0 0 + 1 1 + 2 1 + 3 3 + dtype: Int64 + + Args: + x (Series): + The input Series to be binned. Must be 1-dimensional. + bins (int): + The criteria to bin by. + + int : Defines the number of equal-width bins in the range of `x`. The + range of `x` is extended by .1% on each side to include the minimum + and maximum values of `x`. + labels (None): + Specifies the labels for the returned bins. Must be the same length as + the resulting bins. If False, returns only integer indicators of the + bins. This affects the type of the output container (see below). + If True, raises an error. When `ordered=False`, labels must be + provided. + + Returns: + Series: A Series representing the respective bin for each value + of `x`. The type depends on the value of `labels`. + sequence of scalars : returns a Series for Series `x` or a + Categorical for all other inputs. The values stored within + are whatever the type in the sequence is. + False : returns an ndarray of integers. + """ + raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a254fcf842..215f7ec4e0 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -21,7 +21,7 @@ class Series(NDFrame): # type: ignore[misc] @property def dt(self): """ - Accessor object for datetimelike properties of the Series values. + Accessor object for datetime-like properties of the Series values. """ raise NotImplementedError("abstract property") @@ -58,10 +58,9 @@ def name(self) -> Hashable: to form a DataFrame. It is also used whenever displaying the Series using the interpreter. - Returns - ------- - label (hashable object) - The name of the Series, also the column name if part of a DataFrame. + Returns: + hashable object: The name of the Series, also the column name + if part of a DataFrame. """ raise NotImplementedError("abstract property") @@ -78,24 +77,21 @@ def reset_index( when the index is meaningless and needs to be reset to the default before another operation. - Parameters - ---------- - drop : bool, default False - Just reset the index, without inserting it as a column in - the new DataFrame. - name : object, optional - The name to use for the column containing the original Series - values. Uses ``self.name`` by default. This argument is ignored - when `drop` is True. + Args: + drop (bool, default False): + Just reset the index, without inserting it as a column in + the new DataFrame. + name (object, optional): + The name to use for the column containing the original Series + values. Uses ``self.name`` by default. This argument is ignored + when `drop` is True. - Returns - ------- - Series or DataFrame or None - When `drop` is False (the default), a DataFrame is returned. - The newly created columns will come first in the DataFrame, - followed by the original Series values. - When `drop` is True, a `Series` is returned. - In either case, if ``inplace=True``, no value is returned. + Returns: + Series or DataFrame or None; When `drop` is False (the default), + a DataFrame is returned. The newly created columns will come first + in the DataFrame, followed by the original Series values. + When `drop` is True, a `Series` is returned. + In either case, if ``inplace=True``, no value is returned. """ raise NotImplementedError("abstract method") @@ -125,36 +121,34 @@ def to_string( """ Render a string representation of the Series. - Parameters - ---------- - buf : StringIO-like, optional - Buffer to write to. - na_rep : str, optional - String representation of NaN to use, default 'NaN'. - float_format : one-parameter function, optional - Formatter function to apply to columns' elements if they are - floats, default None. - header : bool, default True - Add the Series header (index name). - index : bool, optional - Add index (row) labels, default True. - length : bool, default False - Add the Series length. - dtype : bool, default False - Add the Series dtype. - name : bool, default False - Add the Series name if not None. - max_rows : int, optional - Maximum number of rows to show before truncating. If None, show - all. - min_rows : int, optional - The number of rows to display in a truncated repr (when number - of rows is above `max_rows`). + Args: + buf (StringIO-like, optional): + Buffer to write to. + na_rep (str, optional): + String representation of NaN to use, default 'NaN'. + float_format (one-parameter function, optional): + Formatter function to apply to columns' elements if they are + floats, default None. + header (bool, default True): + Add the Series header (index name). + index (bool, optional): + Add index (row) labels, default True. + length (bool, default False): + Add the Series length. + dtype (bool, default False): + Add the Series dtype. + name (bool, default False): + Add the Series name if not None. + max_rows (int, optional): + Maximum number of rows to show before truncating. If None, show + all. + min_rows (int, optional): + The number of rows to display in a truncated repr (when number + of rows is above `max_rows`). - Returns - ------- - str or None - String representation of Series if ``buf=None``, otherwise None. + Returns: + str or None: String representation of Series if ``buf=None``, + otherwise None. """ formatter = fmt.SeriesFormatter( self, @@ -183,18 +177,16 @@ def to_markdown( """ Print {klass} in Markdown-friendly format. - Parameters - ---------- - buf : str, Path or StringIO-like, optional, default None - Buffer to write to. If None, the output is returned as a string. - mode : str, optional - Mode in which file is opened, "wt" by default. - index : bool, optional, default True - Add index (row) labels. - Returns - ------- - str - {klass} in Markdown-friendly format. + Args: + buf (str, Path or StringIO-like, optional, default None): + Buffer to write to. If None, the output is returned as a string. + mode (str, optional): + Mode in which file is opened, "wt" by default. + index (bool, optional, default True): + Add index (row) labels. + + Returns: + str: {klass} in Markdown-friendly format. """ raise NotImplementedError("abstract method") @@ -202,18 +194,15 @@ def to_dict(self, into: type[dict] = dict) -> Mapping: """ Convert Series to {label -> value} dict or dict-like object. - Parameters - ---------- - into : class, default dict - The collections.abc.Mapping subclass to use as the return - object. Can be the actual class or an empty - instance of the mapping type you want. If you want a - collections.defaultdict, you must pass it initialized. + Args: + into (class, default dict): + The collections.abc.Mapping subclass to use as the return + object. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. - Returns - ------- - collections.abc.Mapping - Key-value representation of Series. + Returns: + collections.abc.Mapping: Key-value representation of Series. """ raise NotImplementedError("abstract method") @@ -221,10 +210,8 @@ def to_frame(self) -> DataFrame: """ Convert Series to DataFrame. - Returns - ------- - DataFrame - DataFrame representation of Series. + Returns: + DataFrame: DataFrame representation of Series. """ raise NotImplementedError("abstract method") @@ -242,12 +229,11 @@ def to_excel(self, excel_writer, sheet_name): Note that creating an `ExcelWriter` object with a file name that already exists will result in the contents of the existing file being erased. - Parameters - ---------- - excel_writer : path-like, file-like, or ExcelWriter object - File path or existing ExcelWriter. - sheet_name : str, default 'Sheet1' - Name of sheet which will contain DataFrame. + Args: + excel_writer (path-like, file-like, or ExcelWriter object): + File path or existing ExcelWriter. + sheet_name (str, default 'Sheet1'): + Name of sheet to contain Series. """ raise NotImplementedError("abstract method") @@ -255,23 +241,20 @@ def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): """ Render object to a LaTeX tabular, longtable, or nested table. - Parameters - ---------- - buf : str, Path or StringIO-like, optional, default None - Buffer to write to. If None, the output is returned as a string. - columns : list of label, optional - The subset of columns to write. Writes all columns by default. - header : bool or list of str, default True - Write out the column names. If a list of strings is given, - it is assumed to be aliases for the column names. - index : bool, default True - Write row names (index). - + Args: + buf (str, Path or StringIO-like, optional, default None): + Buffer to write to. If None, the output is returned as a string. + columns (list of label, optional): + The subset of columns to write. Writes all columns by default. + header (bool or list of str, default True): + Write out the column names. If a list of strings is given, + it is assumed to be aliases for the column names. + index (bool, default True): + Write row names (index). - Returns - ------- - str or None - If buf is None, returns the result as a string. Otherwise returns None. + Returns: + str or None: If buf is None, returns the result as a string. + Otherwise returns None. """ raise NotImplementedError("abstract method") @@ -281,11 +264,10 @@ def tolist(self) -> list: These are each a scalar type, which is a Python scalar (for str, int, float) or a pandas scalar - (for Timestamp/Timedelta/Interval/Period) + (for Timestamp/Timedelta/Interval/Period). - Returns - ------- - list + Returns: + list: list of the values """ raise NotImplementedError("abstract method") @@ -295,25 +277,24 @@ def to_numpy(self, dtype, copy=False, na_value=None): """ A NumPy ndarray representing the values in this Series or Index. - Parameters - ---------- - dtype : str or numpy.dtype, optional - The dtype to pass to :meth:`numpy.asarray`. - copy : bool, default False - Whether to ensure that the returned value is not a view on - another array. Note that ``copy=False`` does not *ensure* that - ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that - a copy is made, even if not strictly necessary. - na_value : Any, optional - The value to use for missing values. The default value depends - on `dtype` and the type of the array. - **kwargs - Additional keywords passed through to the ``to_numpy`` method - of the underlying array (for extension arrays). + Args: + dtype (str or numpy.dtype, optional): + The dtype to pass to :meth:`numpy.asarray`. + copy (bool, default False): + Whether to ensure that the returned value is not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value (Any, optional): + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + ``**kwargs``: + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). - Returns - ------- - numpy.ndarray + Returns: + numpy.ndarray: A NumPy ndarray representing the values in this + Series or Index. """ raise NotImplementedError("abstract method") @@ -321,12 +302,11 @@ def to_pickle(self, path, **kwargs): """ Pickle (serialize) object to file. - Parameters - ---------- - path : str, path object, or file-like object - String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a binary ``write()`` function. File path where - the pickled object will be stored. + Args: + path (str, path object, or file-like object): + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. File path where + the pickled object will be stored. """ raise NotImplementedError("abstract method") @@ -334,11 +314,10 @@ def to_xarray(self): """ Return an xarray object from the pandas object. - Returns - ------- - xarray.DataArray or xarray.Dataset - Data in the pandas structure converted to Dataset if the object is - a DataFrame, or a DataArray if the object is a Series. + Returns: + xarray.DataArray or xarray.Dataset: Data in the pandas structure + converted to Dataset if the object is a DataFrame, or a DataArray if + the object is a Series. """ raise NotImplementedError("abstract method") @@ -356,43 +335,24 @@ def to_json( Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. - Parameters - ---------- - path_or_buf : str, path object, file-like object, or None, default None - String, path object (implementing os.PathLike[str]), or file-like - object implementing a write() function. If None, the result is - returned as a string. - orient: - Indication of expected JSON string format. - - * Series: - - - default is 'index' - - allowed values are: {{'split', 'records', 'index', 'table'}}. - - * DataFrame: - - - default is 'columns' - - allowed values are: {{'split', 'records', 'index', 'columns', - 'values', 'table'}}. - - * The format of the JSON string: - - - 'split' : dict like {{'index' -> [index], 'columns' -> [columns], - 'data' -> [values]}} - - 'records' : list like [{{column -> value}}, ... , {{column -> value}}] - - 'index' : dict like {{index -> {{column -> value}}}} - - 'columns' : dict like {{column -> {{index -> value}}}} - - 'values' : just the values array - - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}} - + Args: + path_or_buf (str, path object, file-like object, or None, default None): + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. + orient ({"split", "records", "index", "columns", "values", "table"}, default "columns"): + Indication of expected JSON string format. + 'split' : dict like {{'index' -> [index], 'columns' -> [columns],'data' -> [values]}} + 'records' : list like [{{column -> value}}, ... , {{column -> value}}] + 'index' : dict like {{index -> {{column -> value}}}} + 'columns' : dict like {{column -> {{index -> value}}}} + 'values' : just the values array + 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}} Describing the data, where data component is like ``orient='records'``. - Returns - ------- - None or str - If path_or_buf is None, returns the resulting json format as a - string. Otherwise returns None. + Returns: + None or str: If path_or_buf is None, returns the resulting json format as a + string. Otherwise returns None. """ raise NotImplementedError("abstract method") @@ -400,20 +360,32 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: """ Write object to a comma-separated values (csv) file. - Parameters - ---------- - path_or_buf : str, path object, file-like object, or None, default None - String, path object (implementing os.PathLike[str]), or file-like - object implementing a write() function. If None, the result is - returned as a string. If a non-binary file object is passed, it should - be opened with `newline=''`, disabling universal newlines. If a binary - file object is passed, `mode` might need to contain a `'b'`. + Args: + path_or_buf (str, path object, file-like object, or None, default None): + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. If a non-binary file object is passed, it should + be opened with `newline=''`, disabling universal newlines. If a binary + file object is passed, `mode` might need to contain a `'b'`. - Returns - ------- - None or str - If path_or_buf is None, returns the resulting csv format as a - string. Otherwise returns None. + Returns: + None or str: If path_or_buf is None, returns the resulting csv format + as a string. Otherwise returns None. + """ + raise NotImplementedError("abstract method") + + def agg(self, func): + """ + Aggregate using one or more operations over the specified axis. + + Args: + func (function): + Function to use for aggregating the data. + Accepted combinations are: string function name, list of + function names, e.g. ``['sum', 'mean']``. + + Returns: + scalar or Series: Aggregated results """ raise NotImplementedError("abstract method") @@ -421,10 +393,9 @@ def count(self): """ Return number of non-NA/null observations in the Series. - Returns - ------- - int or Series (if level specified) - Number of non-null values in the Series. + Returns: + int or Series (if level specified): Number of non-null values in the + Series. """ raise NotImplementedError("abstract method") @@ -434,9 +405,8 @@ def nunique(self) -> int: Excludes NA values by default. - Returns - ------- - int + Returns: + int: number of unique elements in the object. """ raise NotImplementedError("abstract method") @@ -448,10 +418,8 @@ def mode(self) -> Series: Always returns Series even if only one value is returned. - Returns - ------- - Series - Modes of the Series in sorted order. + Returns: + Series: Modes of the Series in sorted order. """ raise NotImplementedError("abstract method") @@ -464,15 +432,15 @@ def drop_duplicates( Return Series with duplicate values removed. Args: - keep : {'first', 'last', ``False``}, default 'first' + keep ({'first', 'last', ``False``}, default 'first'): Method to handle dropping duplicates: - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. + 'first' : Drop duplicates except for the first occurrence. + 'last' : Drop duplicates except for the last occurrence. + ``False`` : Drop all duplicates. Returns: - Series with duplicates dropped or None if ``inplace=True``. + Series: Series with duplicates dropped or None if ``inplace=True``. """ raise NotImplementedError("abstract method") @@ -485,17 +453,17 @@ def duplicated(self, keep="first") -> Series: last occurrence of duplicates can be indicated. Args: - keep : {'first', 'last', False}, default 'first' + keep ({'first', 'last', False}, default 'first'): Method to handle dropping duplicates: - - 'first' : Mark duplicates as ``True`` except for the first - occurrence. - - 'last' : Mark duplicates as ``True`` except for the last - occurrence. - - ``False`` : Mark all duplicates as ``True``. + 'first' : Mark duplicates as ``True`` except for the first + occurrence. + 'last' : Mark duplicates as ``True`` except for the last + occurrence. + ``False`` : Mark all duplicates as ``True``. Returns: - Series indicating whether each value has occurred in the + Series: Series indicating whether each value has occurred in the preceding values. """ raise NotImplementedError("abstract method") @@ -504,16 +472,13 @@ def round(self, decimals: int = 0) -> Series: """ Round each value in a Series to the given number of decimals. - Parameters - ---------- - decimals : int, default 0 - Number of decimal places to round to. If decimals is negative, - it specifies the number of positions to the left of the decimal point. + Args: + decimals (int, default 0): + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal point. - Returns - ------- - Series - Rounded values of the Series. + Returns: + Series: Rounded values of the Series. """ raise NotImplementedError("abstract method") @@ -524,10 +489,13 @@ def diff(self) -> Series: Calculates the difference of a {klass} element compared with another element in the {klass} (default is element in previous row). - Returns - ------- - {klass} - First differences of the Series. + Args: + periods (int, default 1): + Periods to shift for calculating difference, accepts negative + values. + + Returns: + {klass}: First differences of the Series. """ raise NotImplementedError("abstract method") @@ -541,23 +509,22 @@ def dot(self, other) -> Series | np.ndarray: It can also be called using `self @ other` in Python >= 3.5. - Parameters - ---------- - other : Series, DataFrame or array-like - The other object to compute the dot product with its columns. + .. note:: + The Series and other has to share the same index if other is a Series + or a DataFrame. + BigQuery Dataframes does not validate this property and will produce + incorrect results if indices are not equal. - Returns - ------- - scalar, Series or numpy.ndarray - Return the dot product of the Series and other if other is a - Series, the Series of the dot product of Series and each rows of - other if other is a DataFrame or a numpy.ndarray between the Series - and each columns of the numpy array. + Args: + other (Series): + The other object to compute the dot product with its columns. + + Returns: + scalar, Series or numpy.ndarray: Return the dot product of the Series + and other if other is a Series, the Series of the dot product of + Series and each rows of other if other is a DataFrame or a + numpy.ndarray between the Series and each columns of the numpy array. - Notes - ----- - The Series and other has to share the same index if other is a Series - or a DataFrame. """ raise NotImplementedError("abstract method") @@ -579,6 +546,7 @@ def sort_values( *, axis: Axis = 0, ascending: bool | int | Sequence[bool] | Sequence[int] = True, + kind: str = "quicksort", na_position: str = "last", ) -> Series | None: """ @@ -587,20 +555,21 @@ def sort_values( Sort a Series in ascending or descending order by some criterion. - Parameters - ---------- - axis : {0 or 'index'} - Unused. Parameter needed for compatibility with DataFrame. - ascending : bool or list of bools, default True - If True, sort values in ascending order, otherwise descending. - na_position : {'first' or 'last'}, default 'last' - Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at - the end. + Args: + axis (0 or 'index'): + Unused. Parameter needed for compatibility with DataFrame. + ascending (bool or list of bools, default True): + If True, sort values in ascending order, otherwise descending. + kind (str, default to 'quicksort'): + Choice of sorting algorithm. Accepts 'quicksort’, ‘mergesort’, + ‘heapsort’, ‘stable’. Ignored except when determining whether to + sort stably. 'mergesort' or 'stable' will result in stable reorder + na_position ({'first' or 'last'}, default 'last'): + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. - Returns - ------- - Series or None - Series ordered by values or None if ``inplace=True``. + Returns: + Series or None: Series ordered by values or None if ``inplace=True``. """ raise NotImplementedError("abstract method") @@ -617,21 +586,19 @@ def sort_index( Returns a new Series sorted by label if `inplace` argument is ``False``, otherwise updates the original series and returns None. - Parameters - ---------- - axis : {0 or 'index'} - Unused. Parameter needed for compatibility with DataFrame. - ascending : bool or list-like of bools, default True - Sort ascending vs. descending. When the index is a MultiIndex the - sort direction can be controlled for each level individually. - na_position : {'first', 'last'}, default 'last' - If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. - Not implemented for MultiIndex. + Args: + axis ({0 or 'index'}): + Unused. Parameter needed for compatibility with DataFrame. + ascending (bool or list-like of bools, default True): + Sort ascending vs. descending. When the index is a MultiIndex the + sort direction can be controlled for each level individually. + na_position ({'first', 'last'}, default 'last'): + If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. + Not implemented for MultiIndex. - Returns - ------- - Series or None - The original Series sorted by the labels or None if ``inplace=True``. + Returns: + Series or None: The original Series sorted by the labels or None if + ``inplace=True``. """ @@ -643,25 +610,21 @@ def nlargest( """ Return the largest `n` elements. - Parameters - ---------- - n : int, default 5 - Return this many descending sorted values. - keep : {'first', 'last', 'all'}, default 'first' - When there are duplicate values that cannot all fit in a - Series of `n` elements: - - - ``first`` : return the first `n` occurrences in order - of appearance. - - ``last`` : return the last `n` occurrences in reverse - order of appearance. - - ``all`` : keep all occurrences. This can result in a Series of - size larger than `n`. + Args: + n (int, default 5): + Return this many descending sorted values. + keep ({'first', 'last', 'all'}, default 'first'): + When there are duplicate values that cannot all fit in a + Series of `n` elements: + ``first`` : return the first `n` occurrences in order + of appearance. + ``last`` : return the last `n` occurrences in reverse + order of appearance. + ``all`` : keep all occurrences. This can result in a Series of + size larger than `n`. - Returns - ------- - Series - The `n` largest values in the Series, sorted in decreasing order. + Returns: + Series: The `n` largest values in the Series, sorted in decreasing order. """ raise NotImplementedError("abstract method") @@ -669,25 +632,22 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: """ Return the smallest `n` elements. - Parameters - ---------- - n : int, default 5 - Return this many ascending sorted values. - keep : {'first', 'last', 'all'}, default 'first' - When there are duplicate values that cannot all fit in a - Series of `n` elements: - - - ``first`` : return the first `n` occurrences in order - of appearance. - - ``last`` : return the last `n` occurrences in reverse - order of appearance. - - ``all`` : keep all occurrences. This can result in a Series of - size larger than `n`. + Args: + n (int, default 5): + Return this many ascending sorted values. + keep ({'first', 'last', 'all'}, default 'first'): + When there are duplicate values that cannot all fit in a + Series of `n` elements: + + ``first`` : return the first `n` occurrences in order + of appearance. + ``last`` : return the last `n` occurrences in reverse + order of appearance. + ``all`` : keep all occurrences. This can result in a Series of + size larger than `n`. - Returns - ------- - Series - The `n` smallest values in the Series, sorted in increasing order. + Returns: + Series: The `n` smallest values in the Series, sorted in increasing order. """ raise NotImplementedError("abstract method") @@ -704,14 +664,13 @@ def apply( Can be ufunc (a NumPy function that applies to the entire Series) or a Python function that only works on single values. - Parameters - ---------- - func : function - Python function or NumPy ufunc to apply. - Returns - ------- - Series or DataFrame - If func returns a Series object the result will be a DataFrame. + Args: + func (function): + Python function or NumPy ufunc to apply. + + Returns: + Series or DataFrame: If func returns a Series object the result + will be a DataFrame. """ raise NotImplementedError("abstract method") @@ -731,49 +690,45 @@ def groupby( used to group large amounts of data and compute operations on these groups. - Parameters - ---------- - by : mapping, function, label, pd.Grouper or list of such - Used to determine the groups for the groupby. - If ``by`` is a function, it's called on each value of the object's - index. If a dict or Series is passed, the Series or dict VALUES - will be used to determine the groups (the Series' values are first - aligned; see ``.align()`` method). If a list or ndarray of length - equal to the selected axis is passed (see the `groupby user guide - `_), - the values are used as-is to determine the groups. A label or list - of labels may be passed to group by the columns in ``self``. - Notice that a tuple is interpreted as a (single) key. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Split along rows (0) or columns (1). For `Series` this parameter - is unused and defaults to 0. - level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. Do not specify both ``by`` and ``level``. - as_index : bool, default True - Return object with group labels as the - index. Only relevant for DataFrame input. as_index=False is - effectively "SQL-style" grouped output. This argument has no effect - on filtrations (see the `filtrations in the user guide - `_), - such as ``head()``, ``tail()``, ``nth()`` and in transformations - (see the `transformations in the user guide - `_). - dropna : bool, default True - If True, and if group keys contain NA values, NA values together - with row/column will be dropped. - If False, NA values will also be treated as the key in groups. + Args: + by (mapping, function, label, pd.Grouper or list of such, default None): + Used to determine the groups for the groupby. + If ``by`` is a function, it's called on each value of the object's + index. If a dict or Series is passed, the Series or dict VALUES + will be used to determine the groups (the Series' values are first + aligned; see ``.align()`` method). If a list or ndarray of length + equal to the selected axis is passed (see the `groupby user guide + `_), + the values are used as-is to determine the groups. A label or list + of labels may be passed to group by the columns in ``self``. + Notice that a tuple is interpreted as a (single) key. + axis ({0 or 'index', 1 or 'columns'}, default 0): + Split along rows (0) or columns (1). For `Series` this parameter + is unused and defaults to 0. + level (int, level name, or sequence of such, default None): + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. Do not specify both ``by`` and ``level``. + as_index (bool, default True): + Return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output. This argument has no effect + on filtrations (see the "filtrations in the user guide" + ``_), + such as ``head()``, ``tail()``, ``nth()`` and in transformations + (see the "transformations in the user guide" + ``_). + dropna : bool, default True + If True, and if group keys contain NA values, NA values together + with row/column will be dropped. + If False, NA values will also be treated as the key in groups. - Returns - ------- - SeriesGroupBy - Returns a groupby object that contains information about the groups. + Returns: + SeriesGroupBy: Returns a groupby object that contains information about the groups. """ raise NotImplementedError("abstract method") def drop( - self, - labels=None, + self, labels=None, *, axis=0, index=None, columns=None, level=None ) -> Series | None: """ Return Series with specified index labels removed. @@ -782,10 +737,18 @@ def drop( When using a multi-index, labels on different levels can be removed by specifying the level. - Parameters - ---------- - labels : single label or list-like - Index labels to drop. + Args: + labels (single label or list-like): + Index labels to drop. + axis: + Unused. Parameter needed for compatibility with DataFrame. + index: + Redundant for application on Series, but 'index' can be used instead + of 'labels'. + columns: + No change is made to the Series; use 'index' or 'labels' instead. + level: + For MultiIndex, level for which the labels will be removed. Returns ------- @@ -806,7 +769,7 @@ def reorder_levels(self, order: Sequence) -> Series: May not drop or duplicate levels. Args: - order: list of int representing new level order + order (list of int representing new level order): Reference level by number or key. Returns: @@ -819,7 +782,7 @@ def droplevel(self, level): Return Series with requested index / column level(s) removed. Args: - level: int, str, or list-like + level (int, str, or list-like): If a string is given, must be the name of a level If list-like, elements must be names or positional indexes of levels. @@ -833,6 +796,16 @@ def fillna( self, value=None, ) -> Series | None: + """ + Fill NA/NaN values using the specified method. + + Args: + value (scalar, dict, Series, or DataFrame, default None): + Value to use to fill holes (e.g. 0). + + Returns: + Series or None: Object with missing values filled or None. + """ raise NotImplementedError("abstract method") def between( @@ -848,24 +821,33 @@ def between( corresponding Series element is between the boundary values `left` and `right`. NA values are treated as `False`. - Parameters - ---------- - left : scalar or list-like - Left boundary. - right : scalar or list-like - Right boundary. - inclusive : {"both", "neither", "left", "right"} - Include boundaries. Whether to set each bound as closed or open. + Args: + left (scalar or list-like): + Left boundary. + right (scalar or list-like): + Right boundary. + inclusive ({"both", "neither", "left", "right"}): + Include boundaries. Whether to set each bound as closed or open. - Returns - ------- - Series - Series representing whether each element is between left and + Returns: + Series: Series representing whether each element is between left and right (inclusive). """ raise NotImplementedError("abstract method") + def cumprod(self): + """ + Return cumulative product over a DataFrame or Series axis. + + Returns a DataFrame or Series of the same size containing the cumulative + product. + + Returns: + Return cumulative sum of scalar or Series. + """ + raise NotImplementedError("abstract method") + def cumsum(self): """ Return cumulative sum over a DataFrame or Series axis. @@ -873,16 +855,13 @@ def cumsum(self): Returns a DataFrame or Series of the same size containing the cumulative sum. - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The index or the name of the axis. 0 is equivalent to None or 'index'. - For `Series` this parameter is unused and defaults to 0. + Args: + axis ({0 or 'index', 1 or 'columns'}, default 0): + The index or the name of the axis. 0 is equivalent to None or 'index'. + For `Series` this parameter is unused and defaults to 0. - Returns - ------- - scalar or Series - Return cumulative sum of scalar or Series. + Returns: + scalar or Series: Return cumulative sum of scalar or Series. """ raise NotImplementedError("abstract method") @@ -893,16 +872,13 @@ def cummax(self): Returns a DataFrame or Series of the same size containing the cumulative maximum. - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The index or the name of the axis. 0 is equivalent to None or 'index'. - For `Series` this parameter is unused and defaults to 0. + Args: + axis ({{0 or 'index', 1 or 'columns'}}, default 0): + The index or the name of the axis. 0 is equivalent to None or 'index'. + For `Series` this parameter is unused and defaults to 0. - Returns - ------- - scalar or Series - Return cumulative maximum of scalar or Series. + Returns: + scalar or Series: Return cumulative maximum of scalar or Series. """ raise NotImplementedError("abstract method") @@ -913,57 +889,54 @@ def cummin(self): Returns a DataFrame or Series of the same size containing the cumulative minimum. - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The index or the name of the axis. 0 is equivalent to None or 'index'. - For `Series` this parameter is unused and defaults to 0. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - *args, **kwargs - Additional keywords have no effect but might be accepted for - compatibility with NumPy. + Args: + axis ({0 or 'index', 1 or 'columns'}, default 0): + The index or the name of the axis. 0 is equivalent to None or 'index'. + For `Series` this parameter is unused and defaults to 0. + skipna (bool, default True): + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + `*args`, `**kwargs`: + Additional keywords have no effect but might be accepted for + compatibility with NumPy. - Returns - ------- - scalar or Series - Return cumulative minimum of scalar or Series. + Returns: + scalar or Series: Return cumulative minimum of scalar or Series. """ raise NotImplementedError("abstract method") def eq(self, other) -> Series: - """Return Equal of series and other, element-wise (binary operator eq). + """Return equal of Series and other, element-wise (binary operator eq). Equivalent to ``other == series``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def ne(self, other) -> Series: - """Return Not equal of series and other, element-wise (binary operator ne). + """Return not equal of Series and other, element-wise (binary operator ne). Equivalent to ``other != series``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the comparison. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def le(self, other) -> Series: - """Get 'less than or equal to' of series and other, element-wise (binary operator `<=`). + """Get 'less than or equal to' of Series and other, element-wise (binary operator `<=`). Equivalent to ``series <= other``, but with support to substitute a fill_value for missing data in either one of the inputs. @@ -978,61 +951,76 @@ def le(self, other) -> Series: raise NotImplementedError("abstract method") def lt(self, other) -> Series: - """Get 'less than' of series and other, element-wise (binary operator `<`). + """Get 'less than' of Series and other, element-wise (binary operator `<`). - Equivalent to ``series < other``, but with support to substitute a fill_value for - missing data in either one of the inputs. + Equivalent to ``series < other``, but with support to substitute a fill_value for + missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): - Returns: - Series. The result of the comparison. + Returns: + Series: The result of the operation. """ raise NotImplementedError("abstract method") def ge(self, other) -> Series: - """Get 'greater than or equal to' of series and other, element-wise (binary operator `>=`). + """Get 'greater than or equal to' of Series and other, element-wise (binary operator `>=`). Equivalent to ``series >= other``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the comparison. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def gt(self, other) -> Series: - """Get 'less than or equal to' of series and other, element-wise (binary operator `<=`). + """Get 'less than or equal to' of Series and other, element-wise (binary operator `<=`). Equivalent to ``series <= other``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def add(self, other) -> Series: - """Return Addition of series and other, element-wise (binary operator add). + """Return addition of Series and other, element-wise (binary operator add). Equivalent to ``series + other``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): + + Returns: + Series: The result of the operation. + + """ + raise NotImplementedError("abstract method") + + def radd(self, other) -> Series: + """Return addition of Series and other, element-wise (binary operator radd). + + Equivalent to ``other + series``, but with support to substitute a fill_value for + missing data in either one of the inputs. + + Args: + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1041,136 +1029,180 @@ def sub( self, other, ) -> Series: - """Return Subtraction of series and other, element-wise (binary operator sub). + """Return subtraction of Series and other, element-wise (binary operator sub). Equivalent to ``series - other``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def rsub(self, other) -> Series: - """Return Subtraction of series and other, element-wise (binary operator rsub). + """Return subtraction of Series and other, element-wise (binary operator rsub). Equivalent to ``other - series``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def mul(self, other) -> Series: - """Return Multiplication of series and other, element-wise (binary operator mul). + """Return multiplication of Series and other, element-wise (binary operator mul). Equivalent to ``other * series``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") + def rmul(self, other) -> Series: + """Return multiplication of Series and other, element-wise (binary operator mul). + + Equivalent to ``series * others``, but with support to substitute a fill_value for + missing data in either one of the inputs. + + Args: + other (Series, or scalar value): + + Returns: + Series: The result of the operation. + """ + raise NotImplementedError("abstract method") + def truediv(self, other) -> Series: - """Return Floating division of series and other, element-wise (binary operator truediv). + """Return floating division of Series and other, element-wise (binary operator truediv). Equivalent to ``series / other``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def rtruediv(self, other) -> Series: - """Return Floating division of series and other, element-wise (binary operator rtruediv). + """Return floating division of Series and other, element-wise (binary operator rtruediv). Equivalent to ``other / series``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def floordiv(self, other) -> Series: - """Return Integer division of series and other, element-wise (binary operator floordiv). + """Return integer division of Series and other, element-wise (binary operator floordiv). Equivalent to ``series // other``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def rfloordiv(self, other) -> Series: - """Return Integer division of series and other, element-wise (binary operator rfloordiv). + """Return integer division of Series and other, element-wise (binary operator rfloordiv). Equivalent to ``other // series``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def mod(self, other) -> Series: - """Return Modulo of series and other, element-wise (binary operator mod). + """Return modulo of Series and other, element-wise (binary operator mod). Equivalent to ``series % other``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: - other: Series, or scalar value + other (Series, or scalar value): Returns: - Series. The result of the operation. + Series: The result of the operation. """ raise NotImplementedError("abstract method") def rmod(self, other) -> Series: - """Get Modulo of series and other, element-wise (binary operator `rmod`). + """Get modulo of Series and other, element-wise (binary operator `rmod`). Equivalent to ``other % series``, but with support to substitute a fill_value for missing data in either one of the inputs. + Args: + other (Series, or scalar value): + + Returns: + Series: The result of the operation. + + """ + raise NotImplementedError("abstract method") + + def divmod(self, other) -> Series: + """Return integer division and modulo of Series and other, element-wise (binary operator divmod). + + Equivalent to divmod(series, other). + Args: other: Series, or scalar value Returns: - Series. The result of the operation. + 2-Tuple of Series. The result of the operation. The result is always + consistent with (floordiv, mod) (though pandas may not). + + """ + raise NotImplementedError("abstract method") + + def rdivmod(self, other) -> Series: + """Return integer division and modulo of Series and other, element-wise (binary operator rdivmod). + + Equivalent to other divmod series. + + Args: + other: Series, or scalar value + + Returns: + 2-Tuple of Series. The result of the operation. The result is always + consistent with (rfloordiv, rmod) (though pandas may not). """ raise NotImplementedError("abstract method") @@ -1181,15 +1213,12 @@ def all( """ Return whether all elements are True, potentially over an axis. - Returns True unless there at least one element within a series or along a - Dataframe axis that is False or equivalent (e.g. zero or empty). + Returns True unless there at least one element within a Series or along a + DataFrame axis that is False or equivalent (e.g. zero or empty). - - Returns - ------- - scalar or Series - If level is specified, then, Series is returned; otherwise, scalar - is returned. + Returns: + scalar or Series: If level is specified, then, Series is returned; + otherwise, scalar is returned. """ raise NotImplementedError("abstract method") @@ -1202,12 +1231,9 @@ def any( Returns False unless there is at least one element within a series or along a Dataframe axis that is True or equivalent (e.g. non-zero or non-empty). - - Returns - ------- - scalar or Series - If level is specified, then, Series is returned; otherwise, scalar - is returned. + Returns: + scalar or Series: If level is specified, then, Series is returned; + otherwise, scalar is returned. """ raise NotImplementedError("abstract method") @@ -1221,9 +1247,8 @@ def max( of the ``numpy.ndarray`` method ``argmax``. - Returns - ------- - scalar or scalar + Returns: + scalar or scalar """ raise NotImplementedError("abstract method") @@ -1236,10 +1261,8 @@ def min( If you want the index of the minimum, use ``idxmin``. This is the equivalent of the ``numpy.ndarray`` method ``argmin``. - - Returns - ------- - scalar or scalar + Returns: + scalar or scalar """ raise NotImplementedError("abstract method") @@ -1249,7 +1272,7 @@ def std( """ Return sample standard deviation over requested axis. - Normalized by N-1 by default. This can be changed using the ddof argument. + Normalized by N-1 by default. Returns @@ -1264,12 +1287,10 @@ def var( """ Return unbiased variance over requested axis. - Normalized by N-1 by default. This can be changed using the ddof argument. + Normalized by N-1 by default. - - Returns - ------- - scalar or Series (if level specified) + Returns: + scalar or Series (if level specified) """ raise NotImplementedError("abstract method") @@ -1291,6 +1312,19 @@ def mean(self): """ raise NotImplementedError("abstract method") + def median(self, *, exact: bool = False): + """Return the median of the values over the requested axis. + + Args: + exact (bool. default False): + Default False. Get the exact median instead of an approximate + one. Note: ``exact=True`` not yet supported. + + Returns: + scalar + """ + raise NotImplementedError("abstract method") + def prod(self): """Return the product of the values over the requested axis. @@ -1299,66 +1333,71 @@ def prod(self): """ raise NotImplementedError("abstract method") + def skew(self): + """Return unbiased skew over requested axis. + + Normalized by N-1. + + Returns: + scalar + """ + raise NotImplementedError("abstract method") + def kurt(self): """Return unbiased kurtosis over requested axis. Kurtosis obtained using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). Normalized by N-1. - Returns - ------- - scalar or scalar + Returns: + scalar or scalar: Unbiased kurtosis over requested axis. """ raise NotImplementedError("abstract method") def where(self, cond, other): """Replace values where the condition is False. - Parameters - ---------- - cond: bool Series/DataFrame, array-like, or callable - Where cond is True, keep the original value. Where False, replace - with corresponding value from other. If cond is callable, it is - computed on the Series/DataFrame and should return boolean - Series/DataFrame or array. The callable must not change input - Series/DataFrame (though pandas doesn’t check it). - other: scalar, Series/DataFrame, or callable - Entries where cond is False are replaced with corresponding value - from other. If other is callable, it is computed on the - Series/DataFrame and should return scalar or Series/DataFrame. - The callable must not change input Series/DataFrame (though pandas - doesn’t check it). If not specified, entries will be filled with - the corresponding NULL value (np.nan for numpy dtypes, pd.NA for - extension dtypes). + Args: + cond (bool Series/DataFrame, array-like, or callable): + Where cond is True, keep the original value. Where False, replace + with corresponding value from other. If cond is callable, it is + computed on the Series/DataFrame and returns boolean + Series/DataFrame or array. The callable must not change input + Series/DataFrame (though pandas doesn’t check it). + other (scalar, Series/DataFrame, or callable): + Entries where cond is False are replaced with corresponding value + from other. If other is callable, it is computed on the + Series/DataFrame and returns scalar or Series/DataFrame. + The callable must not change input Series/DataFrame (though pandas + doesn’t check it). If not specified, entries will be filled with + the corresponding NULL value (np.nan for numpy dtypes, pd.NA for + extension dtypes). - Returns - ------- - Series + Returns: + Series """ raise NotImplementedError("abstract method") def mask(self, cond, other): """Replace values where the condition is True. - Parameters - ---------- - cond: bool Series/DataFrame, array-like, or callable - Where cond is False, keep the original value. Where True, replace - with corresponding value from other. If cond is callable, it is - computed on the Series/DataFrame and should return boolean - Series/DataFrame or array. The callable must not change input - Series/DataFrame (though pandas doesn’t check it). - other: scalar, Series/DataFrame, or callable - Entries where cond is True are replaced with corresponding value - from other. If other is callable, it is computed on the - Series/DataFrame and should return scalar or Series/DataFrame. - The callable must not change input Series/DataFrame (though pandas - doesn’t check it). If not specified, entries will be filled with - the corresponding NULL value (np.nan for numpy dtypes, pd.NA for - extension dtypes). + Args: + cond (bool Series/DataFrame, array-like, or callable): + Where cond is False, keep the original value. Where True, replace + with corresponding value from other. If cond is callable, it is + computed on the Series/DataFrame and should return boolean + Series/DataFrame or array. The callable must not change input + Series/DataFrame (though pandas doesn’t check it). + other (scalar, Series/DataFrame, or callable): + Entries where cond is True are replaced with corresponding value + from other. If other is callable, it is computed on the + Series/DataFrame and should return scalar or Series/DataFrame. + The callable must not change input Series/DataFrame (though pandas + doesn’t check it). If not specified, entries will be filled with + the corresponding NULL value (np.nan for numpy dtypes, pd.NA for + extension dtypes). - Returns - ------- - Series + Returns: + Series """ raise NotImplementedError("abstract method") @@ -1369,18 +1408,15 @@ def clip(self): singular values or array like, and in the latter case the clipping is performed element-wise in the specified axis. - Parameters - ---------- - - lower: float or array-like, default None - Minimum threshold value. All values below this threshold will be set to it. A missing threshold (e.g NA) will not clip the value. + Args: + lower (float or array-like, default None): + Minimum threshold value. All values below this threshold will be set to it. A missing threshold (e.g NA) will not clip the value. - upper: float or array-like, default None - Maximum threshold value. All values above this threshold will be set to it. A missing threshold (e.g NA) will not clip the value. + upper (float or array-like, default None): + Maximum threshold value. All values above this threshold will be set to it. A missing threshold (e.g NA) will not clip the value. - Returns - ------- - Series + Returns: + Series. """ raise NotImplementedError("abstract method") @@ -1390,10 +1426,8 @@ def argmax(self): If the minimum is achieved in multiple locations, the first row position is returned. - Returns - ------- - Series - Row position of the maximum value. + Returns: + Series: Row position of the maximum value. """ raise NotImplementedError("abstract method") @@ -1403,10 +1437,8 @@ def argmin(self): If the maximum is achieved in multiple locations, the first row position is returned. - Returns - ------- - Series - Row position of the minimum value. + Returns: + Series: Row position of the minimum value. """ raise NotImplementedError("abstract method") @@ -1420,18 +1452,15 @@ def rename(self, index, **kwargs) -> Series | None: Alternatively, change ``Series.name`` with a scalar value. - Parameters - ---------- - index : scalar, hashable sequence, dict-like or function optional - Functions or dict-like are transformations to apply to - the index. - Scalar or hashable sequence-like will alter the ``Series.name`` - attribute. + Args: + index (scalar, hashable sequence, dict-like or function optional): + Functions or dict-like are transformations to apply to + the index. + Scalar or hashable sequence-like will alter the ``Series.name`` + attribute. - Returns - ------- - Series - Series with index labels + Returns: + Series: Series with index labels """ raise NotImplementedError("abstract method") @@ -1440,14 +1469,12 @@ def rename_axis(self, mapper, **kwargs): """ Set the name of the axis for the index or columns. - Parameters - ---------- - mapper : scalar, list-like, optional - Value to set the axis name attribute. + Args: + mapper (scalar, list-like, optional): + Value to set the axis name attribute. - Returns - ------- - Series + Returns: + Series: Series with the name of the axis set. """ raise NotImplementedError("abstract method") @@ -1459,39 +1486,35 @@ def rolling( """ Provide rolling window calculations. - Parameters - ---------- - window : int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. + Args: + window (int, timedelta, str, offset, or BaseIndexer subclass): + Size of the moving window. - If an integer, the fixed number of observations used for - each window. + If an integer, the fixed number of observations used for + each window. - If a timedelta, str, or offset, the time period of each window. Each - window will be a variable sized based on the observations included in - the time-period. This is only valid for datetimelike indexes. - To learn more about the offsets & frequency strings, please see `this link - `__. + If a timedelta, str, or offset, the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetime-like indexes. + To learn more about the offsets & frequency strings, please see `this link + `__. - If a BaseIndexer subclass, the window boundaries - based on the defined ``get_window_bounds`` method. Additional rolling - keyword arguments, namely ``min_periods``, ``center``, ``closed`` and - ``step`` will be passed to ``get_window_bounds``. + If a BaseIndexer subclass, the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely ``min_periods``, ``center``, ``closed`` and + ``step`` will be passed to ``get_window_bounds``. - min_periods : int, default None - Minimum number of observations in window required to have a value; - otherwise, result is ``np.nan``. + min_periods (int, default None): + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. - For a window that is specified by an offset, ``min_periods`` will default to 1. + For a window that is specified by an offset, ``min_periods`` will default to 1. - For a window that is specified by an integer, ``min_periods`` will default - to the size of the window. + For a window that is specified by an integer, ``min_periods`` will default + to the size of the window. - Returns - ------- - ``Window`` subclass if a ``win_type`` is passed - - ``Rolling`` subclass if ``win_type`` is not passed + Returns: + ``Window`` subclass if a ``win_type`` is passed.``Rolling`` subclass if ``win_type`` is not passed """ raise NotImplementedError("abstract method") @@ -1499,14 +1522,12 @@ def expanding(self, min_periods=1): """ Provide expanding window calculations. - Parameters - ---------- - min_periods : int, default 1 - Minimum number of observations in window required to have a value; - otherwise, result is ``np.nan``. + Args: + min_periods (int, default 1): + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. - Returns - ------- + Returns: ``Expanding`` subclass """ raise NotImplementedError("abstract method") @@ -1527,18 +1548,18 @@ def value_counts( Excludes NA values by default. Args: - normalize : bool, default False + normalize (bool, default False): If True then the object returned will contain the relative frequencies of the unique values. - sort : bool, default True + sort (bool, default True): Sort by frequencies. - ascending : bool, default False + ascending (bool, default False): Sort in ascending order. - dropna : bool, default True + dropna (bool, default True): Don't include counts of NaN. Returns: - Series + Series: Series containing counts of unique values. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 81b9bd3d14..e464843c77 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -1,3 +1,7 @@ +import re +import typing + + class StringMethods: """ Vectorized string functions for Series and Index. @@ -7,6 +11,29 @@ class StringMethods: R's stringr package. """ + def extract(self, pat: str, flags: int = 0): + """ + Extract capture groups in the regex `pat` as columns in a DataFrame. + + For each subject string in the Series, extract groups from the + first match of regular expression `pat`. + + Args: + pat: + Regular expression pattern with capturing groups. + flags: + Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that + modify regular expression matching for things like case, + spaces, etc. For more details, see :mod:`re`. + + Returns: + A DataFrame with one row for each subject string, and one + column for each group. Any capture group names in regular + expression pat will be used for column names; otherwise + capture group numbers will be used. + """ + raise NotImplementedError("abstract method") + def find(self, sub, start: int = 0, end=None): """Return lowest indexes in each strings in the Series/Index. @@ -17,15 +44,14 @@ def find(self, sub, start: int = 0, end=None): Args: sub: Substring being searched. - start: + start (int, default 0): Left edge index. - end: + end (None): Right edge index. Returns: - Series or Index of int. + bigframes.series.Series: Series with lowest indexes in each strings. """ - raise NotImplementedError("abstract method") def len(self): @@ -35,9 +61,8 @@ def len(self): (such as a dictionary). Returns: - Series or Index of int - A Series or Index of integer values indicating the length of each - element in the Series or Index. + bigframes.series.Series: A Series or Index of integer values indicating + the length of each element in the Series or Index. """ raise NotImplementedError("abstract method") @@ -48,7 +73,7 @@ def lower(self): Equivalent to :meth:`str.lower`. Returns: - Series or Index of object + bigframes.series.Series: Series with lowercase. """ raise NotImplementedError("abstract method") @@ -57,16 +82,16 @@ def slice(self, start=None, stop=None): """Slice substrings from each element in the Series or Index. Args: - start : int, optional + start (int, optional): Start position for slice operation. - stop : int, optional + stop (int, optional): Stop position for slice operation. - step : int, optional + step (int, optional): Step size for slice operation. Returns: - Series or Index of object - Series or Index from sliced substring from original string object. + bigframes.series.Series:: Series or Index from sliced + substring from original string object. """ raise NotImplementedError("abstract method") @@ -80,7 +105,8 @@ def strip(self): Equivalent to :meth:`str.strip`. Returns: - Series or Index of object + bigframes.series.Series: Series or Index without leading + and trailing characters. """ raise NotImplementedError("abstract method") @@ -91,7 +117,7 @@ def upper(self): Equivalent to :meth:`str.upper`. Returns: - Series or Index of object + bigframes.series.Series: Series with uppercase strings. """ raise NotImplementedError("abstract method") @@ -104,9 +130,8 @@ def isnumeric(self): has zero characters, ``False`` is returned for that check. Returns: - Series or Index of bool - Series or Index of boolean values with the same length as the original - Series/Index. + bigframes.series.Series: Series or Index of boolean values with the + same length as the original Series/Index. """ raise NotImplementedError("abstract method") @@ -120,7 +145,7 @@ def rstrip(self): Equivalent to :meth:`str.rstrip`. Returns: - Series or Index of object + bigframes.series.Series: Series without trailing characters. """ raise NotImplementedError("abstract method") @@ -134,7 +159,7 @@ def lstrip(self): Equivalent to :meth:`str.lstrip`. Returns: - Series or Index of object` + bigframes.series.Series: Series without leading characters. """ raise NotImplementedError("abstract method") @@ -147,9 +172,8 @@ def repeat(self, repeats: int): Same value for all (int) or different value per (sequence). Returns: - Series or pandas.Index - Series or Index of repeated string objects specified by - input parameter repeats. + bigframes.series.Series: Series or Index of repeated string + objects specified by input parameter repeats. """ raise NotImplementedError("abstract method") @@ -160,7 +184,7 @@ def capitalize(self): Equivalent to :meth:`str.capitalize`. Returns: - Series or Index of object + bigframes.series.Series: Series with captitalized strings. """ raise NotImplementedError("abstract method") @@ -172,16 +196,121 @@ def cat(self, others, *, join): and elements of `others` element-wise. Args: - others : Series + others (Series): - join : {'left', 'outer'}, default 'left' + join ({'left', 'outer'}, default 'left'): Determines the join-style between the calling Series and any Series in `others` (objects without an index need to match the length of the calling Series). To disable alignment, use `.values` on any Series/Index/DataFrame in `others`. Returns: - Series + bigframes.series.Series: Series with concatenated strings. """ raise NotImplementedError("abstract method") + + def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True): + """ + Test if pattern or regex is contained within a string of a Series or Index. + + Return boolean Series or Index based on whether a given pattern or regex is + contained within a string of a Series or Index. + + Args: + pat (str, re.Pattern): + Character sequence or regular expression. + case (bool, default True): + If True, case sensitive. + flags (int, default 0): + Flags to pass through to the re module, e.g. re.IGNORECASE. + regex (bool, default True): + If True, assumes the pat is a regular expression. + If False, treats the pat as a literal string. + + Returns: + bigframes.series.Series: A Series or Index of boolean values indicating + whether the given pattern is contained within the string of each + element of the Series or Index. + """ + raise NotImplementedError("abstract method") + + def replace( + self, + pat: typing.Union[str, re.Pattern], + repl: str, + *, + case: typing.Optional[bool] = None, + flags: int = 0, + regex: bool = False, + ): + """ + Replace each occurrence of pattern/regex in the Series/Index. + + Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on + the regex value. + + Args: + pat (str, re.Pattern): + String can be a character sequence or regular expression. + repl (str): + Replacement string. + case (default None): + Determines if replace is case sensitive: + + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex. + flags (int, default 0): + Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled + regex. + regex (bool: default False): + Determines if the passed-in pattern is a regular expression: + + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + Returns: + bigframes.series.Series: A copy of the object with all matching occurrences + of `pat` replaced by `repl`. + + """ + raise NotImplementedError("abstract method") + + def startswith( + self, + pat: typing.Union[str, tuple[str, ...]], + ): + """ + Test if the start of each string element matches a pattern. + + Args: + pat (str, tuple[str, ...]): + Character sequence or tuple of strings. Regular expressions are not + accepted. + + Returns: + bigframes.series.Series: A Series of booleans indicating whether the given + pattern matches the start of each string element. + """ + raise NotImplementedError("abstract method") + + def endswith( + self, + pat: typing.Union[str, tuple[str, ...]], + ): + """ + Test if the end of each string element matches a pattern. + + Args: + pat (str, tuple[str, ...]): + Character sequence or tuple of strings. Regular expressions are not + accepted. + + Returns: + bigframes.series.Series: A Series of booleans indicating whether the given + pattern matches the end of each string element. + """ + raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/io/common.py b/third_party/bigframes_vendored/pandas/io/common.py index 057ed96721..506984e64d 100644 --- a/third_party/bigframes_vendored/pandas/io/common.py +++ b/third_party/bigframes_vendored/pandas/io/common.py @@ -17,9 +17,10 @@ def dedup_names( but a custom pattern may be supported in the future. Examples - -------- - >>> dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False) + ``` + dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False) ['x', 'y', 'x.1', 'x.2'] + ``` """ names = list(names) # so we can index counts: DefaultDict[Hashable, int] = defaultdict(int) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 4f4b1b8199..9425ead0e3 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -15,23 +15,23 @@ def read_gbq( col_order: Iterable[str] = (), max_results: Optional[int] = None, ): - """Loads DataFrame from Google BigQuery. + """Loads DataFrame from BigQuery. Args: - query: + query (str): A SQL string to be executed or a BigQuery table to be read. The table must be specified in the format of `project.dataset.tablename` or `dataset.tablename`. - index_col: + index_col (Iterable[str] or str): Name of result column(s) to use for index in results DataFrame. - col_order: + col_order (Iterable[str]): List of BigQuery column names in the desired order for results DataFrame. - max_results: + max_results (Optional[int], default None): If set, limit the maximum number of rows to fetch from the query results. Returns: - A DataFrame representing results of the query or table. + bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index ee2e9a65f2..6f0a2b3cb4 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -8,15 +8,18 @@ def read_parquet( self, path: str, ): - r"""Load a parquet object from the file path (local or GCS), returning a DataFrame. + r"""Load a Parquet object from the file path (local or Cloud Storage), returning a DataFrame. - Args: - path: - Local or GCS path to parquet file. - - Note: + .. note:: This method will not guarantee the same ordering as the file. Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + + Args: + path (str): + Local or Cloud Storage path to Parquet file. + + Returns: + bigframes.dataframe.DataFrame: A BigQuery DataFrames. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index e5ed39b54a..e01eb734fb 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -32,27 +32,30 @@ def read_csv( encoding: Optional[str] = None, **kwargs, ): - r"""Loads DataFrame from comma-separated values (csv) file locally or from GCS. + """Loads DataFrame from comma-separated values (csv) file locally or from + Cloud Storage. The CSV file data will be persisted as a temporary BigQuery table, which can be automatically recycled after the Session is closed. - Note: using `engine="bigquery"` will not guarantee the same ordering as the - file. Instead, set a serialized index column as the index and sort by - that in the resulting DataFrame. + .. note:: + using `engine="bigquery"` will not guarantee the same ordering as the + file. Instead, set a serialized index column as the index and sort by + that in the resulting DataFrame. Args: - filepath_or_buffer: a string path including GCS and local file. - - sep: the separator for fields in a CSV file. For the BigQuery engine, the separator + filepath_or_buffer (str): + a string path including Cloud Storage and local file. + sep (Optional[str], default ","): + the separator for fields in a CSV file. For the BigQuery engine, the separator can be any ISO-8859-1 single-byte character. To use a character in the range 128-255, you must encode the character as UTF-8. Both engines support `sep="\t"` to specify tab character as separator. Default engine supports having any number of spaces as separator by specifying `sep="\s+"`. Separators longer than 1 character are interpreted as regular expressions by the default engine. BigQuery engine only supports single character separators. - - header: row number to use as the column names. + header (Optional[int], default 0): + row number to use as the column names. - ``None``: Instructs autodetect that there are no headers and data should be read starting from the first row. - ``0``: If using `engine="bigquery"`, Autodetect tries to detect headers in the @@ -68,19 +71,19 @@ def read_csv( contains column names unless the `names` argument is specified. If `names` is provided, row N+1 will be ignored, row N+2 will be read as data, and column names are inferred from `names`. - - names: a list of column names to use. If the file contains a header row and you + names (default None): + a list of column names to use. If the file contains a header row and you want to pass this parameter, then `header=0` should be passed as well so the first (header) row is ignored. Only to be used with default engine. - - index_col: column(s) to use as the row labels of the DataFrame, either given as + index_col (default None): + column(s) to use as the row labels of the DataFrame, either given as string name or column index. `index_col=False` can be used with the default engine only to enforce that the first column is not used as the index. Using column index instead of column name is only supported with the default engine. The BigQuery engine only supports having a single column name as the `index_col`. Neither engine supports having a multi-column index. - - usecols: list of column names to use. The BigQuery engine only supports having a list + usecols (default None): + List of column names to use): The BigQuery engine only supports having a list of string column names. Column indices and callable functions are only supported with the default engine. Using the default engine, the column names in `usecols` can be defined to correspond to column names provided with the `names` parameter @@ -89,22 +92,22 @@ def read_csv( column names provided with the BigQuery engine will be consistent in the resulting dataframe. If using a callable function with the default engine, only column names that evaluate to True by the callable function will be in the resulting dataframe. - - dtype: data type for data or columns. Only to be used with default engine. - - engine: type of engine to use. If `engine="bigquery"` is specified, then BigQuery's - load API will be used. Otherwise, the engine will be passed to `pandas.read_csv`. - - encoding: the character encoding of the data. The default encoding is `UTF-8` for both + dtype (data type for data or columns): + Data type for data or columns. Only to be used with default engine. + engine (Optional[Dict], default None): + Type of engine to use. If `engine="bigquery"` is specified, then BigQuery's load API will be used. + Otherwise, the engine will be passed to `pandas.read_csv`. + encoding (Optional[str], default to None): + encoding the character encoding of the data. The default encoding is `UTF-8` for both engines. The default engine acceps a wide range of encodings. Refer to Python documentation for a comprehensive list, https://docs.python.org/3/library/codecs.html#standard-encodings The BigQuery engine only supports `UTF-8` and `ISO-8859-1`. - - **kwargs: keyword arguments. + **kwargs: + keyword arguments. Returns: - A BigQuery DataFrames. + bigframes.dataframe.DataFrame: A BigQuery DataFrames. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 925bb76e1a..03958f7595 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -13,7 +13,7 @@ class BaseEstimator: """Base class for all estimators. - Notes: + .. note:: All estimators should specify all the parameters that can be set at the class level in their ``__init__`` as explicit keyword arguments (no ``*args`` or ``**kwargs``). @@ -54,12 +54,12 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]: """Get parameters for this estimator. Args: - deep: + deep (bool, default True): Default ``True``. If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: - A dictionary of parameter names mapped to their values. + Dictionary: A dictionary of parameter names mapped to their values. """ out: Dict = dict() for key in self._get_param_names(): @@ -84,15 +84,15 @@ def score(self, X, y): each label set be correctly predicted. Args: - X: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples, n_features). Test samples. - y: + y (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples,) or (n_samples, n_outputs). True labels for `X`. Returns: - A DataFrame of the evaluation result. + bigframes.dataframe.DataFrame: A DataFrame of the evaluation result. """ raise NotImplementedError("abstract method") @@ -106,19 +106,19 @@ def score(self, X, y): """Return the evaluation metrics of the model. Args: - X: - DataFrame of shape (n_samples, n_features). Test samples. For + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). Test samples. For some estimators this may be a precomputed kernel matrix or a list of generic objects instead with shape ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted`` is the number of samples used in the fitting for the estimator. - y: - DataFrame of shape (n_samples,) or (n_samples, n_outputs). True + y (bigframes.dataframe.DataFrame or bigframes.series.Series: + Series or DataFrame of shape (n_samples,) or (n_samples, n_outputs). True values for `X`. Returns: - A DataFrame of the evaluation result. + bigframes.dataframe.DataFrame: A DataFrame of the evaluation result. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 8f67cab1c3..bddb82c7ba 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -24,13 +24,13 @@ def predict(self, X): """Predict the closest cluster each sample in X belongs to. Args: - X: - DataFrame of shape (n_samples, n_features). The data matrix for + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. Returns: - DataFrame of shape (n_samples,), containing the class labels for - each sample. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing the + class labels for each sample. """ raise NotImplementedError("abstract method") @@ -39,7 +39,7 @@ class KMeans(_BaseKMeans): """K-Means clustering. Args: - n_clusters: int, default=8 + n_clusters (int, default 8): The number of clusters to form as well as the number of centroids to generate. Default to 8. """ @@ -53,19 +53,51 @@ def fit( """Compute k-means clustering. Args: - X: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples, n_features). Training data. - y: Ignored + y (default None): Not used, present here for API consistency by convention. + transforms (Optional[List[str]], default None): + Do not use. Internal param to be deprecated. + Use bigframes.ml.pipeline instead. - transforms: - An optional list of SQL expressions to apply over top of the - model inputs as preprocessing. This preprocessing will be - automatically reapplied to new input data (e.g. in .predict), - and may contain steps (like ML.STANDARD_SCALER) that fit to the - training data. Returns: - Fitted Estimator. + KMeans: Fitted Estimator. + """ + raise NotImplementedError("abstract method") + + def predict( + self, + X, + ): + """Predict the closest cluster each sample in X belongs to. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + DataFrame of shape (n_samples, n_features). New data to predict. + y: (default None) + Not used, present here for API consistency by convention. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of the cluster each sample belongs to. + """ + raise NotImplementedError("abstract method") + + def score( + self, + X, + y=None, + ): + """Metrics of the model. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + DataFrame of shape (n_samples, n_features). New Data. + y (default None) + Not used, present here for API consistency by convention. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of the metrics. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py index dadfa5d013..bc8bc3980a 100644 --- a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py +++ b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py @@ -35,13 +35,13 @@ def fit( """Fit all transformers using X. Args: - X: - DataFrame of shape (n_samples, n_features). Training vector, + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Series or DataFrame of shape (n_samples, n_features). Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. Returns: - Fitted estimator. + ColumnTransformer: Fitted estimator. """ raise NotImplementedError("abstract method") @@ -52,10 +52,10 @@ def transform( """Transform X separately by each transformer, concatenate results. Args: - X: - The DataFrame to be transformed by subset. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Series or DataFrame to be transformed by subset. Returns: - Transformed result. + bigframes.dataframe.DataFrame: Transformed result. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 6f0d17b2e6..619c13f35d 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -30,24 +30,54 @@ class PCA(BaseEstimator, metaclass=ABCMeta): truncated SVD. Args: - n_components: Optional[int] - Number of components to keep. if n_components is not set all components are kept. + n_components (Optional[int], default 3): + Number of components to keep. if n_components is not set all components + are kept. """ - def fit( - self, - X, - ): + def fit(self, X, y=None): """Fit the model according to the given training data. Args: - X: - DataFrame of shape (n_samples, n_features). Training vector, + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. + y (default None): + Ignored. + + transforms (Optional[List[str]], default None): + Do not use. Internal param to be deprecated. + Use bigframes.ml.pipeline instead. + Returns: - Fitted estimator. + PCA: Fitted estimator. """ raise NotImplementedError("abstract method") + + def score(self, X=None, y=None): + """Return the metrics of the model. + + Args: + X (default None): + Ignored. + + y (default None): + Ignored. + Returns: + bigframes.dataframe.DataFrame: DataFrame that represents model metrics. + """ + raise NotImplementedError("abstract method") + + def predict(self, X): + """Predict the closest cluster for each sample in X. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or a DataFrame to predict. + + Returns: + bigframes.dataframe.DataFrame: predicted DataFrames.""" + raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index c91821c762..73f4684dc3 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -46,12 +46,17 @@ def fit(self, X, y): Args: X: - DataFrame of shape (n_samples, n_features). Training data. + Series or DataFrame of shape (n_samples, n_features). Training data. y: - DataFrame of shape (n_samples,) or (n_samples, n_targets). + Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. + transforms (Optional[List[str]], default None): + Do not use. Internal param to be deprecated. + Use bigframes.ml.pipeline instead. + + Returns: Fitted Estimator. """ @@ -71,7 +76,7 @@ def predict(self, X): Args: X: - DataFrame of shape (n_samples, n_features). The data matrix for + Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. Returns: @@ -137,7 +142,7 @@ def predict(self, X): Args: X: - DataFrame of shape (n_samples, n_features). The data matrix for + Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. Returns: diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index e67edac1aa..65e895298d 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -30,11 +30,11 @@ def predict(self, X): """Predict using the linear model. Args: - X: - DataFrame of shape (n_samples, n_features). Samples. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). Samples. Returns: - DataFrame of shape (n_samples,). Returns predicted values. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,). Returns predicted values. """ raise NotImplementedError("abstract method") @@ -44,13 +44,13 @@ def predict(self, X): """Predict class labels for samples in X. Args: - X: - DataFrame of shape (n_samples, n_features). The data matrix for + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. Returns: - DataFrame of shape (n_samples,), containing the class labels for - each sample. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing + the class labels for each sample. """ raise NotImplementedError("abstract method") @@ -63,7 +63,7 @@ class LinearRegression(RegressorMixin, LinearModel): the dataset, and the targets predicted by the linear approximation. Args: - fit_intercept: + fit_intercept (default True): Default ``True``. Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). @@ -78,21 +78,18 @@ def fit( """Fit linear model. Args: - X: - DataFrame of shape (n_samples, n_features). Training data. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). Training data. - y: - DataFrame of shape (n_samples,) or (n_samples, n_targets). + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. - transforms: - An optional list of SQL expressions to apply over top of the - model inputs as preprocessing. This preprocessing will be - automatically reapplied to new input data (e.g. in .predict), - and may contain steps (like ML.STANDARD_SCALER) that fit to the - training data. + transforms (Optional[List[str]], default None): + Do not use. Internal param to be deprecated. + Use bigframes.ml.pipeline instead. Returns: - Fitted Estimator. + LinearRegression: Fitted Estimator. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index 6de5bf65e2..8525e57068 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -24,33 +24,12 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): """Logistic Regression (aka logit, MaxEnt) classifier. Args: - fit_intercept: + fit_intercept (default True): Default True. Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. - auto_class_weights: + auto_class_weights (default False): Default False. If True, balance class labels using weights for each class in inverse proportion to the frequency of that class. - - References: - L-BFGS-B -- Software for Large-scale Bound-constrained Optimization - Ciyou Zhu, Richard Byrd, Jorge Nocedal and Jose Luis Morales. - http://users.iems.northwestern.edu/~nocedal/lbfgsb.html - - LIBLINEAR -- A Library for Large Linear Classification - https://www.csie.ntu.edu.tw/~cjlin/liblinear/ - - SAG -- Mark Schmidt, Nicolas Le Roux, and Francis Bach - Minimizing Finite Sums with the Stochastic Average Gradient - https://hal.inria.fr/hal-00860051/document - - SAGA -- Defazio, A., Bach F. & Lacoste-Julien S. (2014). - "SAGA: A Fast Incremental Gradient Method With Support - for Non-Strongly Convex Composite Objectives" (Arxiv <1407.0202>) - - Hsiang-Fu Yu, Fang-Lan Huang, Chih-Jen Lin (2011). Dual coordinate descent - methods for logistic regression and maximum entropy models. - Machine Learning 85(1-2):41-75. - https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf """ def fit( @@ -62,22 +41,20 @@ def fit( """Fit the model according to the given training data. Args: - X: - DataFrame of shape (n_samples, n_features). Training vector, + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. - y: + y (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples,). Target vector relative to X. - transforms: - An optional list of SQL expressions to apply over top of the - model inputs as preprocessing. This preprocessing will be - automatically reapplied to new input data (e.g. in .predict), - and may contain steps (like ML.STANDARD_SCALER) that fit to the - training data. + transforms (Optional[List[str]], default None): + Do not use. Internal param to be deprecated. + Use bigframes.ml.pipeline instead. + Returns: - Fitted estimator. + LogisticRegression: Fitted Estimator. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index 05fb5f6fb6..6d9692ac8d 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -25,18 +25,17 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float: """Accuracy classification score. Args: - y_true: DataFrame of shape (n_samples,) + y_true (Series or DataFrame of shape (n_samples,)): Ground truth (correct) labels. - y_pred: DataFrame of shape (n_samples,) + y_pred (Series or DataFrame of shape (n_samples,)): Predicted labels, as returned by a classifier. - normalize: bool, default=True + normalize (bool, default True): Default to True. If ``False``, return the number of correctly classified samples. Otherwise, return the fraction of correctly classified samples. Returns: - score: float. - If ``normalize == True``, return the fraction of correctly + float: If ``normalize == True``, return the fraction of correctly classified samples (float), else returns the number of correctly classified samples (int). """ @@ -58,17 +57,16 @@ def confusion_matrix( :math:`C_{1,1}` and false positives is :math:`C_{0,1}`. Args: - y_true: DataFrame of shape (n_samples,) + y_true (Series or DataFrame of shape (n_samples,)): Ground truth (correct) target values. - y_pred: DataFrame of shape (n_samples,) + y_pred (Series or DataFrame of shape (n_samples,)): Estimated targets as returned by a classifier. Returns: - C: DataFrame of shape (n_samples, n_features). - Confusion matrix whose i-th row and j-th - column entry indicates the number of - samples with true label being i-th class - and predicted label being j-th class. + DataFrame of shape (n_samples, n_features): Confusion matrix whose + i-th row and j-th column entry indicates the number of + samples with true label being i-th class and predicted label + being j-th class. """ raise NotImplementedError("abstract method") @@ -87,19 +85,18 @@ def recall_score( The best value is 1 and the worst value is 0. Args: - y_true: DataFrame of shape (n_samples,) + y_true (Series or DataFrame of shape (n_samples,)): Ground truth (correct) target values. - y_pred: DataFrame of shape (n_samples,) + y_pred (Series or DataFrame of shape (n_samples,)): Estimated targets as returned by a classifier. - average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ - default='binary' + average ({'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ + default='binary'): This parameter is required for multiclass/multilabel targets. Possible values are 'None', 'micro', 'macro', 'samples', 'weighted', 'binary'. Returns: - recall: float (if average is not None) or Series of float of shape \ - (n_unique_labels,). - Recall of the positive class in binary classification or weighted + float (if average is not None) or Series of float of shape n_unique_labels,): Recall + of the positive class in binary classification or weighted average of the recall of each class for the multiclass task. """ raise NotImplementedError("abstract method") @@ -120,9 +117,9 @@ def precision_score( The best value is 1 and the worst value is 0. Args: - y_true: DataFrame of shape (n_samples,) + y_true: Series or DataFrame of shape (n_samples,) Ground truth (correct) target values. - y_pred: DataFrame of shape (n_samples,) + y_pred: Series or DataFrame of shape (n_samples,) Estimated targets as returned by a classifier. average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ default='binary' @@ -148,16 +145,16 @@ def f1_score( The F1 score can be interpreted as a harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are - equal. The formula for the F1 score is: F1 = 2 * (precision * recall) / (precision + recall) + equal. The formula for the F1 score is: F1 = 2 * (precision * recall) / (precision + recall). In the multi-class and multi-label case, this is the average of the F1 score of each class with weighting depending on the ``average`` parameter. Args: - y_true: DataFrame of shape (n_samples,) + y_true: Series or DataFrame of shape (n_samples,) Ground truth (correct) target values. - y_pred: DataFrame of shape (n_samples,) + y_pred: Series or DataFrame of shape (n_samples,) Estimated targets as returned by a classifier. average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ default='binary' diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index b7625171b5..693996070f 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -26,15 +26,14 @@ def auc(x, y) -> float: :func:`average_precision_score`. Args: - x : DataFrame of shape (n_samples,) + x (Series or DataFrame of shape (n_samples,)): X coordinates. These must be either monotonic increasing or monotonic decreasing. - y : DataFrame of shape (n_samples,) + y (Series or DataFrame of shape (n_samples,)): Y coordinates. Returns: - auc : float - Area Under the Curve. + float: Area Under the Curve. """ raise NotImplementedError("abstract method") @@ -44,11 +43,11 @@ def roc_auc_score(y_true, y_score) -> float: from prediction scores. Args: - y_true: DataFrame of shape (n_samples,) + y_true (Series or DataFrame of shape (n_samples,)): True labels or binary label indicators. The binary and multiclass cases expect labels with shape (n_samples,) while the multilabel case expects binary label indicators with shape (n_samples, n_classes). - y_score: DataFrame of shape (n_samples,) + y_score (Series or DataFrame of shape (n_samples,)): Target scores. * In the binary case, it corresponds to an array of shape `(n_samples,)`. Both probability estimates and non-thresholded @@ -59,8 +58,7 @@ def roc_auc_score(y_true, y_score) -> float: corresponds to the output of `estimator.decision_function(X, y)`. Returns: - auc: float. - Area Under the Curve score. + float: Area Under the Curve score. """ raise NotImplementedError("abstract method") @@ -73,10 +71,10 @@ def roc_curve( """Compute Receiver operating characteristic (ROC). Args: - y_true: DataFrame of shape (n_samples,) + y_true: Series or DataFrame of shape (n_samples,) True binary labels. If labels are not either {-1, 1} or {0, 1}, then pos_label should be explicitly given. - y_score: DataFrame of shape (n_samples,) + y_score: Series or DataFrame of shape (n_samples,) Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index 44ceab48a6..b90c415887 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -41,13 +41,12 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: predictions) respectively. Args: - y_true: DataFrame of shape (n_samples,) + y_true (Series or DataFrame of shape (n_samples,)): Ground truth (correct) target values. - y_pred: DataFrame of shape (n_samples,) + y_pred (Series or DataFrame of shape (n_samples,)): Estimated target values. Returns: - z: float. - The :math:`R^2` score. + float: The :math:`R^2` score. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/pipeline.py b/third_party/bigframes_vendored/sklearn/pipeline.py index 7374edf990..f8bbae86df 100644 --- a/third_party/bigframes_vendored/sklearn/pipeline.py +++ b/third_party/bigframes_vendored/sklearn/pipeline.py @@ -18,13 +18,13 @@ class Pipeline(BaseEstimator, metaclass=ABCMeta): """Pipeline of transforms with a final estimator. Sequentially apply a list of transforms and a final estimator. - Intermediate steps of the pipeline must be 'transforms', that is, they + Intermediate steps of the pipeline must be `transforms`, that is, they must implement `fit` and `transform` methods. The final estimator only needs to implement `fit`. The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. This simplifies code, and allows deploying an estimator - and peprocessing together, e.g. with Pipeline.to_gbq(...) + and peprocessing together, e.g. with `Pipeline.to_gbq(...).` """ def fit( @@ -38,14 +38,14 @@ def fit( data. Finally, fit the transformed data using the final estimator. Args: - X: - A BigQuery DataFrames representing training data. Must match the + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A DataFrame or Series representing training data. Must match the input requirements of the first step of the pipeline. - y: - A BigQuery DataFrames representing training targets, if applicable. + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + A DataFrame or Series representing training targets, if applicable. Returns: - Pipeline with fitted steps. + Pipeline: Pipeline with fitted steps. """ raise NotImplementedError("abstract method") @@ -58,14 +58,14 @@ def score(self, X, y): `score` method. Only valid if the final estimator implements `score`. Args: - X: - A BigQuery DataFrames as evaluation data. - y: - A BigQuery DataFrames as evaluation labels. - - Returns: - A BigQuery DataFrames representing the result of calling - `score` on the final estimator. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A DataFrame or Series as evaluation data. + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + A DataFrame or Series as evaluation labels. + + Returns:; + DataFrame: A DataFrame representing the result + of calling `score` on the final estimator. """ raise NotImplementedError("abstract method") @@ -74,10 +74,11 @@ def predict(self, X): """Predict the pipeline result for each sample in X. Args: - X: - A BigQuery DataFrames to predict. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A DataFrame or Series to predict. Returns: - A BigQuery DataFrames Dataframe representing predicted result. + bigframes.dataframe.DataFrame: A Dataframe representing + predicted result. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py index 7a44c4e66d..c57d1f2230 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py @@ -32,12 +32,32 @@ class StandardScaler(BaseEstimator): def fit(self, X): """Compute the mean and std to be used for later scaling. + Examples: + + .. code-block:: + + from bigframes.ml.preprocessing import StandardScaler + + enc = StandardScaler() + X = [['Male', 1], ['Female', 3], ['Female', 2]] + enc.fit(X) + + Examples: + + .. code-block:: + + from bigframes.ml import StandardScaler + + enc = StandardScaler() + X = [['Male', 1], ['Female', 3], ['Female', 2]] + enc.fit(X) + Args: - X: - A dataframe with training data. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Dataframe or Series with training data. Returns: - Fitted scaler. + StandardScaler: Fitted scaler. """ raise NotImplementedError("abstract method") @@ -45,9 +65,10 @@ def transform(self, X): """Perform standardization by centering and scaling. Args: - X: - The DataFrame to be transformed. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. Returns: - Transformed result.""" + bigframes.dataframe.DataFrame: Transformed result. + """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 6f0565ac13..a6c32d91c1 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -14,18 +14,52 @@ class OneHotEncoder(BaseEstimator): encoding scheme. Note that this method deviates from Scikit-Learn; instead of producing sparse - binary columns, the encoding is a single column of STRUCT + binary columns, the encoding is a single column of `STRUCT`. + + Args: + drop (Optional[Literal["most_frequent"]], default None): + Specifies a methodology to use to drop one of the categories per feature. + This is useful in situations where perfectly collinear features cause problems, + such as when feeding the resulting data into an unregularized linear regression model. + However, dropping one category breaks the symmetry of the original representation + and can therefore induce a bias in downstream models, for instance for penalized + linear classification or regression models. + Default None: retain all the categories. + "most_frequent": Drop the most frequent category found in the string expression. + Selecting this value causes the function to use dummy encoding. + min_frequency (Optional[int], default None): + Specifies the minimum frequency below which a category will be considered infrequent. + Default None. + int: categories with a smaller cardinality will be considered infrequent as index 0. + max_categories (Optional[int], default None): + Specifies an upper limit to the number of output features for each input feature + when considering infrequent categories. If there are infrequent categories, + max_categories includes the category representing the infrequent categories along with the frequent categories. + Default None, set limit to 1,000,000. """ def fit(self, X): """Fit OneHotEncoder to X. + Examples: + + Given a dataset with two features, we let the encoder find the unique + values per feature and transform the data to a binary one-hot encoding. + + .. code-block:: + + from bigframes.ml.preprocessing import OneHotEncoder + + enc = OneHotEncoder() + X = [['Male', 1], ['Female', 3], ['Female', 2]] + enc.fit(X) + Args: - X: - A dataframe with training data. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series with training data. Returns: - Fitted encoder. + OneHotEncoder: Fitted encoder. """ raise NotImplementedError("abstract method") @@ -33,9 +67,10 @@ def transform(self, X): """Transform X using one-hot encoding. Args: - X: - The DataFrame to be transformed. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. Returns: - Transformed result.""" + bigframes.dataframe.DataFrame: The result is categorized as index: number, value: number. + Where index is the position of the dict that seeing the category, and value is 0 or 1.""" raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index 268484b2c2..fcb5d2ec59 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -12,11 +12,11 @@ def predict(self, X): """Predict using the XGB model. Args: - X: - DataFrame of shape (n_samples, n_features). Samples. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). Samples. Returns: - DataFrame of shape (n_samples,). Returns predicted values. + DataFrame of shape (n_samples,): Returns predicted values. """ raise NotImplementedError("abstract method") @@ -28,15 +28,19 @@ def fit(self, X, y): pass ``xgb_model`` argument. Args: - X: - DataFrame of shape (n_samples, n_features). Training data. + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). Training data. - y: + y (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. + transforms (Optional[List[str]], default None): + Do not use. Internal param to be deprecated. + Use bigframes.ml.pipeline instead. + Returns: - Fitted Estimator. + XGBModel: Fitted Estimator. """ raise NotImplementedError("abstract method") @@ -53,46 +57,47 @@ class XGBRegressor(XGBModel, XGBRegressorBase): XGBoost regression model. Args: - num_parallel_tree: Optional[int] + num_parallel_tree (Optional[int]): Number of parallel trees constructed during each iteration. Default to 1. - booster: Optional[str] + booster (Optional[str]): Specify which booster to use: gbtree or dart. Default to "gbtree". - dart_normalized_type": Optional[str] + dart_normalized_type (Optional[str]): Type of normalization algorithm for DART booster. Possible values: "TREE", "FOREST". Default to "TREE". - tree_method: Optional[str] + tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to - default, XGBoost will choose the most conservative option available. - min_child_weight : Optional[float] + default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", + "hist". + min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. - colsample_bytree : Optional[float] + colsample_bytree (Optional[float]): Subsample ratio of columns when constructing each tree. Default to 1.0. - colsample_bylevel : Optional[float] + colsample_bylevel (Optional[float]): Subsample ratio of columns for each level. Default to 1.0. - colsample_bynode : Optional[float] + colsample_bynode (Optional[float]): Subsample ratio of columns for each split. Default to 1.0. - gamma : Optional[float] + gamma (Optional[float]): (min_split_loss) Minimum loss reduction required to make a further partition on a leaf node of the tree. Default to 0.0. - max_depth : Optional[int] + max_depth (Optional[int]): Maximum tree depth for base learners. Default to 6. - subsample : Optional[float] + subsample (Optional[float]): Subsample ratio of the training instance. Default to 1.0. - reg_alpha : Optional[float] + reg_alpha (Optional[float]): L1 regularization term on weights (xgb's alpha). Default to 0.0. - reg_lambda : Optional[float] + reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop: Optional[bool] + early_stop (Optional[bool]): Whether training should stop after the first iteration. Default to True. - learning_rate: Optional[float] + learning_rate (Optional[float]): Boosting learning rate (xgb's "eta"). Default to 0.3. - max_iterations: Optional[int] + max_iterations (Optional[int]): Maximum number of rounds for boosting. Default to 20. - min_rel_progress: Optional[float] + min_rel_progress (Optional[float]): Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. - enable_global_explain: Optional[bool] + enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. - xgboost_version: Optional[str] - Specifies the Xgboost version for model training. Default to "0.9". + xgboost_version (Optional[str]): + Specifies the Xgboost version for model training. Default to "0.9". Possible values: "0.9", "1.1". """ @@ -101,45 +106,45 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase): XGBoost classifier model. Args: - num_parallel_tree: Optional[int] + num_parallel_tree (Optional[int]): Number of parallel trees constructed during each iteration. Default to 1. - booster: Optional[str] + booster (Optional[str]): Specify which booster to use: gbtree or dart. Default to "gbtree". - dart_normalized_type": Optional[str] + dart_normalized_type (Optional[str]): Type of normalization algorithm for DART booster. Possible values: "TREE", "FOREST". Default to "TREE". - tree_method: Optional[str] + tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", "hist". - min_child_weight : Optional[float] + min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. - colsample_bytree : Optional[float] + colsample_bytree (Optional[float]): Subsample ratio of columns when constructing each tree. Default to 1.0. - colsample_bylevel : Optional[float] + colsample_bylevel (Optional[float]): Subsample ratio of columns for each level. Default to 1.0. - colsample_bynode : Optional[float] + colsample_bynode (Optional[float]): Subsample ratio of columns for each split. Default to 1.0. - gamma : Optional[float] + gamma (Optional[float]): (min_split_loss) Minimum loss reduction required to make a further partition on a leaf node of the tree. Default to 0.0. - max_depth : Optional[int] + max_depth (Optional[int]): Maximum tree depth for base learners. Default to 6. - subsample : Optional[float] + subsample (Optional[float]): Subsample ratio of the training instance. Default to 1.0. - reg_alpha : Optional[float] + reg_alpha (Optional[float]): L1 regularization term on weights (xgb's alpha). Default to 0.0. - reg_lambda : Optional[float] + reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop: Optional[bool] + early_stop (Optional[bool]): Whether training should stop after the first iteration. Default to True. - learning_rate: Optional[float] + learning_rate (Optional[float]): Boosting learning rate (xgb's "eta"). Default to 0.3. - max_iterations: Optional[int] + max_iterations (Optional[int]): Maximum number of rounds for boosting. Default to 20. - min_rel_progress: Optional[float] + min_rel_progress (Optional[float]): Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. - enable_global_explain: Optional[bool] + enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. - xgboost_version: Optional[str] + xgboost_version (Optional[str]): Specifies the Xgboost version for model training. Default to "0.9". Possible values: "0.9", "1.1". """