Skip to content

Commit

Permalink
feat: add XGBoostModel (#363)
Browse files Browse the repository at this point in the history
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes internal #321809936 🦕
  • Loading branch information
ashleyxuu authored Feb 7, 2024
1 parent 443db22 commit d5518b2
Show file tree
Hide file tree
Showing 11 changed files with 287 additions and 13 deletions.
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ Create estimators for imported models by using the `bigframes.ml.imported module
to import Open Neural Network Exchange (ONNX) models.
* Use the `TensorFlowModel class <https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.imported.TensorFlowModel>`_
to import TensorFlow models.
* Use the `XGBoostModel class <https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.imported.XGBoostModel>`_
to import XGBoostModel models.

**Linear models**

Expand Down
30 changes: 30 additions & 0 deletions bigframes/ml/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,33 @@ def create_imported_model(
)

return self._create_model_with_sql(session=session, sql=sql)

def create_xgboost_imported_model(
self,
session: bigframes.Session,
input: Mapping[str, str] = {},
output: Mapping[str, str] = {},
options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
) -> BqmlModel:
"""Create a session-temporary BQML imported model with the CREATE OR REPLACE MODEL statement
Args:
input:
input schema for imported xgboost models
output:
output schema for imported xgboost models
options: a dict of options to configure the model. Generates a BQML OPTIONS
clause
Returns: a BqmlModel, wrapping a trained model in BigQuery
"""
model_ref = self._create_model_ref(session._anonymous_dataset)

sql = self._model_creation_sql_generator.create_xgboost_imported_model(
model_ref=model_ref,
input=input,
output=output,
options=options,
)

return self._create_model_with_sql(session=session, sql=sql)
11 changes: 11 additions & 0 deletions bigframes/ml/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@
_BASE_SQL_GENERATOR = sql.BaseSqlGenerator()
_BQML_MODEL_FACTORY = core.BqmlModelFactory()

_SUPPORTED_DTYPES = (
"bool",
"string",
"int64",
"float64",
"array<bool>",
"array<string>",
"array<int64>",
"array<float64>",
)


def base_sql_generator() -> sql.BaseSqlGenerator:
"""Base SQL Generator."""
Expand Down
117 changes: 116 additions & 1 deletion bigframes/ml/imported.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@

from __future__ import annotations

from typing import cast, Optional, Union
from typing import cast, Mapping, Optional, Union

from google.cloud import bigquery

import bigframes
from bigframes.core import log_adapter
from bigframes.ml import base, core, globals, utils
from bigframes.ml.globals import _SUPPORTED_DTYPES
import bigframes.pandas as bpd


Expand Down Expand Up @@ -176,3 +177,117 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel:

new_model = self._bqml_model.copy(model_name, replace)
return new_model.session.read_gbq_model(model_name)


@log_adapter.class_logger
class XGBoostModel(base.Predictor):
"""Imported XGBoost model.
.. warning::
Imported XGBoost models have the several limitations. See:
https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-xgboost#limitations
Args:
session (BigQuery Session):
BQ session to create the model
input (Dict, default None):
Specify the model input schema information when you
create the XGBoost model. The input should be the format of
{field_name: field_type}. Input is optional only if feature_names
and feature_types are both specified in the model file. Supported types
are "bool", "string", "int64", "float64", "array<bool>", "array<string>", "array<int64>", "array<float64>".
output (Dict, default None):
Specify the model output schema information when you
create the XGBoost model. The input should be the format of
{field_name: field_type}. Output is optional only if feature_names
and feature_types are both specified in the model file. Supported types
are "bool", "string", "int64", "float64", "array<bool>", "array<string>", "array<int64>", "array<float64>".
model_path (str):
Cloud Storage path that holds the model files."""

def __init__(
self,
session: Optional[bigframes.Session] = None,
input: Mapping[str, str] = {},
output: Mapping[str, str] = {},
model_path: Optional[str] = None,
):
self.session = session or bpd.get_global_session()
self.model_path = model_path
self.input = input
self.output = output
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

def _create_bqml_model(self):
options = {"model_type": "XGBOOST", "model_path": self.model_path}

if not self.input and not self.output:
return self._bqml_model_factory.create_imported_model(
session=self.session, options=options
)
else:
for io in (self.input, self.output):
for v in io.values():
if v not in _SUPPORTED_DTYPES:
raise ValueError(
f"field_type {v} is not supported. We only support {', '.join(_SUPPORTED_DTYPES)}."
)

return self._bqml_model_factory.create_xgboost_imported_model(
session=self.session,
input=self.input,
output=self.output,
options=options,
)

@classmethod
def _from_bq(
cls, session: bigframes.Session, model: bigquery.Model
) -> XGBoostModel:
assert model.model_type == "XGBOOST"

xgboost_model = cls(session=session, model_path=None)
xgboost_model._bqml_model = core.BqmlModel(session, model)
return xgboost_model

def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
"""Predict the result from input DataFrame.
Args:
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
Input DataFrame or Series, schema is defined by the model.
Returns:
bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model."""

if not self._bqml_model:
if self.model_path is None:
raise ValueError("Model GCS path must be provided.")
self._bqml_model = self._create_bqml_model()
self._bqml_model = cast(core.BqmlModel, self._bqml_model)

(X,) = utils.convert_to_dataframe(X)

return self._bqml_model.predict(X)

def to_gbq(self, model_name: str, replace: bool = False) -> XGBoostModel:
"""Save the model to BigQuery.
Args:
model_name (str):
the name of the model.
replace (bool, default False):
whether to replace if the model already exists. Default to False.
Returns:
XGBoostModel: saved model."""
if not self._bqml_model:
if self.model_path is None:
raise ValueError("Model GCS path must be provided.")
self._bqml_model = self._create_bqml_model()
self._bqml_model = cast(core.BqmlModel, self._bqml_model)

new_model = self._bqml_model.copy(model_name, replace)
return new_model.session.read_gbq_model(model_name)
2 changes: 2 additions & 0 deletions bigframes/ml/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"RANDOM_FOREST_CLASSIFIER": ensemble.RandomForestClassifier,
"TENSORFLOW": imported.TensorFlowModel,
"ONNX": imported.ONNXModel,
"XGBOOST": imported.XGBoostModel,
}
)

Expand Down Expand Up @@ -72,6 +73,7 @@ def from_bq(
ensemble.RandomForestClassifier,
imported.TensorFlowModel,
imported.ONNXModel,
imported.XGBoostModel,
llm.PaLM2TextGenerator,
llm.PaLM2TextEmbeddingGenerator,
pipeline.Pipeline,
Expand Down
12 changes: 1 addition & 11 deletions bigframes/ml/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,9 @@
from bigframes import clients
from bigframes.core import log_adapter
from bigframes.ml import base, core, globals, utils
from bigframes.ml.globals import _SUPPORTED_DTYPES
import bigframes.pandas as bpd

_SUPPORTED_DTYPES = (
"bool",
"string",
"int64",
"float64",
"array<bool>",
"array<string>",
"array<int64>",
"array<float64>",
)

_REMOTE_MODEL_STATUS = "remote_model_status"


Expand Down
18 changes: 18 additions & 0 deletions bigframes/ml/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,24 @@ def create_imported_model(
parts.append(self.options(**options))
return "\n".join(parts)

def create_xgboost_imported_model(
self,
model_ref: google.cloud.bigquery.ModelReference,
input: Mapping[str, str] = {},
output: Mapping[str, str] = {},
options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
) -> str:
"""Encode the CREATE OR REPLACE MODEL statement for BQML remote model."""

parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"]
if input:
parts.append(self.input(**input))
if output:
parts.append(self.output(**output))
if options:
parts.append(self.options(**options))
return "\n".join(parts)


class ModelManipulationSqlGenerator(BaseSqlGenerator):
"""Sql generator for manipulating a model entity. Model name is the full model path of project_id.dataset_id.model_id."""
Expand Down
2 changes: 2 additions & 0 deletions docs/templates/toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@
uid: bigframes.ml.imported.ONNXModel
- name: TensorFlowModel
uid: bigframes.ml.imported.TensorFlowModel
- name: XGBoostModel
uid: bigframes.ml.imported.XGBoostModel
name: imported
- items:
- name: Overview
Expand Down
40 changes: 40 additions & 0 deletions tests/system/small/ml/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,29 @@ def onnx_iris_pandas_df():
)


@pytest.fixture(scope="session")
def xgboost_iris_pandas_df():
"""Data matching the iris dataset."""
return pd.DataFrame(
{
"sepal_length": [4.9, 5.1, 34.7],
"sepal_width": [3.0, 5.1, 24.7],
"petal_length": [1.4, 1.5, 13.3],
"petal_width": [0.4, 0.2, 18.3],
}
)


@pytest.fixture(scope="session")
def onnx_iris_df(session, onnx_iris_pandas_df):
return session.read_pandas(onnx_iris_pandas_df)


@pytest.fixture(scope="session")
def xgboost_iris_df(session, xgboost_iris_pandas_df):
return session.read_pandas(xgboost_iris_pandas_df)


@pytest.fixture(scope="session")
def llm_text_df(session, llm_text_pandas_df):
return session.read_pandas(llm_text_pandas_df)
Expand Down Expand Up @@ -322,6 +340,11 @@ def imported_onnx_model_path() -> str:
return "gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx"


@pytest.fixture(scope="session")
def imported_xgboost_array_model_path() -> str:
return "gs://bigframes-dev-testing/xgboost-testdata/model.bst"


@pytest.fixture(scope="session")
def imported_tensorflow_model(
session, imported_tensorflow_model_path
Expand All @@ -346,3 +369,20 @@ def imported_onnx_model(session, imported_onnx_model_path) -> imported.ONNXModel
session=session,
model_path=imported_onnx_model_path,
)


@pytest.fixture(scope="session")
def imported_xgboost_model(
session, imported_xgboost_array_model_path
) -> imported.XGBoostModel:
return imported.XGBoostModel(
session=session,
input={
"petal_length": "float64",
"petal_width": "float64",
"sepal_length": "float64",
"sepal_width": "float64",
},
output={"predicted_label": "float64"},
model_path=imported_xgboost_array_model_path,
)
42 changes: 41 additions & 1 deletion tests/system/small/ml/test_imported.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_onnx_create_model(imported_onnx_model):


def test_onnx_create_model_default_session(imported_onnx_model_path):
model = imported.TensorFlowModel(model_path=imported_onnx_model_path)
model = imported.ONNXModel(model_path=imported_onnx_model_path)
assert model is not None


Expand Down Expand Up @@ -100,3 +100,43 @@ def test_onnx_model_to_gbq(imported_onnx_model: imported.ONNXModel, dataset_id:
imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model", replace=True)
with pytest.raises(google.api_core.exceptions.Conflict):
imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model")


def test_xgboost_create_model(imported_xgboost_model):
# Model creation doesn't return error
assert imported_xgboost_model is not None


def test_xgboost_create_model_default_session(imported_xgboost_array_model_path):
model = imported.XGBoostModel(model_path=imported_xgboost_array_model_path)
assert model is not None


def test_xgboost_model_predict(imported_xgboost_model, xgboost_iris_df):
predictions = imported_xgboost_model.predict(xgboost_iris_df).to_pandas()
assert predictions.shape == (3, 5)
result = predictions[["predicted_label"]]
value1 = np.array([0.00362173, 0.01580198, 0.98057634])
value2 = np.array([0.00349651, 0.00999565, 0.98650789])
value3 = np.array([0.00561748, 0.0108124, 0.98357016])
expected = pd.DataFrame(
{
"predicted_label": [value1, value2, value3],
},
index=pd.Index([0, 1, 2], dtype="Int64"),
)
pd.testing.assert_frame_equal(
result,
expected,
check_exact=False,
check_dtype=False,
atol=0.1,
)


def test_xgboost_model_to_gbq(
imported_xgboost_model: imported.XGBoostModel, dataset_id: str
):
imported_xgboost_model.to_gbq(f"{dataset_id}.test_xgboost_model", replace=True)
with pytest.raises(google.api_core.exceptions.Conflict):
imported_xgboost_model.to_gbq(f"{dataset_id}.test_xgboost_model")
Loading

0 comments on commit d5518b2

Please sign in to comment.