Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add XGBoostModel #363

Merged
merged 8 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ Create estimators for imported models by using the `bigframes.ml.imported module
to import Open Neural Network Exchange (ONNX) models.
* Use the `TensorFlowModel class <https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.imported.TensorFlowModel>`_
to import TensorFlow models.
* Use the `XGBoostModel class <https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.imported.XGBoostModel>`_
to import XGBoostModel models.

**Linear models**

Expand Down
30 changes: 30 additions & 0 deletions bigframes/ml/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,33 @@ def create_imported_model(
)

return self._create_model_with_sql(session=session, sql=sql)

def create_xgboost_imported_model(
self,
session: bigframes.Session,
input: Mapping[str, str] = {},
output: Mapping[str, str] = {},
options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
) -> BqmlModel:
"""Create a session-temporary BQML imported model with the CREATE OR REPLACE MODEL statement

Args:
input:
input schema for imported xgboost models
output:
output schema for imported xgboost models
options: a dict of options to configure the model. Generates a BQML OPTIONS
clause

Returns: a BqmlModel, wrapping a trained model in BigQuery
"""
model_ref = self._create_model_ref(session._anonymous_dataset)

sql = self._model_creation_sql_generator.create_xgboost_imported_model(
model_ref=model_ref,
input=input,
output=output,
options=options,
)

return self._create_model_with_sql(session=session, sql=sql)
11 changes: 11 additions & 0 deletions bigframes/ml/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@
_BASE_SQL_GENERATOR = sql.BaseSqlGenerator()
_BQML_MODEL_FACTORY = core.BqmlModelFactory()

_SUPPORTED_DTYPES = (
"bool",
"string",
"int64",
"float64",
"array<bool>",
"array<string>",
"array<int64>",
"array<float64>",
)


def base_sql_generator() -> sql.BaseSqlGenerator:
"""Base SQL Generator."""
Expand Down
117 changes: 116 additions & 1 deletion bigframes/ml/imported.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@

from __future__ import annotations

from typing import cast, Optional, Union
from typing import cast, Mapping, Optional, Union

from google.cloud import bigquery

import bigframes
from bigframes.core import log_adapter
from bigframes.ml import base, core, globals, utils
from bigframes.ml.globals import _SUPPORTED_DTYPES
import bigframes.pandas as bpd


Expand Down Expand Up @@ -176,3 +177,117 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel:

new_model = self._bqml_model.copy(model_name, replace)
return new_model.session.read_gbq_model(model_name)


@log_adapter.class_logger
class XGBoostModel(base.Predictor):
"""Imported XGBoost model.

.. warning::

Imported XGBoost models have the several limitations. See:
https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-xgboost#limitations

Args:
session (BigQuery Session):
BQ session to create the model
input (Dict, default None):
Specify the model input schema information when you
create the XGBoost model. The input should be the format of
{field_name: field_type}. Input is optional only if feature_names
ashleyxuu marked this conversation as resolved.
Show resolved Hide resolved
and feature_types are both specified in the model file. Supported types
are "bool", "string", "int64", "float64", "array<bool>", "array<string>", "array<int64>", "array<float64>".
output (Dict, default None):
Specify the model output schema information when you
create the XGBoost model. The input should be the format of
{field_name: field_type}. Output is optional only if feature_names
and feature_types are both specified in the model file. Supported types
are "bool", "string", "int64", "float64", "array<bool>", "array<string>", "array<int64>", "array<float64>".
model_path (str):
Cloud Storage path that holds the model files."""

def __init__(
self,
session: Optional[bigframes.Session] = None,
input: Mapping[str, str] = {},
output: Mapping[str, str] = {},
model_path: Optional[str] = None,
):
self.session = session or bpd.get_global_session()
self.model_path = model_path
self.input = input
self.output = output
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

def _create_bqml_model(self):
options = {"model_type": "XGBOOST", "model_path": self.model_path}

if not self.input and not self.output:
return self._bqml_model_factory.create_imported_model(
session=self.session, options=options
)
else:
for io in (self.input, self.output):
for v in io.values():
if v not in _SUPPORTED_DTYPES:
raise ValueError(
f"field_type {v} is not supported. We only support {', '.join(_SUPPORTED_DTYPES)}."
)

return self._bqml_model_factory.create_xgboost_imported_model(
session=self.session,
input=self.input,
output=self.output,
options=options,
)

@classmethod
def _from_bq(
cls, session: bigframes.Session, model: bigquery.Model
) -> XGBoostModel:
assert model.model_type == "XGBOOST"

xgboost_model = cls(session=session, model_path=None)
xgboost_model._bqml_model = core.BqmlModel(session, model)
return xgboost_model

def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
"""Predict the result from input DataFrame.

Args:
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
Input DataFrame or Series, schema is defined by the model.

Returns:
bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model."""

if not self._bqml_model:
if self.model_path is None:
raise ValueError("Model GCS path must be provided.")
self._bqml_model = self._create_bqml_model()
self._bqml_model = cast(core.BqmlModel, self._bqml_model)

(X,) = utils.convert_to_dataframe(X)

return self._bqml_model.predict(X)

def to_gbq(self, model_name: str, replace: bool = False) -> XGBoostModel:
"""Save the model to BigQuery.

Args:
model_name (str):
the name of the model.
replace (bool, default False):
whether to replace if the model already exists. Default to False.

Returns:
XGBoostModel: saved model."""
if not self._bqml_model:
if self.model_path is None:
raise ValueError("Model GCS path must be provided.")
self._bqml_model = self._create_bqml_model()
self._bqml_model = cast(core.BqmlModel, self._bqml_model)

new_model = self._bqml_model.copy(model_name, replace)
return new_model.session.read_gbq_model(model_name)
2 changes: 2 additions & 0 deletions bigframes/ml/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"RANDOM_FOREST_CLASSIFIER": ensemble.RandomForestClassifier,
"TENSORFLOW": imported.TensorFlowModel,
"ONNX": imported.ONNXModel,
"XGBOOST": imported.XGBoostModel,
}
)

Expand Down Expand Up @@ -72,6 +73,7 @@ def from_bq(
ensemble.RandomForestClassifier,
imported.TensorFlowModel,
imported.ONNXModel,
imported.XGBoostModel,
llm.PaLM2TextGenerator,
llm.PaLM2TextEmbeddingGenerator,
pipeline.Pipeline,
Expand Down
12 changes: 1 addition & 11 deletions bigframes/ml/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,9 @@
from bigframes import clients
from bigframes.core import log_adapter
from bigframes.ml import base, core, globals, utils
from bigframes.ml.globals import _SUPPORTED_DTYPES
import bigframes.pandas as bpd

_SUPPORTED_DTYPES = (
"bool",
"string",
"int64",
"float64",
"array<bool>",
"array<string>",
"array<int64>",
"array<float64>",
)

_REMOTE_MODEL_STATUS = "remote_model_status"


Expand Down
18 changes: 18 additions & 0 deletions bigframes/ml/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,24 @@ def create_imported_model(
parts.append(self.options(**options))
return "\n".join(parts)

def create_xgboost_imported_model(
self,
model_ref: google.cloud.bigquery.ModelReference,
input: Mapping[str, str] = {},
output: Mapping[str, str] = {},
options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
) -> str:
"""Encode the CREATE OR REPLACE MODEL statement for BQML remote model."""

parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"]
if input:
parts.append(self.input(**input))
if output:
parts.append(self.output(**output))
if options:
parts.append(self.options(**options))
return "\n".join(parts)


class ModelManipulationSqlGenerator(BaseSqlGenerator):
"""Sql generator for manipulating a model entity. Model name is the full model path of project_id.dataset_id.model_id."""
Expand Down
2 changes: 2 additions & 0 deletions docs/templates/toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@
uid: bigframes.ml.imported.ONNXModel
- name: TensorFlowModel
uid: bigframes.ml.imported.TensorFlowModel
- name: XGBoostModel
uid: bigframes.ml.imported.XGBoostModel
name: imported
- items:
- name: Overview
Expand Down
40 changes: 40 additions & 0 deletions tests/system/small/ml/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,29 @@ def onnx_iris_pandas_df():
)


@pytest.fixture(scope="session")
def xgboost_iris_pandas_df():
"""Data matching the iris dataset."""
return pd.DataFrame(
{
"sepal_length": [4.9, 5.1, 34.7],
"sepal_width": [3.0, 5.1, 24.7],
"petal_length": [1.4, 1.5, 13.3],
"petal_width": [0.4, 0.2, 18.3],
}
)


@pytest.fixture(scope="session")
def onnx_iris_df(session, onnx_iris_pandas_df):
return session.read_pandas(onnx_iris_pandas_df)


@pytest.fixture(scope="session")
def xgboost_iris_df(session, xgboost_iris_pandas_df):
return session.read_pandas(xgboost_iris_pandas_df)


@pytest.fixture(scope="session")
def llm_text_df(session, llm_text_pandas_df):
return session.read_pandas(llm_text_pandas_df)
Expand Down Expand Up @@ -322,6 +340,11 @@ def imported_onnx_model_path() -> str:
return "gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx"


@pytest.fixture(scope="session")
def imported_xgboost_array_model_path() -> str:
return "gs://bigframes-dev-testing/xgboost-testdata/model.bst"


@pytest.fixture(scope="session")
def imported_tensorflow_model(
session, imported_tensorflow_model_path
Expand All @@ -346,3 +369,20 @@ def imported_onnx_model(session, imported_onnx_model_path) -> imported.ONNXModel
session=session,
model_path=imported_onnx_model_path,
)


@pytest.fixture(scope="session")
def imported_xgboost_model(
session, imported_xgboost_array_model_path
) -> imported.XGBoostModel:
return imported.XGBoostModel(
session=session,
input={
"petal_length": "float64",
"petal_width": "float64",
"sepal_length": "float64",
"sepal_width": "float64",
},
output={"predicted_label": "float64"},
model_path=imported_xgboost_array_model_path,
)
42 changes: 41 additions & 1 deletion tests/system/small/ml/test_imported.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_onnx_create_model(imported_onnx_model):


def test_onnx_create_model_default_session(imported_onnx_model_path):
model = imported.TensorFlowModel(model_path=imported_onnx_model_path)
model = imported.ONNXModel(model_path=imported_onnx_model_path)
assert model is not None


Expand Down Expand Up @@ -100,3 +100,43 @@ def test_onnx_model_to_gbq(imported_onnx_model: imported.ONNXModel, dataset_id:
imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model", replace=True)
with pytest.raises(google.api_core.exceptions.Conflict):
imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model")


def test_xgboost_create_model(imported_xgboost_model):
# Model creation doesn't return error
assert imported_xgboost_model is not None


def test_xgboost_create_model_default_session(imported_xgboost_array_model_path):
model = imported.XGBoostModel(model_path=imported_xgboost_array_model_path)
assert model is not None


def test_xgboost_model_predict(imported_xgboost_model, xgboost_iris_df):
predictions = imported_xgboost_model.predict(xgboost_iris_df).to_pandas()
assert predictions.shape == (3, 5)
result = predictions[["predicted_label"]]
value1 = np.array([0.00362173, 0.01580198, 0.98057634])
value2 = np.array([0.00349651, 0.00999565, 0.98650789])
value3 = np.array([0.00561748, 0.0108124, 0.98357016])
expected = pd.DataFrame(
{
"predicted_label": [value1, value2, value3],
},
index=pd.Index([0, 1, 2], dtype="Int64"),
)
pd.testing.assert_frame_equal(
result,
expected,
check_exact=False,
check_dtype=False,
atol=0.1,
)


def test_xgboost_model_to_gbq(
imported_xgboost_model: imported.XGBoostModel, dataset_id: str
):
imported_xgboost_model.to_gbq(f"{dataset_id}.test_xgboost_model", replace=True)
with pytest.raises(google.api_core.exceptions.Conflict):
imported_xgboost_model.to_gbq(f"{dataset_id}.test_xgboost_model")
Loading