Skip to content

Commit

Permalink
feat!: rename ml model params (#491)
Browse files Browse the repository at this point in the history
Includes following changes:
* renaming min_rel_progress -> tol, to be consistent with sklearn
* not allowing setting early_stop anymore, always to True
* renaming n_parallell_trees -> n_estimators, to be consistent with sklearn
* renaming class_weights -> class_weight, to be consistent with sklearn
* renaming learn_rate -> learning_rate, to be consistent with sklearn
* PCA n_components supports float value and None now, default to None
  • Loading branch information
GarrettWu authored Mar 23, 2024
1 parent ae586e0 commit 65c6f47
Show file tree
Hide file tree
Showing 13 changed files with 205 additions and 195 deletions.
1 change: 0 additions & 1 deletion bigframes/ml/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
"init_col": "kmeansInitializationColumn",
"distance_type": "distanceType",
"max_iter": "maxIterations",
"early_stop": "earlyStop",
"tol": "minRelativeProgress",
}

Expand Down
29 changes: 23 additions & 6 deletions bigframes/ml/decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class PCA(

def __init__(
self,
n_components: int = 3,
n_components: Optional[Union[int, float]] = None,
*,
svd_solver: Literal["full", "randomized", "auto"] = "auto",
):
Expand All @@ -56,13 +56,31 @@ def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> PCA:
last_fitting = model.training_runs[-1]["trainingOptions"]
if "numPrincipalComponents" in last_fitting:
kwargs["n_components"] = int(last_fitting["numPrincipalComponents"])
if "pcaExplainedVarianceRatio" in last_fitting:
kwargs["n_components"] = float(last_fitting["pcaExplainedVarianceRatio"])
if "pcaSolver" in last_fitting:
kwargs["svd_solver"] = str(last_fitting["pcaSolver"])

new_pca = cls(**kwargs)
new_pca._bqml_model = core.BqmlModel(session, model)
return new_pca

@property
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
options: dict = {
"model_type": "PCA",
"pca_solver": self.svd_solver,
}

assert self.n_components is not None
if 0 < self.n_components < 1:
options["pca_explained_variance_ratio"] = float(self.n_components)
elif self.n_components >= 1:
options["num_principal_components"] = int(self.n_components)

return options

def _fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
Expand All @@ -71,14 +89,13 @@ def _fit(
) -> PCA:
(X,) = utils.convert_to_dataframe(X)

# To mimic sklearn's behavior
if self.n_components is None:
self.n_components = min(X.shape)
self._bqml_model = self._bqml_model_factory.create_model(
X_train=X,
transforms=transforms,
options={
"model_type": "PCA",
"num_principal_components": self.n_components,
"pca_solver": self.svd_solver,
},
options=self._bqml_options,
)
return self

Expand Down
69 changes: 30 additions & 39 deletions bigframes/ml/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
_BQML_PARAMS_MAPPING = {
"booster": "boosterType",
"tree_method": "treeMethod",
"early_stop": "earlyStop",
"colsample_bytree": "colsampleBylevel",
"colsample_bylevel": "colsampleBytree",
"colsample_bynode": "colsampleBynode",
Expand All @@ -40,8 +39,8 @@
"reg_alpha": "l1Regularization",
"reg_lambda": "l2Regularization",
"learning_rate": "learnRate",
"min_rel_progress": "minRelativeProgress",
"num_parallel_tree": "numParallelTree",
"tol": "minRelativeProgress",
"n_estimators": "numParallelTree",
"min_tree_child_weight": "minTreeChildWeight",
"max_depth": "maxTreeDepth",
"max_iterations": "maxIterations",
Expand All @@ -57,7 +56,7 @@ class XGBRegressor(

def __init__(
self,
num_parallel_tree: int = 1,
n_estimators: int = 1,
*,
booster: Literal["gbtree", "dart"] = "gbtree",
dart_normalized_type: Literal["tree", "forest"] = "tree",
Expand All @@ -71,14 +70,13 @@ def __init__(
subsample: float = 1.0,
reg_alpha: float = 0.0,
reg_lambda: float = 1.0,
early_stop: float = True,
learning_rate: float = 0.3,
max_iterations: int = 20,
min_rel_progress: float = 0.01,
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
):
self.num_parallel_tree = num_parallel_tree
self.n_estimators = n_estimators
self.booster = booster
self.dart_normalized_type = dart_normalized_type
self.tree_method = tree_method
Expand All @@ -91,10 +89,9 @@ def __init__(
self.subsample = subsample
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.early_stop = early_stop
self.learning_rate = learning_rate
self.max_iterations = max_iterations
self.min_rel_progress = min_rel_progress
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self._bqml_model: Optional[core.BqmlModel] = None
Expand Down Expand Up @@ -127,7 +124,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
return {
"model_type": "BOOSTED_TREE_REGRESSOR",
"data_split_method": "NO_SPLIT",
"num_parallel_tree": self.num_parallel_tree,
"early_stop": True,
"num_parallel_tree": self.n_estimators,
"booster_type": self.booster,
"tree_method": self.tree_method,
"min_tree_child_weight": self.min_tree_child_weight,
Expand All @@ -139,10 +137,9 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"subsample": self.subsample,
"l1_reg": self.reg_alpha,
"l2_reg": self.reg_lambda,
"early_stop": self.early_stop,
"learn_rate": self.learning_rate,
"max_iterations": self.max_iterations,
"min_rel_progress": self.min_rel_progress,
"min_rel_progress": self.tol,
"enable_global_explain": self.enable_global_explain,
"xgboost_version": self.xgboost_version,
}
Expand Down Expand Up @@ -215,7 +212,7 @@ class XGBClassifier(

def __init__(
self,
num_parallel_tree: int = 1,
n_estimators: int = 1,
*,
booster: Literal["gbtree", "dart"] = "gbtree",
dart_normalized_type: Literal["tree", "forest"] = "tree",
Expand All @@ -229,14 +226,13 @@ def __init__(
subsample: float = 1.0,
reg_alpha: float = 0.0,
reg_lambda: float = 1.0,
early_stop: bool = True,
learning_rate: float = 0.3,
max_iterations: int = 20,
min_rel_progress: float = 0.01,
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
):
self.num_parallel_tree = num_parallel_tree
self.n_estimators = n_estimators
self.booster = booster
self.dart_normalized_type = dart_normalized_type
self.tree_method = tree_method
Expand All @@ -249,10 +245,9 @@ def __init__(
self.subsample = subsample
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.early_stop = early_stop
self.learning_rate = learning_rate
self.max_iterations = max_iterations
self.min_rel_progress = min_rel_progress
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self._bqml_model: Optional[core.BqmlModel] = None
Expand Down Expand Up @@ -285,7 +280,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
return {
"model_type": "BOOSTED_TREE_CLASSIFIER",
"data_split_method": "NO_SPLIT",
"num_parallel_tree": self.num_parallel_tree,
"early_stop": True,
"num_parallel_tree": self.n_estimators,
"booster_type": self.booster,
"tree_method": self.tree_method,
"min_tree_child_weight": self.min_tree_child_weight,
Expand All @@ -297,10 +293,9 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"subsample": self.subsample,
"l1_reg": self.reg_alpha,
"l2_reg": self.reg_lambda,
"early_stop": self.early_stop,
"learn_rate": self.learning_rate,
"max_iterations": self.max_iterations,
"min_rel_progress": self.min_rel_progress,
"min_rel_progress": self.tol,
"enable_global_explain": self.enable_global_explain,
"xgboost_version": self.xgboost_version,
}
Expand Down Expand Up @@ -371,7 +366,7 @@ class RandomForestRegressor(

def __init__(
self,
num_parallel_tree: int = 100,
n_estimators: int = 100,
*,
tree_method: Literal["auto", "exact", "approx", "hist"] = "auto",
min_tree_child_weight: int = 1,
Expand All @@ -383,12 +378,11 @@ def __init__(
subsample=0.8,
reg_alpha=0.0,
reg_lambda=1.0,
early_stop=True,
min_rel_progress=0.01,
tol=0.01,
enable_global_explain=False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
):
self.num_parallel_tree = num_parallel_tree
self.n_estimators = n_estimators
self.tree_method = tree_method
self.min_tree_child_weight = min_tree_child_weight
self.colsample_bytree = colsample_bytree
Expand All @@ -399,8 +393,7 @@ def __init__(
self.subsample = subsample
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.early_stop = early_stop
self.min_rel_progress = min_rel_progress
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self._bqml_model: Optional[core.BqmlModel] = None
Expand Down Expand Up @@ -432,7 +425,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"""The model options as they will be set for BQML"""
return {
"model_type": "RANDOM_FOREST_REGRESSOR",
"num_parallel_tree": self.num_parallel_tree,
"early_stop": True,
"num_parallel_tree": self.n_estimators,
"tree_method": self.tree_method,
"min_tree_child_weight": self.min_tree_child_weight,
"colsample_bytree": self.colsample_bytree,
Expand All @@ -443,8 +437,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"subsample": self.subsample,
"l1_reg": self.reg_alpha,
"l2_reg": self.reg_lambda,
"early_stop": self.early_stop,
"min_rel_progress": self.min_rel_progress,
"min_rel_progress": self.tol,
"data_split_method": "NO_SPLIT",
"enable_global_explain": self.enable_global_explain,
"xgboost_version": self.xgboost_version,
Expand Down Expand Up @@ -536,7 +529,7 @@ class RandomForestClassifier(

def __init__(
self,
num_parallel_tree: int = 100,
n_estimators: int = 100,
*,
tree_method: Literal["auto", "exact", "approx", "hist"] = "auto",
min_tree_child_weight: int = 1,
Expand All @@ -548,12 +541,11 @@ def __init__(
subsample: float = 0.8,
reg_alpha: float = 0.0,
reg_lambda: float = 1.0,
early_stop=True,
min_rel_progress: float = 0.01,
tol: float = 0.01,
enable_global_explain=False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
):
self.num_parallel_tree = num_parallel_tree
self.n_estimators = n_estimators
self.tree_method = tree_method
self.min_tree_child_weight = min_tree_child_weight
self.colsample_bytree = colsample_bytree
Expand All @@ -564,8 +556,7 @@ def __init__(
self.subsample = subsample
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.early_stop = early_stop
self.min_rel_progress = min_rel_progress
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self._bqml_model: Optional[core.BqmlModel] = None
Expand Down Expand Up @@ -597,7 +588,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"""The model options as they will be set for BQML"""
return {
"model_type": "RANDOM_FOREST_CLASSIFIER",
"num_parallel_tree": self.num_parallel_tree,
"early_stop": True,
"num_parallel_tree": self.n_estimators,
"tree_method": self.tree_method,
"min_tree_child_weight": self.min_tree_child_weight,
"colsample_bytree": self.colsample_bytree,
Expand All @@ -608,8 +600,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"subsample": self.subsample,
"l1_reg": self.reg_alpha,
"l2_reg": self.reg_lambda,
"early_stop": self.early_stop,
"min_rel_progress": self.min_rel_progress,
"min_rel_progress": self.tol,
"data_split_method": "NO_SPLIT",
"enable_global_explain": self.enable_global_explain,
"xgboost_version": self.xgboost_version,
Expand Down
Loading

0 comments on commit 65c6f47

Please sign in to comment.