diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index ac5b6f1e3b..1035def54d 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -33,7 +33,6 @@ "init_col": "kmeansInitializationColumn", "distance_type": "distanceType", "max_iter": "maxIterations", - "early_stop": "earlyStop", "tol": "minRelativeProgress", } diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 36fa28e141..475b4a046f 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -37,7 +37,7 @@ class PCA( def __init__( self, - n_components: int = 3, + n_components: Optional[Union[int, float]] = None, *, svd_solver: Literal["full", "randomized", "auto"] = "auto", ): @@ -56,6 +56,8 @@ def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> PCA: last_fitting = model.training_runs[-1]["trainingOptions"] if "numPrincipalComponents" in last_fitting: kwargs["n_components"] = int(last_fitting["numPrincipalComponents"]) + if "pcaExplainedVarianceRatio" in last_fitting: + kwargs["n_components"] = float(last_fitting["pcaExplainedVarianceRatio"]) if "pcaSolver" in last_fitting: kwargs["svd_solver"] = str(last_fitting["pcaSolver"]) @@ -63,6 +65,22 @@ def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> PCA: new_pca._bqml_model = core.BqmlModel(session, model) return new_pca + @property + def _bqml_options(self) -> dict: + """The model options as they will be set for BQML""" + options: dict = { + "model_type": "PCA", + "pca_solver": self.svd_solver, + } + + assert self.n_components is not None + if 0 < self.n_components < 1: + options["pca_explained_variance_ratio"] = float(self.n_components) + elif self.n_components >= 1: + options["num_principal_components"] = int(self.n_components) + + return options + def _fit( self, X: Union[bpd.DataFrame, bpd.Series], @@ -71,14 +89,13 @@ def _fit( ) -> PCA: (X,) = utils.convert_to_dataframe(X) + # To mimic sklearn's behavior + if self.n_components is None: + self.n_components = min(X.shape) self._bqml_model = self._bqml_model_factory.create_model( X_train=X, transforms=transforms, - options={ - "model_type": "PCA", - "num_principal_components": self.n_components, - "pca_solver": self.svd_solver, - }, + options=self._bqml_options, ) return self diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 23b227de67..72ea600c58 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -31,7 +31,6 @@ _BQML_PARAMS_MAPPING = { "booster": "boosterType", "tree_method": "treeMethod", - "early_stop": "earlyStop", "colsample_bytree": "colsampleBylevel", "colsample_bylevel": "colsampleBytree", "colsample_bynode": "colsampleBynode", @@ -40,8 +39,8 @@ "reg_alpha": "l1Regularization", "reg_lambda": "l2Regularization", "learning_rate": "learnRate", - "min_rel_progress": "minRelativeProgress", - "num_parallel_tree": "numParallelTree", + "tol": "minRelativeProgress", + "n_estimators": "numParallelTree", "min_tree_child_weight": "minTreeChildWeight", "max_depth": "maxTreeDepth", "max_iterations": "maxIterations", @@ -57,7 +56,7 @@ class XGBRegressor( def __init__( self, - num_parallel_tree: int = 1, + n_estimators: int = 1, *, booster: Literal["gbtree", "dart"] = "gbtree", dart_normalized_type: Literal["tree", "forest"] = "tree", @@ -71,14 +70,13 @@ def __init__( subsample: float = 1.0, reg_alpha: float = 0.0, reg_lambda: float = 1.0, - early_stop: float = True, learning_rate: float = 0.3, max_iterations: int = 20, - min_rel_progress: float = 0.01, + tol: float = 0.01, enable_global_explain: bool = False, xgboost_version: Literal["0.9", "1.1"] = "0.9", ): - self.num_parallel_tree = num_parallel_tree + self.n_estimators = n_estimators self.booster = booster self.dart_normalized_type = dart_normalized_type self.tree_method = tree_method @@ -91,10 +89,9 @@ def __init__( self.subsample = subsample self.reg_alpha = reg_alpha self.reg_lambda = reg_lambda - self.early_stop = early_stop self.learning_rate = learning_rate self.max_iterations = max_iterations - self.min_rel_progress = min_rel_progress + self.tol = tol self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version self._bqml_model: Optional[core.BqmlModel] = None @@ -127,7 +124,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: return { "model_type": "BOOSTED_TREE_REGRESSOR", "data_split_method": "NO_SPLIT", - "num_parallel_tree": self.num_parallel_tree, + "early_stop": True, + "num_parallel_tree": self.n_estimators, "booster_type": self.booster, "tree_method": self.tree_method, "min_tree_child_weight": self.min_tree_child_weight, @@ -139,10 +137,9 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "subsample": self.subsample, "l1_reg": self.reg_alpha, "l2_reg": self.reg_lambda, - "early_stop": self.early_stop, "learn_rate": self.learning_rate, "max_iterations": self.max_iterations, - "min_rel_progress": self.min_rel_progress, + "min_rel_progress": self.tol, "enable_global_explain": self.enable_global_explain, "xgboost_version": self.xgboost_version, } @@ -215,7 +212,7 @@ class XGBClassifier( def __init__( self, - num_parallel_tree: int = 1, + n_estimators: int = 1, *, booster: Literal["gbtree", "dart"] = "gbtree", dart_normalized_type: Literal["tree", "forest"] = "tree", @@ -229,14 +226,13 @@ def __init__( subsample: float = 1.0, reg_alpha: float = 0.0, reg_lambda: float = 1.0, - early_stop: bool = True, learning_rate: float = 0.3, max_iterations: int = 20, - min_rel_progress: float = 0.01, + tol: float = 0.01, enable_global_explain: bool = False, xgboost_version: Literal["0.9", "1.1"] = "0.9", ): - self.num_parallel_tree = num_parallel_tree + self.n_estimators = n_estimators self.booster = booster self.dart_normalized_type = dart_normalized_type self.tree_method = tree_method @@ -249,10 +245,9 @@ def __init__( self.subsample = subsample self.reg_alpha = reg_alpha self.reg_lambda = reg_lambda - self.early_stop = early_stop self.learning_rate = learning_rate self.max_iterations = max_iterations - self.min_rel_progress = min_rel_progress + self.tol = tol self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version self._bqml_model: Optional[core.BqmlModel] = None @@ -285,7 +280,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: return { "model_type": "BOOSTED_TREE_CLASSIFIER", "data_split_method": "NO_SPLIT", - "num_parallel_tree": self.num_parallel_tree, + "early_stop": True, + "num_parallel_tree": self.n_estimators, "booster_type": self.booster, "tree_method": self.tree_method, "min_tree_child_weight": self.min_tree_child_weight, @@ -297,10 +293,9 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "subsample": self.subsample, "l1_reg": self.reg_alpha, "l2_reg": self.reg_lambda, - "early_stop": self.early_stop, "learn_rate": self.learning_rate, "max_iterations": self.max_iterations, - "min_rel_progress": self.min_rel_progress, + "min_rel_progress": self.tol, "enable_global_explain": self.enable_global_explain, "xgboost_version": self.xgboost_version, } @@ -371,7 +366,7 @@ class RandomForestRegressor( def __init__( self, - num_parallel_tree: int = 100, + n_estimators: int = 100, *, tree_method: Literal["auto", "exact", "approx", "hist"] = "auto", min_tree_child_weight: int = 1, @@ -383,12 +378,11 @@ def __init__( subsample=0.8, reg_alpha=0.0, reg_lambda=1.0, - early_stop=True, - min_rel_progress=0.01, + tol=0.01, enable_global_explain=False, xgboost_version: Literal["0.9", "1.1"] = "0.9", ): - self.num_parallel_tree = num_parallel_tree + self.n_estimators = n_estimators self.tree_method = tree_method self.min_tree_child_weight = min_tree_child_weight self.colsample_bytree = colsample_bytree @@ -399,8 +393,7 @@ def __init__( self.subsample = subsample self.reg_alpha = reg_alpha self.reg_lambda = reg_lambda - self.early_stop = early_stop - self.min_rel_progress = min_rel_progress + self.tol = tol self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version self._bqml_model: Optional[core.BqmlModel] = None @@ -432,7 +425,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: """The model options as they will be set for BQML""" return { "model_type": "RANDOM_FOREST_REGRESSOR", - "num_parallel_tree": self.num_parallel_tree, + "early_stop": True, + "num_parallel_tree": self.n_estimators, "tree_method": self.tree_method, "min_tree_child_weight": self.min_tree_child_weight, "colsample_bytree": self.colsample_bytree, @@ -443,8 +437,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "subsample": self.subsample, "l1_reg": self.reg_alpha, "l2_reg": self.reg_lambda, - "early_stop": self.early_stop, - "min_rel_progress": self.min_rel_progress, + "min_rel_progress": self.tol, "data_split_method": "NO_SPLIT", "enable_global_explain": self.enable_global_explain, "xgboost_version": self.xgboost_version, @@ -536,7 +529,7 @@ class RandomForestClassifier( def __init__( self, - num_parallel_tree: int = 100, + n_estimators: int = 100, *, tree_method: Literal["auto", "exact", "approx", "hist"] = "auto", min_tree_child_weight: int = 1, @@ -548,12 +541,11 @@ def __init__( subsample: float = 0.8, reg_alpha: float = 0.0, reg_lambda: float = 1.0, - early_stop=True, - min_rel_progress: float = 0.01, + tol: float = 0.01, enable_global_explain=False, xgboost_version: Literal["0.9", "1.1"] = "0.9", ): - self.num_parallel_tree = num_parallel_tree + self.n_estimators = n_estimators self.tree_method = tree_method self.min_tree_child_weight = min_tree_child_weight self.colsample_bytree = colsample_bytree @@ -564,8 +556,7 @@ def __init__( self.subsample = subsample self.reg_alpha = reg_alpha self.reg_lambda = reg_lambda - self.early_stop = early_stop - self.min_rel_progress = min_rel_progress + self.tol = tol self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version self._bqml_model: Optional[core.BqmlModel] = None @@ -597,7 +588,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: """The model options as they will be set for BQML""" return { "model_type": "RANDOM_FOREST_CLASSIFIER", - "num_parallel_tree": self.num_parallel_tree, + "early_stop": True, + "num_parallel_tree": self.n_estimators, "tree_method": self.tree_method, "min_tree_child_weight": self.min_tree_child_weight, "colsample_bytree": self.colsample_bytree, @@ -608,8 +600,7 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: "subsample": self.subsample, "l1_reg": self.reg_alpha, "l2_reg": self.reg_lambda, - "early_stop": self.early_stop, - "min_rel_progress": self.min_rel_progress, + "min_rel_progress": self.tol, "data_split_method": "NO_SPLIT", "enable_global_explain": self.enable_global_explain, "xgboost_version": self.xgboost_version, diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index b3db04df05..c0abe77b9f 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -35,13 +35,10 @@ "l1_reg": "l1Regularization", "l2_reg": "l2Regularization", "max_iterations": "maxIterations", - "learn_rate_strategy": "learnRateStrategy", - "learn_rate": "learnRate", - "early_stop": "earlyStop", - # To rename to tol. - "min_rel_progress": "minRelativeProgress", + "learning_rate_strategy": "learnRateStrategy", + "learning_rate": "learnRate", "tol": "minRelativeProgress", - "ls_init_learn_rate": "initialLearnRate", + "ls_init_learning_rate": "initialLearnRate", "warm_start": "warmStart", "calculate_p_values": "calculatePValues", "enable_global_explain": "enableGlobalExplain", @@ -67,11 +64,10 @@ def __init__( l2_reg: float = 0.0, max_iterations: int = 20, warm_start: bool = False, - learn_rate: Optional[float] = None, - learn_rate_strategy: Literal["line_search", "constant"] = "line_search", - early_stop: bool = True, - min_rel_progress: float = 0.01, - ls_init_learn_rate: Optional[float] = None, + learning_rate: Optional[float] = None, + learning_rate_strategy: Literal["line_search", "constant"] = "line_search", + tol: float = 0.01, + ls_init_learning_rate: Optional[float] = None, calculate_p_values: bool = False, enable_global_explain: bool = False, ): @@ -81,11 +77,10 @@ def __init__( self.l2_reg = l2_reg self.max_iterations = max_iterations self.warm_start = warm_start - self.learn_rate = learn_rate - self.learn_rate_strategy = learn_rate_strategy - self.early_stop = early_stop - self.min_rel_progress = min_rel_progress - self.ls_init_learn_rate = ls_init_learn_rate + self.learning_rate = learning_rate + self.learning_rate_strategy = learning_rate_strategy + self.tol = tol + self.ls_init_learning_rate = ls_init_learning_rate self.calculate_p_values = calculate_p_values self.enable_global_explain = enable_global_explain self._bqml_model: Optional[core.BqmlModel] = None @@ -110,7 +105,7 @@ def _from_bq( # Convert types kwargs[bf_param] = ( float(last_fitting[bqml_param]) - if bf_param in ["l1_reg", "learn_rate", "ls_init_learn_rate"] + if bf_param in ["l1_reg", "learning_rate", "ls_init_learning_rate"] else type(bf_value)(last_fitting[bqml_param]) ) @@ -128,18 +123,17 @@ def _bqml_options(self) -> dict: "fit_intercept": self.fit_intercept, "l2_reg": self.l2_reg, "max_iterations": self.max_iterations, - "learn_rate_strategy": self.learn_rate_strategy, - "early_stop": self.early_stop, - "min_rel_progress": self.min_rel_progress, + "learn_rate_strategy": self.learning_rate_strategy, + "min_rel_progress": self.tol, "calculate_p_values": self.calculate_p_values, "enable_global_explain": self.enable_global_explain, } if self.l1_reg is not None: options["l1_reg"] = self.l1_reg - if self.learn_rate is not None: - options["learn_rate"] = self.learn_rate - if self.ls_init_learn_rate is not None: - options["ls_init_learn_rate"] = self.ls_init_learn_rate + if self.learning_rate is not None: + options["learn_rate"] = self.learning_rate + if self.ls_init_learning_rate is not None: + options["ls_init_learn_rate"] = self.ls_init_learning_rate # Even presenting warm_start returns error for NORMAL_EQUATION optimizer if self.warm_start: options["warm_start"] = self.warm_start @@ -210,7 +204,7 @@ class LogisticRegression( bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression.__doc__ ) - # TODO(ashleyxu) support class_weights in the constructor. + # TODO(ashleyxu) support class_weight in the constructor. def __init__( self, *, @@ -222,13 +216,13 @@ def __init__( l2_reg: float = 0.0, max_iterations: int = 20, warm_start: bool = False, - learn_rate: Optional[float] = None, - learn_rate_strategy: Literal["line_search", "constant"] = "line_search", + learning_rate: Optional[float] = None, + learning_rate_strategy: Literal["line_search", "constant"] = "line_search", tol: float = 0.01, - ls_init_learn_rate: Optional[float] = None, + ls_init_learning_rate: Optional[float] = None, calculate_p_values: bool = False, enable_global_explain: bool = False, - class_weights: Optional[Union[Literal["balanced"], Dict[str, float]]] = None, + class_weight: Optional[Union[Literal["balanced"], Dict[str, float]]] = None, ): self.optimize_strategy = optimize_strategy self.fit_intercept = fit_intercept @@ -236,14 +230,14 @@ def __init__( self.l2_reg = l2_reg self.max_iterations = max_iterations self.warm_start = warm_start - self.learn_rate = learn_rate - self.learn_rate_strategy = learn_rate_strategy + self.learning_rate = learning_rate + self.learning_rate_strategy = learning_rate_strategy self.tol = tol - self.ls_init_learn_rate = ls_init_learn_rate + self.ls_init_learning_rate = ls_init_learning_rate self.calculate_p_values = calculate_p_values self.enable_global_explain = enable_global_explain - self.class_weights = class_weights - self._auto_class_weight = class_weights == "balanced" + self.class_weight = class_weight + self._auto_class_weight = class_weight == "balanced" self._bqml_model: Optional[core.BqmlModel] = None self._bqml_model_factory = globals.bqml_model_factory() @@ -264,14 +258,14 @@ def _from_bq( # Convert types kwargs[bf_param] = ( float(last_fitting[bqml_param]) - if bf_param in ["l1_reg", "learn_rate", "ls_init_learn_rate"] + if bf_param in ["l1_reg", "learning_rate", "ls_init_learning_rate"] else type(bf_value)(last_fitting[bqml_param]) ) if last_fitting["autoClassWeights"]: - kwargs["class_weights"] = "balanced" - # TODO(ashleyxu) support class_weights in the constructor. + kwargs["class_weight"] = "balanced" + # TODO(ashleyxu) support class_weight in the constructor. # if "labelClassWeights" in last_fitting: - # kwargs["class_weights"] = last_fitting["labelClassWeights"] + # kwargs["class_weight"] = last_fitting["labelClassWeights"] new_logistic_regression = cls(**kwargs) new_logistic_regression._bqml_model = core.BqmlModel(session, model) @@ -288,19 +282,19 @@ def _bqml_options(self) -> dict: "optimize_strategy": self.optimize_strategy, "l2_reg": self.l2_reg, "max_iterations": self.max_iterations, - "learn_rate_strategy": self.learn_rate_strategy, + "learn_rate_strategy": self.learning_rate_strategy, "min_rel_progress": self.tol, "calculate_p_values": self.calculate_p_values, "enable_global_explain": self.enable_global_explain, - # TODO(ashleyxu): support class_weights (struct array as dict in our API) - # "class_weights": self.class_weights, + # TODO(ashleyxu): support class_weight (struct array as dict in our API) + # "class_weight": self.class_weight, } if self.l1_reg is not None: options["l1_reg"] = self.l1_reg - if self.learn_rate is not None: - options["learn_rate"] = self.learn_rate - if self.ls_init_learn_rate is not None: - options["ls_init_learn_rate"] = self.ls_init_learn_rate + if self.learning_rate is not None: + options["learn_rate"] = self.learning_rate + if self.ls_init_learning_rate is not None: + options["ls_init_learn_rate"] = self.ls_init_learning_rate # Even presenting warm_start returns error for NORMAL_EQUATION optimizer if self.warm_start: options["warm_start"] = self.warm_start @@ -362,10 +356,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LogisticRegression: if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") - # TODO(ashleyxu): support class_weights (struct array as dict in our API) - if self.class_weights not in (None, "balanced"): + # TODO(ashleyxu): support class_weight (struct array as dict in our API) + if self.class_weight not in (None, "balanced"): raise NotImplementedError( - f"class_weights is not supported yet. {constants.FEEDBACK_LINK}" + f"class_weight is not supported yet. {constants.FEEDBACK_LINK}" ) new_model = self._bqml_model.copy(model_name, replace) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 7932536e0c..264b95a92e 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -155,3 +155,37 @@ def test_decomposition_configure_fit_score_predict_params( ) assert reloaded_model.n_components == 5 assert reloaded_model.svd_solver == "RANDOMIZED" + + +def test_decomposition_configure_fit_load_float_component( + penguins_df_default_index, dataset_id +): + model = decomposition.PCA(n_components=0.2) + model.fit(penguins_df_default_index) + + # save, load, check n_components to ensure configuration was kept + reloaded_model = model.to_gbq( + f"{dataset_id}.temp_configured_pca_model", replace=True + ) + assert ( + f"{dataset_id}.temp_configured_pca_model" + in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.n_components == 0.2 + + +def test_decomposition_configure_fit_load_none_component( + penguins_df_default_index, dataset_id +): + model = decomposition.PCA(n_components=None) + model.fit(penguins_df_default_index) + + # save, load, check n_components. Here n_components is the column size of the training input. + reloaded_model = model.to_gbq( + f"{dataset_id}.temp_configured_pca_model", replace=True + ) + assert ( + f"{dataset_id}.temp_configured_pca_model" + in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.n_components == 7 diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index b98d7a757c..2403644a42 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -20,7 +20,7 @@ import bigframes.ml.ensemble -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.XGBRegressor() @@ -64,7 +64,7 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_xgbregressor_dart_booster_multiple_params( penguins_df_default_index, dataset_id ): @@ -75,14 +75,14 @@ def test_xgbregressor_dart_booster_multiple_params( colsample_bytree=0.95, colsample_bylevel=0.95, colsample_bynode=0.95, - num_parallel_tree=2, + n_estimators=2, max_depth=4, subsample=0.95, reg_alpha=0.0001, reg_lambda=0.0001, learning_rate=0.015, max_iterations=4, - min_rel_progress=0.02, + tol=0.02, ) df = penguins_df_default_index.dropna().sample(n=70) @@ -126,20 +126,19 @@ def test_xgbregressor_dart_booster_multiple_params( assert reloaded_model.colsample_bytree == 0.95 assert reloaded_model.colsample_bylevel == 0.95 assert reloaded_model.colsample_bynode == 0.95 - assert reloaded_model.early_stop is True assert reloaded_model.subsample == 0.95 assert reloaded_model.reg_alpha == 0.0001 assert reloaded_model.reg_lambda == 0.0001 assert reloaded_model.learning_rate == 0.015 assert reloaded_model.max_iterations == 4 - assert reloaded_model.min_rel_progress == 0.02 + assert reloaded_model.tol == 0.02 assert reloaded_model.gamma == 0.0 assert reloaded_model.max_depth == 4 assert reloaded_model.min_tree_child_weight == 2 - assert reloaded_model.num_parallel_tree == 2 + assert reloaded_model.n_estimators == 2 -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.XGBClassifier() @@ -179,7 +178,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): ) -# @pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_xgbclassifier_dart_booster_multiple_params( penguins_df_default_index, dataset_id ): @@ -190,14 +189,14 @@ def test_xgbclassifier_dart_booster_multiple_params( colsample_bytree=0.95, colsample_bylevel=0.95, colsample_bynode=0.95, - num_parallel_tree=2, + n_estimators=2, max_depth=4, subsample=0.95, reg_alpha=0.0001, reg_lambda=0.0001, learning_rate=0.015, max_iterations=4, - min_rel_progress=0.02, + tol=0.02, ) df = penguins_df_default_index.dropna().sample(n=70) @@ -240,20 +239,19 @@ def test_xgbclassifier_dart_booster_multiple_params( assert reloaded_model.colsample_bytree == 0.95 assert reloaded_model.colsample_bylevel == 0.95 assert reloaded_model.colsample_bynode == 0.95 - assert reloaded_model.early_stop is True assert reloaded_model.subsample == 0.95 assert reloaded_model.reg_alpha == 0.0001 assert reloaded_model.reg_lambda == 0.0001 assert reloaded_model.learning_rate == 0.015 assert reloaded_model.max_iterations == 4 - assert reloaded_model.min_rel_progress == 0.02 + assert reloaded_model.tol == 0.02 assert reloaded_model.gamma == 0.0 assert reloaded_model.max_depth == 4 assert reloaded_model.min_tree_child_weight == 2 - assert reloaded_model.num_parallel_tree == 2 + assert reloaded_model.n_estimators == 2 -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_randomforestregressor_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestRegressor() @@ -294,7 +292,7 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_randomforestregressor_multiple_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestRegressor( tree_method="auto", @@ -302,12 +300,12 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase colsample_bytree=0.95, colsample_bylevel=0.95, colsample_bynode=0.95, - num_parallel_tree=90, + n_estimators=90, max_depth=14, subsample=0.95, reg_alpha=0.0001, reg_lambda=0.0001, - min_rel_progress=0.02, + tol=0.02, ) df = penguins_df_default_index.dropna().sample(n=70) @@ -349,19 +347,18 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase assert reloaded_model.colsample_bytree == 0.95 assert reloaded_model.colsample_bylevel == 0.95 assert reloaded_model.colsample_bynode == 0.95 - assert reloaded_model.early_stop is True assert reloaded_model.subsample == 0.95 assert reloaded_model.reg_alpha == 0.0001 assert reloaded_model.reg_lambda == 0.0001 - assert reloaded_model.min_rel_progress == 0.02 + assert reloaded_model.tol == 0.02 assert reloaded_model.gamma == 0.0 assert reloaded_model.max_depth == 14 assert reloaded_model.min_tree_child_weight == 2 - assert reloaded_model.num_parallel_tree == 90 + assert reloaded_model.n_estimators == 90 assert reloaded_model.enable_global_explain is False -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_randomforestclassifier_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestClassifier() @@ -401,7 +398,7 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_randomforestclassifier_multiple_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestClassifier( tree_method="AUTO", @@ -409,12 +406,12 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas colsample_bytree=0.95, colsample_bylevel=0.95, colsample_bynode=0.95, - num_parallel_tree=90, + n_estimators=90, max_depth=14, subsample=0.95, reg_alpha=0.0001, reg_lambda=0.0001, - min_rel_progress=0.02, + tol=0.02, ) df = penguins_df_default_index.dropna().sample(n=70) @@ -455,13 +452,12 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas assert reloaded_model.colsample_bytree == 0.95 assert reloaded_model.colsample_bylevel == 0.95 assert reloaded_model.colsample_bynode == 0.95 - assert reloaded_model.early_stop is True assert reloaded_model.subsample == 0.95 assert reloaded_model.reg_alpha == 0.0001 assert reloaded_model.reg_lambda == 0.0001 - assert reloaded_model.min_rel_progress == 0.02 + assert reloaded_model.tol == 0.02 assert reloaded_model.gamma == 0.0 assert reloaded_model.max_depth == 14 assert reloaded_model.min_tree_child_weight == 2 - assert reloaded_model.num_parallel_tree == 90 + assert reloaded_model.n_estimators == 90 assert reloaded_model.enable_global_explain is False diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 3616cc4dd0..99121e4a31 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -58,15 +58,14 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" assert reloaded_model.fit_intercept is True assert reloaded_model.calculate_p_values is False - assert reloaded_model.early_stop is True assert reloaded_model.enable_global_explain is False assert reloaded_model.l1_reg is None assert reloaded_model.l2_reg == 0.0 - assert reloaded_model.learn_rate is None - assert reloaded_model.learn_rate_strategy == "line_search" - assert reloaded_model.ls_init_learn_rate is None + assert reloaded_model.learning_rate is None + assert reloaded_model.learning_rate_strategy == "line_search" + assert reloaded_model.ls_init_learning_rate is None assert reloaded_model.max_iterations == 20 - assert reloaded_model.min_rel_progress == 0.01 + assert reloaded_model.tol == 0.01 def test_linear_regression_customized_params_fit_score( @@ -75,12 +74,12 @@ def test_linear_regression_customized_params_fit_score( model = bigframes.ml.linear_model.LinearRegression( fit_intercept=False, l2_reg=0.2, - min_rel_progress=0.02, + tol=0.02, l1_reg=0.2, max_iterations=30, optimize_strategy="batch_gradient_descent", - learn_rate_strategy="constant", - learn_rate=0.2, + learning_rate_strategy="constant", + learning_rate=0.2, ) df = penguins_df_default_index.dropna() @@ -121,15 +120,14 @@ def test_linear_regression_customized_params_fit_score( assert reloaded_model.optimize_strategy == "BATCH_GRADIENT_DESCENT" assert reloaded_model.fit_intercept is False assert reloaded_model.calculate_p_values is False - assert reloaded_model.early_stop is True assert reloaded_model.enable_global_explain is False assert reloaded_model.l1_reg == 0.2 assert reloaded_model.l2_reg == 0.2 - assert reloaded_model.ls_init_learn_rate is None + assert reloaded_model.ls_init_learning_rate is None assert reloaded_model.max_iterations == 30 - assert reloaded_model.min_rel_progress == 0.02 - assert reloaded_model.learn_rate_strategy == "CONSTANT" - assert reloaded_model.learn_rate == 0.2 + assert reloaded_model.tol == 0.02 + assert reloaded_model.learning_rate_strategy == "CONSTANT" + assert reloaded_model.learning_rate == 0.2 # TODO(garrettwu): add tests for param warm_start. Requires a trained model. @@ -177,7 +175,7 @@ def test_logistic_regression_configure_fit_score(penguins_df_default_index, data in reloaded_model._bqml_model.model_name ) assert reloaded_model.fit_intercept is True - assert reloaded_model.class_weights is None + assert reloaded_model.class_weight is None def test_logistic_regression_customized_params_fit_score( @@ -185,14 +183,14 @@ def test_logistic_regression_customized_params_fit_score( ): model = bigframes.ml.linear_model.LogisticRegression( fit_intercept=False, - class_weights="balanced", + class_weight="balanced", l2_reg=0.2, tol=0.02, l1_reg=0.2, max_iterations=30, optimize_strategy="batch_gradient_descent", - learn_rate_strategy="constant", - learn_rate=0.2, + learning_rate_strategy="constant", + learning_rate=0.2, ) df = penguins_df_default_index.dropna() X_train = df[ @@ -234,12 +232,13 @@ def test_logistic_regression_customized_params_fit_score( # TODO(garrettwu) optimize_strategy isn't logged in BQML # assert reloaded_model.optimize_strategy == "BATCH_GRADIENT_DESCENT" assert reloaded_model.fit_intercept is False + assert reloaded_model.class_weight == "balanced" assert reloaded_model.calculate_p_values is False assert reloaded_model.enable_global_explain is False assert reloaded_model.l1_reg == 0.2 assert reloaded_model.l2_reg == 0.2 - assert reloaded_model.ls_init_learn_rate is None + assert reloaded_model.ls_init_learning_rate is None assert reloaded_model.max_iterations == 30 assert reloaded_model.tol == 0.02 - assert reloaded_model.learn_rate_strategy == "CONSTANT" - assert reloaded_model.learn_rate == 0.2 + assert reloaded_model.learning_rate_strategy == "CONSTANT" + assert reloaded_model.learning_rate == 0.2 diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index c7c4437a6e..bcb220b107 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -105,7 +105,7 @@ def test_linear_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="auto_strategy",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="auto_strategy",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -115,7 +115,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="auto_strategy",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="auto_strategy",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -157,14 +157,14 @@ def test_logistic_regression_params_fit( ): model = linear_model.LogisticRegression( fit_intercept=False, - class_weights="balanced", + class_weight="balanced", l2_reg=0.2, tol=0.02, l1_reg=0.2, max_iterations=30, optimize_strategy="batch_gradient_descent", - learn_rate_strategy="constant", - learn_rate=0.2, + learning_rate_strategy="constant", + learning_rate=0.2, ) model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 25d67f64c4..dcce75d1d9 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -20,21 +20,11 @@ class PCA(BaseEstimator, metaclass=ABCMeta): """Principal component analysis (PCA). - Linear dimensionality reduction using Singular Value Decomposition of the - data to project it to a lower dimensional space. The input data is centered - but not scaled for each feature before applying the SVD. - - It uses the LAPACK implementation of the full SVD or a randomized truncated - SVD by the method of Halko et al. 2009, depending on the shape of the input - data and the number of components to extract. - - It can also use the scipy.sparse.linalg ARPACK implementation of the - truncated SVD. - Args: - n_components (Optional[int], default 3): - Number of components to keep. if n_components is not set all components - are kept. + n_components (int, float or None, default None): + Number of components to keep. + If n_components is not set all components are kept. n_components = min(n_samples, n_features). + If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. svd_solver ("full", "randomized" or "auto", default "auto"): The solver to use to calculate the principal components. Details: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-pca#pca_solver. diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 63c62274fd..53a211dd7f 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -91,7 +91,7 @@ class RandomForestRegressor(ForestRegressor): to improve the predictive accuracy and control over-fitting. Args: - num_parallel_tree (Optional[int]): + n_estimators (Optional[int]): Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2. tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to @@ -116,10 +116,8 @@ class RandomForestRegressor(ForestRegressor): L1 regularization term on weights (xgb's alpha). Default to 0.0. reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop (Optional[bool]): - Whether training should stop after the first iteration. Default to True. - min_rel_progress (Optional[float]): - Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. + tol (Optional[float]): + Minimum relative loss improvement necessary to continue training. Default to 0.01. enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. xgboost_version (Optional[str]): @@ -158,7 +156,7 @@ class RandomForestClassifier(ForestClassifier): improve the predictive accuracy and control over-fitting. Args: - num_parallel_tree (Optional[int]): + n_estimators (Optional[int]): Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2. tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to @@ -183,10 +181,8 @@ class RandomForestClassifier(ForestClassifier): L1 regularization term on weights (xgb's alpha). Default to 0.0. reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop (Optional[bool]): - Whether training should stop after the first iteration. Default to True. - min_rel_progress (Optional[float]): - Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. + tol (Optional[float]): + Minimum relative loss improvement necessary to continue training. Default to 0.01. enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. xgboost_version (Optional[str]): diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 7c7473e713..a845b782c0 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -79,16 +79,14 @@ class LinearRegression(RegressorMixin, LinearModel): The maximum number of training iterations or steps. Default to 20. warm_start (bool, default False): Determines whether to train a model with new training data, new model options, or both. Unless you explicitly override them, the initial options used to train the model are used for the warm start run. Default to False. - learn_rate (float or None, default None): - The learn rate for gradient descent when learn_rate_strategy='constant'. If unset, value 0.1 is used. If learn_rate_strategy='line_search', an error is returned. - learn_rate_strategy (str, default "line_search"): + learning_rate (float or None, default None): + The learn rate for gradient descent when learning_rate_strategy='constant'. If unset, value 0.1 is used. If learning_rate_strategy='line_search', an error is returned. + learning_rate_strategy (str, default "line_search"): The strategy for specifying the learning rate during training. Default to "line_search". - early_stop (bool, default True): - Whether training should stop after the first iteration in which the relative loss improvement is less than the value specified for min_rel_progress. Default to True. - min_rel_progress (float, default 0.01): + tol (float, default 0.01): The minimum relative loss improvement that is necessary to continue training when EARLY_STOP is set to true. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue. Default to 0.01. - ls_init_learn_rate (float or None, default None): - Sets the initial learning rate that learn_rate_strategy='line_search' uses. This option can only be used if line_search is specified. If unset, value 0.1 is used. + ls_init_learning_rate (float or None, default None): + Sets the initial learning rate that learning_rate_strategy='line_search' uses. This option can only be used if line_search is specified. If unset, value 0.1 is used. calculate_p_values (bool, default False): Specifies whether to compute p-values and standard errors during training. Default to False. enable_global_explain (bool, default False): diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index 4cca3b136b..88ff32ea06 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -31,7 +31,7 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): fit_intercept (default True): Default True. Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. - class_weights (dict or 'balanced', default None): + class_weight (dict or 'balanced', default None): Default None. Weights associated with classes in the form ``{class_label: weight}``.If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to @@ -47,14 +47,14 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): The maximum number of training iterations or steps. Default to 20. warm_start (bool, default False): Determines whether to train a model with new training data, new model options, or both. Unless you explicitly override them, the initial options used to train the model are used for the warm start run. Default to False. - learn_rate (float or None, default None): - The learn rate for gradient descent when learn_rate_strategy='constant'. If unset, value 0.1 is used. If learn_rate_strategy='line_search', an error is returned. - learn_rate_strategy (str, default "line_search"): + learning_rate (float or None, default None): + The learn rate for gradient descent when learning_rate_strategy='constant'. If unset, value 0.1 is used. If learning_rate_strategy='line_search', an error is returned. + learning_rate_strategy (str, default "line_search"): The strategy for specifying the learning rate during training. Default to "line_search". tol (float, default 0.01): The minimum relative loss improvement that is necessary to continue training when EARLY_STOP is set to true. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue. Default to 0.01. - ls_init_learn_rate (float or None, default None): - Sets the initial learning rate that learn_rate_strategy='line_search' uses. This option can only be used if line_search is specified. If unset, value 0.1 is used. + ls_init_learning_rate (float or None, default None): + Sets the initial learning rate that learning_rate_strategy='line_search' uses. This option can only be used if line_search is specified. If unset, value 0.1 is used. calculate_p_values (bool, default False): Specifies whether to compute p-values and standard errors during training. Default to False. enable_global_explain (bool, default False): diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index dfd0ba7356..250e34dc2c 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -55,7 +55,7 @@ class XGBRegressor(XGBModel, XGBRegressorBase): XGBoost regression model. Args: - num_parallel_tree (Optional[int]): + n_estimators (Optional[int]): Number of parallel trees constructed during each iteration. Default to 1. booster (Optional[str]): Specify which booster to use: gbtree or dart. Default to "gbtree". @@ -84,14 +84,12 @@ class XGBRegressor(XGBModel, XGBRegressorBase): L1 regularization term on weights (xgb's alpha). Default to 0.0. reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop (Optional[bool]): - Whether training should stop after the first iteration. Default to True. learning_rate (Optional[float]): Boosting learning rate (xgb's "eta"). Default to 0.3. max_iterations (Optional[int]): Maximum number of rounds for boosting. Default to 20. - min_rel_progress (Optional[float]): - Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. + tol (Optional[float]): + Minimum relative loss improvement necessary to continue training. Default to 0.01. enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. xgboost_version (Optional[str]): @@ -104,7 +102,7 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase): XGBoost classifier model. Args: - num_parallel_tree (Optional[int]): + n_estimators (Optional[int]): Number of parallel trees constructed during each iteration. Default to 1. booster (Optional[str]): Specify which booster to use: gbtree or dart. Default to "gbtree". @@ -133,14 +131,12 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase): L1 regularization term on weights (xgb's alpha). Default to 0.0. reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop (Optional[bool]): - Whether training should stop after the first iteration. Default to True. learning_rate (Optional[float]): Boosting learning rate (xgb's "eta"). Default to 0.3. max_iterations (Optional[int]): Maximum number of rounds for boosting. Default to 20. - min_rel_progress (Optional[float]): - Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. + tol (Optional[float]): + Minimum relative loss improvement necessary to continue training. Default to 0.01. enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. xgboost_version (Optional[str]):