From 39ecd8250ae05509c896a02fd586528afe897823 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 18 Dec 2021 12:56:43 -0600 Subject: [PATCH 01/12] feat: refit additional kwargs for dataset and predict --- python-package/lightgbm/basic.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 64f1cb31edaa..5a314c79cd28 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3503,7 +3503,7 @@ def predict(self, data, start_iteration=0, num_iteration=None, raw_score, pred_leaf, pred_contrib, data_has_header, is_reshape) - def refit(self, data, label, decay_rate=0.9, **kwargs): + def refit(self, data, label, decay_rate=0.9, kwargs_for_predict=None, kwargs_for_dataset=None): """Refit the existing Booster by new data. Parameters @@ -3516,9 +3516,10 @@ def refit(self, data, label, decay_rate=0.9, **kwargs): decay_rate : float, optional (default=0.9) Decay rate of refit, will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees. - **kwargs - Other parameters for refit. - These parameters will be passed to ``predict`` method. + kwargs_for_predict: dict, optional (default=None) + parameters passed to ``predict`` method. + kwargs_for_dataset: dict, optional (default=None) + additional parameters passed to ``Dataset`` class. The keys data, label and params should not be contained. Returns ------- @@ -3527,7 +3528,9 @@ def refit(self, data, label, decay_rate=0.9, **kwargs): """ if self.__set_objective_to_none: raise LightGBMError('Cannot refit due to null objective function.') - predictor = self._to_predictor(deepcopy(kwargs)) + kwargs_for_predict = {} if kwargs_for_predict is None else kwargs_for_predict + kwargs_for_dataset = {} if kwargs_for_dataset is None else kwargs_for_dataset + predictor = self._to_predictor(deepcopy(kwargs_for_predict)) leaf_preds = predictor.predict(data, -1, pred_leaf=True) nrow, ncol = leaf_preds.shape out_is_linear = ctypes.c_int(0) @@ -3540,7 +3543,7 @@ def refit(self, data, label, decay_rate=0.9, **kwargs): default_value=None ) new_params["linear_tree"] = bool(out_is_linear.value) - train_set = Dataset(data, label, params=new_params) + train_set = Dataset(data, label, params=new_params, **kwargs_for_dataset) new_params['refit_decay_rate'] = decay_rate new_booster = Booster(new_params, train_set) # Copy models From 5f01c83156803c5cb9cbe99c5abf43f581aaffc0 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 18 Dec 2021 12:57:25 -0600 Subject: [PATCH 02/12] test: kwargs for refit method --- tests/python_package_test/test_engine.py | 52 +++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 2d0ed6d86293..44dd5538f119 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -22,7 +22,6 @@ decreasing_generator = itertools.count(0, -1) - def dummy_obj(preds, train_data): return np.ones(preds.shape), np.ones(preds.shape) @@ -50,6 +49,24 @@ def categorize(continuous_x): return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) +@pytest.fixture +def artifacts_for_refit_kwargs(): + X = np.array([1, 2, 2]).reshape((3, 1)) + label = np.array([1, 2, 3]) + data = lgb.basic.Dataset(X, label) + booster = lgb.engine.train( + { + "min_data_in_bin": 1, + "min_data_in_leaf": 1, + "learning_rate": 1, + "boost_from_average": False, + }, + data, + num_boost_round=2, + ) + return (X, label, booster) + + def test_binary(): X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -1545,6 +1562,39 @@ def test_refit(): assert err_pred > new_err_pred +def test_refit_kwargs_for_predict(artifacts_for_refit_kwargs): + # check refit accepts kwargs_for_predict + X, label, booster = artifacts_for_refit_kwargs + kwargs_for_dataset = { + "weight": [1.0, 0.0, 1.0], + "reference": None, + "group": None, + "init_score": None, + "feature_name": "auto", + "categorical_feature": "auto", + "free_raw_data": True + } + booster_refit = booster.refit( + X, label, kwargs_for_dataset=kwargs_for_dataset + ) + pred = booster_refit.predict(X) + assert pred.shape == (3, ) + + +def test_refit_kwargs_for_dataset(artifacts_for_refit_kwargs): + # check refit accepts kwargs_for_dataset + X, label, booster = artifacts_for_refit_kwargs + kwargs_for_predict = { + "num_iteration": 0, + "raw_score": False, + } + booster_refit = booster.refit( + X, label, kwargs_for_predict=kwargs_for_predict + ) + pred = booster_refit.predict(X) + assert pred.shape == (3, ) + + def test_mape_rf(): X, y = load_boston(return_X_y=True) params = { From bf7af5f0c9bb90b13e53d2b36727c9b564e4a875 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 18 Dec 2021 20:09:22 -0600 Subject: [PATCH 03/12] fix: __init__ got multiple values for argument --- python-package/lightgbm/basic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 5a314c79cd28..4a6e5594f1f8 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3519,7 +3519,8 @@ def refit(self, data, label, decay_rate=0.9, kwargs_for_predict=None, kwargs_for kwargs_for_predict: dict, optional (default=None) parameters passed to ``predict`` method. kwargs_for_dataset: dict, optional (default=None) - additional parameters passed to ``Dataset`` class. The keys data, label and params should not be contained. + additional parameters passed to ``Dataset`` class. If the parameters ``data, label, params`` are contained, they + are removed. Returns ------- @@ -3543,6 +3544,8 @@ def refit(self, data, label, decay_rate=0.9, kwargs_for_predict=None, kwargs_for default_value=None ) new_params["linear_tree"] = bool(out_is_linear.value) + for arg in ['data', 'label', 'params']: + kwargs_for_dataset.pop(arg, None) train_set = Dataset(data, label, params=new_params, **kwargs_for_dataset) new_params['refit_decay_rate'] = decay_rate new_booster = Booster(new_params, train_set) From f14d5228dc6c791fa671dc2087f8774223c4130d Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 18 Dec 2021 21:22:52 -0600 Subject: [PATCH 04/12] fix: pycodestyle E302 error --- tests/python_package_test/test_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 44dd5538f119..4f7e39806955 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -22,6 +22,7 @@ decreasing_generator = itertools.count(0, -1) + def dummy_obj(preds, train_data): return np.ones(preds.shape), np.ones(preds.shape) From 5f6245e45cdc11bc7a03fc6eaf70dba6ebd5fb8e Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 22 Dec 2021 21:04:48 -0600 Subject: [PATCH 05/12] refactor: dataset_params to avoid breaking change --- python-package/lightgbm/basic.py | 18 +++---- tests/python_package_test/test_engine.py | 60 ++++++++---------------- 2 files changed, 28 insertions(+), 50 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 4a6e5594f1f8..1628e2087fb9 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3503,7 +3503,7 @@ def predict(self, data, start_iteration=0, num_iteration=None, raw_score, pred_leaf, pred_contrib, data_has_header, is_reshape) - def refit(self, data, label, decay_rate=0.9, kwargs_for_predict=None, kwargs_for_dataset=None): + def refit(self, data, label, decay_rate=0.9, dataset_params=None, **kwargs): """Refit the existing Booster by new data. Parameters @@ -3516,11 +3516,12 @@ def refit(self, data, label, decay_rate=0.9, kwargs_for_predict=None, kwargs_for decay_rate : float, optional (default=0.9) Decay rate of refit, will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees. - kwargs_for_predict: dict, optional (default=None) - parameters passed to ``predict`` method. - kwargs_for_dataset: dict, optional (default=None) + dataset_params: dict, optional (default=None) additional parameters passed to ``Dataset`` class. If the parameters ``data, label, params`` are contained, they are removed. + **kwargs + Other parameters for refit. + These parameters will be passed to ``predict`` method. Returns ------- @@ -3529,9 +3530,8 @@ def refit(self, data, label, decay_rate=0.9, kwargs_for_predict=None, kwargs_for """ if self.__set_objective_to_none: raise LightGBMError('Cannot refit due to null objective function.') - kwargs_for_predict = {} if kwargs_for_predict is None else kwargs_for_predict - kwargs_for_dataset = {} if kwargs_for_dataset is None else kwargs_for_dataset - predictor = self._to_predictor(deepcopy(kwargs_for_predict)) + dataset_params = {} if dataset_params is None else dataset_params + predictor = self._to_predictor(deepcopy(kwargs)) leaf_preds = predictor.predict(data, -1, pred_leaf=True) nrow, ncol = leaf_preds.shape out_is_linear = ctypes.c_int(0) @@ -3545,8 +3545,8 @@ def refit(self, data, label, decay_rate=0.9, kwargs_for_predict=None, kwargs_for ) new_params["linear_tree"] = bool(out_is_linear.value) for arg in ['data', 'label', 'params']: - kwargs_for_dataset.pop(arg, None) - train_set = Dataset(data, label, params=new_params, **kwargs_for_dataset) + dataset_params.pop(arg, None) + train_set = Dataset(data, label, params=new_params, **dataset_params) new_params['refit_decay_rate'] = decay_rate new_booster = Booster(new_params, train_set) # Copy models diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4f7e39806955..d1f7a59814ee 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -50,24 +50,6 @@ def categorize(continuous_x): return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) -@pytest.fixture -def artifacts_for_refit_kwargs(): - X = np.array([1, 2, 2]).reshape((3, 1)) - label = np.array([1, 2, 3]) - data = lgb.basic.Dataset(X, label) - booster = lgb.engine.train( - { - "min_data_in_bin": 1, - "min_data_in_leaf": 1, - "learning_rate": 1, - "boost_from_average": False, - }, - data, - num_boost_round=2, - ) - return (X, label, booster) - - def test_binary(): X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -1563,11 +1545,21 @@ def test_refit(): assert err_pred > new_err_pred -def test_refit_kwargs_for_predict(artifacts_for_refit_kwargs): - # check refit accepts kwargs_for_predict - X, label, booster = artifacts_for_refit_kwargs - kwargs_for_dataset = { - "weight": [1.0, 0.0, 1.0], +def test_refit_dataset_params(): + # check refit accepts dataset_params + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + lgb_train = lgb.Dataset(X_train, y_train) + params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'verbose': -1, + 'min_data': 10 + } + gbm = lgb.train(params, lgb_train, num_boost_round=20) + non_weight_err_pred = log_loss(y_test, gbm.predict(X_test)) + dataset_params = { + "weight": np.abs(np.random.normal(size=y_train.shape)), "reference": None, "group": None, "init_score": None, @@ -1575,25 +1567,11 @@ def test_refit_kwargs_for_predict(artifacts_for_refit_kwargs): "categorical_feature": "auto", "free_raw_data": True } - booster_refit = booster.refit( - X, label, kwargs_for_dataset=kwargs_for_dataset - ) - pred = booster_refit.predict(X) - assert pred.shape == (3, ) - - -def test_refit_kwargs_for_dataset(artifacts_for_refit_kwargs): - # check refit accepts kwargs_for_dataset - X, label, booster = artifacts_for_refit_kwargs - kwargs_for_predict = { - "num_iteration": 0, - "raw_score": False, - } - booster_refit = booster.refit( - X, label, kwargs_for_predict=kwargs_for_predict + new_gbm = gbm.refit( + X_train, y_train, dataset_params=dataset_params ) - pred = booster_refit.predict(X) - assert pred.shape == (3, ) + weight_err_pred = log_loss(y_test, new_gbm.predict(X_test)) + assert weight_err_pred != non_weight_err_pred def test_mape_rf(): From 3e860dfa5310712eb0f35626ca640b60957696ef Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 22 Dec 2021 22:52:15 -0600 Subject: [PATCH 06/12] refactor: expose all Dataset params in refit --- python-package/lightgbm/basic.py | 57 ++++++++++++++++++++---- tests/python_package_test/test_engine.py | 19 ++++---- 2 files changed, 58 insertions(+), 18 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 1628e2087fb9..6bfea0923870 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3503,7 +3503,20 @@ def predict(self, data, start_iteration=0, num_iteration=None, raw_score, pred_leaf, pred_contrib, data_has_header, is_reshape) - def refit(self, data, label, decay_rate=0.9, dataset_params=None, **kwargs): + def refit( + self, + data, + label, + decay_rate=0.9, + reference=None, + weight=None, + group=None, + init_score=None, + feature_name='auto', + categorical_feature='auto', + free_raw_data=True, + **kwargs + ): """Refit the existing Booster by new data. Parameters @@ -3516,9 +3529,29 @@ def refit(self, data, label, decay_rate=0.9, dataset_params=None, **kwargs): decay_rate : float, optional (default=0.9) Decay rate of refit, will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees. - dataset_params: dict, optional (default=None) - additional parameters passed to ``Dataset`` class. If the parameters ``data, label, params`` are contained, they - are removed. + reference : Dataset or None, optional (default=None) + reference for ``data``. + If this is Dataset for validation, training data should be used as reference. + weight : list, numpy 1-D array, pandas Series or None, optional (default=None) + Weight for each ``data`` instance. + group : list, numpy 1-D array, pandas Series or None, optional (default=None) + Group/query size for ``data``. + init_score : list, numpy 1-D array, pandas Series or None, optional (default=None) + Init score for ``data``. + feature_name : list of strings or 'auto', optional (default="auto") + Feature names for ``data``. + If 'auto' and data is pandas DataFrame, data columns names are used. + categorical_feature : list of strings or int, or 'auto', optional (default="auto") + Categorical features for ``data``. + If list of int, interpreted as indices. + If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). + If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. + All values in categorical features should be less than int32 max value (2147483647). + Large values could be memory consuming. Consider using consecutive integers starting from zero. + All negative values in categorical features will be treated as missing values. + The output cannot be monotonically constrained with respect to a categorical feature. + free_raw_data : bool, optional (default=True) + If True, raw data is freed after constructing inner Dataset for ``data``. **kwargs Other parameters for refit. These parameters will be passed to ``predict`` method. @@ -3530,7 +3563,6 @@ def refit(self, data, label, decay_rate=0.9, dataset_params=None, **kwargs): """ if self.__set_objective_to_none: raise LightGBMError('Cannot refit due to null objective function.') - dataset_params = {} if dataset_params is None else dataset_params predictor = self._to_predictor(deepcopy(kwargs)) leaf_preds = predictor.predict(data, -1, pred_leaf=True) nrow, ncol = leaf_preds.shape @@ -3544,9 +3576,18 @@ def refit(self, data, label, decay_rate=0.9, dataset_params=None, **kwargs): default_value=None ) new_params["linear_tree"] = bool(out_is_linear.value) - for arg in ['data', 'label', 'params']: - dataset_params.pop(arg, None) - train_set = Dataset(data, label, params=new_params, **dataset_params) + train_set = Dataset( + data=data, + label=label, + reference=reference, + weight=weight, + group=group, + init_score=init_score, + feature_name=feature_name, + categorical_feature=categorical_feature, + params=new_params, + free_raw_data=free_raw_data, + ) new_params['refit_decay_rate'] = decay_rate new_booster = Booster(new_params, train_set) # Copy models diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index d1f7a59814ee..b6e3ad941f95 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1558,17 +1558,16 @@ def test_refit_dataset_params(): } gbm = lgb.train(params, lgb_train, num_boost_round=20) non_weight_err_pred = log_loss(y_test, gbm.predict(X_test)) - dataset_params = { - "weight": np.abs(np.random.normal(size=y_train.shape)), - "reference": None, - "group": None, - "init_score": None, - "feature_name": "auto", - "categorical_feature": "auto", - "free_raw_data": True - } new_gbm = gbm.refit( - X_train, y_train, dataset_params=dataset_params + data=X_train, + label=y_train, + weight=np.abs(np.random.normal(size=y_train.shape)), + reference=None, + group=None, + init_score=None, + feature_name="auto", + categorical_feature="auto", + free_raw_data=True ) weight_err_pred = log_loss(y_test, new_gbm.predict(X_test)) assert weight_err_pred != non_weight_err_pred From 11d75eca89584308801012cfdaa218c87dfae852 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 25 Dec 2021 13:39:33 -0600 Subject: [PATCH 07/12] feat: dataset_params updates new_params --- python-package/lightgbm/basic.py | 8 +++++- tests/python_package_test/test_engine.py | 31 +++++++++++++++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 6bfea0923870..4c1021ea252a 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3514,6 +3514,7 @@ def refit( init_score=None, feature_name='auto', categorical_feature='auto', + dataset_params=None, free_raw_data=True, **kwargs ): @@ -3550,7 +3551,9 @@ def refit( Large values could be memory consuming. Consider using consecutive integers starting from zero. All negative values in categorical features will be treated as missing values. The output cannot be monotonically constrained with respect to a categorical feature. - free_raw_data : bool, optional (default=True) + dataset_params : dict or None, optional (default=None) + Other parameters for Dataset ``data``. + free_raw_data : bool, optional (default=True) If True, raw data is freed after constructing inner Dataset for ``data``. **kwargs Other parameters for refit. @@ -3563,6 +3566,8 @@ def refit( """ if self.__set_objective_to_none: raise LightGBMError('Cannot refit due to null objective function.') + if dataset_params is None: + dataset_params = {} predictor = self._to_predictor(deepcopy(kwargs)) leaf_preds = predictor.predict(data, -1, pred_leaf=True) nrow, ncol = leaf_preds.shape @@ -3576,6 +3581,7 @@ def refit( default_value=None ) new_params["linear_tree"] = bool(out_is_linear.value) + new_params.update(dataset_params) train_set = Dataset( data=data, label=label, diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index b6e3ad941f95..7daa9678911c 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1550,23 +1550,48 @@ def test_refit_dataset_params(): X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) - params = { + train_params = { 'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1, 'min_data': 10 } - gbm = lgb.train(params, lgb_train, num_boost_round=20) + gbm = lgb.train(train_params, lgb_train, num_boost_round=20) non_weight_err_pred = log_loss(y_test, gbm.predict(X_test)) + dataset_params = { + 'linear_tree': False, + 'max_bin': 255, + 'max_bin_by_feature': None, + 'min_data_in_bin': 3, + 'bin_construct_sample_cnt': 200000, + 'data_random_seed': 1, + 'is_enable_sparse': True, + 'enable_bundle': True, + 'use_missing': True, + 'zero_as_missing': False, + 'feature_pre_filter': True, + 'pre_partition': False, + 'two_round': False, + 'header': False, + 'label_column': "", + 'weight_column': "", + 'group_column': "", + 'ignore_column': "", + 'forcedbins_filename': "", + 'save_binary': False, + 'precise_float_parser': False, + 'parser_config_file': "", + } new_gbm = gbm.refit( data=X_train, label=y_train, - weight=np.abs(np.random.normal(size=y_train.shape)), + weight=np.random.rand(y_train.shape[0]), reference=None, group=None, init_score=None, feature_name="auto", categorical_feature="auto", + dataset_params=dataset_params, free_raw_data=True ) weight_err_pred = log_loss(y_test, new_gbm.predict(X_test)) From dad32783b7f49bf03c49971093de11327d5143cc Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 28 Dec 2021 08:57:15 -0600 Subject: [PATCH 08/12] fix: remove unnecessary params to test --- python-package/lightgbm/basic.py | 4 +-- tests/python_package_test/test_engine.py | 34 ++++-------------------- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 4c1021ea252a..6d316ad7869c 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3532,9 +3532,9 @@ def refit( will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees. reference : Dataset or None, optional (default=None) reference for ``data``. - If this is Dataset for validation, training data should be used as reference. weight : list, numpy 1-D array, pandas Series or None, optional (default=None) - Weight for each ``data`` instance. + Weight for each ``data`` instance. Weight should be non-negative values because the Hessian + value multiplied by weight is supposed to be non-negative. group : list, numpy 1-D array, pandas Series or None, optional (default=None) Group/query size for ``data``. init_score : list, numpy 1-D array, pandas Series or None, optional (default=None) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 7daa9678911c..b667fe5ab54c 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1554,45 +1554,21 @@ def test_refit_dataset_params(): 'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1, - 'min_data': 10 + 'min_data': 10, + 'seed': 123 } gbm = lgb.train(train_params, lgb_train, num_boost_round=20) non_weight_err_pred = log_loss(y_test, gbm.predict(X_test)) dataset_params = { - 'linear_tree': False, - 'max_bin': 255, - 'max_bin_by_feature': None, - 'min_data_in_bin': 3, - 'bin_construct_sample_cnt': 200000, - 'data_random_seed': 1, - 'is_enable_sparse': True, - 'enable_bundle': True, - 'use_missing': True, - 'zero_as_missing': False, - 'feature_pre_filter': True, - 'pre_partition': False, - 'two_round': False, - 'header': False, - 'label_column': "", - 'weight_column': "", - 'group_column': "", - 'ignore_column': "", - 'forcedbins_filename': "", - 'save_binary': False, - 'precise_float_parser': False, - 'parser_config_file': "", + 'max_bin': 260, + 'min_data_in_bin': 5, + 'data_random_seed': 123, } new_gbm = gbm.refit( data=X_train, label=y_train, weight=np.random.rand(y_train.shape[0]), - reference=None, - group=None, - init_score=None, - feature_name="auto", - categorical_feature="auto", dataset_params=dataset_params, - free_raw_data=True ) weight_err_pred = log_loss(y_test, new_gbm.predict(X_test)) assert weight_err_pred != non_weight_err_pred From 4bbd86a696dbd3a55bd00e854a524c6dcf1ffb6a Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 28 Dec 2021 16:32:55 -0600 Subject: [PATCH 09/12] test: parameters input are the same --- tests/python_package_test/test_engine.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index b667fe5ab54c..be86cb4ebd37 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1552,13 +1552,12 @@ def test_refit_dataset_params(): lgb_train = lgb.Dataset(X_train, y_train) train_params = { 'objective': 'binary', - 'metric': 'binary_logloss', 'verbose': -1, - 'min_data': 10, 'seed': 123 } - gbm = lgb.train(train_params, lgb_train, num_boost_round=20) + gbm = lgb.train(train_params, lgb_train, num_boost_round=10) non_weight_err_pred = log_loss(y_test, gbm.predict(X_test)) + refit_weight = np.random.rand(y_train.shape[0]) dataset_params = { 'max_bin': 260, 'min_data_in_bin': 5, @@ -1567,11 +1566,17 @@ def test_refit_dataset_params(): new_gbm = gbm.refit( data=X_train, label=y_train, - weight=np.random.rand(y_train.shape[0]), + weight=refit_weight, dataset_params=dataset_params, ) weight_err_pred = log_loss(y_test, new_gbm.predict(X_test)) + train_set_params = new_gbm.train_set.get_params() + stored_weights = new_gbm.train_set.get_weight() assert weight_err_pred != non_weight_err_pred + assert train_set_params["max_bin"] == 260 + assert train_set_params["min_data_in_bin"] == 5 + assert train_set_params["data_random_seed"] == 123 + np.testing.assert_allclose(stored_weights, refit_weight, verbose=True) def test_mape_rf(): From 0198d3ffa1e3658c2bc36bfa5050103eb10a3b3f Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 30 Dec 2021 10:04:57 -0600 Subject: [PATCH 10/12] docs: address StrikeRUS changes --- python-package/lightgbm/basic.py | 14 +++++++++----- tests/python_package_test/test_engine.py | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 6d316ad7869c..f2390b29641e 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3531,21 +3531,25 @@ def refit( Decay rate of refit, will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees. reference : Dataset or None, optional (default=None) - reference for ``data``. + Reference for ``data``. weight : list, numpy 1-D array, pandas Series or None, optional (default=None) Weight for each ``data`` instance. Weight should be non-negative values because the Hessian value multiplied by weight is supposed to be non-negative. group : list, numpy 1-D array, pandas Series or None, optional (default=None) Group/query size for ``data``. - init_score : list, numpy 1-D array, pandas Series or None, optional (default=None) + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) Init score for ``data``. - feature_name : list of strings or 'auto', optional (default="auto") + feature_name : list of str or 'auto', optional (default="auto") Feature names for ``data``. If 'auto' and data is pandas DataFrame, data columns names are used. - categorical_feature : list of strings or int, or 'auto', optional (default="auto") + categorical_feature : list of str or int, or 'auto', optional (default="auto") Categorical features for ``data``. If list of int, interpreted as indices. - If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). + If list of str, interpreted as feature names (need to specify ``feature_name`` as well). If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. All values in categorical features should be less than int32 max value (2147483647). Large values could be memory consuming. Consider using consecutive integers starting from zero. diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index be86cb4ebd37..44db6f8b20fc 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1576,7 +1576,7 @@ def test_refit_dataset_params(): assert train_set_params["max_bin"] == 260 assert train_set_params["min_data_in_bin"] == 5 assert train_set_params["data_random_seed"] == 123 - np.testing.assert_allclose(stored_weights, refit_weight, verbose=True) + np.testing.assert_allclose(stored_weights, refit_weight) def test_mape_rf(): From 935fdde812424b027e6d949cd48f454c23fdc778 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 8 Jan 2022 10:51:11 -0600 Subject: [PATCH 11/12] test: refit test changes in train dataset --- python-package/lightgbm/basic.py | 2 +- tests/python_package_test/test_engine.py | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index f2390b29641e..4883c4fba605 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3543,7 +3543,7 @@ def refit( where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) Init score for ``data``. - feature_name : list of str or 'auto', optional (default="auto") + feature_name : list of str, or 'auto', optional (default="auto") Feature names for ``data``. If 'auto' and data is pandas DataFrame, data columns names are used. categorical_feature : list of str or int, or 'auto', optional (default="auto") diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 44db6f8b20fc..26963836aaac 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1548,28 +1548,27 @@ def test_refit(): def test_refit_dataset_params(): # check refit accepts dataset_params X, y = load_breast_cancer(return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) - lgb_train = lgb.Dataset(X_train, y_train) + lgb_train = lgb.Dataset(X, y) train_params = { 'objective': 'binary', 'verbose': -1, 'seed': 123 } gbm = lgb.train(train_params, lgb_train, num_boost_round=10) - non_weight_err_pred = log_loss(y_test, gbm.predict(X_test)) - refit_weight = np.random.rand(y_train.shape[0]) + non_weight_err_pred = log_loss(y, gbm.predict(X)) + refit_weight = np.random.rand(y.shape[0]) dataset_params = { 'max_bin': 260, 'min_data_in_bin': 5, 'data_random_seed': 123, } new_gbm = gbm.refit( - data=X_train, - label=y_train, + data=X, + label=y, weight=refit_weight, dataset_params=dataset_params, ) - weight_err_pred = log_loss(y_test, new_gbm.predict(X_test)) + weight_err_pred = log_loss(y, new_gbm.predict(X)) train_set_params = new_gbm.train_set.get_params() stored_weights = new_gbm.train_set.get_weight() assert weight_err_pred != non_weight_err_pred From cbc5e006f83393b629bfd4e30321a7525df3b97a Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 15 Jan 2022 08:13:26 -0600 Subject: [PATCH 12/12] test: set init_score and decay_rate to zero --- tests/python_package_test/test_engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 26963836aaac..280f19af989a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1548,7 +1548,7 @@ def test_refit(): def test_refit_dataset_params(): # check refit accepts dataset_params X, y = load_breast_cancer(return_X_y=True) - lgb_train = lgb.Dataset(X, y) + lgb_train = lgb.Dataset(X, y, init_score=np.zeros(y.size)) train_params = { 'objective': 'binary', 'verbose': -1, @@ -1567,6 +1567,7 @@ def test_refit_dataset_params(): label=y, weight=refit_weight, dataset_params=dataset_params, + decay_rate=0.0, ) weight_err_pred = log_loss(y, new_gbm.predict(X)) train_set_params = new_gbm.train_set.get_params()