From c84b7f965e8dbbb6929f7f49e16a4efbe7cc221a Mon Sep 17 00:00:00 2001 From: nyanp Date: Wed, 20 Apr 2022 23:23:49 +0900 Subject: [PATCH 01/18] implement save/load methods for CVBooster --- python-package/lightgbm/engine.py | 97 +++++++++++++++++++++++- tests/python_package_test/test_engine.py | 33 ++++++++ 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 61ef4494648f..9e7fc643dea4 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -2,6 +2,7 @@ """Library with training routines of LightGBM.""" import collections import copy +import json from operator import attrgetter from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -266,7 +267,8 @@ class CVBooster: Auxiliary data structure to hold and redirect all boosters of ``cv`` function. This class has the same methods as Booster class. - All method calls are actually performed for underlying Boosters and then all returned results are returned in a list. + All method calls, except for saving and loading the model, are actually performed for underlying Boosters and + then all returned results are returned in a list. Attributes ---------- @@ -276,18 +278,42 @@ class CVBooster: The best iteration of fitted model. """ - def __init__(self): + def __init__(self, model_file=None): """Initialize the CVBooster. Generally, no need to instantiate manually. + + Parameters + ---------- + model_file : str, pathlib.Path or None, optional (default=None) + Path to the CVBooster model file. """ self.boosters = [] self.best_iteration = -1 + if model_file is not None: + with open(str(model_file), "r") as file: + self._from_dict(json.load(file)) + def _append(self, booster): """Add a booster to CVBooster.""" self.boosters.append(booster) + def _from_dict(self, models): + """Load CVBooster from dict""" + self.best_iteration = models["best_iteration"] + self.boosters = [] + for model_str in models["boosters"]: + self.boosters.append(Booster(model_str=model_str)) + + def _to_dict(self, num_iteration, start_iteration, importance_type): + """Serialize CVBooster to dict""" + models_str = [] + for booster in self.boosters: + models_str.append(booster.model_to_string(num_iteration=num_iteration, start_iteration=start_iteration, + importance_type=importance_type)) + return {"boosters": models_str, "best_iteration": self.best_iteration} + def __getattr__(self, name): """Redirect methods call of CVBooster.""" def handler_function(*args, **kwargs): @@ -298,6 +324,73 @@ def handler_function(*args, **kwargs): return ret return handler_function + def model_from_string(self, model_str): + """Load CVBooster from a string. + + Parameters + ---------- + model_str : str + Model will be loaded from this string. + + Returns + ------- + self : CVBooster + Loaded CVBooster object. + """ + self._from_dict(json.loads(model_str)) + return self + + def model_to_string(self, num_iteration=None, start_iteration=0, importance_type='split'): + """Save CVBooster to string. + + Parameters + ---------- + num_iteration : int or None, optional (default=None) + Index of the iteration that should be saved. + If None, if the best iteration exists, it is saved; otherwise, all iterations are saved. + If <= 0, all iterations are saved. + start_iteration : int, optional (default=0) + Start index of the iteration that should be saved. + importance_type : str, optional (default="split") + What type of feature importance should be saved. + If "split", result contains numbers of times the feature is used in a model. + If "gain", result contains total gains of splits which use the feature. + + Returns + ------- + str_repr : str + String representation of CVBooster. + """ + return json.dumps(self._to_dict(num_iteration, start_iteration, importance_type)) + + def save_model(self, filename, num_iteration=None, start_iteration=0, importance_type='split'): + """Save CVBoosters to file. + + Parameters + ---------- + filename : str or pathlib.Path + Filename to save Booster. + num_iteration : int or None, optional (default=None) + Index of the iteration that should be saved. + If None, if the best iteration exists, it is saved; otherwise, all iterations are saved. + If <= 0, all iterations are saved. + start_iteration : int, optional (default=0) + Start index of the iteration that should be saved. + importance_type : str, optional (default="split") + What type of feature importance should be saved. + If "split", result contains numbers of times the feature is used in a model. + If "gain", result contains total gains of splits which use the feature. + + Returns + ------- + self : CVBooster + Returns self. + """ + with open(filename, "w") as file: + json.dump(self._to_dict(num_iteration, start_iteration, importance_type), file) + + return self + def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratified=True, shuffle=True, eval_train_metric=False): diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 1b202b413a2b..a8e8164edb0c 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1068,6 +1068,39 @@ def test_cvbooster(): assert ret < 0.15 +def test_cvbooster_save_load(): + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'verbose': -1, + } + nfold = 3 + lgb_train = lgb.Dataset(X_train, y_train) + + def predict(cv_booster): + return np.array(cv_booster.predict(X_test)) + + cv_res = lgb.cv(params, lgb_train, + num_boost_round=25, + nfold=nfold, + callbacks=[lgb.early_stopping(stopping_rounds=5)], + return_cvbooster=True) + cvbooster = cv_res['cvbooster'] + + ret_origin = predict(cvbooster) + other_ret = [] + + cvbooster.save_model('lgb.model') + + other_ret.append(predict(lgb.CVBooster(model_file='lgb.model'))) + other_ret.append(predict(lgb.CVBooster().model_from_string(cvbooster.model_to_string()))) + + for ret in other_ret: + np.testing.assert_array_equal(ret_origin, ret) + + def test_feature_name(): X_train, y_train = make_synthetic_regression() params = {'verbose': -1} From 32a973268b7620d6a4b0685ca0a48001ce183fe5 Mon Sep 17 00:00:00 2001 From: nyanp Date: Thu, 21 Apr 2022 00:30:55 +0900 Subject: [PATCH 02/18] fix comment --- python-package/lightgbm/engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 9e7fc643dea4..fd6d1d4b081a 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -300,14 +300,14 @@ def _append(self, booster): self.boosters.append(booster) def _from_dict(self, models): - """Load CVBooster from dict""" + """Load CVBooster from dict.""" self.best_iteration = models["best_iteration"] self.boosters = [] for model_str in models["boosters"]: self.boosters.append(Booster(model_str=model_str)) def _to_dict(self, num_iteration, start_iteration, importance_type): - """Serialize CVBooster to dict""" + """Serialize CVBooster to dict.""" models_str = [] for booster in self.boosters: models_str.append(booster.model_to_string(num_iteration=num_iteration, start_iteration=start_iteration, From 972d09bbbfa8d6f8541b2434c335c11ac6168407 Mon Sep 17 00:00:00 2001 From: nyanp Date: Fri, 22 Apr 2022 19:45:38 +0900 Subject: [PATCH 03/18] add type hint to CVBooster Co-authored-by: James Lamb --- python-package/lightgbm/engine.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index fd6d1d4b081a..cec8cb3fbb9b 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -278,7 +278,10 @@ class CVBooster: The best iteration of fitted model. """ - def __init__(self, model_file=None): + def __init__( + self, + model_file: Optional[str, pathlib.Path] = None + ): """Initialize the CVBooster. Generally, no need to instantiate manually. From dbc75252b2b986027c917e47aa7f732604760630 Mon Sep 17 00:00:00 2001 From: nyanp Date: Fri, 22 Apr 2022 19:46:47 +0900 Subject: [PATCH 04/18] add type hint to CVBooster Co-authored-by: James Lamb --- python-package/lightgbm/engine.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index cec8cb3fbb9b..0475beeb3da9 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -302,14 +302,14 @@ def _append(self, booster): """Add a booster to CVBooster.""" self.boosters.append(booster) - def _from_dict(self, models): + def _from_dict(self, models: Dict[str, any]) -> None: """Load CVBooster from dict.""" self.best_iteration = models["best_iteration"] self.boosters = [] for model_str in models["boosters"]: self.boosters.append(Booster(model_str=model_str)) - def _to_dict(self, num_iteration, start_iteration, importance_type): + def _to_dict(self, num_iteration: int, start_iteration: int, importance_type: str) -> Dict[str, Any]: """Serialize CVBooster to dict.""" models_str = [] for booster in self.boosters: @@ -327,7 +327,7 @@ def handler_function(*args, **kwargs): return ret return handler_function - def model_from_string(self, model_str): + def model_from_string(self, model_str: str) -> "CVBooster": """Load CVBooster from a string. Parameters @@ -343,7 +343,12 @@ def model_from_string(self, model_str): self._from_dict(json.loads(model_str)) return self - def model_to_string(self, num_iteration=None, start_iteration=0, importance_type='split'): + def model_to_string( + self, + num_iteration: Optional[int] = None, + start_iteration: int = 0, + importance_type: str = 'split' + ) -> str: """Save CVBooster to string. Parameters @@ -366,7 +371,13 @@ def model_to_string(self, num_iteration=None, start_iteration=0, importance_type """ return json.dumps(self._to_dict(num_iteration, start_iteration, importance_type)) - def save_model(self, filename, num_iteration=None, start_iteration=0, importance_type='split'): + def save_model( + self, + filename: Union[str, pathlib.Path], + num_iteration: Optional[int] = None, + start_iteration: int = 0, + importance_type: str = 'split' + ) -> "CVBooster": """Save CVBoosters to file. Parameters From d7e760fa16e2802377ed3a26a7d87265bc457031 Mon Sep 17 00:00:00 2001 From: nyanp Date: Fri, 22 Apr 2022 19:50:32 +0900 Subject: [PATCH 05/18] Apply suggestions from code review Co-authored-by: James Lamb --- tests/python_package_test/test_engine.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index a8e8164edb0c..e385a93d47ef 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1083,22 +1083,20 @@ def predict(cv_booster): return np.array(cv_booster.predict(X_test)) cv_res = lgb.cv(params, lgb_train, - num_boost_round=25, + num_boost_round=10, nfold=nfold, callbacks=[lgb.early_stopping(stopping_rounds=5)], return_cvbooster=True) cvbooster = cv_res['cvbooster'] - ret_origin = predict(cvbooster) - other_ret = [] + preds = predict(cvbooster) cvbooster.save_model('lgb.model') + preds_from_file = predict(lgb.CVBooster(model_file='lgb.model'))) + preds_from_string = predict(lgb.CVBooster().model_from_string(cvbooster.model_to_string()))) - other_ret.append(predict(lgb.CVBooster(model_file='lgb.model'))) - other_ret.append(predict(lgb.CVBooster().model_from_string(cvbooster.model_to_string()))) - - for ret in other_ret: - np.testing.assert_array_equal(ret_origin, ret) + np.testing.assert_array_equal(preds, preds_from_file) + np.testing.assert_array_equal(preds, preds_from_string) def test_feature_name(): From 5376074af6b30af794cdaafedb32142309ba29aa Mon Sep 17 00:00:00 2001 From: nyanp Date: Fri, 22 Apr 2022 22:57:03 +0900 Subject: [PATCH 06/18] fix type error --- python-package/lightgbm/engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 0475beeb3da9..bbc003a73a63 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -280,7 +280,7 @@ class CVBooster: def __init__( self, - model_file: Optional[str, pathlib.Path] = None + model_file: Optional[str, Path] = None ): """Initialize the CVBooster. @@ -373,7 +373,7 @@ def model_to_string( def save_model( self, - filename: Union[str, pathlib.Path], + filename: Union[str, Path], num_iteration: Optional[int] = None, start_iteration: int = 0, importance_type: str = 'split' From 902dad94c72c6e29f364b45eb67a257ff20c87b4 Mon Sep 17 00:00:00 2001 From: nyanp Date: Fri, 22 Apr 2022 23:24:08 +0900 Subject: [PATCH 07/18] fix type hint --- python-package/lightgbm/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index bbc003a73a63..6a586df226f7 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -280,7 +280,7 @@ class CVBooster: def __init__( self, - model_file: Optional[str, Path] = None + model_file: Optional[Union[str, Path]] = None ): """Initialize the CVBooster. From f3c9ada0662aa519f1635e522d4556a765dd5405 Mon Sep 17 00:00:00 2001 From: nyanp Date: Fri, 22 Apr 2022 23:25:48 +0900 Subject: [PATCH 08/18] Support for CVBooster serialization by pickle --- python-package/lightgbm/engine.py | 6 ++++++ tests/python_package_test/test_engine.py | 17 +++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 6a586df226f7..eaa965c121a0 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -327,6 +327,12 @@ def handler_function(*args, **kwargs): return ret return handler_function + def __getstate__(self) -> Dict[str, Any]: + return vars(self) + + def __setstate__(self, state: Dict[str, Any]) -> None: + vars(self).update(state) + def model_from_string(self, model_str: str) -> "CVBooster": """Load CVBooster from a string. diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index e385a93d47ef..26b390d146e2 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1068,7 +1068,7 @@ def test_cvbooster(): assert ret < 0.15 -def test_cvbooster_save_load(): +def test_cvbooster_save_load(tmp_path): X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { @@ -1091,12 +1091,21 @@ def predict(cv_booster): preds = predict(cvbooster) - cvbooster.save_model('lgb.model') - preds_from_file = predict(lgb.CVBooster(model_file='lgb.model'))) - preds_from_string = predict(lgb.CVBooster().model_from_string(cvbooster.model_to_string()))) + model_path_txt = str(tmp_path / 'lgb.model') + model_path_pkl = str(tmp_path / 'lgb.pkl') + + cvbooster.save_model(model_path_txt) + with open(model_path_pkl, 'wb') as f: + pickle.dump(cvbooster, f) + + preds_from_file = predict(lgb.CVBooster(model_file=model_path_txt)) + preds_from_string = predict(lgb.CVBooster().model_from_string(cvbooster.model_to_string())) + with open(model_path_pkl, 'rb') as f: + preds_from_pkl = predict(pickle.load(f)) np.testing.assert_array_equal(preds, preds_from_file) np.testing.assert_array_equal(preds, preds_from_string) + np.testing.assert_array_equal(preds, preds_from_pkl) def test_feature_name(): From 78a6109af6830c4f5924889eb3954fae3c5755d4 Mon Sep 17 00:00:00 2001 From: nyanp Date: Tue, 26 Apr 2022 20:22:54 +0900 Subject: [PATCH 09/18] Apply suggestions from code review Co-authored-by: James Lamb --- tests/python_package_test/test_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 56c9f7866f5b..cd04b4a3ca8c 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1101,8 +1101,8 @@ def predict(cv_booster): cvbooster.save_model(model_path_txt) with open(model_path_pkl, 'wb') as f: pickle.dump(cvbooster, f) - - preds_from_file = predict(lgb.CVBooster(model_file=model_path_txt)) +del cvboost + preds_from_txt_file = predict(lgb.CVBooster(model_file=model_path_txt)) preds_from_string = predict(lgb.CVBooster().model_from_string(cvbooster.model_to_string())) with open(model_path_pkl, 'rb') as f: preds_from_pkl = predict(pickle.load(f)) From c933d86c0a988aec6acae88b52f4a9bac714d65c Mon Sep 17 00:00:00 2001 From: nyanp Date: Tue, 26 Apr 2022 20:27:54 +0900 Subject: [PATCH 10/18] remove inner function --- tests/python_package_test/test_engine.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index cd04b4a3ca8c..a5ab4ecd5881 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1083,9 +1083,6 @@ def test_cvbooster_save_load(tmp_path): nfold = 3 lgb_train = lgb.Dataset(X_train, y_train) - def predict(cv_booster): - return np.array(cv_booster.predict(X_test)) - cv_res = lgb.cv(params, lgb_train, num_boost_round=10, nfold=nfold, @@ -1093,7 +1090,7 @@ def predict(cv_booster): return_cvbooster=True) cvbooster = cv_res['cvbooster'] - preds = predict(cvbooster) + preds = cvbooster.predict(X_test) model_path_txt = str(tmp_path / 'lgb.model') model_path_pkl = str(tmp_path / 'lgb.pkl') @@ -1101,13 +1098,15 @@ def predict(cv_booster): cvbooster.save_model(model_path_txt) with open(model_path_pkl, 'wb') as f: pickle.dump(cvbooster, f) -del cvboost - preds_from_txt_file = predict(lgb.CVBooster(model_file=model_path_txt)) - preds_from_string = predict(lgb.CVBooster().model_from_string(cvbooster.model_to_string())) + + del cvbooster + + preds_from_txt_file = lgb.CVBooster(model_file=model_path_txt).predict(X_test) + preds_from_string = lgb.CVBooster().model_from_string(cvbooster.model_to_string()).predict(X_test) with open(model_path_pkl, 'rb') as f: - preds_from_pkl = predict(pickle.load(f)) + preds_from_pkl = pickle.load(f).predict(X_test) - np.testing.assert_array_equal(preds, preds_from_file) + np.testing.assert_array_equal(preds, preds_from_txt_file) np.testing.assert_array_equal(preds, preds_from_string) np.testing.assert_array_equal(preds, preds_from_pkl) From 3ec7fd02ebefa64b560135b14fcd3f44ab0affda Mon Sep 17 00:00:00 2001 From: nyanp Date: Tue, 26 Apr 2022 23:06:48 +0900 Subject: [PATCH 11/18] add test for joblib, cloudpickle --- tests/python_package_test/test_callback.py | 18 +-------- tests/python_package_test/test_engine.py | 44 ++++++++++++++++------ tests/python_package_test/utils.py | 18 +++++++++ 3 files changed, 51 insertions(+), 29 deletions(-) diff --git a/tests/python_package_test/test_callback.py b/tests/python_package_test/test_callback.py index 1a101fd6799b..d75b9309918c 100644 --- a/tests/python_package_test/test_callback.py +++ b/tests/python_package_test/test_callback.py @@ -3,23 +3,7 @@ import lightgbm as lgb -from .utils import pickle_obj, unpickle_obj - -SERIALIZERS = ["pickle", "joblib", "cloudpickle"] - - -def pickle_and_unpickle_object(obj, serializer): - with lgb.basic._TempFile() as tmp_file: - pickle_obj( - obj=obj, - filepath=tmp_file.name, - serializer=serializer - ) - obj_from_disk = unpickle_obj( - filepath=tmp_file.name, - serializer=serializer - ) - return obj_from_disk +from .utils import SERIALIZERS, pickle_and_unpickle_object, pickle_obj, unpickle_obj def reset_feature_fraction(boosting_round): diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index a5ab4ecd5881..1368c7f512c1 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -19,8 +19,9 @@ import lightgbm as lgb -from .utils import (dummy_obj, load_boston, load_breast_cancer, load_digits, load_iris, logistic_sigmoid, - make_synthetic_regression, mse_obj, sklearn_multiclass_custom_objective, softmax) +from .utils import (SERIALIZERS, dummy_obj, load_boston, load_breast_cancer, load_digits, load_iris, logistic_sigmoid, + make_synthetic_regression, mse_obj, pickle_and_unpickle_object, sklearn_multiclass_custom_objective, + softmax) decreasing_generator = itertools.count(0, -1) @@ -1089,26 +1090,45 @@ def test_cvbooster_save_load(tmp_path): callbacks=[lgb.early_stopping(stopping_rounds=5)], return_cvbooster=True) cvbooster = cv_res['cvbooster'] - preds = cvbooster.predict(X_test) model_path_txt = str(tmp_path / 'lgb.model') - model_path_pkl = str(tmp_path / 'lgb.pkl') cvbooster.save_model(model_path_txt) - with open(model_path_pkl, 'wb') as f: - pickle.dump(cvbooster, f) - + model_string = cvbooster.model_to_string() del cvbooster preds_from_txt_file = lgb.CVBooster(model_file=model_path_txt).predict(X_test) - preds_from_string = lgb.CVBooster().model_from_string(cvbooster.model_to_string()).predict(X_test) - with open(model_path_pkl, 'rb') as f: - preds_from_pkl = pickle.load(f).predict(X_test) - + preds_from_string = lgb.CVBooster().model_from_string(model_string).predict(X_test) np.testing.assert_array_equal(preds, preds_from_txt_file) np.testing.assert_array_equal(preds, preds_from_string) - np.testing.assert_array_equal(preds, preds_from_pkl) + + +@pytest.mark.parametrize('serializer', SERIALIZERS) +def test_cvbooster_picklable(serializer): + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'verbose': -1, + } + nfold = 3 + lgb_train = lgb.Dataset(X_train, y_train) + + cv_res = lgb.cv(params, lgb_train, + num_boost_round=10, + nfold=nfold, + callbacks=[lgb.early_stopping(stopping_rounds=5)], + return_cvbooster=True) + cvbooster = cv_res['cvbooster'] + preds = cvbooster.predict(X_test) + + cvbooster_from_disk = pickle_and_unpickle_object(obj=cvbooster, serializer=serializer) + del cvbooster + + preds_from_disk = cvbooster_from_disk.predict(X_test) + np.testing.assert_array_equal(preds, preds_from_disk) def test_feature_name(): diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index 472343091566..2060f41b35b3 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -8,6 +8,10 @@ import sklearn.datasets from sklearn.utils import check_random_state +import lightgbm as lgb + +SERIALIZERS = ["pickle", "joblib", "cloudpickle"] + @lru_cache(maxsize=None) def load_boston(**kwargs): @@ -175,3 +179,17 @@ def unpickle_obj(filepath, serializer): return cloudpickle.load(f) else: raise ValueError(f'Unrecognized serializer type: {serializer}') + + +def pickle_and_unpickle_object(obj, serializer): + with lgb.basic._TempFile() as tmp_file: + pickle_obj( + obj=obj, + filepath=tmp_file.name, + serializer=serializer + ) + obj_from_disk = unpickle_obj( + filepath=tmp_file.name, + serializer=serializer + ) + return obj_from_disk From d511b5c925e10d4d9c61ee9011427e24e9e194f4 Mon Sep 17 00:00:00 2001 From: nyanp Date: Tue, 24 May 2022 23:11:36 +0900 Subject: [PATCH 12/18] Apply suggestions from code review Co-authored-by: James Lamb --- python-package/lightgbm/engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 0dc6bb0f3b38..d6031c575b5a 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -303,7 +303,7 @@ def _append(self, booster): """Add a booster to CVBooster.""" self.boosters.append(booster) - def _from_dict(self, models: Dict[str, any]) -> None: + def _from_dict(self, models: Dict[str, Any]) -> None: """Load CVBooster from dict.""" self.best_iteration = models["best_iteration"] self.boosters = [] @@ -356,7 +356,7 @@ def model_to_string( start_iteration: int = 0, importance_type: str = 'split' ) -> str: - """Save CVBooster to string. + """Save CVBooster to JSON string. Parameters ---------- @@ -374,7 +374,7 @@ def model_to_string( Returns ------- str_repr : str - String representation of CVBooster. + JSON string representation of CVBooster. """ return json.dumps(self._to_dict(num_iteration, start_iteration, importance_type)) @@ -385,7 +385,7 @@ def save_model( start_iteration: int = 0, importance_type: str = 'split' ) -> "CVBooster": - """Save CVBoosters to file. + """Save CVBoosters to a file as JSON text. Parameters ---------- From dd88fccf593a1f600d8ea1f9cdb0f23ad46bb630 Mon Sep 17 00:00:00 2001 From: nyanp Date: Sun, 29 May 2022 23:41:55 +0900 Subject: [PATCH 13/18] explicitly state which methods are to be overridden --- python-package/lightgbm/engine.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index d6031c575b5a..7826142f92c5 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -268,9 +268,13 @@ class CVBooster: Auxiliary data structure to hold and redirect all boosters of ``cv`` function. This class has the same methods as Booster class. - All method calls, except for saving and loading the model, are actually performed for underlying Boosters and + All method calls, except for the following methods, are actually performed for underlying Boosters and then all returned results are returned in a list. + - model_from_string + - model_to_string + - save_model + Attributes ---------- boosters : list of Booster From dfe32913e2975b243bc211785278a7f3dda5e07f Mon Sep 17 00:00:00 2001 From: nyanp Date: Tue, 19 Jul 2022 20:00:35 +0900 Subject: [PATCH 14/18] Apply suggestions from code review Co-authored-by: Nikita Titov --- python-package/lightgbm/engine.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index d0328e892df8..d46bbd3bf4eb 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -272,14 +272,14 @@ def train( class CVBooster: """CVBooster in LightGBM. - Auxiliary data structure to hold and redirect all boosters of ``cv`` function. + Auxiliary data structure to hold and redirect all boosters of ``cv()`` function. This class has the same methods as Booster class. All method calls, except for the following methods, are actually performed for underlying Boosters and then all returned results are returned in a list. - - model_from_string - - model_to_string - - save_model + - ``model_from_string()`` + - ``model_to_string()`` + - ``save_model()`` Attributes ---------- @@ -306,7 +306,7 @@ def __init__( self.best_iteration = -1 if model_file is not None: - with open(str(model_file), "r") as file: + with open(model_file, "r") as file: self._from_dict(json.load(file)) def _append(self, booster: Booster) -> None: @@ -318,9 +318,9 @@ def _from_dict(self, models: Dict[str, Any]) -> None: self.best_iteration = models["best_iteration"] self.boosters = [] for model_str in models["boosters"]: - self.boosters.append(Booster(model_str=model_str)) + self._append(Booster(model_str=model_str)) - def _to_dict(self, num_iteration: int, start_iteration: int, importance_type: str) -> Dict[str, Any]: + def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importance_type: str) -> Dict[str, Any]: """Serialize CVBooster to dict.""" models_str = [] for booster in self.boosters: @@ -400,7 +400,7 @@ def save_model( Parameters ---------- filename : str or pathlib.Path - Filename to save Booster. + Filename to save CVBooster. num_iteration : int or None, optional (default=None) Index of the iteration that should be saved. If None, if the best iteration exists, it is saved; otherwise, all iterations are saved. From cf52d45098621771de0f6d5bed3c4fc835af4d3a Mon Sep 17 00:00:00 2001 From: nyanp Date: Sat, 30 Jul 2022 12:48:23 +0900 Subject: [PATCH 15/18] remove comment --- python-package/lightgbm/engine.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index d46bbd3bf4eb..ad2d7d14b1fe 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -295,8 +295,6 @@ def __init__( ): """Initialize the CVBooster. - Generally, no need to instantiate manually. - Parameters ---------- model_file : str, pathlib.Path or None, optional (default=None) From 26365003532865bdf751684f7beb06b4074279e2 Mon Sep 17 00:00:00 2001 From: nyanp Date: Sat, 30 Jul 2022 13:09:31 +0900 Subject: [PATCH 16/18] test best_iteration --- tests/python_package_test/test_engine.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index c10d7adabddd..9137ea7f11ff 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1092,6 +1092,7 @@ def test_cvbooster_save_load(tmp_path): return_cvbooster=True) cvbooster = cv_res['cvbooster'] preds = cvbooster.predict(X_test) + best_iteration = cvbooster.best_iteration model_path_txt = str(tmp_path / 'lgb.model') @@ -1099,10 +1100,11 @@ def test_cvbooster_save_load(tmp_path): model_string = cvbooster.model_to_string() del cvbooster - preds_from_txt_file = lgb.CVBooster(model_file=model_path_txt).predict(X_test) - preds_from_string = lgb.CVBooster().model_from_string(model_string).predict(X_test) - np.testing.assert_array_equal(preds, preds_from_txt_file) - np.testing.assert_array_equal(preds, preds_from_string) + cvbooster_from_txt_file = lgb.CVBooster(model_file=model_path_txt) + cvbooster_from_string = lgb.CVBooster().model_from_string(model_string) + for cvbooster_loaded in [cvbooster_from_txt_file, cvbooster_from_string]: + assert best_iteration == cvbooster_loaded.best_iteration + np.testing.assert_array_equal(preds, cvbooster_loaded.predict(X_test)) @pytest.mark.parametrize('serializer', SERIALIZERS) From c3a493d327ad3fc65ed308bb38e66e7c2add5fb5 Mon Sep 17 00:00:00 2001 From: nyanp Date: Thu, 4 Aug 2022 22:56:26 +0900 Subject: [PATCH 17/18] Apply suggestions from code review Co-authored-by: Nikita Titov --- python-package/lightgbm/engine.py | 2 +- tests/python_package_test/test_engine.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index ad2d7d14b1fe..2cc050362d35 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -393,7 +393,7 @@ def save_model( start_iteration: int = 0, importance_type: str = 'split' ) -> "CVBooster": - """Save CVBoosters to a file as JSON text. + """Save CVBooster to a file as JSON text. Parameters ---------- diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 33fc6f041047..4c2ae4cb8b12 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1076,7 +1076,7 @@ def test_cvbooster(): def test_cvbooster_save_load(tmp_path): X, y = load_breast_cancer(return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', 'metric': 'binary_logloss', @@ -1110,7 +1110,7 @@ def test_cvbooster_save_load(tmp_path): @pytest.mark.parametrize('serializer', SERIALIZERS) def test_cvbooster_picklable(serializer): X, y = load_breast_cancer(return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', 'metric': 'binary_logloss', @@ -1126,10 +1126,13 @@ def test_cvbooster_picklable(serializer): return_cvbooster=True) cvbooster = cv_res['cvbooster'] preds = cvbooster.predict(X_test) +best_iteration = cvbooster.best_iteration cvbooster_from_disk = pickle_and_unpickle_object(obj=cvbooster, serializer=serializer) del cvbooster + assert best_iteration == cvbooster_from_disk.best_iteration + preds_from_disk = cvbooster_from_disk.predict(X_test) np.testing.assert_array_equal(preds, preds_from_disk) From 0b7ad39578488049252b7928844254c544fdb7d0 Mon Sep 17 00:00:00 2001 From: nyanp Date: Wed, 10 Aug 2022 07:20:47 +0900 Subject: [PATCH 18/18] Update tests/python_package_test/test_engine.py Co-authored-by: Nikita Titov --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4c2ae4cb8b12..e2877a76a549 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1126,7 +1126,7 @@ def test_cvbooster_picklable(serializer): return_cvbooster=True) cvbooster = cv_res['cvbooster'] preds = cvbooster.predict(X_test) -best_iteration = cvbooster.best_iteration + best_iteration = cvbooster.best_iteration cvbooster_from_disk = pickle_and_unpickle_object(obj=cvbooster, serializer=serializer) del cvbooster