From 24b7df07ea0ffcba4e00b693c7108a9c3c3ef6f9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 17 Apr 2022 15:54:41 +0000 Subject: [PATCH 01/10] consider number of features from model in continual training --- include/LightGBM/dataset_loader.h | 4 ++++ src/io/dataset_loader.cpp | 18 +++++++++++++++--- src/io/parser.cpp | 20 ++++++++++++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index e4c93e182d96..02e1049a8a67 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -16,6 +16,8 @@ namespace LightGBM { class DatasetLoader { public: + LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename, const int num_features_from_init_model); + LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename); LIGHTGBM_EXPORT ~DatasetLoader(); @@ -95,6 +97,8 @@ class DatasetLoader { std::unordered_set categorical_features_; /*! \brief Whether to store raw feature values */ bool store_raw_; + /*! \brief number of features from init model */ + int num_features_from_init_model_; }; } // namespace LightGBM diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 1c76a9627376..f772fbd6662b 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -18,7 +18,19 @@ namespace LightGBM { using json11::Json; DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename) - :config_(io_config), random_(config_.data_random_seed), predict_fun_(predict_fun), num_class_(num_class) { + :DatasetLoader(io_config, predict_fun, num_class, filename, 0) {} + +DatasetLoader::DatasetLoader( + const Config& io_config, + const PredictFunction& predict_fun, + int num_class, + const char* filename, + const int num_features_from_init_model) + :config_(io_config), + random_(config_.data_random_seed), + predict_fun_(predict_fun), + num_class_(num_class), + num_features_from_init_model_(num_features_from_init_model) { label_idx_ = 0; weight_idx_ = NO_SPECIFIC; group_idx_ = NO_SPECIFIC; @@ -218,7 +230,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac bool is_load_from_binary = false; if (bin_filename.size() == 0) { dataset->parser_config_str_ = Parser::GenerateParserConfigStr(filename, config_.parser_config_file.c_str(), config_.header, label_idx_); - auto parser = std::unique_ptr(Parser::CreateParser(filename, config_.header, 0, label_idx_, + auto parser = std::unique_ptr(Parser::CreateParser(filename, config_.header, num_features_from_init_model_, label_idx_, config_.precise_float_parser, dataset->parser_config_str_)); if (parser == nullptr) { Log::Fatal("Could not recognize data format of %s", filename); @@ -305,7 +317,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, } auto bin_filename = CheckCanLoadFromBin(filename); if (bin_filename.size() == 0) { - auto parser = std::unique_ptr(Parser::CreateParser(filename, config_.header, 0, label_idx_, + auto parser = std::unique_ptr(Parser::CreateParser(filename, config_.header, num_features_from_init_model_, label_idx_, config_.precise_float_parser, train_data->parser_config_str_)); if (parser == nullptr) { Log::Fatal("Could not recognize data format of %s", filename); diff --git a/src/io/parser.cpp b/src/io/parser.cpp index 68e4cdb8116b..d32b3c4c4fc4 100644 --- a/src/io/parser.cpp +++ b/src/io/parser.cpp @@ -270,12 +270,32 @@ Parser* Parser::CreateParser(const char* filename, bool header, int num_features if (type == DataType::LIBSVM) { output_label_index = GetLabelIdxForLibsvm(lines[0], num_features, label_idx); ret.reset(new LibSVMParser(output_label_index, num_col, atof)); + if (num_features > 0) { + // num_features is specified by loaded model + // this can happen when loading a model for prediction + // or loading a model for continual training + if(ret->NumFeatures() > num_features) { + Log::Warning("Number of features found in LibSVM file (%d) is greater than number of features found in loaded model (%d).", + ret->NumFeatures(), num_features); + } else { + Log::Warning("Number of features found in LivSVM file (%d) is smaller than number of features found in loaded model (%d).", + ret->NumFeatures(), num_features); + } + } } else if (type == DataType::TSV) { output_label_index = GetLabelIdxForTSV(lines[0], num_features, label_idx); ret.reset(new TSVParser(output_label_index, num_col, atof)); + if (num_features > 0 && num_features != ret->NumFeatures()) { + Log::Fatal("Number of features in TSV file (%d) mismatches the number of features (%d) in loaded model.", + ret->NumFeatures(), num_features); + } } else if (type == DataType::CSV) { output_label_index = GetLabelIdxForCSV(lines[0], num_features, label_idx); ret.reset(new CSVParser(output_label_index, num_col, atof)); + if (num_features > 0 && num_features != ret->NumFeatures()) { + Log::Fatal("Number of features in CSV file (%d) mismatches the number of features (%d) in loaded model.", + ret->NumFeatures(), num_features); + } } if (output_label_index < 0 && label_idx >= 0) { From 442c03a9571f7a9cff3366ee451c2e9b29732b95 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 21 Apr 2022 03:34:39 +0000 Subject: [PATCH 02/10] rollback changes --- include/LightGBM/dataset_loader.h | 4 ---- src/io/dataset_loader.cpp | 18 +++--------------- src/io/parser.cpp | 20 -------------------- 3 files changed, 3 insertions(+), 39 deletions(-) diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 02e1049a8a67..e4c93e182d96 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -16,8 +16,6 @@ namespace LightGBM { class DatasetLoader { public: - LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename, const int num_features_from_init_model); - LIGHTGBM_EXPORT DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename); LIGHTGBM_EXPORT ~DatasetLoader(); @@ -97,8 +95,6 @@ class DatasetLoader { std::unordered_set categorical_features_; /*! \brief Whether to store raw feature values */ bool store_raw_; - /*! \brief number of features from init model */ - int num_features_from_init_model_; }; } // namespace LightGBM diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index f772fbd6662b..1c76a9627376 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -18,19 +18,7 @@ namespace LightGBM { using json11::Json; DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename) - :DatasetLoader(io_config, predict_fun, num_class, filename, 0) {} - -DatasetLoader::DatasetLoader( - const Config& io_config, - const PredictFunction& predict_fun, - int num_class, - const char* filename, - const int num_features_from_init_model) - :config_(io_config), - random_(config_.data_random_seed), - predict_fun_(predict_fun), - num_class_(num_class), - num_features_from_init_model_(num_features_from_init_model) { + :config_(io_config), random_(config_.data_random_seed), predict_fun_(predict_fun), num_class_(num_class) { label_idx_ = 0; weight_idx_ = NO_SPECIFIC; group_idx_ = NO_SPECIFIC; @@ -230,7 +218,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac bool is_load_from_binary = false; if (bin_filename.size() == 0) { dataset->parser_config_str_ = Parser::GenerateParserConfigStr(filename, config_.parser_config_file.c_str(), config_.header, label_idx_); - auto parser = std::unique_ptr(Parser::CreateParser(filename, config_.header, num_features_from_init_model_, label_idx_, + auto parser = std::unique_ptr(Parser::CreateParser(filename, config_.header, 0, label_idx_, config_.precise_float_parser, dataset->parser_config_str_)); if (parser == nullptr) { Log::Fatal("Could not recognize data format of %s", filename); @@ -317,7 +305,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, } auto bin_filename = CheckCanLoadFromBin(filename); if (bin_filename.size() == 0) { - auto parser = std::unique_ptr(Parser::CreateParser(filename, config_.header, num_features_from_init_model_, label_idx_, + auto parser = std::unique_ptr(Parser::CreateParser(filename, config_.header, 0, label_idx_, config_.precise_float_parser, train_data->parser_config_str_)); if (parser == nullptr) { Log::Fatal("Could not recognize data format of %s", filename); diff --git a/src/io/parser.cpp b/src/io/parser.cpp index d32b3c4c4fc4..68e4cdb8116b 100644 --- a/src/io/parser.cpp +++ b/src/io/parser.cpp @@ -270,32 +270,12 @@ Parser* Parser::CreateParser(const char* filename, bool header, int num_features if (type == DataType::LIBSVM) { output_label_index = GetLabelIdxForLibsvm(lines[0], num_features, label_idx); ret.reset(new LibSVMParser(output_label_index, num_col, atof)); - if (num_features > 0) { - // num_features is specified by loaded model - // this can happen when loading a model for prediction - // or loading a model for continual training - if(ret->NumFeatures() > num_features) { - Log::Warning("Number of features found in LibSVM file (%d) is greater than number of features found in loaded model (%d).", - ret->NumFeatures(), num_features); - } else { - Log::Warning("Number of features found in LivSVM file (%d) is smaller than number of features found in loaded model (%d).", - ret->NumFeatures(), num_features); - } - } } else if (type == DataType::TSV) { output_label_index = GetLabelIdxForTSV(lines[0], num_features, label_idx); ret.reset(new TSVParser(output_label_index, num_col, atof)); - if (num_features > 0 && num_features != ret->NumFeatures()) { - Log::Fatal("Number of features in TSV file (%d) mismatches the number of features (%d) in loaded model.", - ret->NumFeatures(), num_features); - } } else if (type == DataType::CSV) { output_label_index = GetLabelIdxForCSV(lines[0], num_features, label_idx); ret.reset(new CSVParser(output_label_index, num_col, atof)); - if (num_features > 0 && num_features != ret->NumFeatures()) { - Log::Fatal("Number of features in CSV file (%d) mismatches the number of features (%d) in loaded model.", - ret->NumFeatures(), num_features); - } } if (output_label_index < 0 && label_idx >= 0) { From 2887ac0a913decf86f4f6772a0f554c3fba5ddf5 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 21 Apr 2022 04:58:39 +0000 Subject: [PATCH 03/10] handle mismatches only within gbdt_model_text.cpp --- src/boosting/gbdt_model_text.cpp | 37 ++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 73c3ea98d3f6..74dbad1e3aed 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -104,7 +104,16 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ for (size_t i = 0; i < feature_importances.size(); ++i) { size_t feature_importances_int = static_cast(feature_importances[i]); if (feature_importances_int > 0) { - pairs.emplace_back(feature_importances_int, feature_names_[i]); + Log::Warning("i = %d, feature_names_.size() = %d", i, feature_names_.size()); + if (i < feature_names_.size()) { + pairs.emplace_back(feature_importances_int, feature_names_[i]); + } else { + // with LibSVM format and continual training, the number of features in dataset can be fewer than in the intial model + // in that case FeatureImportance returns with the number of features in the intial model + std::stringstream str_buf; + str_buf << "Column_" << i << "_from_init_model"; + pairs.emplace_back(feature_importances_int, str_buf.str()); + } } } str_buf << '\n' << "\"feature_importances\":" << "{"; @@ -377,7 +386,15 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int for (size_t i = 0; i < feature_importances.size(); ++i) { size_t feature_importances_int = static_cast(feature_importances[i]); if (feature_importances_int > 0) { - pairs.emplace_back(feature_importances_int, feature_names_[i]); + if (i < feature_names_.size()) { + pairs.emplace_back(feature_importances_int, feature_names_[i]); + } else { + // with LibSVM format and continual training, the number of features in dataset can be fewer than in the intial model + // in that case FeatureImportance returns with the number of features in the intial model + std::stringstream str_buf; + str_buf << "Column_" << i << "_from_init_model"; + pairs.emplace_back(feature_importances_int, str_buf.str()); + } } } // sort the importance @@ -636,10 +653,14 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty for (int iter = 0; iter < num_used_model; ++iter) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { if (models_[iter]->split_gain(split_idx) > 0) { + const int real_feature_index = models_[iter]->split_feature(split_idx); #ifdef DEBUG - CHECK_GE(models_[iter]->split_feature(split_idx), 0); + CHECK_GE(real_feature_index, 0); #endif - feature_importances[models_[iter]->split_feature(split_idx)] += 1.0; + if (static_cast(real_feature_index) >= feature_importances.size()) { + feature_importances.resize(real_feature_index + 1); + } + feature_importances[real_feature_index] += 1.0; } } } @@ -647,10 +668,14 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty for (int iter = 0; iter < num_used_model; ++iter) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { if (models_[iter]->split_gain(split_idx) > 0) { + const int real_feature_index = models_[iter]->split_feature(split_idx); #ifdef DEBUG - CHECK_GE(models_[iter]->split_feature(split_idx), 0); + CHECK_GE(real_feature_index, 0); #endif - feature_importances[models_[iter]->split_feature(split_idx)] += models_[iter]->split_gain(split_idx); + if (static_cast(real_feature_index) >= feature_importances.size()) { + feature_importances.resize(real_feature_index + 1); + } + feature_importances[real_feature_index] += models_[iter]->split_gain(split_idx); } } } From 0a20a132667114415bfc353f49cf138264edd6e4 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 5 May 2022 15:14:06 +0000 Subject: [PATCH 04/10] remove useless debug information --- src/boosting/gbdt_model_text.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 74dbad1e3aed..fa6ee02f1634 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -104,7 +104,6 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ for (size_t i = 0; i < feature_importances.size(); ++i) { size_t feature_importances_int = static_cast(feature_importances[i]); if (feature_importances_int > 0) { - Log::Warning("i = %d, feature_names_.size() = %d", i, feature_names_.size()); if (i < feature_names_.size()) { pairs.emplace_back(feature_importances_int, feature_names_[i]); } else { From bda5bdbad8331f1fe70a87a9fd9ea872c30e3cb5 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 9 May 2022 04:28:33 +0000 Subject: [PATCH 05/10] add a test case --- tests/python_package_test/test_engine.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 1b202b413a2b..ed1601ed0851 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -966,6 +966,26 @@ def test_continue_train_multiclass(): assert evals_result['valid_0']['multi_logloss'][-1] == pytest.approx(ret) +def test_continue_train_different_feature_size(): + np.random.seed(0) + train_X = np.random.randn(100, 10) + train_y = np.sum(train_X, axis=1) + train_data = lgb.Dataset(train_X, label=train_y) + params = { + "objective": "regression", + "num_trees": 10, + "num_leaves": 31, + "verbose": 2, + 'predict_disable_shape_check': True, + } + model = lgb.train(train_set=train_data, params=params) + + train_X_cont = np.random.rand(100, 5) + train_y_cont = np.sum(train_X_cont, axis=1) + train_data_cont = lgb.Dataset(train_X_cont, label=train_y_cont) + lgb.train(train_set=train_data_cont, params=params, init_model=model) + + def test_cv(): X_train, y_train = make_synthetic_regression() params = {'verbose': -1} From 7e7190ffa340498505ad4456f56b31e1cef3825b Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Tue, 10 May 2022 08:53:28 +0800 Subject: [PATCH 06/10] Update tests/python_package_test/test_engine.py Co-authored-by: Nikita Titov --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index ed1601ed0851..087a852f0616 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -975,7 +975,7 @@ def test_continue_train_different_feature_size(): "objective": "regression", "num_trees": 10, "num_leaves": 31, - "verbose": 2, + "verbose": -1, 'predict_disable_shape_check': True, } model = lgb.train(train_set=train_data, params=params) From c823137f038bafe0e8e3e5a8c1825637f885c514 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 10 May 2022 02:13:14 +0000 Subject: [PATCH 07/10] provide warning instead of enlarging feature importances --- src/boosting/gbdt_model_text.cpp | 28 +++++++++++++----------- tests/python_package_test/test_engine.py | 9 ++++++-- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index fa6ee02f1634..72d4c45cb284 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -104,15 +104,7 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ for (size_t i = 0; i < feature_importances.size(); ++i) { size_t feature_importances_int = static_cast(feature_importances[i]); if (feature_importances_int > 0) { - if (i < feature_names_.size()) { - pairs.emplace_back(feature_importances_int, feature_names_[i]); - } else { - // with LibSVM format and continual training, the number of features in dataset can be fewer than in the intial model - // in that case FeatureImportance returns with the number of features in the intial model - std::stringstream str_buf; - str_buf << "Column_" << i << "_from_init_model"; - pairs.emplace_back(feature_importances_int, str_buf.str()); - } + pairs.emplace_back(feature_importances_int, feature_names_[i]); } } str_buf << '\n' << "\"feature_importances\":" << "{"; @@ -648,6 +640,8 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty } std::vector feature_importances(max_feature_idx_ + 1, 0.0); + bool warn_about_feature_number = false; + int max_feature_index_found = -1; if (importance_type == 0) { for (int iter = 0; iter < num_used_model; ++iter) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { @@ -657,9 +651,11 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty CHECK_GE(real_feature_index, 0); #endif if (static_cast(real_feature_index) >= feature_importances.size()) { - feature_importances.resize(real_feature_index + 1); + warn_about_feature_number = true; + max_feature_index_found = real_feature_index; + } else { + feature_importances[real_feature_index] += 1.0; } - feature_importances[real_feature_index] += 1.0; } } } @@ -672,15 +668,21 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty CHECK_GE(real_feature_index, 0); #endif if (static_cast(real_feature_index) >= feature_importances.size()) { - feature_importances.resize(real_feature_index + 1); + warn_about_feature_number = true; + max_feature_index_found = real_feature_index; + } else { + feature_importances[real_feature_index] += models_[iter]->split_gain(split_idx); } - feature_importances[real_feature_index] += models_[iter]->split_gain(split_idx); } } } } else { Log::Fatal("Unknown importance type: only support split=0 and gain=1"); } + if (warn_about_feature_number) { + Log::Warning("%d features found in continually trained model, but at least %d features found in initial model.", + static_cast(feature_importances.size(), max_feature_index_found + 1)); + } return feature_importances; } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 80200b92843b..11c17a57f59d 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -970,7 +970,7 @@ def test_continue_train_multiclass(): assert evals_result['valid_0']['multi_logloss'][-1] == pytest.approx(ret) -def test_continue_train_different_feature_size(): +def test_continue_train_different_feature_size(capsys): np.random.seed(0) train_X = np.random.randn(100, 10) train_y = np.sum(train_X, axis=1) @@ -979,7 +979,7 @@ def test_continue_train_different_feature_size(): "objective": "regression", "num_trees": 10, "num_leaves": 31, - "verbose": 2, + "verbose": -1, 'predict_disable_shape_check': True, } model = lgb.train(train_set=train_data, params=params) @@ -987,7 +987,12 @@ def test_continue_train_different_feature_size(): train_X_cont = np.random.rand(100, 5) train_y_cont = np.sum(train_X_cont, axis=1) train_data_cont = lgb.Dataset(train_X_cont, label=train_y_cont) + params.update({"verbose": 2}) lgb.train(train_set=train_data_cont, params=params, init_model=model) + captured = capsys.readouterr() + assert captured.out.find("features found in continually trained model, but at least") != -1 + #feature_importance = new_model.feature_importance() + #assert len(feature_importance) == 10 def test_cv(): From 3e525381e192e4de6180b91001dcb15924f9b61a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 10 May 2022 02:15:24 +0000 Subject: [PATCH 08/10] remove useless comments --- src/boosting/gbdt_model_text.cpp | 10 +--------- tests/python_package_test/test_engine.py | 2 -- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 72d4c45cb284..41f4e1df3097 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -377,15 +377,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int for (size_t i = 0; i < feature_importances.size(); ++i) { size_t feature_importances_int = static_cast(feature_importances[i]); if (feature_importances_int > 0) { - if (i < feature_names_.size()) { - pairs.emplace_back(feature_importances_int, feature_names_[i]); - } else { - // with LibSVM format and continual training, the number of features in dataset can be fewer than in the intial model - // in that case FeatureImportance returns with the number of features in the intial model - std::stringstream str_buf; - str_buf << "Column_" << i << "_from_init_model"; - pairs.emplace_back(feature_importances_int, str_buf.str()); - } + pairs.emplace_back(feature_importances_int, feature_names_[i]); } } // sort the importance diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 11c17a57f59d..273c63b7fe89 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -991,8 +991,6 @@ def test_continue_train_different_feature_size(capsys): lgb.train(train_set=train_data_cont, params=params, init_model=model) captured = capsys.readouterr() assert captured.out.find("features found in continually trained model, but at least") != -1 - #feature_importance = new_model.feature_importance() - #assert len(feature_importance) == 10 def test_cv(): From 23e99913630f5f9fa6bb8cfffd78ff1209894267 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 11 May 2022 03:17:56 +0000 Subject: [PATCH 09/10] ensure the warning is raised in test case --- src/boosting/gbdt_model_text.cpp | 12 ++++++++---- tests/python_package_test/test_engine.py | 6 +++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 41f4e1df3097..12a0bf7cfed4 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -644,7 +644,9 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty #endif if (static_cast(real_feature_index) >= feature_importances.size()) { warn_about_feature_number = true; - max_feature_index_found = real_feature_index; + if (real_feature_index > max_feature_index_found) { + max_feature_index_found = real_feature_index; + } } else { feature_importances[real_feature_index] += 1.0; } @@ -661,7 +663,9 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty #endif if (static_cast(real_feature_index) >= feature_importances.size()) { warn_about_feature_number = true; - max_feature_index_found = real_feature_index; + if (real_feature_index > max_feature_index_found) { + max_feature_index_found = real_feature_index; + } } else { feature_importances[real_feature_index] += models_[iter]->split_gain(split_idx); } @@ -672,8 +676,8 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty Log::Fatal("Unknown importance type: only support split=0 and gain=1"); } if (warn_about_feature_number) { - Log::Warning("%d features found in continually trained model, but at least %d features found in initial model.", - static_cast(feature_importances.size(), max_feature_index_found + 1)); + Log::Warning("Only %d features found in dataset for continual training, but at least %d features found in initial model.", + static_cast(feature_importances.size()), max_feature_index_found + 1); } return feature_importances; } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 273c63b7fe89..6d5579180090 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -972,8 +972,8 @@ def test_continue_train_multiclass(): def test_continue_train_different_feature_size(capsys): np.random.seed(0) - train_X = np.random.randn(100, 10) - train_y = np.sum(train_X, axis=1) + train_X = np.hstack([np.ones(800).reshape(-1, 8), np.arange(200, 0, -1).reshape(-1, 2)]) + train_y = np.sum(train_X[:, -2:], axis=1) train_data = lgb.Dataset(train_X, label=train_y) params = { "objective": "regression", @@ -990,7 +990,7 @@ def test_continue_train_different_feature_size(capsys): params.update({"verbose": 2}) lgb.train(train_set=train_data_cont, params=params, init_model=model) captured = capsys.readouterr() - assert captured.out.find("features found in continually trained model, but at least") != -1 + assert captured.out.find("features found in dataset for continual training, but at least") != -1 def test_cv(): From d719d78d867532267e4c7f28fa7bb6ba6f273eb0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 30 May 2022 03:36:45 +0000 Subject: [PATCH 10/10] add suggestion for the warning --- src/boosting/gbdt_model_text.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 12a0bf7cfed4..695eae3c70cb 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -678,6 +678,7 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty if (warn_about_feature_number) { Log::Warning("Only %d features found in dataset for continual training, but at least %d features found in initial model.", static_cast(feature_importances.size()), max_feature_index_found + 1); + Log::Warning("Please check the number of features used in continual training."); } return feature_importances; }