From 691aa043f7dbd8bf4ce819e39f9fa1d016bebe9c Mon Sep 17 00:00:00 2001 From: gAldeia Date: Fri, 8 Nov 2024 20:15:08 -0300 Subject: [PATCH] Fixed sample weights not working in lexicase --- pybrush/EstimatorInterface.py | 6 ++--- src/engine.cpp | 13 +++++++---- src/eval/evaluation.cpp | 4 ++-- src/ind/individual.h | 2 +- src/params.h | 44 +++++++++++++++++++++-------------- src/pop/population.cpp | 5 ++++ src/pop/population.h | 16 ++++++------- src/selection/lexicase.cpp | 4 ++++ 8 files changed, 57 insertions(+), 37 deletions(-) diff --git a/pybrush/EstimatorInterface.py b/pybrush/EstimatorInterface.py index 8e5210cd..38403078 100644 --- a/pybrush/EstimatorInterface.py +++ b/pybrush/EstimatorInterface.py @@ -221,8 +221,8 @@ def _wrap_parameters(self, y, **extra_kwargs): params = Parameters() # Setting up the classification or regression problem - params.classification = self.mode == "classification" - if params.classification: + if self.mode == "classification": + params.classification = True params.set_n_classes(y) params.set_class_weights(y) params.set_sample_weights(y) @@ -256,7 +256,7 @@ def _wrap_parameters(self, y, **extra_kwargs): params.max_time = self.max_time # Sampling probabilities - params.weights_init=self.weights_init + params.weights_init = self.weights_init params.bandit = self.bandit params.mutation_probs = self.mutation_probs diff --git a/src/engine.cpp b/src/engine.cpp index d62d2552..e201956b 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -335,14 +335,17 @@ void Engine::run(Dataset &data) this->init(); if (params.load_population != "") { + // std::cout << "Loading population from: " << params.load_population << std::endl; this->pop.load(params.load_population); // invalidating all individuals - // for (auto& individual : this->pop.individuals) { - // if (individual != nullptr) { - // individual->set_is_fitted(false); - // } - // } + for (auto& individual : this->pop.individuals) { + if (individual != nullptr) { + individual->set_is_fitted(false); + // std::cout << "Invalidated individual with ID: " << individual->id << std::endl; + } + } + // std::cout << "Population loaded and individuals invalidated." << std::endl; } else this->pop.init(this->ss, this->params); diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index 16e6cb6f..1d528fc6 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -63,8 +63,8 @@ void Evaluation::assign_fit(Individual& ind, const Dataset& data, VectorXf val_errors; f_v = S.score(ind, validation, val_errors, params); - if (val) - ind.error = val_errors; + // if (val) // never use validation data here. This is used in lexicase selection + // ind.error = val_errors; } // This is what is going to determine the weights for the individual's fitness diff --git a/src/ind/individual.h b/src/ind/individual.h index f1b8e1a5..6737efcb 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -30,7 +30,7 @@ class Individual{ // storing what changed in relation to parent inside variation string variation = "born"; // spontanegous generation (born), crossover, or which type of mutation - vector sampled_nodes; // nodes that were sampled in mutation + vector sampled_nodes = {}; // nodes that were sampled in mutation VectorXf error; ///< training error (used in lexicase selectors) diff --git a/src/params.h b/src/params.h index 63fdd004..7f8279da 100644 --- a/src/params.h +++ b/src/params.h @@ -64,14 +64,14 @@ struct Parameters string scorer="mse"; ///< actual loss function used, determined by error - vector classes; ///< class labels + vector classes = vector(); ///< class labels vector class_weights = vector(); ///< weights for each class vector sample_weights = vector(); ///< weights for each sample // for creating dataset from X and y in Engine::fit. Ignored if // the uses uses an dataset - bool classification; - unsigned int n_classes; + bool classification = false; + unsigned int n_classes = 0; // validation partition bool shuffle_split = false; @@ -188,31 +188,36 @@ struct Parameters bool get_weights_init(){ return weights_init; }; void set_n_classes(const ArrayXf& y){ - vector uc = unique( ArrayXi(y.cast()) ); - - if (int(uc.at(0)) != 0) - HANDLE_ERROR_THROW("Class labels must start at 0"); - - vector cont_classes(uc.size()); - iota(cont_classes.begin(), cont_classes.end(), 0); - for (int i = 0; i < cont_classes.size(); ++i) + if (classification) { - if ( int(uc.at(i)) != cont_classes.at(i)) - HANDLE_ERROR_THROW("Class labels must be contiguous"); + vector uc = unique( ArrayXi(y.cast()) ); + + if (int(uc.at(0)) != 0) + HANDLE_ERROR_THROW("Class labels must start at 0"); + + vector cont_classes(uc.size()); + iota(cont_classes.begin(), cont_classes.end(), 0); + for (int i = 0; i < cont_classes.size(); ++i) + { + if ( int(uc.at(i)) != cont_classes.at(i)) + HANDLE_ERROR_THROW("Class labels must be contiguous"); + } + n_classes = uc.size(); + // classes = uc; } - n_classes = uc.size(); }; void set_class_weights(const ArrayXf& y){ class_weights.resize(n_classes); // set_n_classes must be called first for (unsigned i = 0; i < n_classes; ++i){ class_weights.at(i) = float((y.cast().array() == i).count())/y.size(); - class_weights.at(i) = (1 - class_weights.at(i))*float(n_classes); + class_weights.at(i) = (1.0 - class_weights.at(i))*float(n_classes); } }; void set_sample_weights(const ArrayXf& y){ - sample_weights.clear(); // set_class_weights must be called first - for (unsigned i = 0; i < y.size(); ++i) - sample_weights.push_back(class_weights.at(int(y(i)))); + sample_weights.resize(0); // set_class_weights must be called first + if (!class_weights.empty()) + for (unsigned i = 0; i < y.size(); ++i) + sample_weights.push_back(class_weights.at(int(y(i)))); }; unsigned int get_n_classes(){ return n_classes; }; @@ -261,6 +266,9 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Parameters, mig_prob, classification, n_classes, + classes, // TODO: get rid of this parameter? for some reason, when i remove it (or set it to any value) the load population starts to fail with regression + class_weights, + sample_weights, validation_size, feature_names, batch_size, diff --git a/src/pop/population.cpp b/src/pop/population.cpp index c7cf6e16..28245828 100644 --- a/src/pop/population.cpp +++ b/src/pop/population.cpp @@ -124,13 +124,18 @@ void Population::load(string filename) std::string line; indata >> line; + // std::cout << "Debug: Read line from file " << std::endl; + json j = json::parse(line); from_json(j, *this); + // std::cout << "Debug: Parsed JSON successfully." << std::endl; + logger.log("Loaded population from " + filename + " of size = " + to_string(this->size()),1); indata.close(); + // std::cout << "Debug: Closed input file." << std::endl; } /// update individual vector size and island indexes diff --git a/src/pop/population.h b/src/pop/population.h index ef005857..afa4560a 100644 --- a/src/pop/population.h +++ b/src/pop/population.h @@ -97,14 +97,14 @@ class Population{ }; }; -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( - Population, individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( - Population, individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( - Population, individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( - Population, individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, + individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, + individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, + individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, + individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); }// Pop }// Brush diff --git a/src/selection/lexicase.cpp b/src/selection/lexicase.cpp index be68cf98..c04527ff 100644 --- a/src/selection/lexicase.cpp +++ b/src/selection/lexicase.cpp @@ -73,6 +73,10 @@ vector Lexicase::select(Population& pop, int island, vector cases; // cases (samples) if (params.classification && !params.class_weights.empty()) { + // NOTE: when calling lexicase, make sure `errors` is from training + // data, and not from validation data. This is because the sample + // weights indexes are based on train partition + // for classification problems, weight case selection // by class weights cases.resize(0);