From c1e4366771b79bae334dd8f328ee12467b9de85e Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 26 Sep 2023 09:27:17 -0400 Subject: [PATCH 1/3] Initial structure to implement an Island GA. taskflow added as depend. --- environment.yml | 1 + src/brushGA.cpp | 11 ++++++++ src/brushGA.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++ src/selection.h | 17 ++++++++++++ 4 files changed, 98 insertions(+) create mode 100644 src/brushGA.cpp create mode 100644 src/brushGA.h create mode 100644 src/selection.h diff --git a/environment.yml b/environment.yml index 53e9d812..8e6a62d6 100644 --- a/environment.yml +++ b/environment.yml @@ -10,6 +10,7 @@ dependencies: - gxx >= 12.0 - ninja - ceres-solver + - taskflow - pybind11 #=2.6.2 - pytest #=6.2.4 - pydot diff --git a/src/brushGA.cpp b/src/brushGA.cpp new file mode 100644 index 00000000..1b16a4d7 --- /dev/null +++ b/src/brushGA.cpp @@ -0,0 +1,11 @@ +#include "brushGA.h" +#include + + +using namespace Brush; + +/// @brief initialize Feat object for fitting. +void BrushGA::init() +{ + +} diff --git a/src/brushGA.h b/src/brushGA.h new file mode 100644 index 00000000..02d0c536 --- /dev/null +++ b/src/brushGA.h @@ -0,0 +1,69 @@ +/* Brush +copyright 2020 William La Cava +license: GNU/GPL v3 +*/ + +#ifndef BrushGA_H +#define BrushGA_H + +#include "init.h" +#include "taskflow/taskflow.hpp" + +// TODO: improve the includes (why does this lines below does not work?) +// #include "variation.h" +// #include "selection.h" + +// using namespace selection; +// using namespace variation; + +namespace Brush +{ + +class BrushGA{ +public: + + BrushGA(){} + /// destructor + ~BrushGA(){} + + void init(); + + //getters and setters for GA configuration. + // getters and setters for the best solution found after evolution + // predict, transform, predict_proba, etc. + // get statistics + // load and save best individuals + // logger, save to file + // execution archive + // random state control + // score functions + // fit methods (this will run the evolution), run a single generation +private: + // attributes (hyperparameters) + // update best + // calculate/print stats +}; + +int main(){ + + tf::Executor executor; + tf::Taskflow taskflow; + + auto [A, B, C, D] = taskflow.emplace( // create four tasks + [] () { std::cout << "TaskA\n"; }, + [] () { std::cout << "TaskB\n"; }, + [] () { std::cout << "TaskC\n"; }, + [] () { std::cout << "TaskD\n"; } + ); + + A.precede(B, C); // A runs before B and C + D.succeed(B, C); // D runs after B and C + + executor.run(taskflow).wait(); + + return 0; +} + +} // Brush + +#endif diff --git a/src/selection.h b/src/selection.h new file mode 100644 index 00000000..3ce1f849 --- /dev/null +++ b/src/selection.h @@ -0,0 +1,17 @@ +/* Brush +copyright 2020 William La Cava +license: GNU/GPL v3 +*/ + +#ifndef SELECTION_H +#define SELECTION_H + +namespace selection { + +class SelectorBase { +public: +private: +}; + +} // selection +#endif \ No newline at end of file From 2a579b9cc7e7135dbf04e44987be4cb70f8b5b9a Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Sun, 8 Oct 2023 15:53:10 -0400 Subject: [PATCH 2/3] python implementation of island model. fixed api and tests --- src/brush/deap_api/__init__.py | 2 + src/brush/deap_api/nsga2island.py | 143 ++++++++++++++++++++++++++++++ src/brush/estimator.py | 25 ++++-- tests/python/test_brush.py | 11 ++- 4 files changed, 172 insertions(+), 9 deletions(-) create mode 100644 src/brush/deap_api/nsga2island.py diff --git a/src/brush/deap_api/__init__.py b/src/brush/deap_api/__init__.py index b2b2dfa8..6cbb48db 100644 --- a/src/brush/deap_api/__init__.py +++ b/src/brush/deap_api/__init__.py @@ -1,2 +1,4 @@ from .nsga2 import nsga2 +from .ga import ga +from .nsga2island import nsga2island from .utils import DeapIndividual \ No newline at end of file diff --git a/src/brush/deap_api/nsga2island.py b/src/brush/deap_api/nsga2island.py new file mode 100644 index 00000000..c976b211 --- /dev/null +++ b/src/brush/deap_api/nsga2island.py @@ -0,0 +1,143 @@ +from deap import tools +from deap.benchmarks.tools import diversity, convergence, hypervolume +import numpy as np +import functools + + +def nsga2island(toolbox, NGEN, MU, N_ISLANDS, MIGPX, CXPB, use_batch, verbosity, rnd_flt): + # NGEN = 250 + # MU = 100 + # CXPB = 0.9 + # N_ISLANDS: number of independent islands. Islands are controled by indexes. + # setting N_ISLANDS=1 would be the same as the original nsga2 + # rnd_flt: random number generator to sample crossover prob + + def calculate_statistics(ind): + on_train = ind.fitness.values + on_val = toolbox.evaluateValidation(ind) + + return (*on_train, *on_val) + + stats = tools.Statistics(calculate_statistics) + + stats.register("avg", np.mean, axis=0) + stats.register("med", np.median, axis=0) + stats.register("std", np.std, axis=0) + stats.register("min", np.min, axis=0) + stats.register("max", np.max, axis=0) + + logbook = tools.Logbook() + logbook.header = "gen", "evals", "avg (O1 train, O2 train, O1 val, O2 val)", \ + "med (O1 train, O2 train, O1 val, O2 val)", \ + "std (O1 train, O2 train, O1 val, O2 val)", \ + "min (O1 train, O2 train, O1 val, O2 val)", \ + "max (O1 train, O2 train, O1 val, O2 val)" + + # Tuples with start and end indexes for each island. Number of individuals + # in each island can slightly differ if N_ISLANDS is not a divisor of MU + island_indexes = [((i*MU)//N_ISLANDS, ((i+1)*MU)//N_ISLANDS) + for i in range(N_ISLANDS)] + + print("island_indexes", island_indexes) + pop = toolbox.population(n=MU) + + fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) + for ind, fit in zip(pop, fitnesses): + ind.fitness.values = fit + + survived = [] + for (idx_start, idx_end) in island_indexes: + survived_parents = toolbox.survive(pop[idx_start:idx_end], + idx_end-idx_start) + survived.extend(survived_parents) + pop = survived + + record = stats.compile(pop) + logbook.record(gen=0, evals=len(pop), **record) + + if verbosity > 0: + print(logbook.stream) + + # Begin the generational process + for gen in range(1, NGEN): + batch = toolbox.getBatch() # batch will be a random subset only if it was not + # defined as the size of the train set. everytime + # this function is called, a new random batch is generated. + + if (use_batch): # recalculate the fitness for the parents + # use_batch is false if batch_size is different from train set size. + # If we're using batch, we need to re-evaluate every model (without + # changing its weights). evaluateValidation doesnt fit the weights + fitnesses = toolbox.map( + functools.partial(toolbox.evaluateValidation, data=batch), pop) + + for ind, fit in zip(pop, fitnesses): + ind.fitness.values = fit + + # Vary the population inside each island + parents = [] + for (idx_start, idx_end) in island_indexes: + island_parents = toolbox.select(pop[idx_start:idx_end], + idx_end-idx_start) + parents.extend(island_parents) + + offspring = [] # Will have the same size as pop + for (idx_start, idx_end) in island_indexes: + for ind1, ind2 in zip(parents[idx_start:idx_end:2], + parents[idx_start+1:idx_end:2] + ): + off1, off2 = None, None + if rnd_flt() < CXPB: # either mutation or crossover + off1, off2 = toolbox.mate(ind1, ind2) + else: + off1 = toolbox.mutate(ind1) + off2 = toolbox.mutate(ind2) + + # Inserting parent if mutation failed + offspring.extend([off1 if off1 is not None else toolbox.Clone(ind1)]) + offspring.extend([off2 if off2 is not None else toolbox.Clone(ind2)]) + + # Evaluate (instead of evaluateValidation) to fit the weights of the offspring + fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring) + if (use_batch): #calculating objectives based on batch + fitnesses = toolbox.map( + functools.partial(toolbox.evaluateValidation, data=batch), offspring) + + for ind, fit in zip(offspring, fitnesses): + ind.fitness.values = fit + + # Select the next generation population + new_pop = [] + for (idx_start, idx_end) in island_indexes: + island_new_pop = toolbox.survive(pop[idx_start:idx_end] \ + +offspring[idx_start:idx_end], + idx_end-idx_start) + new_pop.extend(island_new_pop) + + # Migration to fill up the islands for the next generation + pop = [] + for (idx_start, idx_end) in island_indexes: + other_islands = list(range(0, idx_start)) + list(range(idx_end, MU)) + for idx_individual in range(idx_start, idx_end): + if rnd_flt() < MIGPX: # replace by someone not from the same island + idx_other_individual = other_islands[ + int(rnd_flt() * len(other_islands))] + pop.append(new_pop[idx_other_individual]) + else: + pop.append(new_pop[idx_individual]) + print(len(pop)) + for p in pop: + print(type(p), p) + record = stats.compile(pop) + logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) + + if verbosity > 0: + print(logbook.stream) + + if verbosity > 0: + print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) + + archive = tools.ParetoFront() + archive.update(pop) + + return archive, logbook \ No newline at end of file diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 5f07eabc..6249d2f9 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -13,7 +13,7 @@ # from tqdm import tqdm from types import NoneType import _brush -from .deap_api import nsga2, ga, DeapIndividual +from .deap_api import nsga2, ga, nsga2island, DeapIndividual # from _brush import Dataset, SearchSpace @@ -38,6 +38,12 @@ class BrushEstimator(BaseEstimator): Maximum depth of GP trees in the GP program. Use 0 for no limit. max_size : int, default 0 Maximum number of nodes in a tree. Use 0 for no limit. + n_islands : int, default 5 + Number of independent islands to use in evolutionary framework. + Ignored if `algorithm!="nsga2island"`. + mig_prob : float, default 0.05 + Probability of occuring a migration between two random islands at the + end of a generation, must be between 0 and 1. cx_prob : float, default 1/7 Probability of applying the crossover variation when generating the offspring, must be between 0 and 1. @@ -59,7 +65,7 @@ class BrushEstimator(BaseEstimator): initialization : {"grow", "full"}, default "grow" Strategy to create the initial population. If `full`, then every expression is created with `max_size` nodes. If `grow`, size will be uniformly distributed. - algorithm : {"nsga2", "ga"}, default "nsga2" + algorithm : {"nsga2island", "nsga2", "ga"}, default "nsga2island" Which Evolutionary Algorithm framework to use to evolve the population. validation_size : float, default 0.0 Percentage of samples to use as a hold-out partition. These samples are used @@ -106,12 +112,14 @@ def __init__( verbosity=0, max_depth=3, max_size=20, + n_islands=5, + mig_prob=0.05, cx_prob= 1/7, mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6}, functions: list[str]|dict[str,float] = {}, initialization="grow", - algorithm="nsga2", + algorithm="nsga2island", random_state=None, validation_size: float = 0.0, batch_size: float = 1.0 @@ -123,6 +131,8 @@ def __init__( self.mode=mode self.max_depth=max_depth self.max_size=max_size + self.n_islands=n_islands + self.mig_prob=mig_prob self.cx_prob=cx_prob self.mutation_options=mutation_options self.functions=functions @@ -155,7 +165,7 @@ def _setup_toolbox(self, data_train, data_validation): # When solving multi-objective problems, selection and survival must # support this feature. This means that these selection operators must # accept a tuple of fitnesses as argument) - if self.algorithm=="nsga2": + if self.algorithm=="nsga2" or self.algorithm=="nsga2island": toolbox.register("select", tools.selTournamentDCD) toolbox.register("survive", tools.selNSGA2) elif self.algorithm=="ga": @@ -232,7 +242,12 @@ def fit(self, X, y): self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) - if self.algorithm=="nsga2": + if self.algorithm=="nsga2island": + self.archive_, self.logbook_ = nsga2island( + self.toolbox_, self.max_gen, self.pop_size, self.n_islands, + self.mig_prob, self.cx_prob, + (0.0 Date: Wed, 11 Oct 2023 12:48:07 -0400 Subject: [PATCH 3/3] Option to use nsga or ga with island divisions --- src/brush/deap_api/__init__.py | 1 - src/brush/deap_api/nsga2island.py | 5 +---- src/brush/estimator.py | 6 +++--- src/variation.h | 1 + tests/cpp/test_program.cpp | 8 ++++++++ tests/python/test_brush.py | 2 ++ 6 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/brush/deap_api/__init__.py b/src/brush/deap_api/__init__.py index 6cbb48db..b011dec5 100644 --- a/src/brush/deap_api/__init__.py +++ b/src/brush/deap_api/__init__.py @@ -1,4 +1,3 @@ from .nsga2 import nsga2 -from .ga import ga from .nsga2island import nsga2island from .utils import DeapIndividual \ No newline at end of file diff --git a/src/brush/deap_api/nsga2island.py b/src/brush/deap_api/nsga2island.py index c976b211..d215d3f7 100644 --- a/src/brush/deap_api/nsga2island.py +++ b/src/brush/deap_api/nsga2island.py @@ -38,7 +38,6 @@ def calculate_statistics(ind): island_indexes = [((i*MU)//N_ISLANDS, ((i+1)*MU)//N_ISLANDS) for i in range(N_ISLANDS)] - print("island_indexes", island_indexes) pop = toolbox.population(n=MU) fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) @@ -125,9 +124,7 @@ def calculate_statistics(ind): pop.append(new_pop[idx_other_individual]) else: pop.append(new_pop[idx_individual]) - print(len(pop)) - for p in pop: - print(type(p), p) + record = stats.compile(pop) logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 5c810d70..a5520067 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -66,7 +66,7 @@ class BrushEstimator(BaseEstimator): initialization : {"grow", "full"}, default "grow" Strategy to create the initial population. If `full`, then every expression is created with `max_size` nodes. If `grow`, size will be uniformly distributed. - algorithm : {"nsga2island", "nsga2", "ga"}, default "nsga2island" + algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2island" Which Evolutionary Algorithm framework to use to evolve the population. validation_size : float, default 0.0 Percentage of samples to use as a hold-out partition. These samples are used @@ -169,7 +169,7 @@ def _setup_toolbox(self, data_train, data_validation): if self.algorithm=="nsga2" or self.algorithm=="nsga2island": toolbox.register("select", tools.selTournamentDCD) toolbox.register("survive", tools.selNSGA2) - elif self.algorithm=="ga": + elif self.algorithm=="ga" or self.algorithm=="gaisland": toolbox.register("select", tools.selTournament, tournsize=3) def offspring(pop, MU): return pop[-MU:] toolbox.register("survive", offspring) @@ -250,7 +250,7 @@ def fit(self, X, y): self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) - if self.algorithm=="nsga2island": + if self.algorithm=="nsga2island" or self.algorithm=="gaisland": self.archive_, self.logbook_ = nsga2island( self.toolbox_, self.max_gen, self.pop_size, self.n_islands, self.mig_prob, self.cx_prob, diff --git a/src/variation.h b/src/variation.h index 22d42a77..90421011 100644 --- a/src/variation.h +++ b/src/variation.h @@ -680,6 +680,7 @@ std::optional> cross(const Program& root, const Program& other) // fmt::print("other_spot : {}\n",other_spot.node->data); // swap subtrees at child_spot and other_spot + // TODO: do I need to delete the removed node? child.Tree.move_ontop(child_spot, other_spot); return child; } diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp index de7084f9..1509ac02 100644 --- a/tests/cpp/test_program.cpp +++ b/tests/cpp/test_program.cpp @@ -31,6 +31,14 @@ TEST(Program, MakeRegressor) ); ASSERT_TRUE( PRG.get_model("compact", true)==clone.get_model("compact", true) ); + + // probabilities are the same + vector PRG_weights(PRG.Tree.size()); + std::transform(PRG.Tree.begin(), PRG.Tree.end(), + PRG_weights.begin(), + [](const auto& n){ return n.get_prob_change(); } + ); + ASSERT_TRUE( PRG.get_model("compact", true)==clone.get_model("compact", true) ); } } diff --git a/tests/python/test_brush.py b/tests/python/test_brush.py index 1c7f759e..c9b55103 100644 --- a/tests/python/test_brush.py +++ b/tests/python/test_brush.py @@ -46,9 +46,11 @@ def regression_setup(): @pytest.mark.parametrize('setup,algorithm', [('classification_setup', 'nsga2island'), ('classification_setup', 'nsga2' ), + ('classification_setup', 'gaisland' ), ('classification_setup', 'ga' ), ('regression_setup', 'nsga2island'), ('regression_setup', 'nsga2' ), + ('regression_setup', 'gaisland' ), ('regression_setup', 'ga' )]) def test_fit(setup, algorithm, brush_args, request): """Testing common utilities related to fitting and generic brush estimator.