cavalab · gAldeia · Oct 11, 2023 · Sep 26, 2023 · Oct 8, 2023 · Oct 9, 2023
diff --git a/environment.yml b/environment.yml
@@ -10,6 +10,7 @@ dependencies:
     - gxx >= 12.0
     - ninja
     - ceres-solver
+    - taskflow
     - pybind11 #=2.6.2
     - pytest #=6.2.4
     - pydot

diff --git a/src/brush/deap_api/__init__.py b/src/brush/deap_api/__init__.py
@@ -1,2 +1,3 @@
 from .nsga2 import nsga2
+from .nsga2island import nsga2island
 from .utils import DeapIndividual
diff --git a/src/brush/deap_api/nsga2island.py b/src/brush/deap_api/nsga2island.py
@@ -0,0 +1,140 @@
+from deap import tools 
+from deap.benchmarks.tools import diversity, convergence, hypervolume
+import numpy as np
+import functools
+
+
+def nsga2island(toolbox, NGEN, MU, N_ISLANDS, MIGPX, CXPB, use_batch, verbosity, rnd_flt):
+    # NGEN = 250
+    # MU   = 100
+    # CXPB = 0.9
+    # N_ISLANDS: number of independent islands. Islands are controled by indexes.
+    # setting N_ISLANDS=1 would be the same as the original nsga2
+    # rnd_flt: random number generator to sample crossover prob
+
+    def calculate_statistics(ind):
+        on_train = ind.fitness.values
+        on_val   = toolbox.evaluateValidation(ind)
+
+        return (*on_train, *on_val) 
+
+    stats = tools.Statistics(calculate_statistics)
+
+    stats.register("avg", np.mean, axis=0)
+    stats.register("med", np.median, axis=0)
+    stats.register("std", np.std, axis=0)
+    stats.register("min", np.min, axis=0)
+    stats.register("max", np.max, axis=0)
+
+    logbook = tools.Logbook()
+    logbook.header = "gen", "evals", "avg (O1 train, O2 train, O1 val, O2 val)", \
+                                     "med (O1 train, O2 train, O1 val, O2 val)", \
+                                     "std (O1 train, O2 train, O1 val, O2 val)", \
+                                     "min (O1 train, O2 train, O1 val, O2 val)", \
+                                     "max (O1 train, O2 train, O1 val, O2 val)"
+
+    # Tuples with start and end indexes for each island. Number of individuals
+    # in each island can slightly differ if N_ISLANDS is not a divisor of MU
+    island_indexes = [((i*MU)//N_ISLANDS, ((i+1)*MU)//N_ISLANDS)
+                      for i in range(N_ISLANDS)]
+
+    pop = toolbox.population(n=MU)
+
+    fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop)
+    for ind, fit in zip(pop, fitnesses):
+        ind.fitness.values = fit
+
+    survived = []
+    for (idx_start, idx_end) in island_indexes:
+        survived_parents = toolbox.survive(pop[idx_start:idx_end],
+                                        idx_end-idx_start)
+        survived.extend(survived_parents)
+    pop = survived
+
+    record = stats.compile(pop)
+    logbook.record(gen=0, evals=len(pop), **record)
+
+    if verbosity > 0: 
+        print(logbook.stream)
+
+    # Begin the generational process
+    for gen in range(1, NGEN):
+        batch = toolbox.getBatch() # batch will be a random subset only if it was not
+                                   # defined as the size of the train set. everytime
+                                   # this function is called, a new random batch is generated.
+
+        if (use_batch): # recalculate the fitness for the parents
+            # use_batch is false if batch_size is different from train set size.
+            # If we're using batch, we need to re-evaluate every model (without
+            # changing its weights). evaluateValidation doesnt fit the weights
+            fitnesses = toolbox.map( 
+                functools.partial(toolbox.evaluateValidation, data=batch), pop)
+
+            for ind, fit in zip(pop, fitnesses):
+                ind.fitness.values = fit
+
+        # Vary the population inside each island
+        parents = []
+        for (idx_start, idx_end) in island_indexes:
+            island_parents = toolbox.select(pop[idx_start:idx_end],
+                                            idx_end-idx_start)
+            parents.extend(island_parents)
+
+        offspring = [] # Will have the same size as pop
+        for (idx_start, idx_end) in island_indexes:
+            for ind1, ind2 in zip(parents[idx_start:idx_end:2],
+                                  parents[idx_start+1:idx_end:2]
+            ):
+                off1, off2 = None, None
+                if rnd_flt() < CXPB: # either mutation or crossover
+                    off1, off2 = toolbox.mate(ind1, ind2)
+                else:
+                    off1 = toolbox.mutate(ind1)
+                    off2 = toolbox.mutate(ind2)
+
+                # Inserting parent if mutation failed
+                offspring.extend([off1 if off1 is not None else toolbox.Clone(ind1)])
+                offspring.extend([off2 if off2 is not None else toolbox.Clone(ind2)])
+
+        # Evaluate (instead of evaluateValidation) to fit the weights of the offspring
+        fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring)
+        if (use_batch): #calculating objectives based on batch
+            fitnesses = toolbox.map(
+                functools.partial(toolbox.evaluateValidation, data=batch), offspring)
+
+        for ind, fit in zip(offspring, fitnesses):
+            ind.fitness.values = fit
+
+        # Select the next generation population
+        new_pop = []
+        for (idx_start, idx_end) in island_indexes:
+            island_new_pop = toolbox.survive(pop[idx_start:idx_end] \
+                                            +offspring[idx_start:idx_end], 
+                                            idx_end-idx_start)
+            new_pop.extend(island_new_pop)
+
+        # Migration to fill up the islands for the next generation
+        pop = []
+        for (idx_start, idx_end) in island_indexes:
+            other_islands = list(range(0, idx_start)) + list(range(idx_end, MU))
+            for idx_individual in range(idx_start, idx_end):
+                if rnd_flt() < MIGPX: # replace by someone not from the same island
+                    idx_other_individual = other_islands[
+                        int(rnd_flt() * len(other_islands))]
+                    pop.append(new_pop[idx_other_individual])
+                else:
+                    pop.append(new_pop[idx_individual])
+
+        record = stats.compile(pop)
+        logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record)
+
+        if verbosity > 0: 
+            print(logbook.stream)
+
+    if verbosity > 0: 
+        print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0]))
+
+    archive = tools.ParetoFront() 
+    archive.update(pop)
+
+    return archive, logbook
diff --git a/src/brush/estimator.py b/src/brush/estimator.py
@@ -14,7 +14,7 @@
 # from tqdm import tqdm
 from types import NoneType
 import _brush
-from .deap_api import nsga2, DeapIndividual 
+from .deap_api import nsga2, nsga2island, DeapIndividual 
 # from _brush import Dataset, SearchSpace
 
 
@@ -39,6 +39,12 @@ class BrushEstimator(BaseEstimator):
         Maximum depth of GP trees in the GP program. Use 0 for no limit.
     max_size : int, default 0
         Maximum number of nodes in a tree. Use 0 for no limit.
+    n_islands : int, default 5
+        Number of independent islands to use in evolutionary framework. 
+        Ignored if `algorithm!="nsga2island"`.
+    mig_prob : float, default 0.05
+        Probability of occuring a migration between two random islands at the
+        end of a generation, must be between 0 and 1.
     cx_prob : float, default 1/7
         Probability of applying the crossover variation when generating the offspring,
         must be between 0 and 1.
@@ -60,7 +66,7 @@ class BrushEstimator(BaseEstimator):
     initialization : {"grow", "full"}, default "grow" 
         Strategy to create the initial population. If `full`, then every expression is created
         with `max_size` nodes. If `grow`, size will be uniformly distributed.
-    algorithm : {"nsga2", "ga"}, default "nsga2"
+    algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2island"
         Which Evolutionary Algorithm framework to use to evolve the population.
     validation_size : float, default 0.0
         Percentage of samples to use as a hold-out partition. These samples are used
@@ -107,12 +113,14 @@ def __init__(
         verbosity=0,
         max_depth=3,
         max_size=20,
+        n_islands=5,
+        mig_prob=0.05,
         cx_prob= 1/7,
         mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6,
                             "toggle_weight_on":1/6, "toggle_weight_off":1/6},
         functions: list[str]|dict[str,float] = {},
         initialization="grow",
-        algorithm="nsga2",
+        algorithm="nsga2island",
         random_state=None,
         validation_size: float = 0.0,
         batch_size: float = 1.0
@@ -124,6 +132,8 @@ def __init__(
         self.mode=mode
         self.max_depth=max_depth
         self.max_size=max_size
+        self.n_islands=n_islands
+        self.mig_prob=mig_prob
         self.cx_prob=cx_prob
         self.mutation_options=mutation_options
         self.functions=functions
@@ -156,10 +166,10 @@ def _setup_toolbox(self, data_train, data_validation):
         # When solving multi-objective problems, selection and survival must
         # support this feature. This means that these selection operators must
         # accept a tuple of fitnesses as argument)
-        if self.algorithm=="nsga2":
+        if self.algorithm=="nsga2" or self.algorithm=="nsga2island":
             toolbox.register("select", tools.selTournamentDCD) 
             toolbox.register("survive", tools.selNSGA2)
-        elif self.algorithm=="ga":
+        elif self.algorithm=="ga" or self.algorithm=="gaisland":
             toolbox.register("select", tools.selTournament, tournsize=3) 
             def offspring(pop, MU): return pop[-MU:]
             toolbox.register("survive", offspring)
@@ -249,9 +259,17 @@ def fit(self, X, y):
         self.search_space_ = _brush.SearchSpace(self.train_, self.functions_)
         self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_)
 
-        self.archive_, self.logbook_ = nsga2(
-            self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, 
-            (0.0<self.batch_size<1.0), self.verbosity, _brush.rnd_flt)
+        if self.algorithm=="nsga2island" or self.algorithm=="gaisland":
+            self.archive_, self.logbook_ = nsga2island(
+                self.toolbox_, self.max_gen, self.pop_size, self.n_islands,
+                self.mig_prob, self.cx_prob, 
+                (0.0<self.batch_size<1.0), self.verbosity, _brush.rnd_flt)
+        elif self.algorithm=="nsga2" or self.algorithm=="ga":
+            # nsga2 and ga differ in the toolbox
+            self.archive_, self.logbook_ = nsga2(
+                self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, 
+                (0.0<self.batch_size<1.0), self.verbosity, _brush.rnd_flt)
+
 
         final_ind_idx = 0
 

diff --git a/src/brushGA.cpp b/src/brushGA.cpp
@@ -0,0 +1,11 @@
+#include "brushGA.h"
+#include <iostream>
+
+
+using namespace Brush;
+
+/// @brief initialize Feat object for fitting.
+void BrushGA::init()
+{
+
+}
diff --git a/src/brushGA.h b/src/brushGA.h
@@ -0,0 +1,69 @@
+/* Brush
+copyright 2020 William La Cava
+license: GNU/GPL v3
+*/
+
+#ifndef BrushGA_H
+#define BrushGA_H
+
+#include "init.h"
+#include "taskflow/taskflow.hpp"
+
+// TODO: improve the includes (why does this lines below does not work?)
+// #include "variation.h"
+// #include "selection.h"
+
+// using namespace selection;
+// using namespace variation;
+
+namespace Brush
+{
+
+class BrushGA{
+public:
+
+    BrushGA(){}
+    /// destructor             
+    ~BrushGA(){} 
+
+    void init();
+
+    //getters and setters for GA configuration.
+    // getters and setters for the best solution found after evolution
+    // predict, transform, predict_proba, etc.
+    // get statistics
+    // load and save best individuals
+    // logger, save to file
+    // execution archive
+    // random state control
+    // score functions
+    // fit methods (this will run the evolution), run a single generation 
+private:
+    // attributes (hyperparameters)
+    // update best
+    // calculate/print stats
+};
+
+int main(){
+
+  tf::Executor executor;
+  tf::Taskflow taskflow;
+
+  auto [A, B, C, D] = taskflow.emplace(  // create four tasks
+    [] () { std::cout << "TaskA\n"; },
+    [] () { std::cout << "TaskB\n"; },
+    [] () { std::cout << "TaskC\n"; },
+    [] () { std::cout << "TaskD\n"; } 
+  );                                  
+
+  A.precede(B, C);  // A runs before B and C
+  D.succeed(B, C);  // D runs after  B and C
+
+  executor.run(taskflow).wait(); 
+
+  return 0;
+}
+
+} // Brush
+
+#endif
diff --git a/src/selection.h b/src/selection.h
@@ -0,0 +1,17 @@
+/* Brush
+copyright 2020 William La Cava
+license: GNU/GPL v3
+*/
+
+#ifndef SELECTION_H
+#define SELECTION_H
+
+namespace selection {
+
+class SelectorBase {
+public:
+private:
+};
+
+} // selection
+#endif
diff --git a/src/variation.h b/src/variation.h
@@ -680,6 +680,7 @@ std::optional<Program<T>> cross(const Program<T>& root, const Program<T>& other)
 
             // fmt::print("other_spot : {}\n",other_spot.node->data);
             // swap subtrees at child_spot and other_spot
+            // TODO: do I need to delete the removed node?
             child.Tree.move_ontop(child_spot, other_spot);
             return child;
         }

diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp
@@ -31,6 +31,7 @@ TEST(Program, MakeRegressor)
             );
 
             ASSERT_TRUE( PRG.get_model("compact", true)==clone.get_model("compact", true) );
+
             fmt::print("Models have the same representation\n");
 
             // weights didnt changed

diff --git a/tests/python/test_brush.py b/tests/python/test_brush.py
@@ -44,16 +44,21 @@ def regression_setup():
     return brush.BrushRegressor, X, y
 
 @pytest.mark.parametrize('setup,algorithm',
-                         [('classification_setup', 'nsga2'),
-                          ('classification_setup', 'ga'   ),
-                          ('regression_setup',     'nsga2'),
-                          ('regression_setup',     'ga'   )])
+                         [('classification_setup', 'nsga2island'),
+                          ('classification_setup', 'nsga2'      ),
+                          ('classification_setup', 'gaisland'   ),
+                          ('classification_setup', 'ga'         ),
+                          ('regression_setup',     'nsga2island'),
+                          ('regression_setup',     'nsga2'      ),
+                          ('regression_setup',     'gaisland'   ),
+                          ('regression_setup',     'ga'         )])
 def test_fit(setup, algorithm, brush_args, request):
     """Testing common utilities related to fitting and generic brush estimator.
     """
 
     Estimator, X, y = request.getfixturevalue(setup)
 
+    brush_args["algorithm"] = algorithm
     try:
         est = Estimator(**brush_args)
         est.fit(X, y)
-Original file line number
+Diff line change
@@ Expand Up / @@ -31,6 +31,7 @@ TEST(Program, MakeRegressor) @@
                 );
                 ASSERT_TRUE( PRG.get_model("compact", true)==clone.get_model("compact", true) );
                 fmt::print("Models have the same representation\n");
                 // weights didnt changed
@@ Expand Down @@