From ddc280c461612ff26b36f2c4d07db73f368fd9ae Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 11 Aug 2023 14:19:45 -0400 Subject: [PATCH 001/199] Regressor now uses MSE (instead of squashed version of the metric) --- src/brush/estimator.py | 50 +++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 78f94d27..cf0b1d7b 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -128,9 +128,7 @@ def _setup_toolbox(self, data_train, data_validation): # Minimizing/maximizing problem: negative/positive weight, respectively. # Our classification is using the error as a metric # Comparing fitnesses: https://deap.readthedocs.io/en/master/api/base.html#deap.base.Fitness - creator.create("FitnessMulti", base.Fitness, weights=(+1.0,-1.0)) - - # TODO: make this weights attributes of each derivate class (creator is global) + creator.create("FitnessMulti", base.Fitness, weights=self.weights) # create Individual class, inheriting from self.Individual with a fitness attribute creator.create("Individual", DeapIndividual, fitness=creator.FitnessMulti) @@ -221,17 +219,23 @@ def fit(self, X, y): self.archive_ = archive self.logbook_ = logbook - # Selecting the best estimator using validation data and multi-criteria decision making - points = np.array([self.toolbox_.evaluateValidation(ind) for ind in self.archive_]) - points = points*np.array([+1.0,-1.0]) #Multiply by the weights TODO: use weights here instead of hardcoded + closest_idx = 0 + if self.validation_size==0.0: + # Selecting the best estimator using training data + # (train data==val data if validation_size is set to 0.0) + # and multi-criteria decision making + points = np.array([self.toolbox_.evaluateValidation(ind) for ind in self.archive_]) - # Normalizing - min_vals = np.min(points, axis=0) - max_vals = np.max(points, axis=0) - points = (points - min_vals) / (max_vals - min_vals) - - reference = np.array([0, 0]) - closest_idx = np.argmin( np.linalg.norm(points - reference, axis=1) ) + #Multiply by the weights so reference can be agnostic of min/max problems + points = points*np.array(self.weights) + + # Normalizing + min_vals = np.min(points, axis=0) + max_vals = np.max(points, axis=0) + points = (points - min_vals) / (max_vals - min_vals) + + reference = np.array([0, 0]) + closest_idx = np.argmin( np.linalg.norm(points - reference, axis=1) ) self.best_estimator_ = self.archive_[closest_idx].prg @@ -290,6 +294,7 @@ def predict(self, X): def get_params(self): return {k:v for k,v in self.__dict__.items() if not k.endswith('_')} + class BrushClassifier(BrushEstimator,ClassifierMixin): """Brush for classification. @@ -310,13 +315,16 @@ class BrushClassifier(BrushEstimator,ClassifierMixin): def __init__( self, **kwargs): super().__init__(mode='classification',**kwargs) + # Weight of each objective (+ for maximization, - for minimization) + self.weights = (+1.0,-1.0) + def _fitness_validation(self, ind, data: _brush.Dataset): + # Fitness without fitting the expression, used with validation data return ( # (accuracy, size) (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], ind.prg.size() ) - def _fitness_function(self, ind, data: _brush.Dataset): ind.prg.fit(data) return ( # (accuracy, size) @@ -379,15 +387,17 @@ class BrushRegressor(BrushEstimator, RegressorMixin): def __init__(self, **kwargs): super().__init__(mode='regressor',**kwargs) + # Weight of each objective (+ for maximization, - for minimization) + self.weights = (-1.0,-1.0) def _fitness_validation(self, ind, data: _brush.Dataset): + # Fitness without fitting the expression, used with validation data + MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf MSE = np.inf - # We are squash the error and making it a maximization problem - return ( 1/(1+MSE), ind.prg.size() ) - + return ( MSE, ind.prg.size() ) def _fitness_function(self, ind, data: _brush.Dataset): ind.prg.fit(data) @@ -396,9 +406,7 @@ def _fitness_function(self, ind, data: _brush.Dataset): if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf MSE = np.inf - # We are squash the error and making it a maximization problem - return ( 1/(1+MSE), ind.prg.size() ) - + return ( MSE, ind.prg.size() ) def _make_individual(self): if self.initialization not in ["grow", "full"]: @@ -410,8 +418,6 @@ def _make_individual(self): self.max_depth, (0 if self.initialization=='grow' else self.max_size)) ) - - # Under development # class BrushRepresenter(BrushEstimator, TransformerMixin): # """Brush for representation learning. From 9939d0716329e5cc0d358643e07a9a469ea66c90 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 11 Aug 2023 16:08:44 -0400 Subject: [PATCH 002/199] Fixed wrong use of validation partition and MDCM --- src/brush/estimator.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index cf0b1d7b..df9326bd 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -219,25 +219,32 @@ def fit(self, X, y): self.archive_ = archive self.logbook_ = logbook - closest_idx = 0 - if self.validation_size==0.0: + final_ind_idx = 0 + + # Each individual is a point in the Multi-Objective space. We multiply + # the fitness by the weights so greater numbers are always better + points = np.array([self.toolbox_.evaluateValidation(ind) for ind in self.archive_]) + points = points*np.array(self.weights) + + if self.validation_size==0.0: # Using the multi-criteria decision making on training data # Selecting the best estimator using training data # (train data==val data if validation_size is set to 0.0) # and multi-criteria decision making - points = np.array([self.toolbox_.evaluateValidation(ind) for ind in self.archive_]) - - #Multiply by the weights so reference can be agnostic of min/max problems - points = points*np.array(self.weights) # Normalizing min_vals = np.min(points, axis=0) max_vals = np.max(points, axis=0) points = (points - min_vals) / (max_vals - min_vals) - reference = np.array([0, 0]) - closest_idx = np.argmin( np.linalg.norm(points - reference, axis=1) ) + # Reference should be best value each obj. can have (after normalization) + reference = np.array([1, 1]) + + # closest to the reference + final_ind_idx = np.argmin( np.linalg.norm(points - reference, axis=1) ) + else: # Best in obj.1 (loss) in validation data + final_ind_idx = np.argmax( points[:, 0] ) - self.best_estimator_ = self.archive_[closest_idx].prg + self.best_estimator_ = self.archive_[final_ind_idx].prg if self.verbosity > 0: print(f'best model {self.best_estimator_.get_model()}'+ From dc328bb33357cfc48a783dba1833932caf981a5e Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 15 Aug 2023 15:18:53 -0400 Subject: [PATCH 003/199] Added mutation trace back Im not good at merging stuff --- src/variation.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/variation.h b/src/variation.h index dcfd8288..55839def 100644 --- a/src/variation.h +++ b/src/variation.h @@ -236,6 +236,23 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS auto options = PARAMS["mutation_options"].get>(); + // whether we should write everything that happened inside the method + if (PARAMS.value("write_mutation_trace", false)==true) { + // Default fields of the trace. Initialize with default values, which are + // gradually changed throughout the execution of the method. + PARAMS["mutation_trace"] = json({ + {"parent", child.get_model("compact", true)}, + {"spot_weights", weights}, + {"mutation_weights", options}, + // default values, to be changed in case mutation works + {"spot", "not selected"}, + {"mutation", "not selected"}, + {"child", "failed to generate"}, + {"status", "initialized weight vectors"}, + {"success", "false"} + }); + } + if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { return w<=0.0; })) @@ -246,6 +263,12 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), weights.begin(), weights.end()); + // whether we should write everything that happened inside the method + if (PARAMS.value("write_mutation_trace", false)==true) { + PARAMS["mutation_trace"]["spot"] = spot.node->get_model(false); + PARAMS["mutation_trace"]["status"] = "sampled the mutation spot"; + } + if (std::all_of(options.begin(), options.end(), [](const auto& kv) { return kv.second<=0.0; })) @@ -284,12 +307,30 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS // apply the mutation and check if it succeeded bool success = it->second(child.Tree, spot, SS); + if (PARAMS.value("write_mutation_trace", false)==true) { + PARAMS["mutation_trace"]["mutation"] = choice; + PARAMS["mutation_trace"]["status"] = "sampled and aplied the mutation"; + + if (success) + PARAMS["mutation_trace"]["child"] = child.get_model("compact", true); + } + if (success && ( (child.size() <= PARAMS["max_size"].get() ) && (child.depth() <= PARAMS["max_depth"].get()) )){ + // success is true only if mutation returned a valid program + if (PARAMS.value("write_mutation_trace", false)==true) + PARAMS["mutation_trace"]["success"] = true; + return child; } else { + + // here we have a string in PARAMS["mutation_trace"]["child"], + // but success is false since it didnt return an valid program + if (PARAMS.value("write_mutation_trace", false)==true) + PARAMS["mutation_trace"]["status"] = "children exceeds max_size or max_depth"; + return std::nullopt; } }; From c6ccde48f9a6d4da51708d5fd91fe8ccdf26884e Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 15 Aug 2023 17:10:25 -0400 Subject: [PATCH 004/199] Uniform weight initialization between mutation options and cx --- src/brush/estimator.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 2039577d..fd4913af 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -38,11 +38,20 @@ class BrushEstimator(BaseEstimator): Maximum depth of GP trees in the GP program. Use 0 for no limit. max_size : int, default 0 Maximum number of nodes in a tree. Use 0 for no limit. - cx_prob : float, default 0.9 - Probability of applying the crossover variation when generating the offspring - mutation_options : dict, default {"point":0.2, "insert":0.2, "delete":0.2, "subtree":0.2, "toggle_weight_on":0.1, "toggle_weight_off":0.1} + cx_prob : float, default 1/7 + Probability of applying the crossover variation when generating the offspring, + must be between 0 and 1. + Given that there are `n` mutations, and either crossover or mutation is + used to generate each individual in the offspring (but not both at the + same time), we want to have by default an uniform probability between + crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and + `1/n` for each mutation, we can achieve an uniform distribution. + mutation_options : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} A dictionary with keys naming the types of mutation and floating point - values specifying the fraction of total mutations to do with that method. + values specifying the fraction of total mutations to do with that method. + The probability of having a mutation is `(1-cx_prob)` and, in case the mutation + is applied, then each mutation option is sampled based on the probabilities + defined in `mutation_options`. The set of probabilities should add up to 1.0. functions: dict[str,float] or list[str], default {} A dictionary with keys naming the function set and values giving the probability of sampling them, or a list of functions which will be weighted uniformly. @@ -95,8 +104,9 @@ def __init__( verbosity=0, max_depth=3, max_size=20, - cx_prob=0.9, - mutation_options = {"point":0.2, "insert":0.2, "delete":0.2, "subtree":0.2, "toggle_weight_on":0.1, "toggle_weight_off":0.1}, + cx_prob= 1/7, + mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, + "toggle_weight_on":1/6, "toggle_weight_off":1/6}, functions: list[str]|dict[str,float] = {}, initialization="grow", random_state=None, From f29d94d3abeccc892c43c6f7009f3aaf9e75d9b9 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 30 Aug 2023 17:39:50 -0400 Subject: [PATCH 005/199] If mutation/cx fails, then the parent is inserted in offspring --- src/brush/deap_api/nsga2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py index b569ad47..77d1bbce 100644 --- a/src/brush/deap_api/nsga2.py +++ b/src/brush/deap_api/nsga2.py @@ -69,15 +69,15 @@ def calculate_statistics(ind): for ind1, ind2 in zip(parents[::2], parents[1::2]): off1, off2 = None, None - if rnd_flt() < CXPB: + if rnd_flt() < CXPB: # either mutation or crossover off1, off2 = toolbox.mate(ind1, ind2) else: off1 = toolbox.mutate(ind1) off2 = toolbox.mutate(ind2) - # avoid inserting empty solutions - if off1 is not None: offspring.extend([off1]) - if off2 is not None: offspring.extend([off2]) + # Inserting parent if mutation failed + offspring.append(off1 if off1 is not None else ind1) + offspring.append(off2 if off2 is not None else ind2) # archive.update(offspring) # Evaluate the individuals with an invalid fitness From 9d1d153fb8f7994d49489524a02c1229e9f6107c Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 30 Aug 2023 17:40:42 -0400 Subject: [PATCH 006/199] Marked get_model functions as const --- src/program/program.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/program/program.h b/src/program/program.h index d1330cc2..631bb7a2 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -343,7 +343,7 @@ template struct Program * @param pretty currently unused. * @return string the model in string form. */ - string get_model(string fmt="compact", bool pretty=false) + string get_model(string fmt="compact", bool pretty=false) const { auto head = Tree.begin(); if (fmt=="tree") @@ -359,7 +359,7 @@ template struct Program * @param extras extra code passed to the beginning of the dot code. * @return string the model in dot language. */ - string get_dot_model(string extras="") + string get_dot_model(string extras="") const { // TODO: make the node names their hash or index, and the node label the nodetype name. // ref: https://stackoverflow.com/questions/10579041/graphviz-create-new-node-with-this-same-label#10579155 @@ -381,7 +381,6 @@ template struct Program const auto& parent = iter.node; // const auto& parent_data = iter.node->data; - string parent_id = get_id(parent); // if (Is(parent_data.node_type)) // parent_id = parent_data.get_name(false); @@ -390,7 +389,6 @@ template struct Program // } // // parent_id = parent_id.substr(2); - // if the first node is weighted, make a dummy output node so that the // first node's weight can be shown if (i==0 && parent->data.get_is_weighted()) @@ -401,7 +399,6 @@ template struct Program parent_id, parent->data.W ); - } // add the node @@ -459,7 +456,6 @@ template struct Program head_label, tail_label ); - } else{ out += fmt::format("\"{}\" -> \"{}\" [label=\"{}\"];\n", From 29ae79efc65c547a934c0f8aaebea0eb3f2571c9 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 30 Aug 2023 17:41:41 -0400 Subject: [PATCH 007/199] Additional check before doing PTC2 --- src/search_space.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/search_space.h b/src/search_space.h index ac751a65..1778b183 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -631,9 +631,11 @@ T RandomDequeue(std::vector& Q) template P SearchSpace::make_program(int max_d, int max_size) { + // this is what makes `make_program` create uniformly distributed + // individuals to feed initial population if (max_d == 0) max_d = PARAMS["max_depth"].get(); - if (max_size == 0) + if (max_size == 0) max_size = r.rnd_int(1, PARAMS["max_size"].get()); DataType root_type = DataTypeEnum::value; @@ -641,7 +643,7 @@ P SearchSpace::make_program(int max_d, int max_size) // ProgramType program_type = ProgramTypeEnum::value; auto Tree = tree(); - if (max_size == 1) + if (max_size == 1 || max_d == 1) { // auto root = Tree.insert(Tree.begin(), sample_terminal(root_type)); From fdf0469971f7c8906ffc86fd5c3571a4e28dea9c Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 30 Aug 2023 17:42:26 -0400 Subject: [PATCH 008/199] Mutation classes. bug fix in subtree mutation This commit changes how mutation works by selecting the spot **after** selecting the mutation. Previously, spot was being selected without taking into account which mutation was going to happen. Now, mutations are derivations of a baseclass that implements at least two methods: find_spots and mutate. find_spots will return the weights of nodes that can be selected to apply the mutation. The idea here is to have more robust mutation, avoiding a lot of nullopt returns. mutate will actually change the expression, based on a given spot. Now, the mutation function takes care of selecting the node and performing the checks to determine if the search space holds an alternative to apply the mutation. Oh, it also tak care of writing the mutation trace (which I'm thinking about making it an official feature, instead of just a debuging tool). --- src/variation.h | 514 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 361 insertions(+), 153 deletions(-) diff --git a/src/variation.h b/src/variation.h index 55839def..03267966 100644 --- a/src/variation.h +++ b/src/variation.h @@ -26,165 +26,377 @@ license: GNU/GPL v3 * */ namespace variation { + +class MutationBase { +public: + using Iter = tree::pre_order_iterator; + + MutationBase(const SearchSpace& SS, size_t max_size, size_t max_depth) + : SS_(SS) + , max_size_(max_size) + , max_depth_(max_depth) + { + } + + template + auto find_spots(const Program& prog) const -> vector // override for custom behavior + { + // It is important to use prog.Tree.size instead of prog.size(). The + // later takes into account node weights (coefficients), but we want + // weight of nodes that are actually in the tree structure + vector weights(prog.Tree.size()); + + // by default, mutation can happen anywhere, based on node weights + std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), + [&](const auto& n){ + return n.get_prob_change();}); + + // Should have same size as prog.Tree.size, even if all weights <= 0.0 + return weights; + } + + virtual auto mutate_inplace(tree& Tree, Iter spot) const -> bool = 0; -typedef tree::pre_order_iterator Iter; + auto SS() const -> SearchSpace { return SS_; } + auto max_size() const -> size_t { return max_size_; } + auto max_depth() const -> size_t{ return max_depth_; } +private: + SearchSpace SS_; // where to sample nodes to change the program + + // constrains + size_t max_size_; + size_t max_depth_; +}; /// @brief replace node with same typed node +/// @param prog the program /// @param Tree the program tree /// @param spot an iterator to the node that is being mutated /// @param SS the search space to sample a node like `spot` /// @return boolean indicating the success (true) or fail (false) of the operation -inline bool point_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +class PointMutation : public MutationBase { - // cout << "point mutation\n"; +public: + explicit PointMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) + { + } - // get_node_like will sample a similar node based on node_map_weights or - // terminal_weights, and maybe will return a Node. - std::optional newNode = SS.get_node_like(spot.node->data); + auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + { + // cout << "point mutation\n"; + + // get_node_like will sample a similar node based on node_map_weights or + // terminal_weights, and maybe will return a Node. + optional newNode = SS().get_node_like(spot.node->data); - if (!newNode) // newNode == std::nullopt - return false; + if (!newNode) // overload to check if newNode == nullopt + return false; - // if optional contains a Node, we access its contained value - Tree.replace(spot, *newNode); + // if optional contains a Node, we access its contained value + Tree.replace(spot, *newNode); - return true; -} + return true; + } +}; /// @brief insert a node with spot as a child +/// @param prog the program /// @param Tree the program tree /// @param spot an iterator to the node that is being mutated /// @param SS the search space to sample a node like `spot` /// @return boolean indicating the success (true) or fail (false) of the operation -inline bool insert_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +class InsertMutation : public MutationBase { - // cout << "insert mutation\n"; - auto spot_type = spot.node->data.ret_type; - - // pick a random compatible node to insert (with probabilities given by - // node_map_weights). The `-1` represents the node being inserted. - // Ideally, it should always find at least one match (the same node - // used as a reference when calling the function). However, we have a - // size restriction, which will be relaxed here (just as it is in the PTC2 - // algorithm). This mutation can create a new expression that exceeds the - // maximum size by the highest arity among the operators. - std::optional n = SS.sample_op_with_arg(spot_type, spot_type, true, - PARAMS["max_size"].get()-Tree.size()-1); - - if (!n) // there is no operator with compatible arguments - return false; - - // make node n wrap the subtree at the chosen spot - auto parent_node = Tree.wrap(spot, *n); - - // now fill the arguments of n appropriately - bool spot_filled = false; - for (auto a: (*n).arg_types) +public: + explicit InsertMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) { - if (spot_filled) - { - // if spot is in its child position, append children. - // TODO: reminding that sample_terminal may fail as well - auto opt = SS.sample_terminal(a); + } + + template + auto find_spots(const Program& prog) const -> vector + { + vector weights(prog.Tree.size()); + + if (prog.size() < max_size()) { + auto prog_iter = prog.Tree.begin(); + std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), + [&](const auto& n){ + size_t d = prog.depth_to_reach( prog_iter ); - if (!opt) - return false; + std::advance(prog_iter, 1); - Tree.append_child(parent_node, opt.value()); + if (d < max_depth()) + return n.get_prob_change(); + else + return 0.0f; + }); } - // if types match, treat this spot as filled by the spot node - else if (a == spot_type) - spot_filled = true; - // otherwise, add siblings before spot node else { - auto opt = SS.sample_terminal(a); - - if (!opt) - return false; - - Tree.insert(spot, opt.value()); + // fill the vector with zeros, since we're already at max_size + std::fill(weights.begin(), weights.end(), 0.0f); } - } + + return weights; + } - return true; -} + auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + { + // cout << "insert mutation\n"; + auto spot_type = spot.node->data.ret_type; + + // pick a random compatible node to insert (with probabilities given by + // node_map_weights). The `-1` represents the node being inserted. + // Ideally, it should always find at least one match (the same node + // used as a reference when calling the function). However, we have a + // size restriction, which will be relaxed here (just as it is in the PTC2 + // algorithm). This mutation can create a new expression that exceeds the + // maximum size by the highest arity among the operators. + std::optional n = SS().sample_op_with_arg(spot_type, spot_type, true, + max_size()-Tree.size()-1); + + if (!n) // there is no operator with compatible arguments + return false; + + // make node n wrap the subtree at the chosen spot + auto parent_node = Tree.wrap(spot, *n); + + // now fill the arguments of n appropriately + bool spot_filled = false; + for (auto a: (*n).arg_types) + { + if (spot_filled) + { + // if spot is in its child position, append children. + // TODO: reminding that sample_terminal may fail as well + auto opt = SS().sample_terminal(a); + + if (!opt) + return false; + + Tree.append_child(parent_node, opt.value()); + } + // if types match, treat this spot as filled by the spot node + else if (a == spot_type) + spot_filled = true; + // otherwise, add siblings before spot node + else { + auto opt = SS().sample_terminal(a); + + if (!opt) + return false; + + Tree.insert(spot, opt.value()); + } + } + + return true; + } +}; /// @brief delete subtree and replace it with a terminal of the same return type +/// @param prog the program /// @param Tree the program tree /// @param spot an iterator to the node that is being mutated /// @param SS the search space to sample a node like `spot` /// @return boolean indicating the success (true) or fail (false) of the operation -inline bool delete_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +class DeleteMutation : public MutationBase { - // cout << "delete mutation\n"; +public: + explicit DeleteMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) + { + } - // sample_terminal will sample based on terminal_weights. If it succeeds, - // then the new terminal will be in `opt.value()` - auto opt = SS.sample_terminal(spot.node->data.ret_type); - - if (!opt) // there is no terminal with compatible arguments - return false; + auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + { + // cout << "delete mutation\n"; + + // sample_terminal will sample based on terminal_weights. If it succeeds, + // then the new terminal will be in `opt.value()` + auto opt = SS().sample_terminal(spot.node->data.ret_type); + + if (!opt) // there is no terminal with compatible arguments + return false; - Tree.erase_children(spot); + Tree.erase_children(spot); - Tree.replace(spot, opt.value()); + Tree.replace(spot, opt.value()); - return true; + return true; + } }; -/// @brief toggle the node's weight ON. +/// @brief toggle the node's weight ON +/// @param prog the program /// @param Tree the program tree /// @param spot an iterator to the node that is being mutated /// @param SS the search space (unused) /// @return boolean indicating the success (true) or fail (false) of the operation -inline bool toggle_weight_on_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +class ToggleWeightOnMutation : public MutationBase { - if (spot.node->data.get_is_weighted()==true // cant turn on whats already on - || !IsWeighable(spot.node->data.ret_type)) // does not accept weights (e.g. boolean) - return false; // false indicates that mutation failed and should return std::nullopt +public: + explicit ToggleWeightOnMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) + { + } - spot.node->data.set_is_weighted(true); - return true; -} + template + auto find_spots(const Program& prog) const -> vector + { + vector weights(prog.Tree.size()); + + if (prog.size() < max_size()) { + std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), + [&](const auto& n){ + // only weighted nodes can be toggled off + if (!n.node->data.get_is_weighted()) + return n.get_prob_change(); + else + return 0.0; + }); + } + else { + // fill the vector with zeros, since we're already at max_size + std::fill(weights.begin(), weights.end(), 0.0f); + } + + return weights; + } + + auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + { + // cout << "toggle_weight_on mutation\n"; -/// @brief toggle the node's weight OFF. + if (spot.node->data.get_is_weighted()==true // cant turn on whats already on + || !IsWeighable(spot.node->data.ret_type)) // does not accept weights (e.g. boolean) + return false; // false indicates that mutation failed and should return std::nullopt + + spot.node->data.set_is_weighted(true); + return true; + } +}; + +/// @brief toggle the node's weight OFF +/// @param prog the program /// @param Tree the program tree /// @param spot an iterator to the node that is being mutated /// @param SS the search space (unused) /// @return boolean indicating the success (true) or fail (false) of the operation -inline bool toggle_weight_off_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +class ToggleWeightOffMutation : public MutationBase { - if (spot.node->data.get_is_weighted()==false) - return false; +public: + explicit ToggleWeightOffMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) + { + } + + template + auto find_spots(const Program& prog) const -> vector + { + vector weights(prog.Tree.size()); - spot.node->data.set_is_weighted(false); - return true; -} + std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), + [&](const auto& n){ + if (n.node->data.get_is_weighted()) + return n.get_prob_change(); + else + return 0.0; + }); + + return weights; + } + + auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + { + // cout << "toggle_weight_off mutation\n"; + + if (spot.node->data.get_is_weighted()==false) + return false; + + spot.node->data.set_is_weighted(false); + return true; + } +}; /// @brief replaces the subtree rooted in `spot` +/// @param prog the program /// @param Tree the program tree /// @param spot an iterator to the node that is being mutated /// @param SS the search space to generate a compatible subtree /// @return boolean indicating the success (true) or fail (false) of the operation -inline bool subtree_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +class SubtreeMutation : public MutationBase { - auto spot_type = spot.node->data.ret_type; - auto max_size = PARAMS["max_size"].get() - (Tree.size() - Tree.size(spot)); - auto max_depth = PARAMS["max_depth"].get() - (Tree.depth(spot)); +public: + explicit SubtreeMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) // TODO: change order size and depth + { + } + + // TODO: make different private functions to find spots and use them. theres too much copy and paste here + template + auto find_spots(const Program& prog) const -> vector + { + vector weights(prog.Tree.size()); + + if (prog.size() < max_size()) { + auto prog_iter = prog.Tree.begin(); + std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), + [&](const auto& n){ + size_t d = prog.depth_to_reach( prog_iter ); + + std::advance(prog_iter, 1); + + if (d < max_depth()) + return n.get_prob_change(); + else + return 0.0f; + }); + } + else { + // fill the vector with zeros, since we're already at max_size + std::fill(weights.begin(), weights.end(), 0.0f); + } + + return weights; + } + + auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + { + // cout << "subtree mutation\n"; + + // check if we exceeded the size/depth constrains (without subtracting, + // to avoid overflow cases if the user sets max_size smaller than arity + // of smallest operator. The overflow would happen when calculating d and + // s in the following lines, to choose the PTC2 limits) + if ( max_size() <= (Tree.size() - Tree.size(spot)) + || max_depth() <= Tree.depth(spot) ) + return false; - // sample subtree uses PTC2, which operates on depth and size of the tree - // (and not on the program!). we shoudn't care for weights here - auto subtree = SS.sample_subtree(spot.node->data, max_depth, max_size); + auto spot_type = spot.node->data.ret_type; - if (!subtree) // there is no terminal with compatible arguments - return false; + // d and s must be compatible with PTC2 --- they should be based on + // tree structure, not program structure + size_t d = max_depth() - Tree.depth(spot); + size_t s = max_size() - (Tree.size() - Tree.size(spot)); - // if optional contains a Node, we access its contained value - Tree.erase_children(spot); - Tree.replace(spot, subtree.value().begin()); + s = r.rnd_int(1, s); - return true; -} + // sample subtree uses PTC2, which operates on depth and size of the tree + // (and not on the program!). we shoudn't care for weights here + auto subtree = SS().sample_subtree(spot.node->data, d, s); + + if (!subtree) // there is no terminal with compatible arguments + return false; + + // if optional contains a Node, we access its contained value + Tree.erase_children(spot); + Tree.replace(spot, subtree.value().begin()); + + return true; + } +}; /** * @brief Stochastically mutate a program. @@ -222,18 +434,6 @@ inline bool subtree_mutation(tree& Tree, Iter spot, const SearchSpace& SS) template std::optional> mutate(const Program& parent, const SearchSpace& SS) { - // all mutation validation and setup should be done here. Specific mutaiton - // functions are intended to work on the program tree thus cannot access - // program functions and attributes. - Program child(parent); - - // choose location by weighted sampling of program - vector weights(child.Tree.size()); - std::transform(child.Tree.begin(), child.Tree.end(), - weights.begin(), - [](const auto& n){ return n.get_prob_change(); } - ); - auto options = PARAMS["mutation_options"].get>(); // whether we should write everything that happened inside the method @@ -241,74 +441,82 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS // Default fields of the trace. Initialize with default values, which are // gradually changed throughout the execution of the method. PARAMS["mutation_trace"] = json({ - {"parent", child.get_model("compact", true)}, - {"spot_weights", weights}, + {"parent", parent.get_model("compact", true)}, {"mutation_weights", options}, // default values, to be changed in case mutation works - {"spot", "not selected"}, {"mutation", "not selected"}, + {"spot_weights", "not calculated"}, + {"spot", "not selected"}, {"child", "failed to generate"}, {"status", "initialized weight vectors"}, {"success", "false"} }); } - - if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { - return w<=0.0; - })) - { // There is no spot that has a probability to be selected + if (std::all_of(options.begin(), options.end(), + [](const auto& kv) { return kv.second<=0.0; }) + ) + { // No mutation can be successfully applied to this solution return std::nullopt; } - auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), - weights.begin(), weights.end()); + // choose a valid mutation option + string choice = r.random_choice(options); - // whether we should write everything that happened inside the method - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["spot"] = spot.node->get_model(false); - PARAMS["mutation_trace"]["status"] = "sampled the mutation spot"; + // TODO: this could be improved + std::unique_ptr mutation; + if (choice == "point") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "insert") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "delete") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "toggle_weight_on") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "toggle_weight_off") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "subtree") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else { + std::string msg = fmt::format("{} not a valid mutation choice", choice); + HANDLE_ERROR_THROW(msg); } - if (std::all_of(options.begin(), options.end(), [](const auto& kv) { - return kv.second<=0.0; + if (PARAMS.value("write_mutation_trace", false)==true) + PARAMS["mutation_trace"]["mutation"] = choice; + + // choose location by weighted sampling of program + auto weights = mutation->find_spots(parent); + + if (PARAMS.value("write_mutation_trace", false)==true) + PARAMS["mutation_trace"]["spot_weights"] = weights; + + if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { + return w<=0.0; })) - { // No mutation can be successfully applied to this solution + { // There is no spot that has a probability to be selected return std::nullopt; } - - // choose a valid mutation option - string choice = r.random_choice(options); + + // if we got this far, mutation is going to happen + Program child(parent); - // std::cout << "mutation configuration (choice was " << choice << "):" << std::endl; - // for (const auto& [k, v] : options) - // std::cout << " - " << k << " : " << v << std::endl; + // apply the mutation and check if it succeeded + auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), // TODO: get weights from mutation + weights.begin(), weights.end()); // Every mutation here works inplace, so they return bool instead of // std::optional to indicare the result of their manipulation over the // program tree. Here we call the mutation function and return the result - using MutationFunc = std::function&, Iter, const SearchSpace&)>; - - std::map mutations{ - {"insert", insert_mutation}, - {"delete", delete_mutation}, - {"point", point_mutation}, - {"subtree", subtree_mutation}, - {"toggle_weight_on", toggle_weight_on_mutation}, - {"toggle_weight_off", toggle_weight_off_mutation} - }; - - // Try to find the mutation function based on the choice - auto it = mutations.find(choice); - if (it == mutations.end()) { - std::string msg = fmt::format("{} not a valid mutation choice", choice); - HANDLE_ERROR_THROW(msg); - } - - // apply the mutation and check if it succeeded - bool success = it->second(child.Tree, spot, SS); + bool success = mutation->mutate_inplace(child.Tree, spot); if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["mutation"] = choice; + PARAMS["mutation_trace"]["spot"] = choice; PARAMS["mutation_trace"]["status"] = "sampled and aplied the mutation"; if (success) @@ -329,7 +537,7 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS // here we have a string in PARAMS["mutation_trace"]["child"], // but success is false since it didnt return an valid program if (PARAMS.value("write_mutation_trace", false)==true) - PARAMS["mutation_trace"]["status"] = "children exceeds max_size or max_depth"; + PARAMS["mutation_trace"]["status"] = "mutation returned child, but it exceeds max_size or max_depth"; return std::nullopt; } From 1e35e2db1c530f34c5651e6f657a7a459fa545a8 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 30 Aug 2023 17:46:32 -0400 Subject: [PATCH 009/199] Updated tests --- tests/cpp/test_data.cpp | 10 ++++++---- tests/cpp/test_variation.cpp | 29 +++++++++++++++-------------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index 09893c2c..628358fe 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -29,8 +29,9 @@ TEST(Data, MixedVariableTypes) { // We need to set at least the mutation options (and respective // probabilities) in order to call PRG.predict() + PARAMS["write_mutation_trace"] = true; PARAMS["mutation_options"] = { - {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"toggle_weight_on", 0.125}, {"toggle_weight_off", 0.125} + {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} }; MatrixXf X(5,3); @@ -59,14 +60,14 @@ TEST(Data, MixedVariableTypes) dt.print(); SS.print(); - for (int d = 1; d < 5; ++d) - for (int s = 1; s < 5; ++s) + for (size_t d = 5; d < 10; ++d) + for (size_t s = 5; s < 20; ++s) { PARAMS["max_size"] = s; PARAMS["max_depth"] = d; - RegressorProgram PRG = SS.make_regressor(d, s); + RegressorProgram PRG = SS.make_regressor(s-4, d-4); fmt::print( "=================================================\n" "Tree model for depth = {}, size= {}: {}\n", @@ -95,6 +96,7 @@ TEST(Data, MixedVariableTypes) if (!opt){ fmt::print("Mutation failed to create a child\n"); + fmt::print("{}", PARAMS["mutation_trace"].get().dump()); } else { auto Child = opt.value(); diff --git a/tests/cpp/test_variation.cpp b/tests/cpp/test_variation.cpp index d0eb9bcf..861a0420 100644 --- a/tests/cpp/test_variation.cpp +++ b/tests/cpp/test_variation.cpp @@ -113,11 +113,9 @@ TEST(Operators, InsertMutationWorks) TEST(Operators, Mutation) { - // test mutation - // TODO: set random seed - + PARAMS["write_mutation_trace"] = true; PARAMS["mutation_options"] = { - {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"subtree", 0.0}, {"toggle_weight_on", 0.125}, {"toggle_weight_off", 0.125} + {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} }; MatrixXf X(10,2); @@ -136,9 +134,9 @@ TEST(Operators, Mutation) SearchSpace SS; SS.init(data); + int successes = 0; for (int d = 1; d < 10; ++d) { - int successes = 0; for (int s = 1; s < 10; ++s) { fmt::print("d={},s={}\n",d,s); @@ -166,6 +164,7 @@ TEST(Operators, Mutation) d, s, PRG.get_model("compact", true) ); + fmt::print("{}", PARAMS["mutation_trace"].get().dump()); } else { successes += 1; @@ -185,15 +184,16 @@ TEST(Operators, Mutation) y_pred = Child.predict(data); } } - // since x1 and x2 have same type, we shoudn't get fails - ASSERT_TRUE(successes > 0); } + // since x1 and x2 have same type, we shoudn't get fails + ASSERT_TRUE(successes > 0); } TEST(Operators, MutationSizeAndDepthLimit) { + PARAMS["write_mutation_trace"] = true; PARAMS["mutation_options"] = { - {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"subtree", 0.0}, {"toggle_weight_on", 0.125}, {"toggle_weight_off", 0.125} + {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} }; MatrixXf X(10,2); @@ -216,9 +216,9 @@ TEST(Operators, MutationSizeAndDepthLimit) // prod operator --> arity 4 int max_arity = 4; + int successes = 0; for (int d = 5; d < 15; ++d) { - int successes = 0; for (int s = 5; s < 15; ++s) { PARAMS["max_size"] = s; @@ -245,6 +245,7 @@ TEST(Operators, MutationSizeAndDepthLimit) d, s, PRG.get_model("compact", true) ); + fmt::print("{}", PARAMS["mutation_trace"].get().dump()); } else { successes += 1; @@ -282,8 +283,8 @@ TEST(Operators, MutationSizeAndDepthLimit) ASSERT_TRUE(Child.depth() <= d); } } - ASSERT_TRUE(successes > 0); } + ASSERT_TRUE(successes > 0); } TEST(Operators, Crossover) @@ -304,9 +305,9 @@ TEST(Operators, Crossover) SearchSpace SS; SS.init(data); + int successes = 0; for (int d = 1; d < 10; ++d) { - int successes = 0; for (int s = 1; s < 10; ++s) { RegressorProgram PRG1 = SS.make_regressor(d, s); @@ -358,8 +359,8 @@ TEST(Operators, Crossover) auto child_pred1 = Child.predict(data); } } - ASSERT_TRUE(successes > 0); } + ASSERT_TRUE(successes > 0); } TEST(Operators, CrossoverSizeAndDepthLimit) @@ -384,9 +385,9 @@ TEST(Operators, CrossoverSizeAndDepthLimit) // prod operator --> arity 4 int max_arity = 4; + int successes = 0; for (int d = 5; d < 15; ++d) { - int successes = 0; for (int s = 5; s < 15; ++s) { PARAMS["max_size"] = s; @@ -445,6 +446,6 @@ TEST(Operators, CrossoverSizeAndDepthLimit) ASSERT_TRUE(Child.depth() <= d); } } - ASSERT_TRUE(successes > 0); } + ASSERT_TRUE(successes > 0); } \ No newline at end of file From 2aac8b0d90d6c2c186b80b29bee21ad47856dbcf Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 31 Aug 2023 17:01:58 -0400 Subject: [PATCH 010/199] Switched `append(value)` to `extend([value])` --- src/brush/deap_api/nsga2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py index 77d1bbce..3f5b11b1 100644 --- a/src/brush/deap_api/nsga2.py +++ b/src/brush/deap_api/nsga2.py @@ -76,8 +76,8 @@ def calculate_statistics(ind): off2 = toolbox.mutate(ind2) # Inserting parent if mutation failed - offspring.append(off1 if off1 is not None else ind1) - offspring.append(off2 if off2 is not None else ind2) + offspring.extend([off1 if off1 is not None else ind1]) + offspring.extend([off2 if off2 is not None else ind2]) # archive.update(offspring) # Evaluate the individuals with an invalid fitness From d6e97784818b4c4591188c5548aace700d4ff96d Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 31 Aug 2023 17:02:53 -0400 Subject: [PATCH 011/199] Better way to make sure PTC2, make_program and subtree will work --- src/search_space.cpp | 22 +++++----- src/search_space.h | 101 +++++++++++++++++++++++++++++++------------ 2 files changed, 85 insertions(+), 38 deletions(-) diff --git a/src/search_space.cpp b/src/search_space.cpp index 4ea0b518..1eafbb06 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -120,15 +120,15 @@ vector generate_terminals(const Dataset& d) return sum / count; }; - auto cXf = Node(NodeType::Constant, Signature{}, true, "C"); + auto cXf = Node(NodeType::Constant, Signature{}, true, "Cf"); cXf.set_prob_change(signature_avg(cXf.ret_type)); terminals.push_back(cXf); - auto cXi = Node(NodeType::Constant, Signature{}, true, "C"); + auto cXi = Node(NodeType::Constant, Signature{}, true, "Ci"); cXi.set_prob_change(signature_avg(cXi.ret_type)); terminals.push_back(cXi); - auto cXb = Node(NodeType::Constant, Signature{}, false, "C"); + auto cXb = Node(NodeType::Constant, Signature{}, false, "Cb"); cXb.set_prob_change(signature_avg(cXb.ret_type)); terminals.push_back(cXb); @@ -263,8 +263,11 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // Tree.append_child(qspot, sample_terminal(t)); auto opt = sample_terminal(t); - while (!opt) - opt = sample_terminal(t); + + // if it returned optional, then there's nothing to sample based on weights. + // We'll force sampling again with uniform probs + if (!opt) + opt = sample_terminal(t, true); // If we successfully get a terminal, use it n = opt.value(); @@ -280,8 +283,8 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // TreeIter new_spot = Tree.append_child(qspot, n); // qspot = n; - while (!opt) - opt = sample_op(t); + if (!opt) + opt = sample_terminal(t, true); n = opt.value(); @@ -321,9 +324,8 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // auto newspot = Tree.replace(qspot, sample_terminal(t)); auto opt = sample_terminal(t); - while (!opt) { - opt = sample_terminal(t); - } + if (!opt) + opt = sample_terminal(t, true); n = opt.value(); diff --git a/src/search_space.h b/src/search_space.h index 1778b183..088957b0 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -312,7 +312,7 @@ struct SearchSpace /// @brief Get a random terminal /// @return `std::optional` that may contain a terminal Node. - std::optional sample_terminal() const + std::optional sample_terminal(bool force_return=false) const { //TODO: match terminal args_type (probably '{}' or something?) // make a separate terminal_map @@ -320,17 +320,24 @@ struct SearchSpace // We'll make terminal types to have its weights proportional to the // DataTypes Weights they hold vector data_type_weights(terminal_weights.size()); - std::transform( - terminal_weights.begin(), - terminal_weights.end(), - data_type_weights.begin(), - [](const auto& tw){ - return std::reduce(tw.second.begin(), tw.second.end()); } - ); - - if (!has_solution_space(data_type_weights.begin(), - data_type_weights.end())) - return std::nullopt; + if (force_return) + { + std::fill(data_type_weights.begin(), data_type_weights.end(), 1.0f); + } + else + { + std::transform( + terminal_weights.begin(), + terminal_weights.end(), + data_type_weights.begin(), + [](const auto& tw){ + return std::reduce(tw.second.begin(), tw.second.end()); } + ); + + if (!has_solution_space(data_type_weights.begin(), + data_type_weights.end())) + return std::nullopt; + } // If we got this far, then it is garanteed that we'll return something // The match take into account datatypes with non-zero weights @@ -341,16 +348,32 @@ struct SearchSpace data_type_weights.end() ); - return *r.select_randomly( - match.second.begin(), match.second.end(), - terminal_weights.at(match.first).begin(), - terminal_weights.at(match.first).end() - ); + // theres always a constant of each data type + vector match_weights(match.second.size()); + if (force_return) + { + std::fill(match_weights.begin(), match_weights.end(), 1.0f); + } + else + { + std::transform( + terminal_weights.at(match.first).begin(), + terminal_weights.at(match.first).end(), + match_weights.begin(), + [](const auto& w){ return w; }); + + if (!has_solution_space(match_weights.begin(), + match_weights.end())) + return std::nullopt; + } + + return *r.select_randomly(match.second.begin(), match.second.end(), + match_weights.begin(), match_weights.end()); }; /// @brief Get a random terminal with return type `R` /// @return `std::optional` that may contain a terminal Node of type `R`. - std::optional sample_terminal(DataType R) const + std::optional sample_terminal(DataType R, bool force_return=false) const { // should I keep doing this check? // if (terminal_map.find(R) == terminal_map.end()){ @@ -358,16 +381,33 @@ struct SearchSpace // HANDLE_ERROR_THROW(msg); // } + // If there's at least one constant for every data type, its always possible to force sample_terminal to return something + // TODO: try to combine with above function - if ( (terminal_map.find(R) == terminal_map.end()) - || (!has_solution_space(terminal_weights.at(R).begin(), - terminal_weights.at(R).end())) ) + vector match_weights(terminal_weights.at(R).size()); + if (force_return) + { + std::fill(match_weights.begin(), match_weights.end(), 1.0f); + } + else + { + std::transform( + terminal_weights.at(R).begin(), + terminal_weights.at(R).end(), + match_weights.begin(), + [](const auto& w){ return w; } + ); + + if ( (terminal_map.find(R) == terminal_map.end()) + || (!has_solution_space(match_weights.begin(), + match_weights.end())) ) return std::nullopt; - + } + return *r.select_randomly(terminal_map.at(R).begin(), - terminal_map.at(R).end(), - terminal_weights.at(R).begin(), - terminal_weights.at(R).end()); + terminal_map.at(R).end(), + match_weights.begin(), + match_weights.end()); }; /// @brief get an operator matching return type `ret`. @@ -376,6 +416,8 @@ struct SearchSpace std::optional sample_op(DataType ret) const { // check(ret); + if (node_map.find(ret) == node_map.end()) + return std::nullopt; //TODO: match terminal args_type (probably '{}' or something?) auto ret_match = node_map.at(ret); @@ -408,6 +450,8 @@ struct SearchSpace std::optional sample_op(NodeType type, DataType R) { // check(R); + if (node_map.find(R) == node_map.end()) + return std::nullopt; auto ret_match = node_map.at(R); @@ -649,6 +693,8 @@ P SearchSpace::make_program(int max_d, int max_size) // We can only have a terminal here, but the terminal must be compatible auto opt = sample_terminal(root_type); + if (!opt) + opt = sample_terminal(root_type, true); if (!opt){ auto msg = fmt::format("Program with size=1 could not be created. " @@ -682,9 +728,8 @@ P SearchSpace::make_program(int max_d, int max_size) else { // we start with a non-terminal (can be replaced inside PTC2 though, if max_size==1) auto opt = sample_op(root_type); - while (!opt) { - opt = sample_op(root_type); - } + if (!opt) + opt = sample_terminal(root_type, true); root = opt.value(); } From 9044d5a9ef02de400d66ba60d8d20734ca8ba353 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 31 Aug 2023 17:03:18 -0400 Subject: [PATCH 012/199] Improved find_spot and removed template in base class --- src/variation.h | 170 +++++++++++++++++++++++++++--------------------- 1 file changed, 97 insertions(+), 73 deletions(-) diff --git a/src/variation.h b/src/variation.h index 03267966..ef98333d 100644 --- a/src/variation.h +++ b/src/variation.h @@ -38,28 +38,39 @@ class MutationBase { { } - template - auto find_spots(const Program& prog) const -> vector // override for custom behavior + virtual auto find_spots(tree& Tree) const -> vector { - // It is important to use prog.Tree.size instead of prog.size(). The - // later takes into account node weights (coefficients), but we want - // weight of nodes that are actually in the tree structure - vector weights(prog.Tree.size()); + vector weights(Tree.size()); // by default, mutation can happen anywhere, based on node weights - std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), - [&](const auto& n){ - return n.get_prob_change();}); + std::transform(Tree.begin(), Tree.end(), weights.begin(), + [&](const auto& n){ return n.get_prob_change();}); // Should have same size as prog.Tree.size, even if all weights <= 0.0 return weights; } - virtual auto mutate_inplace(tree& Tree, Iter spot) const -> bool = 0; + virtual auto operator()(tree& Tree, Iter spot) const -> bool = 0; auto SS() const -> SearchSpace { return SS_; } auto max_size() const -> size_t { return max_size_; } auto max_depth() const -> size_t{ return max_depth_; } +protected: + static size_t size_with_weights(tree& Tree, bool include_weight=true) + { + size_t acc = 0; + + std::for_each(Tree.begin(), Tree.end(), + [include_weight, &acc](auto& node){ + ++acc; // the node operator or terminal + + if (include_weight && node.get_is_weighted()==true) + acc += 2; // weight and multiplication, if enabled + }); + + return acc; + } + private: SearchSpace SS_; // where to sample nodes to change the program @@ -82,7 +93,7 @@ class PointMutation : public MutationBase { } - auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + auto operator()(tree& Tree, Iter spot) const -> bool override { // cout << "point mutation\n"; @@ -114,34 +125,37 @@ class InsertMutation : public MutationBase { } - template - auto find_spots(const Program& prog) const -> vector + auto find_spots(tree& Tree) const -> vector override { - vector weights(prog.Tree.size()); + vector weights; - if (prog.size() < max_size()) { - auto prog_iter = prog.Tree.begin(); - std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), + if (size_with_weights(Tree) < max_size()) { + Iter iter = Tree.begin(); + std::transform(Tree.begin(), Tree.end(), std::back_inserter(weights), [&](const auto& n){ - size_t d = prog.depth_to_reach( prog_iter ); - - std::advance(prog_iter, 1); - - if (d < max_depth()) - return n.get_prob_change(); - else - return 0.0f; + size_t d = 1+Tree.depth(iter); + std::advance(iter, 1); + + // check if SS holds an operator to avoid failing `check` in sample_op_with_arg + if ((d >= max_depth()) + || (SS().node_map.find(n.ret_type) == SS().node_map.end())) { + return 0.0f; + } + else { + return n.get_prob_change(); + } }); } else { // fill the vector with zeros, since we're already at max_size + weights.resize(Tree.size()); std::fill(weights.begin(), weights.end(), 0.0f); } return weights; } - auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + auto operator()(tree& Tree, Iter spot) const -> bool override { // cout << "insert mutation\n"; auto spot_type = spot.node->data.ret_type; @@ -209,7 +223,7 @@ class DeleteMutation : public MutationBase { } - auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + auto operator()(tree& Tree, Iter spot) const -> bool override { // cout << "delete mutation\n"; @@ -242,19 +256,19 @@ class ToggleWeightOnMutation : public MutationBase { } - template - auto find_spots(const Program& prog) const -> vector + auto find_spots(tree& Tree) const -> vector override { - vector weights(prog.Tree.size()); + vector weights(Tree.size()); - if (prog.size() < max_size()) { - std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), + if (size_with_weights(Tree) < max_size()) { + std::transform(Tree.begin(), Tree.end(), weights.begin(), [&](const auto& n){ // only weighted nodes can be toggled off - if (!n.node->data.get_is_weighted()) + if (!n.get_is_weighted() + && IsWeighable(n.ret_type)) return n.get_prob_change(); else - return 0.0; + return 0.0f; }); } else { @@ -265,7 +279,7 @@ class ToggleWeightOnMutation : public MutationBase return weights; } - auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + auto operator()(tree& Tree, Iter spot) const -> bool override { // cout << "toggle_weight_on mutation\n"; @@ -292,23 +306,23 @@ class ToggleWeightOffMutation : public MutationBase { } - template - auto find_spots(const Program& prog) const -> vector + auto find_spots(tree& Tree) const -> vector override { - vector weights(prog.Tree.size()); + vector weights(Tree.size()); - std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), + std::transform(Tree.begin(), Tree.end(), weights.begin(), [&](const auto& n){ - if (n.node->data.get_is_weighted()) + if (n.get_is_weighted() + && IsWeighable(n.ret_type)) return n.get_prob_change(); else - return 0.0; + return 0.0f; }); return weights; } - auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + auto operator()(tree& Tree, Iter spot) const -> bool override { // cout << "toggle_weight_off mutation\n"; @@ -335,34 +349,37 @@ class SubtreeMutation : public MutationBase } // TODO: make different private functions to find spots and use them. theres too much copy and paste here - template - auto find_spots(const Program& prog) const -> vector + auto find_spots(tree& Tree) const -> vector override { - vector weights(prog.Tree.size()); - - if (prog.size() < max_size()) { - auto prog_iter = prog.Tree.begin(); - std::transform(prog.Tree.begin(), prog.Tree.end(), weights.begin(), - [&](const auto& n){ - size_t d = prog.depth_to_reach( prog_iter ); + vector weights; - std::advance(prog_iter, 1); + auto node_map = SS().node_map; - if (d < max_depth()) - return n.get_prob_change(); + if (size_with_weights(Tree) < max_size()) { + Iter iter = Tree.begin(); + std::transform(Tree.begin(), Tree.end(), std::back_inserter(weights), + [&](const auto& n){ + size_t d = 1+Tree.depth(iter); + std::advance(iter, 1); + + // we need to make sure there's some node to start the subtree + if ((d >= max_depth()) + || (SS().node_map.find(n.ret_type) == SS().node_map.end()) + || (SS().node_map.find(n.ret_type) == SS().node_map.end()) ) + return 0.0f; else - return 0.0f; + return n.get_prob_change(); }); } else { - // fill the vector with zeros, since we're already at max_size + weights.resize(Tree.size()); std::fill(weights.begin(), weights.end(), 0.0f); } return weights; } - auto mutate_inplace(tree& Tree, Iter spot) const -> bool override + auto operator()(tree& Tree, Iter spot) const -> bool override { // cout << "subtree mutation\n"; @@ -487,14 +504,18 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS HANDLE_ERROR_THROW(msg); } - if (PARAMS.value("write_mutation_trace", false)==true) + if (PARAMS.value("write_mutation_trace", false)==true) { PARAMS["mutation_trace"]["mutation"] = choice; + } + + Program child(parent); // choose location by weighted sampling of program - auto weights = mutation->find_spots(parent); + auto weights = mutation->find_spots(child.Tree); - if (PARAMS.value("write_mutation_trace", false)==true) + if (PARAMS.value("write_mutation_trace", false)==true) { PARAMS["mutation_trace"]["spot_weights"] = weights; + } if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { return w<=0.0; @@ -502,25 +523,26 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS { // There is no spot that has a probability to be selected return std::nullopt; } - - // if we got this far, mutation is going to happen - Program child(parent); // apply the mutation and check if it succeeded - auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), // TODO: get weights from mutation - weights.begin(), weights.end()); + auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), + weights.begin(), weights.end()); + if (PARAMS.value("write_mutation_trace", false)==true) { + PARAMS["mutation_trace"]["spot"] = spot.node->get_model(false); + PARAMS["mutation_trace"]["status"] = "sampled the spot"; + } + // Every mutation here works inplace, so they return bool instead of // std::optional to indicare the result of their manipulation over the // program tree. Here we call the mutation function and return the result - bool success = mutation->mutate_inplace(child.Tree, spot); + bool success = (*mutation)(child.Tree, spot); if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["spot"] = choice; - PARAMS["mutation_trace"]["status"] = "sampled and aplied the mutation"; - + PARAMS["mutation_trace"]["status"] = "aplied the mutation"; if (success) PARAMS["mutation_trace"]["child"] = child.get_model("compact", true); + } if (success @@ -528,17 +550,19 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS && (child.depth() <= PARAMS["max_depth"].get()) )){ // success is true only if mutation returned a valid program - if (PARAMS.value("write_mutation_trace", false)==true) + if (PARAMS.value("write_mutation_trace", false)==true) { PARAMS["mutation_trace"]["success"] = true; + } return child; } else { // here we have a string in PARAMS["mutation_trace"]["child"], // but success is false since it didnt return an valid program - if (PARAMS.value("write_mutation_trace", false)==true) + if (PARAMS.value("write_mutation_trace", false)==true) { PARAMS["mutation_trace"]["status"] = "mutation returned child, but it exceeds max_size or max_depth"; - + //fmt::print("{}\n", PARAMS["mutation_trace"].get().dump()); + } return std::nullopt; } }; @@ -625,7 +649,7 @@ std::optional> cross(const Program& root, const Program& other) return n.get_prob_change(); else // setting the weight to zero to indicate a non-feasible crossover point - return float(0.0); + return 0.0f; } ); From 10827205f17fbf0eec7326463f32c18b044c47e7 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 31 Aug 2023 17:03:53 -0400 Subject: [PATCH 013/199] Printing more informations for debug --- tests/cpp/test_data.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index 628358fe..e6306179 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -63,15 +63,18 @@ TEST(Data, MixedVariableTypes) for (size_t d = 5; d < 10; ++d) for (size_t s = 5; s < 20; ++s) { - + fmt::print( + "=================================================\n" + "depth={}, size={}. ", d, s + ); + PARAMS["max_size"] = s; PARAMS["max_depth"] = d; RegressorProgram PRG = SS.make_regressor(s-4, d-4); + fmt::print( - "=================================================\n" - "Tree model for depth = {}, size= {}: {}\n", - d, s, PRG.get_model("compact", true) + "Tree model: {}\n", PRG.get_model("compact", true) ); // visualizing detailed information for the model @@ -96,7 +99,7 @@ TEST(Data, MixedVariableTypes) if (!opt){ fmt::print("Mutation failed to create a child\n"); - fmt::print("{}", PARAMS["mutation_trace"].get().dump()); + fmt::print("{}\n", PARAMS["mutation_trace"].get().dump()); } else { auto Child = opt.value(); From cb8e3ecc256fe86e2fbce7c26aedc338cad275d8 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 11 Sep 2023 16:46:43 -0400 Subject: [PATCH 014/199] Bug fixes. Avoiding re-fit. cloning expressions --- src/brush/deap_api/nsga2.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py index 3f5b11b1..710cf234 100644 --- a/src/brush/deap_api/nsga2.py +++ b/src/brush/deap_api/nsga2.py @@ -33,12 +33,9 @@ def calculate_statistics(ind): pop = toolbox.population(n=MU) - batch = toolbox.getBatch() # everytime this function is called, a new random batch is generated - # OBS: evaluate calls fit in the individual. It is different from using it to predict. The # function evaluateValidation don't call the fit - fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), pop) - + fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) for ind, fit in zip(pop, fitnesses): ind.fitness.values = fit @@ -54,9 +51,14 @@ def calculate_statistics(ind): # Begin the generational process for gen in range(1, NGEN): - if (use_batch): #batch will be random only if it is not the size of the entire train set. In this case, we dont need to reevaluate the whole pop - batch = toolbox.getBatch() - fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), pop) + batch = toolbox.getBatch() # batch will be a random subset only if it was not defined as the size of the train set. + # everytime this function is called, a new random batch is generated. + if (use_batch): # recalculate the fitness for the parents + # use_batch is false if batch_size is different from train set size. + # If we're using batch, we need to re-evaluate every model (without changing its weights). + # evaluateValidation doesnt fit the weights + fitnesses = toolbox.map( + functools.partial(toolbox.evaluateValidation, data=batch), pop) for ind, fit in zip(pop, fitnesses): ind.fitness.values = fit @@ -76,18 +78,20 @@ def calculate_statistics(ind): off2 = toolbox.mutate(ind2) # Inserting parent if mutation failed - offspring.extend([off1 if off1 is not None else ind1]) - offspring.extend([off2 if off2 is not None else ind2]) - - # archive.update(offspring) - # Evaluate the individuals with an invalid fitness - invalid_ind = [ind for ind in offspring if not ind.fitness.valid] - fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), invalid_ind) - for ind, fit in zip(invalid_ind, fitnesses): + offspring.extend([off1 if off1 is not None else toolbox.Clone(ind1)]) + offspring.extend([off2 if off2 is not None else toolbox.Clone(ind2)]) + + # Evaluate (instead of evaluateValidation) to fit the weights of the offspring + fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring) + if (use_batch): #calculating objectives based on batch + fitnesses = toolbox.map(functools.partial(toolbox.evaluateValidation, data=batch), offspring) + + for ind, fit in zip(offspring, fitnesses): ind.fitness.values = fit # Select the next generation population pop = toolbox.survive(pop + offspring, MU) + record = stats.compile(pop) logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) From 732a57972dd5fd39b0ab491cc54173114d3f88a0 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 11 Sep 2023 16:48:27 -0400 Subject: [PATCH 015/199] Simple interface to clone a program and return a copy --- src/bindings/bind_programs.h | 2 ++ src/program/program.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index 96a36b71..abdd9d76 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -52,6 +52,8 @@ void bind_program(py::module& m, string name) .def("mutate", &T::mutate, py::return_value_policy::automatic, "Performs one attempt to stochastically mutate the program and generate a child") .def("set_search_space", &T::set_search_space) + //.def("copy", &T::copy<>, py::return_value_policy::copy) + .def("copy", [](const T& self){ T clone(self); return clone; }) .def(py::pickle( [](const T &p) { // __getstate__ /* Return a tuple that fully encodes the state of the object */ diff --git a/src/program/program.h b/src/program/program.h index 631bb7a2..188cbdd7 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -82,6 +82,8 @@ template struct Program SSref = std::optional>{s}; } + Program copy() { return Program(*this); } + inline void set_search_space(const std::reference_wrapper s) { SSref = std::optional>{s}; From 20ef18d8fa4e3afd6444a4779e7785d39825fe55 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 11 Sep 2023 16:53:29 -0400 Subject: [PATCH 016/199] Testing clone method --- tests/cpp/test_program.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp index 3fb2de8b..de7084f9 100644 --- a/tests/cpp/test_program.cpp +++ b/tests/cpp/test_program.cpp @@ -19,10 +19,18 @@ TEST(Program, MakeRegressor) RegressorProgram PRG = SS.make_regressor(d, s); fmt::print( "=================================================\n" - "Tree model for depth = {}, size= {}: {}\n" - "=================================================\n", + "Tree model for depth = {}, size= {}: {}\n", d, s, PRG.get_model("compact", true) ); + + auto clone = PRG.copy(); + fmt::print( + "Copy of the original model: {}\n" + "=================================================\n", + clone.get_model("compact", true) + ); + + ASSERT_TRUE( PRG.get_model("compact", true)==clone.get_model("compact", true) ); } } From e9424a79be6eedb3aa3af99db240066036ed4932 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 11 Sep 2023 16:54:09 -0400 Subject: [PATCH 017/199] Implementation of simple GA with tournament selection --- src/brush/deap_api/ga.py | 85 ++++++++++++++++++++++++++++++++++++++ src/brush/estimator.py | 33 ++++++++++----- tests/python/test_brush.py | 8 +++- 3 files changed, 114 insertions(+), 12 deletions(-) create mode 100644 src/brush/deap_api/ga.py diff --git a/src/brush/deap_api/ga.py b/src/brush/deap_api/ga.py new file mode 100644 index 00000000..be91ae93 --- /dev/null +++ b/src/brush/deap_api/ga.py @@ -0,0 +1,85 @@ +from deap import tools +from deap.benchmarks.tools import diversity, convergence, hypervolume +import numpy as np +import functools + + +def ga(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rnd_flt): + def calculate_statistics(ind): + return (*ind.fitness.values, *toolbox.evaluateValidation(ind)) + + stats = tools.Statistics(calculate_statistics) + + stats.register("avg", np.mean, axis=0) + stats.register("med", np.median, axis=0) + stats.register("std", np.std, axis=0) + stats.register("min", np.min, axis=0) + stats.register("max", np.max, axis=0) + + logbook = tools.Logbook() + logbook.header = "gen", "evals", "avg (O1 train, O2 train, O1 val, O2 val)", \ + "med (O1 train, O2 train, O1 val, O2 val)", \ + "std (O1 train, O2 train, O1 val, O2 val)", \ + "min (O1 train, O2 train, O1 val, O2 val)", \ + "max (O1 train, O2 train, O1 val, O2 val)" + + pop = toolbox.population(n=MU) + + fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) + for ind, fit in zip(pop, fitnesses): + ind.fitness.values = fit + + record = stats.compile(pop) + logbook.record(gen=0, evals=len(pop), **record) + + if verbosity > 0: + print(logbook.stream) + + # Begin the generational process + for gen in range(1, NGEN): + batch = toolbox.getBatch() + if (use_batch): + fitnesses = toolbox.map( + functools.partial(toolbox.evaluateValidation, data=batch), pop) + + for ind, fit in zip(pop, fitnesses): + ind.fitness.values = fit + + # Vary the population + parents = toolbox.select(pop, len(pop)) + + offspring = [] + for ind1, ind2 in zip(parents[::2], parents[1::2]): + off1, off2 = None, None + if rnd_flt() < CXPB: + off1, off2 = toolbox.mate(ind1, ind2) + else: + off1 = toolbox.mutate(ind1) + off2 = toolbox.mutate(ind2) + + offspring.extend([off1 if off1 is not None else toolbox.Clone(ind1)]) + offspring.extend([off2 if off2 is not None else toolbox.Clone(ind2)]) + + fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring) + if (use_batch): + fitnesses = toolbox.map(functools.partial(toolbox.evaluateValidation, data=batch), offspring) + + for ind, fit in zip(offspring, fitnesses): + ind.fitness.values = fit + + # Select the next generation population with offspring strategy + pop = offspring + + record = stats.compile(pop) + logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) + + if verbosity > 0: + print(logbook.stream) + + if verbosity > 0: + print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) + + archive = tools.ParetoFront() + archive.update(pop) + + return archive, logbook \ No newline at end of file diff --git a/src/brush/estimator.py b/src/brush/estimator.py index fd4913af..5f07eabc 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -13,7 +13,7 @@ # from tqdm import tqdm from types import NoneType import _brush -from .deap_api import nsga2, DeapIndividual +from .deap_api import nsga2, ga, DeapIndividual # from _brush import Dataset, SearchSpace @@ -59,6 +59,8 @@ class BrushEstimator(BaseEstimator): initialization : {"grow", "full"}, default "grow" Strategy to create the initial population. If `full`, then every expression is created with `max_size` nodes. If `grow`, size will be uniformly distributed. + algorithm : {"nsga2", "ga"}, default "nsga2" + Which Evolutionary Algorithm framework to use to evolve the population. validation_size : float, default 0.0 Percentage of samples to use as a hold-out partition. These samples are used to calculate statistics during evolution, but not used to train the models. @@ -109,6 +111,7 @@ def __init__( "toggle_weight_on":1/6, "toggle_weight_off":1/6}, functions: list[str]|dict[str,float] = {}, initialization="grow", + algorithm="nsga2", random_state=None, validation_size: float = 0.0, batch_size: float = 1.0 @@ -116,6 +119,7 @@ def __init__( self.pop_size=pop_size self.max_gen=max_gen self.verbosity=verbosity + self.algorithm=algorithm self.mode=mode self.max_depth=max_depth self.max_size=max_size @@ -142,6 +146,8 @@ def _setup_toolbox(self, data_train, data_validation): # create Individual class, inheriting from self.Individual with a fitness attribute creator.create("Individual", DeapIndividual, fitness=creator.FitnessMulti) + + toolbox.register("Clone", lambda ind: creator.Individual(ind.prg.copy())) toolbox.register("mate", self._crossover) toolbox.register("mutate", self._mutate) @@ -149,8 +155,12 @@ def _setup_toolbox(self, data_train, data_validation): # When solving multi-objective problems, selection and survival must # support this feature. This means that these selection operators must # accept a tuple of fitnesses as argument) - toolbox.register("select", tools.selTournamentDCD) - toolbox.register("survive", tools.selNSGA2) + if self.algorithm=="nsga2": + toolbox.register("select", tools.selTournamentDCD) + toolbox.register("survive", tools.selNSGA2) + elif self.algorithm=="ga": + toolbox.register("select", tools.selTournament, tournsize=3) + toolbox.register("survive", tools.selNSGA2) # toolbox.population will return a list of elements by calling toolbox.individual toolbox.register("createRandom", self._make_individual) @@ -222,12 +232,15 @@ def fit(self, X, y): self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) - archive, logbook = nsga2( - self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, - (0.0 Date: Tue, 26 Sep 2023 09:27:17 -0400 Subject: [PATCH 018/199] Initial structure to implement an Island GA. taskflow added as depend. --- environment.yml | 1 + src/brushGA.cpp | 11 ++++++++ src/brushGA.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++ src/selection.h | 17 ++++++++++++ 4 files changed, 98 insertions(+) create mode 100644 src/brushGA.cpp create mode 100644 src/brushGA.h create mode 100644 src/selection.h diff --git a/environment.yml b/environment.yml index 53e9d812..8e6a62d6 100644 --- a/environment.yml +++ b/environment.yml @@ -10,6 +10,7 @@ dependencies: - gxx >= 12.0 - ninja - ceres-solver + - taskflow - pybind11 #=2.6.2 - pytest #=6.2.4 - pydot diff --git a/src/brushGA.cpp b/src/brushGA.cpp new file mode 100644 index 00000000..1b16a4d7 --- /dev/null +++ b/src/brushGA.cpp @@ -0,0 +1,11 @@ +#include "brushGA.h" +#include + + +using namespace Brush; + +/// @brief initialize Feat object for fitting. +void BrushGA::init() +{ + +} diff --git a/src/brushGA.h b/src/brushGA.h new file mode 100644 index 00000000..02d0c536 --- /dev/null +++ b/src/brushGA.h @@ -0,0 +1,69 @@ +/* Brush +copyright 2020 William La Cava +license: GNU/GPL v3 +*/ + +#ifndef BrushGA_H +#define BrushGA_H + +#include "init.h" +#include "taskflow/taskflow.hpp" + +// TODO: improve the includes (why does this lines below does not work?) +// #include "variation.h" +// #include "selection.h" + +// using namespace selection; +// using namespace variation; + +namespace Brush +{ + +class BrushGA{ +public: + + BrushGA(){} + /// destructor + ~BrushGA(){} + + void init(); + + //getters and setters for GA configuration. + // getters and setters for the best solution found after evolution + // predict, transform, predict_proba, etc. + // get statistics + // load and save best individuals + // logger, save to file + // execution archive + // random state control + // score functions + // fit methods (this will run the evolution), run a single generation +private: + // attributes (hyperparameters) + // update best + // calculate/print stats +}; + +int main(){ + + tf::Executor executor; + tf::Taskflow taskflow; + + auto [A, B, C, D] = taskflow.emplace( // create four tasks + [] () { std::cout << "TaskA\n"; }, + [] () { std::cout << "TaskB\n"; }, + [] () { std::cout << "TaskC\n"; }, + [] () { std::cout << "TaskD\n"; } + ); + + A.precede(B, C); // A runs before B and C + D.succeed(B, C); // D runs after B and C + + executor.run(taskflow).wait(); + + return 0; +} + +} // Brush + +#endif diff --git a/src/selection.h b/src/selection.h new file mode 100644 index 00000000..3ce1f849 --- /dev/null +++ b/src/selection.h @@ -0,0 +1,17 @@ +/* Brush +copyright 2020 William La Cava +license: GNU/GPL v3 +*/ + +#ifndef SELECTION_H +#define SELECTION_H + +namespace selection { + +class SelectorBase { +public: +private: +}; + +} // selection +#endif \ No newline at end of file From 058579bc6c3dd99131abf4b6262a6b5d9fd69ad1 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 26 Sep 2023 13:49:06 -0400 Subject: [PATCH 019/199] Changed GA so it doesnt require a new file --- src/brush/deap_api/ga.py | 85 ---------------------------------------- src/brush/estimator.py | 14 +++---- 2 files changed, 5 insertions(+), 94 deletions(-) delete mode 100644 src/brush/deap_api/ga.py diff --git a/src/brush/deap_api/ga.py b/src/brush/deap_api/ga.py deleted file mode 100644 index be91ae93..00000000 --- a/src/brush/deap_api/ga.py +++ /dev/null @@ -1,85 +0,0 @@ -from deap import tools -from deap.benchmarks.tools import diversity, convergence, hypervolume -import numpy as np -import functools - - -def ga(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rnd_flt): - def calculate_statistics(ind): - return (*ind.fitness.values, *toolbox.evaluateValidation(ind)) - - stats = tools.Statistics(calculate_statistics) - - stats.register("avg", np.mean, axis=0) - stats.register("med", np.median, axis=0) - stats.register("std", np.std, axis=0) - stats.register("min", np.min, axis=0) - stats.register("max", np.max, axis=0) - - logbook = tools.Logbook() - logbook.header = "gen", "evals", "avg (O1 train, O2 train, O1 val, O2 val)", \ - "med (O1 train, O2 train, O1 val, O2 val)", \ - "std (O1 train, O2 train, O1 val, O2 val)", \ - "min (O1 train, O2 train, O1 val, O2 val)", \ - "max (O1 train, O2 train, O1 val, O2 val)" - - pop = toolbox.population(n=MU) - - fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) - for ind, fit in zip(pop, fitnesses): - ind.fitness.values = fit - - record = stats.compile(pop) - logbook.record(gen=0, evals=len(pop), **record) - - if verbosity > 0: - print(logbook.stream) - - # Begin the generational process - for gen in range(1, NGEN): - batch = toolbox.getBatch() - if (use_batch): - fitnesses = toolbox.map( - functools.partial(toolbox.evaluateValidation, data=batch), pop) - - for ind, fit in zip(pop, fitnesses): - ind.fitness.values = fit - - # Vary the population - parents = toolbox.select(pop, len(pop)) - - offspring = [] - for ind1, ind2 in zip(parents[::2], parents[1::2]): - off1, off2 = None, None - if rnd_flt() < CXPB: - off1, off2 = toolbox.mate(ind1, ind2) - else: - off1 = toolbox.mutate(ind1) - off2 = toolbox.mutate(ind2) - - offspring.extend([off1 if off1 is not None else toolbox.Clone(ind1)]) - offspring.extend([off2 if off2 is not None else toolbox.Clone(ind2)]) - - fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring) - if (use_batch): - fitnesses = toolbox.map(functools.partial(toolbox.evaluateValidation, data=batch), offspring) - - for ind, fit in zip(offspring, fitnesses): - ind.fitness.values = fit - - # Select the next generation population with offspring strategy - pop = offspring - - record = stats.compile(pop) - logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) - - if verbosity > 0: - print(logbook.stream) - - if verbosity > 0: - print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) - - archive = tools.ParetoFront() - archive.update(pop) - - return archive, logbook \ No newline at end of file diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 5f07eabc..40d62672 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -160,7 +160,8 @@ def _setup_toolbox(self, data_train, data_validation): toolbox.register("survive", tools.selNSGA2) elif self.algorithm=="ga": toolbox.register("select", tools.selTournament, tournsize=3) - toolbox.register("survive", tools.selNSGA2) + def offspring(pop, MU): return pop[-MU:] + toolbox.register("survive", offspring) # toolbox.population will return a list of elements by calling toolbox.individual toolbox.register("createRandom", self._make_individual) @@ -232,14 +233,9 @@ def fit(self, X, y): self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) - if self.algorithm=="nsga2": - self.archive_, self.logbook_ = nsga2( - self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, - (0.0 Date: Wed, 27 Sep 2023 14:32:54 -0400 Subject: [PATCH 020/199] Fixed include of non-existing file --- src/brush/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 40d62672..f3dd2719 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -13,7 +13,7 @@ # from tqdm import tqdm from types import NoneType import _brush -from .deap_api import nsga2, ga, DeapIndividual +from .deap_api import nsga2, DeapIndividual # from _brush import Dataset, SearchSpace From f238ebd401b0349f3eaee23ce2877498f3e03837 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 27 Sep 2023 16:17:09 -0400 Subject: [PATCH 021/199] Changed `get_params` to have same arguments as base class from sklearn --- src/brush/estimator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index f3dd2719..a564ab8b 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -318,8 +318,15 @@ def predict(self, X): # # return [self._create_deap_individual_(p) for p in programs] # return programs - def get_params(self): - return {k:v for k,v in self.__dict__.items() if not k.endswith('_')} + def get_params(self, deep=True): + out = dict() + for (key, value) in self.__dict__.items(): + if not key.endswith('_'): + if deep and hasattr(value, "get_params") and not isinstance(value, type): + deep_items = value.get_params().items() + out.update((key + "__" + k, val) for k, val in deep_items) + out[key] = value + return out class BrushClassifier(BrushEstimator,ClassifierMixin): From 91ba36737f3b2845f7b0efc4bc74ca5900ca213b Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 28 Sep 2023 10:08:36 -0400 Subject: [PATCH 022/199] Fixed bug fit with a dataframe but predict with an np.array --- src/brush/estimator.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index a564ab8b..b5cf14a3 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -213,7 +213,13 @@ def fit(self, X, y): if self.random_state is not None: _brush.set_random_state(self.random_state) - self.data_ = self._make_data(X,y, validation_size=self.validation_size) + self.feature_names_ = [] + if isinstance(X, pd.DataFrame): + self.feature_names_ = X.columns.to_list() + + self.data_ = self._make_data(X, y, + feature_names=self.feature_names_, + validation_size=self.validation_size) # set n classes if relevant if self.mode=="classification": @@ -237,7 +243,6 @@ def fit(self, X, y): self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, (0.0 Date: Thu, 28 Sep 2023 16:52:06 -0400 Subject: [PATCH 023/199] Predict now use types from training data Now we can make predictions with a single sample, and also without having to specify the feature names used in training. This changes are made since brush's search space and dispatch table use these information to evaluate an expression tree. --- src/bindings/bind_dataset.cpp | 61 ++++++++----------------------- src/brush/estimator.py | 39 +++++++++++++------- src/data/data.cpp | 67 +++++++++++++++++++++++++++++++++-- src/data/data.h | 30 +++++++++++++++- 4 files changed, 135 insertions(+), 62 deletions(-) diff --git a/src/bindings/bind_dataset.cpp b/src/bindings/bind_dataset.cpp index 872750d5..ade036dc 100644 --- a/src/bindings/bind_dataset.cpp +++ b/src/bindings/bind_dataset.cpp @@ -9,77 +9,46 @@ namespace nl = nlohmann; void bind_dataset(py::module & m) { py::class_(m, "Dataset") - - // construct from X - // .def(py::init &>()) - // construct from X (and optional validation and batch sizes) with constructor 3. - .def(py::init([](const Ref& X, - const float validation_size=0.0, - const float batch_size=1.0){ - return br::Data::Dataset( - X, {}, validation_size, batch_size); - }), - py::arg("X"), - py::arg("validation_size") = 0.0, - py::arg("batch_size") = 1.0 - ) - // construct from X, feature names - // .def(py::init< - // const Ref&, - // const vector& - // >() - // ) // construct from X, feature names (and optional validation and batch sizes) with constructor 3. .def(py::init([](const Ref& X, - const vector& feature_names, + const vector& feature_names=vector(), const float validation_size=0.0, const float batch_size=1.0){ return br::Data::Dataset( X, feature_names, validation_size, batch_size); }), py::arg("X"), - py::arg("feature_names"), + py::arg("feature_names") = vector(), py::arg("validation_size") = 0.0, py::arg("batch_size") = 1.0 ) - - // construct from X, y arrays - // .def(py::init &, Ref &>()) - // construct from X, y arrays (and optional validation and batch sizes) with constructor 2. + // construct from X, y, feature names (and optional validation and batch sizes) with constructor 2. .def(py::init([](const Ref& X, const Ref& y, + const vector& feature_names=vector(), const float validation_size=0.0, const float batch_size=1.0){ return br::Data::Dataset( - X, y, {}, {}, false, validation_size, batch_size); + X, y, feature_names, {}, false, validation_size, batch_size); }), py::arg("X"), py::arg("y"), + py::arg("feature_names") = vector(), py::arg("validation_size") = 0.0, py::arg("batch_size") = 1.0 ) - - // construct from X, y, feature names - // .def(py::init< - // const Ref&, - // const Ref&, - // const vector& - // >() - // ) - // construct from X, y, feature names (and optional validation and batch sizes) with constructor 2. + // construct from X, feature names, but copying the feature types from a + // reference dataset with constructor 4. Useful for predicting (specially + // because the user can provide a single element matrix, or an array with + // no feature names). .def(py::init([](const Ref& X, - const Ref& y, - const vector& feature_names, - const float validation_size=0.0, - const float batch_size=1.0){ - return br::Data::Dataset( - X, y, feature_names, {}, false, validation_size, batch_size); + const br::Data::Dataset& ref_dataset, + const vector& feature_names){ + return br::Data::Dataset(X, ref_dataset, feature_names); }), py::arg("X"), - py::arg("y"), - py::arg("feature_names"), - py::arg("validation_size") = 0.0, - py::arg("batch_size") = 1.0 + py::arg("ref_dataset"), + py::arg("feature_names") ) .def_readwrite("y", &br::Data::Dataset::y) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index b5cf14a3..f3baa2c8 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -288,25 +288,30 @@ def _make_data(self, X, y=None, feature_names=[], validation_size=0.0): y = y.values if isinstance(X, pd.DataFrame): X = X.values - if isinstance(y, NoneType): - return _brush.Dataset(X, - feature_names=feature_names, validation_size=validation_size) - else: - return _brush.Dataset(X, y, - feature_names=feature_names, validation_size=validation_size) - + assert isinstance(X, np.ndarray) - # if there is no label, don't include it in library call to Dataset if isinstance(y, NoneType): - return _brush.Dataset(X,feature_names=feature_names, validation_size=validation_size) + return _brush.Dataset(X=X, + feature_names=feature_names, validation_size=validation_size) - return _brush.Dataset(X, y, feature_names=feature_names, validation_size=validation_size) + return _brush.Dataset(X=X, y=y, + feature_names=feature_names, validation_size=validation_size) def predict(self, X): """Predict using the best estimator in the archive. """ - data = self._make_data(X, feature_names=self.feature_names_) + + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + data = _brush.Dataset(X=X, ref_dataset=self.data_, + feature_names=self.feature_names_) + + # data = self._make_data(X, feature_names=self.feature_names_) + return self.best_estimator_.predict(data) # def _setup_population(self): @@ -403,7 +408,17 @@ def predict_proba(self, X): classes corresponds to that in the attribute :term:`classes_`. """ - data = self._make_data(X) + + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + data = _brush.Dataset(X=X, ref_dataset=self.data_, + feature_names=self.feature_names_) + + # data = self._make_data(X, feature_names=self.feature_names_) + return self.best_estimator_.predict_proba(data) class BrushRegressor(BrushEstimator, RegressorMixin): diff --git a/src/data/data.cpp b/src/data/data.cpp index b80668df..7521c668 100644 --- a/src/data/data.cpp +++ b/src/data/data.cpp @@ -100,7 +100,17 @@ State check_type(const ArrayXf& x) } } return tmp; +} +template +State cast_type(const ArrayXf& x, const StateRef& x_ref) +{ + if (std::holds_alternative(x_ref)) + return ArrayXi(x.cast()); + else if (std::holds_alternative(x_ref)) + return ArrayXb(x.cast()); + + return x; } /// return a slice of the data using indices idx @@ -222,9 +232,9 @@ void Dataset::set_batch_size(float new_size) { /// turns input data into a feature map map Dataset::make_features(const ArrayXXf& X, - const map& Z, - const vector& vn - ) + const map& Z, + const vector& vn + ) { // fmt::print("Dataset::make_features()\n"); map tmp_features; @@ -265,6 +275,57 @@ map Dataset::make_features(const ArrayXXf& X, return tmp_features; }; +/// turns input into a feature map, with feature types copied from a reference +map Dataset::copy_and_make_features(const ArrayXXf& X, + const Dataset& ref_dataset, + const vector& vn + ) +{ + vector var_names; + if (vn.empty()) + { + for (int i = 0; i < X.cols(); ++i) + { + string v = "x_"+to_string(i); + var_names.push_back(v); + } + } + else + { + if (vn.size() != X.cols()) + HANDLE_ERROR_THROW( + fmt::format("Variable names and data size mismatch: " + "{} variable names and {} features in X", + vn.size(), + X.cols() + ) + ); + var_names = vn; + } + + if (ref_dataset.features.size() != vn.size()) + HANDLE_ERROR_THROW( + fmt::format("Reference dataset with incompatible number of variables: " + "Reference has {} variable names, but X has {}", + ref_dataset.features.size(), + vn.size() + ) + ); + + map tmp_features; + for (int i = 0; i < X.cols(); ++i) + { + State tmp = cast_type( + X.col(i).array(), + ref_dataset.features.at(var_names.at(i)) + ); + + tmp_features[var_names.at(i)] = tmp; + } + + return tmp_features; +}; + ostream& operator<<(ostream& os, DataType dt) { os << DataTypeName[dt]; diff --git a/src/data/data.h b/src/data/data.h index 629c02b5..358f0856 100644 --- a/src/data/data.h +++ b/src/data/data.h @@ -36,6 +36,10 @@ namespace Data /// determines data types of columns of matrix X. State check_type(const ArrayXf& x); DataType StateType(const State& arg); + +template +State cast_type(const ArrayXf& x, const StateRef& x_ref); + /////////////////////////////////////////////////////////////////////////////// /*! @@ -94,6 +98,12 @@ class Dataset const vector& vn = {} ); + /// turns input into a feature map, with feature types copied from a reference + map copy_and_make_features(const ArrayXXf& X, + const Dataset& ref_dataset, + const vector& vn = {} + ); + /// 1. initialize data from a map. Dataset(std::map& d, const Ref& y_ = ArrayXf(), @@ -134,13 +144,31 @@ class Dataset /// 3. initialize data from X and feature names Dataset(const ArrayXXf& X, const vector& vn, float validation_size = 0.0, - float batch_size = 1.0) + float batch_size = 1.0 + ) : classification(false) , features(make_features(X,map{},vn)) , validation_size(validation_size) , use_validation(validation_size > 0.0 && validation_size < 1.0) , batch_size(batch_size) , use_batch(batch_size > 0.0 && batch_size < 1.0) + { + init(); + Xref = optional>{X}; + } + + //// 4. initialize data from X, but feature types are copied from a + //// reference dataset. Useful for bypass Brush's type sniffer and + //// doing predictions with small number of samples + Dataset(const ArrayXXf& X, const Dataset& ref_dataset, + const vector& vn + ) + : classification(false) + , features(copy_and_make_features(X,ref_dataset,vn)) + , validation_size(0.0) + , use_validation(false) + , batch_size(1.0) + , use_batch(false) { init(); Xref = optional>{X}; From d4324a03978f70f4b5757002ef586de22c7cac10 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 29 Sep 2023 09:58:02 -0400 Subject: [PATCH 024/199] `predict_proba` returns 2d array for binary classification This is aligned with standard scikit interface for predict proba --- src/bindings/bind_programs.h | 2 +- src/brush/estimator.py | 14 +++++++++++++- tests/python/test_brush.py | 16 ++++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index abdd9d76..f18e188e 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -76,7 +76,7 @@ void bind_program(py::module& m, string name) "predict from Dataset object") .def("predict_proba", static_cast &X)>(&T::predict_proba), - "fit from X,y data"); + "predict from X data"); } } \ No newline at end of file diff --git a/src/brush/estimator.py b/src/brush/estimator.py index f3baa2c8..957938b4 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -5,6 +5,7 @@ control of the underlying GP objects. """ from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +from sklearn.utils.validation import check_is_fitted # from sklearn.metrics import mean_squared_error import numpy as np import pandas as pd @@ -302,6 +303,8 @@ def _make_data(self, X, y=None, feature_names=[], validation_size=0.0): def predict(self, X): """Predict using the best estimator in the archive. """ + check_is_fitted(self) + if isinstance(X, pd.DataFrame): X = X.values @@ -408,6 +411,8 @@ def predict_proba(self, X): classes corresponds to that in the attribute :term:`classes_`. """ + + check_is_fitted(self) if isinstance(X, pd.DataFrame): X = X.values @@ -419,7 +424,14 @@ def predict_proba(self, X): # data = self._make_data(X, feature_names=self.feature_names_) - return self.best_estimator_.predict_proba(data) + prob = self.best_estimator_.predict_proba(data) + + if self.n_classes_ <= 2: + prob = np.hstack( (np.ones(X.shape[0]).reshape(-1,1), prob.reshape(-1,1)) ) + prob[:, 0] -= prob[:, 1] + + return prob + class BrushRegressor(BrushEstimator, RegressorMixin): """Brush for regression. diff --git a/tests/python/test_brush.py b/tests/python/test_brush.py index 894fae6c..3cc191ba 100644 --- a/tests/python/test_brush.py +++ b/tests/python/test_brush.py @@ -63,6 +63,22 @@ def test_fit(setup, algorithm, brush_args, request): except Exception as e: pytest.fail(f"Unexpected Exception caught: {e}") logging.error(traceback.format_exc()) + +@pytest.mark.parametrize('setup', + [('classification_setup'), + ('multiclass_classification_setup')]) +def test_predict_proba(setup, brush_args, request): + + Estimator, X, y = request.getfixturevalue(setup) + + est = Estimator(**brush_args) + est.fit(X, y) + + y_prob = est.predict_proba(X) + assert len(y_prob.shape) == 2, "predict_proba should be 2-dimensional" + assert y_prob.shape[1] >= 2, \ + "every class should have its own column (even for binary clf)" + # def test_random_state(): # TODO: make it work From 52611bb396346feea4b5f32ba1eb64f67e8adf6f Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 29 Sep 2023 10:35:18 -0400 Subject: [PATCH 025/199] Fixed wrong comparison when throwing an error while copying from a reference dataset --- src/data/data.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/data.cpp b/src/data/data.cpp index 7521c668..858ba126 100644 --- a/src/data/data.cpp +++ b/src/data/data.cpp @@ -303,12 +303,12 @@ map Dataset::copy_and_make_features(const ArrayXXf& X, var_names = vn; } - if (ref_dataset.features.size() != vn.size()) + if (ref_dataset.features.size() != var_names.size()) HANDLE_ERROR_THROW( fmt::format("Reference dataset with incompatible number of variables: " "Reference has {} variable names, but X has {}", ref_dataset.features.size(), - vn.size() + var_names.size() ) ); From f42590d3f6d729f2708ca8da7992adc3b3dd6392 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 2 Oct 2023 16:58:54 -0400 Subject: [PATCH 026/199] Bug fix - binary clf programs being created without `logistic` as root --- src/program/node.h | 2 +- src/search_space.h | 67 +++++++++++++++------------------------------- src/variation.h | 1 - 3 files changed, 23 insertions(+), 47 deletions(-) diff --git a/src/program/node.h b/src/program/node.h index cf4541ef..a5238c46 100644 --- a/src/program/node.h +++ b/src/program/node.h @@ -239,7 +239,7 @@ struct Node { //TODO revisit float get_prob_change() const { return fixed ? 0.0 : this->prob_change;}; void set_prob_change(float w){ if (!fixed) this->prob_change = w;}; - float get_prob_keep() const { return 1-this->prob_change;}; + float get_prob_keep() const { return fixed ? 1.0 : 1.0-this->prob_change;}; inline void set_feature(string f){ feature = f; }; inline string get_feature() const { return feature; }; diff --git a/src/search_space.h b/src/search_space.h index 088957b0..9e763de2 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -686,55 +686,32 @@ P SearchSpace::make_program(int max_d, int max_size) ProgramType program_type = P::program_type; // ProgramType program_type = ProgramTypeEnum::value; - auto Tree = tree(); - if (max_size == 1 || max_d == 1) - { - // auto root = Tree.insert(Tree.begin(), sample_terminal(root_type)); + // building the root node for each program case. We give the root, and it + // fills the rest of the tree + Node root; - // We can only have a terminal here, but the terminal must be compatible - auto opt = sample_terminal(root_type); + // building the root node for each program case + if (P::program_type == ProgramType::BinaryClassifier) + { + root = get(NodeType::Logistic, DataType::ArrayF, Signature()); + root.set_prob_change(0.0); + root.fixed=true; + } + else if (P::program_type == ProgramType::MulticlassClassifier) + { + root = get(NodeType::Softmax, DataType::MatrixF, Signature()); + root.set_prob_change(0.0); + root.fixed=true; + } + else { + // we start with a non-terminal (can be replaced inside PTC2 though, if max_size==1) + auto opt = sample_op(root_type); if (!opt) opt = sample_terminal(root_type, true); - - if (!opt){ - auto msg = fmt::format("Program with size=1 could not be created. " - "The search space does not contain any terminal with data type {}./n", - root_type); - HANDLE_ERROR_THROW(msg); - } - - Tree.insert(Tree.begin(), opt.value()); - } - else {// Our program can (and will) be grater than 1 node - - // building the root node for each program case. We give the root, and it - // fills the rest of the tree - Node root; - - // building the root node for each program case - if (P::program_type == ProgramType::BinaryClassifier) - { - root = get(NodeType::Logistic, DataType::ArrayF, Signature()); - root.set_prob_change(0.0); - root.fixed=true; - - } - else if (P::program_type == ProgramType::MulticlassClassifier) - { - root = get(NodeType::Softmax, DataType::MatrixF, Signature()); - root.set_prob_change(0.0); - root.fixed=true; - } - else { - // we start with a non-terminal (can be replaced inside PTC2 though, if max_size==1) - auto opt = sample_op(root_type); - if (!opt) - opt = sample_terminal(root_type, true); - root = opt.value(); - } - - Tree = PTC2(root, max_d, max_size); + root = opt.value(); } + + auto Tree = PTC2(root, max_d, max_size); return P(*this,Tree); }; diff --git a/src/variation.h b/src/variation.h index ef98333d..22d72a5f 100644 --- a/src/variation.h +++ b/src/variation.h @@ -542,7 +542,6 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS PARAMS["mutation_trace"]["status"] = "aplied the mutation"; if (success) PARAMS["mutation_trace"]["child"] = child.get_model("compact", true); - } if (success From 9a6c7f1c64807773d99b07880345d5c472fdaf41 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 3 Oct 2023 09:35:53 -0400 Subject: [PATCH 027/199] New fix. Some classification programs still being modified --- src/program/node.cpp | 2 -- src/program/node.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/program/node.cpp b/src/program/node.cpp index 2c632249..9e7d3167 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -199,8 +199,6 @@ void from_json(const json &j, Node& p) if (j.contains("prob_change")) j.at("prob_change").get_to(p.prob_change); - else - p.prob_change=1.0; // if node has a ret_type and arg_types, get them. if not we need to make diff --git a/src/program/node.h b/src/program/node.h index a5238c46..56af0512 100644 --- a/src/program/node.h +++ b/src/program/node.h @@ -238,7 +238,7 @@ struct Node { // getters and setters //TODO revisit float get_prob_change() const { return fixed ? 0.0 : this->prob_change;}; - void set_prob_change(float w){ if (!fixed) this->prob_change = w;}; + void set_prob_change(float w){ this->prob_change = w;}; float get_prob_keep() const { return fixed ? 1.0 : 1.0-this->prob_change;}; inline void set_feature(string f){ feature = f; }; From 74f9d3aa0c0ef01cf0f6a6ca9e249da9424cd8d6 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 3 Oct 2023 12:44:06 -0400 Subject: [PATCH 028/199] Improved counting nodes --- src/program/program.h | 13 ++++- src/variation.h | 17 +++++- tests/cpp/test_variation.cpp | 104 +++++++++++++++++++++++++++++++++-- 3 files changed, 125 insertions(+), 9 deletions(-) diff --git a/src/program/program.h b/src/program/program.h index 188cbdd7..a2ad5206 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -99,8 +99,17 @@ template struct Program [include_weight, &acc](auto& node){ ++acc; // the node operator or terminal - if (include_weight && node.get_is_weighted()==true) - acc += 2; // weight and multiplication, if enabled + // SplitBest has an optimizable decision tree consisting of 3 nodes + // (terminal, arithmetic comparison, value) that needs to be taken + // into account + if (Is(node.node_type)) + acc += 3; + + if ( (include_weight && node.get_is_weighted()==true) + && Isnt(node.node_type) ) + // weighted constants still count as 1 (simpler than constant terminals) + // Taking into account the weight and multiplication, if enabled + acc += 2; }); return acc; diff --git a/src/variation.h b/src/variation.h index 22d72a5f..22d42a77 100644 --- a/src/variation.h +++ b/src/variation.h @@ -58,14 +58,27 @@ class MutationBase { protected: static size_t size_with_weights(tree& Tree, bool include_weight=true) { + // re-implementation of int Node::size(bool include_weight=true) meant + // to work with the tree instead of brush's programs. + // TODO: find a better way to have this function available to mutations + // and avoid repeated functions size_t acc = 0; std::for_each(Tree.begin(), Tree.end(), [include_weight, &acc](auto& node){ ++acc; // the node operator or terminal - if (include_weight && node.get_is_weighted()==true) - acc += 2; // weight and multiplication, if enabled + // SplitBest has an optimizable decision tree consisting of 3 nodes + // (terminal, arithmetic comparison, value) that needs to be taken + // into account + if (Is(node.node_type)) + acc += 3; + + if ( (include_weight && node.get_is_weighted()==true) + && Isnt(node.node_type) ) + // Taking into account the weight and multiplication, if enabled. + // weighted constants still count as 1 (simpler than constant terminals) + acc += 2; }); return acc; diff --git a/tests/cpp/test_variation.cpp b/tests/cpp/test_variation.cpp index 861a0420..f6e93b37 100644 --- a/tests/cpp/test_variation.cpp +++ b/tests/cpp/test_variation.cpp @@ -4,7 +4,101 @@ #include "../../src/program/dispatch_table.h" #include "../../src/data/io.h" -TEST(Operators, InsertMutationWorks) +TEST(Variation, FixedRootDoesntChange) +{ + PARAMS["mutation_options"] = { + {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} + }; + PARAMS["max_size"] = 20; + PARAMS["max_depth"] = 10; + + MatrixXf X(10,2); + ArrayXf y(10); + X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, + 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + + 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, + 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + + y << 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0; + + Dataset data(X,y); + + SearchSpace SS; + SS.init(data); + + auto logistic_hash = Signature().hash(); + + for (int d = 1; d < 10; ++d) + { + for (int s = 1; s < 10; ++s) + { + int successes = 0; + for (int attempt = 0; attempt < 10; ++attempt) + { + // different program types changes how predict works (and the rettype of predict) + ClassifierProgram PRG = SS.make_classifier(d, s); + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model 1: {}\n", + d, s, + PRG.get_model("compact", true) + ); + + Node root = *(PRG.Tree.begin()); + ASSERT_TRUE(root.node_type == NodeType::Logistic); + ASSERT_TRUE(root.ret_type == DataType::ArrayF); + ASSERT_TRUE(root.sig_hash == logistic_hash); + ASSERT_TRUE(root.get_prob_change()==0.0); + ASSERT_TRUE(root.fixed==true); + + auto opt_mutation = PRG.mutate(); + if (opt_mutation) + { + successes += 1; + auto Mut_Child = opt_mutation.value(); + fmt::print("After mutation : {}\n", + Mut_Child.get_model("compact", true)); + + Node mut_child_root = *(Mut_Child.Tree.begin()); + ASSERT_TRUE(mut_child_root.node_type == NodeType::Logistic); + ASSERT_TRUE(mut_child_root.ret_type == DataType::ArrayF); + ASSERT_TRUE(mut_child_root.sig_hash == logistic_hash); + ASSERT_TRUE(mut_child_root.get_prob_change()==0.0); + ASSERT_TRUE(mut_child_root.fixed==true); + } + + ClassifierProgram PRG2 = SS.make_classifier(d, s); + auto opt_cx = PRG.cross(PRG2); + if (opt_cx) + { + successes += 1; + auto CX_Child = opt_cx.value(); + fmt::print("After crossover: {}\n", + CX_Child.get_model("compact", true)); + + Node cx_child_root = *(CX_Child.Tree.begin()); + ASSERT_TRUE(cx_child_root.node_type == NodeType::Logistic); + ASSERT_TRUE(cx_child_root.ret_type == DataType::ArrayF); + ASSERT_TRUE(cx_child_root.sig_hash == logistic_hash); + ASSERT_TRUE(cx_child_root.get_prob_change()==0.0); + ASSERT_TRUE(cx_child_root.fixed==true); + } + + // root remained unchanged + ASSERT_TRUE(root.node_type == NodeType::Logistic); + ASSERT_TRUE(root.ret_type == DataType::ArrayF); + ASSERT_TRUE(root.sig_hash == logistic_hash); + ASSERT_TRUE(root.get_prob_change()==0.0); + ASSERT_TRUE(root.fixed==true); + } + ASSERT_TRUE(successes > 0); + } + } +} + +TEST(Variation, InsertMutationWorks) { // TODO: this tests could be parameterized. // To understand design implementation of this test, check Mutation test @@ -111,7 +205,7 @@ TEST(Operators, InsertMutationWorks) ASSERT_TRUE(successes > 0); } -TEST(Operators, Mutation) +TEST(Variation, Mutation) { PARAMS["write_mutation_trace"] = true; PARAMS["mutation_options"] = { @@ -189,7 +283,7 @@ TEST(Operators, Mutation) ASSERT_TRUE(successes > 0); } -TEST(Operators, MutationSizeAndDepthLimit) +TEST(Variation, MutationSizeAndDepthLimit) { PARAMS["write_mutation_trace"] = true; PARAMS["mutation_options"] = { @@ -287,7 +381,7 @@ TEST(Operators, MutationSizeAndDepthLimit) ASSERT_TRUE(successes > 0); } -TEST(Operators, Crossover) +TEST(Variation, Crossover) { MatrixXf X(10,2); ArrayXf y(10); @@ -363,7 +457,7 @@ TEST(Operators, Crossover) ASSERT_TRUE(successes > 0); } -TEST(Operators, CrossoverSizeAndDepthLimit) +TEST(Variation, CrossoverSizeAndDepthLimit) { MatrixXf X(10,2); ArrayXf y(10); From 8f2b33283b68eeac6849bbbee3541b2796ae6930 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 3 Oct 2023 15:37:29 -0400 Subject: [PATCH 029/199] Changed max arity in test_variation --- src/program/functions.h | 1 - src/program/program.h | 17 +++++++++++++---- tests/cpp/test_variation.cpp | 14 +++++++------- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/program/functions.h b/src/program/functions.h index ff5acc8f..b6a959bf 100644 --- a/src/program/functions.h +++ b/src/program/functions.h @@ -202,7 +202,6 @@ namespace Brush t.row(i).maxCoeff(&idx(i)); return idx; } - }; template<> diff --git a/src/program/program.h b/src/program/program.h index a2ad5206..93db3e54 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -107,8 +107,8 @@ template struct Program if ( (include_weight && node.get_is_weighted()==true) && Isnt(node.node_type) ) + // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) - // Taking into account the weight and multiplication, if enabled acc += 2; }); @@ -130,13 +130,22 @@ template struct Program // Then make the second one point to the next sibling eit.skip_children(); ++eit; - + // calculate tree size for each node until reach next sibling while(it!=eit) { ++acc; // counting the node operator/terminal - if (include_weight && it.node->data.get_is_weighted()==true) - acc += 2; // weight and multiplication, if enabled + // SplitBest has an optimizable decision tree consisting of 3 nodes + // (terminal, arithmetic comparison, value) that needs to be taken + // into account + if (Is(it.node->data.node_type)) + acc += 3; + + if ( (include_weight && it.node->data.get_is_weighted()==true) + && Isnt(it.node->data.node_type) ) + // Taking into account the weight and multiplication, if enabled. + // weighted constants still count as 1 (simpler than constant terminals) + acc += 2; ++it; } diff --git a/tests/cpp/test_variation.cpp b/tests/cpp/test_variation.cpp index f6e93b37..eae5e67d 100644 --- a/tests/cpp/test_variation.cpp +++ b/tests/cpp/test_variation.cpp @@ -305,10 +305,10 @@ TEST(Variation, MutationSizeAndDepthLimit) SearchSpace SS; SS.init(data); - - // split operator --> arity 3 - // prod operator --> arity 4 - int max_arity = 4; + + // prod operator --> arity 4: prod(T1, T2, T3) + // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) + int max_arity = 6; int successes = 0; for (int d = 5; d < 15; ++d) @@ -475,9 +475,9 @@ TEST(Variation, CrossoverSizeAndDepthLimit) SearchSpace SS; SS.init(data); - // split operator --> arity 3 - // prod operator --> arity 4 - int max_arity = 4; + // prod operator --> arity 4: prod(T1, T2, T3) + // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) + int max_arity = 6; int successes = 0; for (int d = 5; d < 15; ++d) From f0fa367b9a9970b87504c0547a86e5adb7fd35b1 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 4 Oct 2023 10:49:38 -0400 Subject: [PATCH 030/199] MeanLabel node --- src/program/nodetype.cpp | 3 +++ src/program/nodetype.h | 28 ++++++++++++++++++++-------- src/program/operator.h | 31 +++++++++++++++++++++++++++++++ src/program/signatures.h | 7 +++++++ tests/cpp/test_data.cpp | 7 ++++--- 5 files changed, 65 insertions(+), 11 deletions(-) diff --git a/src/program/nodetype.cpp b/src/program/nodetype.cpp index b58302a7..d6332d8c 100644 --- a/src/program/nodetype.cpp +++ b/src/program/nodetype.cpp @@ -66,6 +66,9 @@ std::map NodeNameType = { {"SplitBest", NodeType::SplitBest}, {"SplitOn", NodeType::SplitOn}, + //mean label + {"MeanLabel", NodeType::MeanLabel}, + // leaves {"Constant", NodeType::Constant}, {"Terminal", NodeType::Terminal}, diff --git a/src/program/nodetype.h b/src/program/nodetype.h index b1e9ce4f..de4cd9ff 100644 --- a/src/program/nodetype.h +++ b/src/program/nodetype.h @@ -72,6 +72,8 @@ enum class NodeType : uint64_t { //split SplitBest = 1UL << 35UL, SplitOn = 1UL << 36UL, + // mean label of a split + MeanLabel = 1UL << 37UL, // these ones change type /* Equals = 1UL << 39UL, */ /* LessThan = 1UL << 40UL, */ @@ -79,14 +81,14 @@ enum class NodeType : uint64_t { /* Leq = 1UL << 42UL, */ /* Geq = 1UL << 43UL, */ // leaves - Constant = 1UL << 37UL, - Terminal = 1UL << 38UL, - ArgMax = 1UL << 39UL, - Count = 1UL << 40UL, + Constant = 1UL << 38UL, + Terminal = 1UL << 39UL, + ArgMax = 1UL << 40UL, + Count = 1UL << 41UL, // custom - CustomUnaryOp = 1UL << 41UL, - CustomBinaryOp = 1UL << 42UL, - CustomSplit = 1UL << 43UL + CustomUnaryOp = 1UL << 42UL, + CustomBinaryOp = 1UL << 43UL, + CustomSplit = 1UL << 44UL // boolean // And = 1UL << 37UL, // Or = 1UL << 38UL, @@ -98,7 +100,7 @@ enum class NodeType : uint64_t { using UnderlyingNodeType = std::underlying_type_t; struct NodeTypes { // magic number keeping track of the number of different node types - static constexpr size_t Count = 39; + static constexpr size_t Count = 40; static constexpr size_t OpCount = Count-2; // returns the index of the given type in the NodeType enum @@ -200,6 +202,9 @@ NLOHMANN_JSON_SERIALIZE_ENUM( NodeType, { {NodeType::SplitBest,"SplitBest" }, {NodeType::SplitOn,"SplitOn" }, + //mean label + {NodeType::MeanLabel,"MeanLabel" }, + // leaves {NodeType::Constant,"Constant" }, {NodeType::Terminal,"Terminal" }, @@ -265,6 +270,7 @@ static constexpr bool BinaryOp = is_in_v; + template static constexpr bool AssociativeBinaryOp = is_in_v; + +template +static constexpr bool NullaryOp = is_in_v; + // // TODO: make this work // template // concept Transformer = requires(NT n, size_t ArgCount) diff --git a/src/program/operator.h b/src/program/operator.h index 195afc6a..f59205fe 100644 --- a/src/program/operator.h +++ b/src/program/operator.h @@ -305,6 +305,37 @@ struct Operator }; }; +//////////////////////////////////////////////////////////////////////////// +// MeanLabel overload +template +struct Operator +{ + using RetType = typename S::RetType; + using W = typename S::WeightType; + + RetType fit(const Dataset& d, TreeNode& tn) const { + tn.data.W = d.y.mean(); + return predict(d, tn); + }; + + template + RetType predict(const Dataset& d, TreeNode& tn, const W** weights=nullptr) const + { + Scalar w = util::get_weight(tn, weights); + if constexpr (N == 1) + return RetType::Constant(d.get_n_samples(), w); + else + return RetType::Constant(d.get_n_samples(), d.get_n_features(), w); + }; + + RetType eval(const Dataset& d, TreeNode& tn, const W** weights=nullptr) const { + if constexpr (Fit) + return fit(d,tn); + else + return predict(d,tn,weights); + }; +}; + //////////////////////////////////////////////////////////////////////////// // Operator overloads // Split diff --git a/src/program/signatures.h b/src/program/signatures.h index a46e58a9..78cd2440 100644 --- a/src/program/signatures.h +++ b/src/program/signatures.h @@ -378,5 +378,12 @@ struct Signatures{ using type = decltype(std::tuple_cat(unaryTuple(), naryTuple())); }; + +template +struct Signatures>>{ + using type = std::tuple< + Signature + >; + }; } // namespace Brush #endif \ No newline at end of file diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index e6306179..417fc84d 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -48,9 +48,10 @@ TEST(Data, MixedVariableTypes) y << 6.1, 7.7, -4.2; // y = x_0 + x_1 + x_2 unordered_map user_ops = { - {"Add", 1}, - {"Sub", 1}, - {"SplitOn", 1} + {"Add", 1.0}, + {"Sub", 1.0}, + {"SplitOn", 1.0}, + {"MeanLabel", 1.0} }; Dataset dt(X, y); From 217014674bfeab1abeefa1ade9a5b1d6172bf576 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 5 Oct 2023 11:10:46 -0400 Subject: [PATCH 031/199] Fixed `algorithm`==`ga` not returning the best individual --- src/brush/deap_api/nsga2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py index 710cf234..809be792 100644 --- a/src/brush/deap_api/nsga2.py +++ b/src/brush/deap_api/nsga2.py @@ -89,9 +89,12 @@ def calculate_statistics(ind): for ind, fit in zip(offspring, fitnesses): ind.fitness.values = fit - # Select the next generation population + # Select the next generation population (no sorting before this step, as + # survive==offspring will cut it in half) pop = toolbox.survive(pop + offspring, MU) + pop.sort(key=lambda x: x.fitness, reverse=True) + record = stats.compile(pop) logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) From 547bfdc266744b30c267767e5cfb91ac9f326662 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 5 Oct 2023 11:11:13 -0400 Subject: [PATCH 032/199] Added multiclass dataset (currently brush can do only binary clf) --- .../datasets/d_analcatdata_happiness.csv | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 docs/examples/datasets/d_analcatdata_happiness.csv diff --git a/docs/examples/datasets/d_analcatdata_happiness.csv b/docs/examples/datasets/d_analcatdata_happiness.csv new file mode 100644 index 00000000..b18d1319 --- /dev/null +++ b/docs/examples/datasets/d_analcatdata_happiness.csv @@ -0,0 +1,61 @@ +Years_of_schooling,Siblings,Count,target +0,0,15.0,0 +1,0,34.0,0 +2,0,36.0,0 +3,0,22.0,0 +0,1,61.0,0 +1,1,31.0,0 +2,1,60.0,0 +3,1,46.0,0 +0,2,25.0,0 +1,2,26.0,0 +2,2,35.0,0 +3,2,45.0,0 +0,3,30.0,0 +1,3,13.0,0 +2,3,8.0,0 +3,3,18.0,0 +0,4,14.0,0 +1,4,3.0,0 +2,4,3.0,0 +3,4,4.0,0 +0,0,17.0,1 +1,0,53.0,1 +2,0,70.0,1 +3,0,67.0,1 +0,1,79.0,1 +1,1,60.0,1 +2,1,96.0,1 +3,1,45.0,1 +0,2,40.0,1 +1,2,31.0,1 +2,2,63.0,1 +3,2,74.0,1 +0,3,39.0,1 +1,3,24.0,1 +2,3,7.0,1 +3,3,15.0,1 +0,4,15.0,1 +1,4,9.0,1 +2,4,2.0,1 +3,4,1.0,1 +0,0,7.0,2 +1,0,20.0,2 +2,0,23.0,2 +3,0,16.0,2 +0,1,36.0,2 +1,1,5.0,2 +2,1,12.0,2 +3,1,11.0,2 +0,2,12.0,2 +1,2,7.0,2 +2,2,5.0,2 +3,2,10.0,2 +0,3,4.0,2 +1,3,4.0,2 +2,3,3.0,2 +3,3,2.0,2 +0,4,1.0,2 +1,4,2.0,2 +2,4,0.0,2 +3,4,1.0,2 From c25feddbdfbed19dd618a07f42c82c643724a45d Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 5 Oct 2023 12:44:30 -0400 Subject: [PATCH 033/199] Changed MeanLabel to be a terminal (not an operator) --- src/program/node.cpp | 4 ++++ src/program/node.h | 2 +- src/program/nodetype.cpp | 4 +--- src/program/nodetype.h | 9 +++------ src/program/program.h | 6 +++--- src/program/signatures.h | 4 ++-- src/search_space.cpp | 8 +++++++- src/search_space.h | 6 +++--- src/variation.h | 2 +- tests/cpp/test_search_space.cpp | 12 ++++++------ 10 files changed, 31 insertions(+), 26 deletions(-) diff --git a/src/program/node.cpp b/src/program/node.cpp index 9e7d3167..e984b730 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -31,6 +31,10 @@ auto Node::get_name(bool include_weight) const noexcept -> std::string { return fmt::format("{:.2f}", W); } + else if (Is(node_type) && include_weight) + { + return fmt::format("MeanLabel"); + } else if (is_weighted && include_weight) return fmt::format("{:.2f}*{}",W,name); return name; diff --git a/src/program/node.h b/src/program/node.h index 56af0512..3f9aa41d 100644 --- a/src/program/node.h +++ b/src/program/node.h @@ -264,7 +264,7 @@ template inline auto Isnt(NodeType nt) -> bool { return !((nt == T) || ...); } inline auto IsLeaf(NodeType nt) noexcept -> bool { - return Is(nt); + return Is(nt); } inline auto IsCommutative(NodeType nt) noexcept -> bool { diff --git a/src/program/nodetype.cpp b/src/program/nodetype.cpp index d6332d8c..6c9c7946 100644 --- a/src/program/nodetype.cpp +++ b/src/program/nodetype.cpp @@ -66,10 +66,8 @@ std::map NodeNameType = { {"SplitBest", NodeType::SplitBest}, {"SplitOn", NodeType::SplitOn}, - //mean label - {"MeanLabel", NodeType::MeanLabel}, - // leaves + {"MeanLabel", NodeType::MeanLabel}, {"Constant", NodeType::Constant}, {"Terminal", NodeType::Terminal}, diff --git a/src/program/nodetype.h b/src/program/nodetype.h index de4cd9ff..374cd732 100644 --- a/src/program/nodetype.h +++ b/src/program/nodetype.h @@ -72,8 +72,6 @@ enum class NodeType : uint64_t { //split SplitBest = 1UL << 35UL, SplitOn = 1UL << 36UL, - // mean label of a split - MeanLabel = 1UL << 37UL, // these ones change type /* Equals = 1UL << 39UL, */ /* LessThan = 1UL << 40UL, */ @@ -81,6 +79,7 @@ enum class NodeType : uint64_t { /* Leq = 1UL << 42UL, */ /* Geq = 1UL << 43UL, */ // leaves + MeanLabel = 1UL << 37UL, Constant = 1UL << 38UL, Terminal = 1UL << 39UL, ArgMax = 1UL << 40UL, @@ -101,7 +100,7 @@ using UnderlyingNodeType = std::underlying_type_t; struct NodeTypes { // magic number keeping track of the number of different node types static constexpr size_t Count = 40; - static constexpr size_t OpCount = Count-2; + static constexpr size_t OpCount = Count-3; // subtracting leaves // returns the index of the given type in the NodeType enum static auto GetIndex(NodeType type) -> size_t @@ -202,10 +201,8 @@ NLOHMANN_JSON_SERIALIZE_ENUM( NodeType, { {NodeType::SplitBest,"SplitBest" }, {NodeType::SplitOn,"SplitOn" }, - //mean label - {NodeType::MeanLabel,"MeanLabel" }, - // leaves + {NodeType::MeanLabel,"MeanLabel" }, {NodeType::Constant,"Constant" }, {NodeType::Terminal,"Terminal" }, diff --git a/src/program/program.h b/src/program/program.h index 93db3e54..523a0b90 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -106,7 +106,7 @@ template struct Program acc += 3; if ( (include_weight && node.get_is_weighted()==true) - && Isnt(node.node_type) ) + && Isnt(node.node_type) ) // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) acc += 2; @@ -142,7 +142,7 @@ template struct Program acc += 3; if ( (include_weight && it.node->data.get_is_weighted()==true) - && Isnt(it.node->data.node_type) ) + && Isnt(it.node->data.node_type) ) // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) acc += 2; @@ -422,7 +422,7 @@ template struct Program } // add the node - bool is_constant = Is(parent->data.node_type); + bool is_constant = Is(parent->data.node_type); string node_label = parent->data.get_name(is_constant); if (Is(parent->data.node_type)){ diff --git a/src/program/signatures.h b/src/program/signatures.h index 78cd2440..28b36db3 100644 --- a/src/program/signatures.h +++ b/src/program/signatures.h @@ -382,8 +382,8 @@ struct Signatures{ template struct Signatures>>{ using type = std::tuple< - Signature - >; + Signature + >; }; } // namespace Brush #endif \ No newline at end of file diff --git a/src/search_space.cpp b/src/search_space.cpp index 1eafbb06..10f0eede 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -121,7 +121,8 @@ vector generate_terminals(const Dataset& d) }; auto cXf = Node(NodeType::Constant, Signature{}, true, "Cf"); - cXf.set_prob_change(signature_avg(cXf.ret_type)); + float floats_avg_weights = signature_avg(cXf.ret_type); + cXf.set_prob_change(floats_avg_weights); terminals.push_back(cXf); auto cXi = Node(NodeType::Constant, Signature{}, true, "Ci"); @@ -132,6 +133,11 @@ vector generate_terminals(const Dataset& d) cXb.set_prob_change(signature_avg(cXb.ret_type)); terminals.push_back(cXb); + // mean label node + auto meanlabel = Node(NodeType::MeanLabel, Signature{}, true, "MeanLabel"); + meanlabel.set_prob_change(floats_avg_weights); + terminals.push_back(meanlabel); + return terminals; }; diff --git a/src/search_space.h b/src/search_space.h index 9e763de2..820bc95c 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -545,7 +545,7 @@ struct SearchSpace /// @return `std::optional` that may contain a Node std::optional get_node_like(Node node) const { - if (Is(node.node_type)){ + if (Is(node.node_type)){ return sample_terminal(node.ret_type); } @@ -578,7 +578,7 @@ struct SearchSpace tree PTC2(Node root, int max_d, int max_size) const; template - requires (!is_in_v) + requires (!is_in_v) static constexpr std::optional CreateNode( const auto& unique_data_types, bool use_all, @@ -632,7 +632,7 @@ struct SearchSpace const vector& unique_data_types ) { - if (Is(NT)) + if (Is(NT)) return; bool use_all = user_ops.size() == 0; auto name = NodeTypeName.at(NT); diff --git a/src/variation.h b/src/variation.h index 22d42a77..3c375549 100644 --- a/src/variation.h +++ b/src/variation.h @@ -75,7 +75,7 @@ class MutationBase { acc += 3; if ( (include_weight && node.get_is_weighted()==true) - && Isnt(node.node_type) ) + && Isnt(node.node_type) ) // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) acc += 2; diff --git a/tests/cpp/test_search_space.cpp b/tests/cpp/test_search_space.cpp index 02eaaf19..6b58375b 100644 --- a/tests/cpp/test_search_space.cpp +++ b/tests/cpp/test_search_space.cpp @@ -25,10 +25,10 @@ TEST(SearchSpace, Initialization) // different weights to check if searchspace is initialized correctnly unordered_map user_ops = { - {"Add", 1}, - {"Sub", 1}, - {"Div", .5}, - {"Mul", 0.5} + {"Add", 1}, + {"Sub", 1}, + {"Div", .5}, + {"Mul", 0.5} }; SearchSpace SS; @@ -40,8 +40,8 @@ TEST(SearchSpace, Initialization) // dtable_predict.print(); // manually calculated. last value is the avg of prev values - ArrayXf expected_weights_Xf(4); // 4 elements (x3, x4, x5 and c) - expected_weights_Xf << 0.80240685, 0.19270448, 0.5994426, 0.531518; + ArrayXf expected_weights_Xf(4); // 5 elements (x3, x4, x5, c, meanLabel) + expected_weights_Xf << 0.80240685, 0.19270448, 0.5994426, 0.531518, 0.531518; auto actual_weights_f = SS.terminal_weights.at(DataType::ArrayF); Eigen::Map actual_weights_Xf(actual_weights_f.data(), actual_weights_f.size()); From 2a579b9cc7e7135dbf04e44987be4cb70f8b5b9a Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Sun, 8 Oct 2023 15:53:10 -0400 Subject: [PATCH 034/199] python implementation of island model. fixed api and tests --- src/brush/deap_api/__init__.py | 2 + src/brush/deap_api/nsga2island.py | 143 ++++++++++++++++++++++++++++++ src/brush/estimator.py | 25 ++++-- tests/python/test_brush.py | 11 ++- 4 files changed, 172 insertions(+), 9 deletions(-) create mode 100644 src/brush/deap_api/nsga2island.py diff --git a/src/brush/deap_api/__init__.py b/src/brush/deap_api/__init__.py index b2b2dfa8..6cbb48db 100644 --- a/src/brush/deap_api/__init__.py +++ b/src/brush/deap_api/__init__.py @@ -1,2 +1,4 @@ from .nsga2 import nsga2 +from .ga import ga +from .nsga2island import nsga2island from .utils import DeapIndividual \ No newline at end of file diff --git a/src/brush/deap_api/nsga2island.py b/src/brush/deap_api/nsga2island.py new file mode 100644 index 00000000..c976b211 --- /dev/null +++ b/src/brush/deap_api/nsga2island.py @@ -0,0 +1,143 @@ +from deap import tools +from deap.benchmarks.tools import diversity, convergence, hypervolume +import numpy as np +import functools + + +def nsga2island(toolbox, NGEN, MU, N_ISLANDS, MIGPX, CXPB, use_batch, verbosity, rnd_flt): + # NGEN = 250 + # MU = 100 + # CXPB = 0.9 + # N_ISLANDS: number of independent islands. Islands are controled by indexes. + # setting N_ISLANDS=1 would be the same as the original nsga2 + # rnd_flt: random number generator to sample crossover prob + + def calculate_statistics(ind): + on_train = ind.fitness.values + on_val = toolbox.evaluateValidation(ind) + + return (*on_train, *on_val) + + stats = tools.Statistics(calculate_statistics) + + stats.register("avg", np.mean, axis=0) + stats.register("med", np.median, axis=0) + stats.register("std", np.std, axis=0) + stats.register("min", np.min, axis=0) + stats.register("max", np.max, axis=0) + + logbook = tools.Logbook() + logbook.header = "gen", "evals", "avg (O1 train, O2 train, O1 val, O2 val)", \ + "med (O1 train, O2 train, O1 val, O2 val)", \ + "std (O1 train, O2 train, O1 val, O2 val)", \ + "min (O1 train, O2 train, O1 val, O2 val)", \ + "max (O1 train, O2 train, O1 val, O2 val)" + + # Tuples with start and end indexes for each island. Number of individuals + # in each island can slightly differ if N_ISLANDS is not a divisor of MU + island_indexes = [((i*MU)//N_ISLANDS, ((i+1)*MU)//N_ISLANDS) + for i in range(N_ISLANDS)] + + print("island_indexes", island_indexes) + pop = toolbox.population(n=MU) + + fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) + for ind, fit in zip(pop, fitnesses): + ind.fitness.values = fit + + survived = [] + for (idx_start, idx_end) in island_indexes: + survived_parents = toolbox.survive(pop[idx_start:idx_end], + idx_end-idx_start) + survived.extend(survived_parents) + pop = survived + + record = stats.compile(pop) + logbook.record(gen=0, evals=len(pop), **record) + + if verbosity > 0: + print(logbook.stream) + + # Begin the generational process + for gen in range(1, NGEN): + batch = toolbox.getBatch() # batch will be a random subset only if it was not + # defined as the size of the train set. everytime + # this function is called, a new random batch is generated. + + if (use_batch): # recalculate the fitness for the parents + # use_batch is false if batch_size is different from train set size. + # If we're using batch, we need to re-evaluate every model (without + # changing its weights). evaluateValidation doesnt fit the weights + fitnesses = toolbox.map( + functools.partial(toolbox.evaluateValidation, data=batch), pop) + + for ind, fit in zip(pop, fitnesses): + ind.fitness.values = fit + + # Vary the population inside each island + parents = [] + for (idx_start, idx_end) in island_indexes: + island_parents = toolbox.select(pop[idx_start:idx_end], + idx_end-idx_start) + parents.extend(island_parents) + + offspring = [] # Will have the same size as pop + for (idx_start, idx_end) in island_indexes: + for ind1, ind2 in zip(parents[idx_start:idx_end:2], + parents[idx_start+1:idx_end:2] + ): + off1, off2 = None, None + if rnd_flt() < CXPB: # either mutation or crossover + off1, off2 = toolbox.mate(ind1, ind2) + else: + off1 = toolbox.mutate(ind1) + off2 = toolbox.mutate(ind2) + + # Inserting parent if mutation failed + offspring.extend([off1 if off1 is not None else toolbox.Clone(ind1)]) + offspring.extend([off2 if off2 is not None else toolbox.Clone(ind2)]) + + # Evaluate (instead of evaluateValidation) to fit the weights of the offspring + fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring) + if (use_batch): #calculating objectives based on batch + fitnesses = toolbox.map( + functools.partial(toolbox.evaluateValidation, data=batch), offspring) + + for ind, fit in zip(offspring, fitnesses): + ind.fitness.values = fit + + # Select the next generation population + new_pop = [] + for (idx_start, idx_end) in island_indexes: + island_new_pop = toolbox.survive(pop[idx_start:idx_end] \ + +offspring[idx_start:idx_end], + idx_end-idx_start) + new_pop.extend(island_new_pop) + + # Migration to fill up the islands for the next generation + pop = [] + for (idx_start, idx_end) in island_indexes: + other_islands = list(range(0, idx_start)) + list(range(idx_end, MU)) + for idx_individual in range(idx_start, idx_end): + if rnd_flt() < MIGPX: # replace by someone not from the same island + idx_other_individual = other_islands[ + int(rnd_flt() * len(other_islands))] + pop.append(new_pop[idx_other_individual]) + else: + pop.append(new_pop[idx_individual]) + print(len(pop)) + for p in pop: + print(type(p), p) + record = stats.compile(pop) + logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) + + if verbosity > 0: + print(logbook.stream) + + if verbosity > 0: + print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) + + archive = tools.ParetoFront() + archive.update(pop) + + return archive, logbook \ No newline at end of file diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 5f07eabc..6249d2f9 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -13,7 +13,7 @@ # from tqdm import tqdm from types import NoneType import _brush -from .deap_api import nsga2, ga, DeapIndividual +from .deap_api import nsga2, ga, nsga2island, DeapIndividual # from _brush import Dataset, SearchSpace @@ -38,6 +38,12 @@ class BrushEstimator(BaseEstimator): Maximum depth of GP trees in the GP program. Use 0 for no limit. max_size : int, default 0 Maximum number of nodes in a tree. Use 0 for no limit. + n_islands : int, default 5 + Number of independent islands to use in evolutionary framework. + Ignored if `algorithm!="nsga2island"`. + mig_prob : float, default 0.05 + Probability of occuring a migration between two random islands at the + end of a generation, must be between 0 and 1. cx_prob : float, default 1/7 Probability of applying the crossover variation when generating the offspring, must be between 0 and 1. @@ -59,7 +65,7 @@ class BrushEstimator(BaseEstimator): initialization : {"grow", "full"}, default "grow" Strategy to create the initial population. If `full`, then every expression is created with `max_size` nodes. If `grow`, size will be uniformly distributed. - algorithm : {"nsga2", "ga"}, default "nsga2" + algorithm : {"nsga2island", "nsga2", "ga"}, default "nsga2island" Which Evolutionary Algorithm framework to use to evolve the population. validation_size : float, default 0.0 Percentage of samples to use as a hold-out partition. These samples are used @@ -106,12 +112,14 @@ def __init__( verbosity=0, max_depth=3, max_size=20, + n_islands=5, + mig_prob=0.05, cx_prob= 1/7, mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6}, functions: list[str]|dict[str,float] = {}, initialization="grow", - algorithm="nsga2", + algorithm="nsga2island", random_state=None, validation_size: float = 0.0, batch_size: float = 1.0 @@ -123,6 +131,8 @@ def __init__( self.mode=mode self.max_depth=max_depth self.max_size=max_size + self.n_islands=n_islands + self.mig_prob=mig_prob self.cx_prob=cx_prob self.mutation_options=mutation_options self.functions=functions @@ -155,7 +165,7 @@ def _setup_toolbox(self, data_train, data_validation): # When solving multi-objective problems, selection and survival must # support this feature. This means that these selection operators must # accept a tuple of fitnesses as argument) - if self.algorithm=="nsga2": + if self.algorithm=="nsga2" or self.algorithm=="nsga2island": toolbox.register("select", tools.selTournamentDCD) toolbox.register("survive", tools.selNSGA2) elif self.algorithm=="ga": @@ -232,7 +242,12 @@ def fit(self, X, y): self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) - if self.algorithm=="nsga2": + if self.algorithm=="nsga2island": + self.archive_, self.logbook_ = nsga2island( + self.toolbox_, self.max_gen, self.pop_size, self.n_islands, + self.mig_prob, self.cx_prob, + (0.0 Date: Mon, 9 Oct 2023 15:07:48 -0400 Subject: [PATCH 035/199] Adding necessary functions if the user don't specify them --- src/brush/estimator.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 957938b4..f5a4f2e8 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -222,21 +222,31 @@ def fit(self, X, y): feature_names=self.feature_names_, validation_size=self.validation_size) + if isinstance(self.functions, list): + self.functions_ = {k:1.0 for k in self.functions} + else: + self.functions_ = self.functions + # set n classes if relevant if self.mode=="classification": self.n_classes_ = len(np.unique(y)) + # Including necessary functions for classification programs. We'll insert + # it with zero probability, so it doesn't interfer in the user-defined + # functions, but allows the search space to create the hash and mapping to + # use the functions. + if self.n_classes_ == 2 and "Logistic" not in self.functions_.keys(): + self.functions_["Logistic"] = 0.0 + elif "Softmax" not in self.functions_.keys(): + self.functions_["Softmax"] = 0.0 + + # These have a default behavior to return something meaningfull if # no values are set self.train_ = self.data_.get_training_data() self.train_.set_batch_size(self.batch_size) self.validation_ = self.data_.get_validation_data() - if isinstance(self.functions, list): - self.functions_ = {k:1.0 for k in self.functions} - else: - self.functions_ = self.functions - self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) @@ -264,7 +274,7 @@ def fit(self, X, y): # Reference should be best value each obj. can have (after normalization) reference = np.array([1, 1]) - # closest to the reference + # closest to the reference (smallest distance) final_ind_idx = np.argmin( np.linalg.norm(points - reference, axis=1) ) else: # Best in obj.1 (loss) in validation data final_ind_idx = np.argmax( points[:, 0] ) From cf81e1b24c86f43cd1b2c66b5d6b32ac8c50ce0c Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 9 Oct 2023 15:09:02 -0400 Subject: [PATCH 036/199] Implemented base for `And` and `Or` operators Now, we need to make them work and be used in the programs. Needs to implement Not. --- src/program/functions.h | 40 ++++++++++++++++++++---- src/program/node.cpp | 19 ++++++++++-- src/program/node.h | 18 ++++++++--- src/program/nodetype.cpp | 10 +++--- src/program/nodetype.h | 53 +++++++++++++++++--------------- src/program/signatures.h | 66 +++++++++++++++++++++------------------- src/search_space.cpp | 14 +++++++-- 7 files changed, 143 insertions(+), 77 deletions(-) diff --git a/src/program/functions.h b/src/program/functions.h index b6a959bf..1d3cbd0f 100644 --- a/src/program/functions.h +++ b/src/program/functions.h @@ -402,12 +402,40 @@ namespace Brush return this->softmax(t); } - // template - // inline auto operator()(const Array& first, const Ts& ... inputs) - // { - // auto output = Stack(first, inputs...); - // return this->softmax(output); - // } + // template + // inline auto operator()(const Array& first, const Ts& ... inputs) + // { + // auto output = Stack(first, inputs...); + // return this->softmax(output); + // } + }; + + /* logical and -- mul with boolean inputs */ + template<> + struct Function + { + template + inline auto operator()(const T1& t1, const T2& t2) { + return t1 * t2; + } + }; + + /* logical or -- add with boolean inputs */ + template<> + struct Function + { + template + inline auto operator()(const T1& t1, const T2& t2) { + return t1 + t2; + } + }; + + /* logical not */ + template<> + struct Function + { + template + inline auto operator()(const T& t) { return t; } }; } // Brush diff --git a/src/program/node.cpp b/src/program/node.cpp index e984b730..bcaef619 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -33,7 +33,7 @@ auto Node::get_name(bool include_weight) const noexcept -> std::string } else if (Is(node_type) && include_weight) { - return fmt::format("MeanLabel"); + return fmt::format("MeanLabel({:.2f})", W); } else if (is_weighted && include_weight) return fmt::format("{:.2f}*{}",W,name); @@ -52,7 +52,7 @@ string Node::get_model(const vector& children) const noexcept children.at(1) ); } - else if (Is(node_type)){ + else if (Is(node_type)){ // TODO: have a better print for splitOn, based on child type return fmt::format("If({}>{:.2f},{},{})", children.at(0), W, @@ -143,9 +143,22 @@ void init_node_with_default_signature(Node& node) NT::SplitBest, NT::CustomSplit >(n)) - { + { node.set_signature>(); + } + else if (Is< + NT::And, + NT::Or + >(n)) + { + node.set_signature>(); } + // else if (Is< + // NT::Not + // >(n)) + // { + // node.set_signature>(); + // } else if (Is< NT::Min, NT::Max, diff --git a/src/program/node.h b/src/program/node.h index 3f9aa41d..8092664b 100644 --- a/src/program/node.h +++ b/src/program/node.h @@ -271,7 +271,8 @@ inline auto IsCommutative(NodeType nt) noexcept -> bool { return Is(nt); + NodeType::Max + >(nt); } inline auto IsDifferentiable(NodeType nt) noexcept -> bool { @@ -281,7 +282,10 @@ inline auto IsDifferentiable(NodeType nt) noexcept -> bool { NodeType::Before, NodeType::After, NodeType::During, - NodeType::Count + NodeType::Count, + NodeType::And, + NodeType::Or, + NodeType::Not >(nt); } template @@ -294,7 +298,10 @@ inline auto IsWeighable() noexcept -> bool { NodeType::During, NodeType::Count, NodeType::SplitOn, - NodeType::SplitBest + NodeType::SplitBest, + NodeType::And, + NodeType::Or, + NodeType::Not >(NT); } inline auto IsWeighable(NodeType nt) noexcept -> bool { @@ -306,7 +313,10 @@ inline auto IsWeighable(NodeType nt) noexcept -> bool { NodeType::During, NodeType::Count, NodeType::SplitOn, - NodeType::SplitBest + NodeType::SplitBest, + NodeType::And, + NodeType::Or, + NodeType::Not >(nt); } diff --git a/src/program/nodetype.cpp b/src/program/nodetype.cpp index 6c9c7946..d9224849 100644 --- a/src/program/nodetype.cpp +++ b/src/program/nodetype.cpp @@ -31,10 +31,10 @@ std::map NodeNameType = { {"Pow", NodeType::Pow}, {"Logistic", NodeType::Logistic}, - // logic; not sure these will make it in - // {"And", NodeType::And}, - // {"Or", NodeType::Or}, - // {"Not", NodeType::Not}, + // logic + {"And", NodeType::And}, + {"Or", NodeType::Or}, + {"Not", NodeType::Not}, // {"Xor", NodeType::Xor}, // decision (same) @@ -74,7 +74,7 @@ std::map NodeNameType = { // custom {"CustomUnaryOp", NodeType::CustomUnaryOp}, {"CustomBinaryOp", NodeType::CustomBinaryOp}, - {"CustomSplit", NodeType::CustomSplit}, + {"CustomSplit", NodeType::CustomSplit} }; std::map NodeTypeName = Util::reverse_map(NodeNameType); diff --git a/src/program/nodetype.h b/src/program/nodetype.h index 374cd732..6677350d 100644 --- a/src/program/nodetype.h +++ b/src/program/nodetype.h @@ -50,10 +50,12 @@ enum class NodeType : uint64_t { Sqrtabs = 1UL << 17UL, Square = 1UL << 18UL, Logistic = 1UL << 19UL, + // timing masks Before = 1UL << 20UL, After = 1UL << 21UL, During = 1UL << 22UL, + // Reducers Min = 1UL << 23UL, Max = 1UL << 24UL, @@ -61,45 +63,52 @@ enum class NodeType : uint64_t { Median = 1UL << 26UL, Sum = 1UL << 27UL, Prod = 1UL << 28UL, + // Transformers Softmax = 1UL << 29UL, + // Binary Add = 1UL << 30UL, Sub = 1UL << 31UL, Mul = 1UL << 32UL, Div = 1UL << 33UL, Pow = 1UL << 34UL, + //split SplitBest = 1UL << 35UL, SplitOn = 1UL << 36UL, + // these ones change type /* Equals = 1UL << 39UL, */ /* LessThan = 1UL << 40UL, */ /* GreaterThan = 1UL << 41UL, */ /* Leq = 1UL << 42UL, */ /* Geq = 1UL << 43UL, */ - // leaves - MeanLabel = 1UL << 37UL, - Constant = 1UL << 38UL, - Terminal = 1UL << 39UL, - ArgMax = 1UL << 40UL, - Count = 1UL << 41UL, - // custom - CustomUnaryOp = 1UL << 42UL, - CustomBinaryOp = 1UL << 43UL, - CustomSplit = 1UL << 44UL + // boolean - // And = 1UL << 37UL, - // Or = 1UL << 38UL, + And = 1UL << 37UL, + Or = 1UL << 38UL, + Not = 1UL << 39UL, // Xor = 1UL << 39UL, - // Not = 1UL << 19UL, + + // leaves (must be the last ones in this enum) + MeanLabel = 1UL << 40UL, + Constant = 1UL << 41UL, + Terminal = 1UL << 42UL, + ArgMax = 1UL << 43UL, + Count = 1UL << 44UL, // TODO: move before leaves + + // custom + CustomUnaryOp = 1UL << 44UL, + CustomBinaryOp = 1UL << 45UL, + CustomSplit = 1UL << 46UL }; using UnderlyingNodeType = std::underlying_type_t; struct NodeTypes { // magic number keeping track of the number of different node types - static constexpr size_t Count = 40; + static constexpr size_t Count = 43; static constexpr size_t OpCount = Count-3; // subtracting leaves // returns the index of the given type in the NodeType enum @@ -166,10 +175,10 @@ NLOHMANN_JSON_SERIALIZE_ENUM( NodeType, { {NodeType::Pow,"Pow" }, {NodeType::Logistic,"Logistic" }, - // logic; not sure these will make it in - // {NodeType::And,"And" }, - // {NodeType::Or,"Or" }, - // {NodeType::Not,"Not" }, + // logic + {NodeType::And,"And" }, + {NodeType::Or,"Or" }, + {NodeType::Not,"Not" }, // {NodeType::Xor,"Xor" }, // decision (same) @@ -209,7 +218,7 @@ NLOHMANN_JSON_SERIALIZE_ENUM( NodeType, { // custom {NodeType::CustomUnaryOp,"CustomUnaryOp" }, {NodeType::CustomBinaryOp,"CustomBinaryOp" }, - {NodeType::CustomSplit,"CustomSplit" }, + {NodeType::CustomSplit,"CustomSplit" } }) #endif @@ -257,6 +266,7 @@ static constexpr bool UnaryOp = is_in_v; template @@ -285,11 +295,6 @@ static constexpr bool NaryOp = is_in_v; -template -static constexpr bool NullaryOp = is_in_v; - // // TODO: make this work // template // concept Transformer = requires(NT n, size_t ArgCount) diff --git a/src/program/signatures.h b/src/program/signatures.h index 28b36db3..fa818623 100644 --- a/src/program/signatures.h +++ b/src/program/signatures.h @@ -201,6 +201,13 @@ struct Signatures; }; +template<> +struct Signatures{ + using type = std::tuple< + Signature + >; +}; + template struct Signatures; }; -// template -// struct Signatures>>{ -// using type = std::tuple< -// Signature, -// Signature -// >; -// }; - -// template<> -// struct Signatures { -// using type = std::tuple< -// Signature, -// Signature -// >; -// }; +template +struct Signatures>>{ + using type = std::tuple< + Signature + // Signature, + // Signature // TODO: just for testing. delete later + >; + }; + +template<> +struct Signatures{ + using type = std::tuple< + Signature + // Signature, + // Signature // TODO: just for testing. delete later + >; + }; template struct Signatures{ Signature, Signature, Signature, - Signature - /* Signature, */ - /* Signature, */ - /* Signature, */ - /* Signature */ + Signature, + Signature, + Signature, + Signature, + Signature >; }; - template <> - struct Signatures +template <> +struct Signatures { using unaryTuple = std::tuple< Signature >; using naryTuple = NarySignatures_t; @@ -379,11 +387,5 @@ struct Signatures{ using type = decltype(std::tuple_cat(unaryTuple(), naryTuple())); }; -template -struct Signatures>>{ - using type = std::tuple< - Signature - >; - }; } // namespace Brush #endif \ No newline at end of file diff --git a/src/search_space.cpp b/src/search_space.cpp index 10f0eede..3459bd32 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -244,12 +244,17 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const queue.push_back(make_tuple(child_spot, a, d)); } + int max_arity = 3; + Node n; // Now we actually start the PTC2 procedure to create the program tree /* cout << "queue size: " << queue.size() << endl; */ /* cout << "entering first while loop...\n"; */ - while ( 3*(queue.size()-1) + s < max_size && queue.size() > 0) + while ( max_arity*(queue.size()-1) + s < max_size && queue.size() > 0) { + // including the queue size in the max_size, since each element in queue + // can grow up exponentially + // by default, terminals are weighted (counts as 3 nodes in program size). // since every spot in queue has potential to be a terminal, we multiply // its size by 3. Subtracting one due to the fact that this loop will @@ -260,7 +265,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const auto [qspot, t, d] = RandomDequeue(queue); /* cout << "current depth: " << d << endl; */ - if (d == max_d) + if (d >= max_d) { // choose terminal of matching type /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ @@ -309,7 +314,10 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // increment is different based on node weights ++s; - if (n.get_is_weighted()) + if (Is(n.node_type)) + s += 3; + if ( n.get_is_weighted()==true + && Isnt(n.node_type) ) s += 2; /* cout << "current tree size: " << s << endl; */ From 45b8de3e24c67e7b41c79b7e619768f385f3b272 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 9 Oct 2023 15:09:50 -0400 Subject: [PATCH 037/199] Using boolean operators in test (not working yet) --- tests/cpp/test_data.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index 417fc84d..a0bff17d 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -50,8 +50,10 @@ TEST(Data, MixedVariableTypes) unordered_map user_ops = { {"Add", 1.0}, {"Sub", 1.0}, - {"SplitOn", 1.0}, - {"MeanLabel", 1.0} + // a boolean operator + {"And", 1.0}, + // operator that takes boolean as argument + {"SplitOn", 1.0} }; Dataset dt(X, y); From 7f47454c7f073524311ad0f5a9c9c45bff3aaf1f Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 11 Oct 2023 12:40:59 -0400 Subject: [PATCH 038/199] Implemented boolean operators. Improved print of meanLabel and SplitOn --- src/evaluator.cpp | 98 ++++++++++++++++++++++++++++++++++++++++ src/evaluator.h | 52 +++++++++++++++++++++ src/program/functions.h | 42 +++++++++++++---- src/program/node.cpp | 15 +++++- src/program/nodetype.h | 4 +- src/program/signatures.h | 6 +-- src/search_space.cpp | 5 +- src/search_space.h | 1 + 8 files changed, 204 insertions(+), 19 deletions(-) create mode 100644 src/evaluator.cpp create mode 100644 src/evaluator.h diff --git a/src/evaluator.cpp b/src/evaluator.cpp new file mode 100644 index 00000000..4e9e6b5f --- /dev/null +++ b/src/evaluator.cpp @@ -0,0 +1,98 @@ +/* Brush +copyright 2020 William La Cava +license: GNU/GPL v3 +*/ + +#include "evaluator.h" +namespace Brush +{ + +// template<> auto OPERON_EXPORT +// MinimumDescriptionLengthEvaluator::operator()(Operon::RandomGenerator& rng, Individual& ind, Operon::Span buf) const -> typename EvaluatorBase::ReturnType { +// auto const& problem = Evaluator::GetProblem(); +// auto const range = problem.TrainingRange(); +// auto const& dataset = problem.GetDataset(); +// auto const& nodes = ind.Genotype.Nodes(); +// auto const& dtable = Evaluator::GetDispatchTable(); + +// auto const* optimizer = Evaluator::GetOptimizer(); +// EXPECT(optimizer != nullptr); + +// // this call will optimize the tree coefficients and compute the SSE +// auto const& tree = ind.Genotype; +// Operon::Interpreter interpreter{dtable, dataset, ind.Genotype}; +// auto summary = optimizer->Optimize(rng, tree); +// auto parameters = summary.Success ? summary.FinalParameters : tree.GetCoefficients(); +// auto const p { static_cast(parameters.size()) }; + +// std::vector buffer; +// if (buf.size() < range.Size()) { +// buffer.resize(range.Size()); +// buf = Operon::Span(buffer); +// } +// interpreter.Evaluate(parameters, range, buf); + +// auto estimatedValues = buf; +// auto targetValues = problem.TargetValues(range); + +// // codelength of the complexity +// // count number of unique functions +// // - count weight * variable as three nodes +// // - compute complexity c of the remaining numerical values +// // (that are not part of the coefficients that are optimized) +// Operon::Set uniqueFunctions; // to count the number of unique functions +// auto k{0.0}; // number of nodes +// auto cComplexity { 0.0 }; + +// // codelength of the parameters +// Eigen::Matrix j = interpreter.JacRev(parameters, range); // jacobian +// auto fm = optimizer->ComputeFisherMatrix(estimatedValues, {j.data(), static_cast(j.size())}, sigma_); +// auto ii = fm.diagonal().array(); +// ENSURE(ii.size() == p); + +// auto cParameters { 0.0 }; +// auto constexpr eps = std::numeric_limits::epsilon(); // machine epsilon for zero comparison + +// for (auto i = 0, j = 0; i < std::ssize(nodes); ++i) { +// auto const& n = nodes[i]; + +// // count the number of nodes and the number of unique operators +// k += n.IsVariable() ? 3 : 1; +// uniqueFunctions.insert(n.HashValue); + +// if (n.Optimize) { +// // this branch computes the description length of the parameters to be optimized +// auto const di = std::sqrt(12 / ii(j)); +// auto const ci = std::abs(parameters[j]); + +// if (!(std::isfinite(ci) && std::isfinite(di)) || ci / di < 1) { +// //ind.Genotype[i].Optimize = false; +// //auto const v = ind.Genotype[i].Value; +// //ind.Genotype[i].Value = 0; +// //auto fit = (*this)(rng, ind, buf); +// //ind.Genotype[i].Optimize = true; +// //ind.Genotype[i].Value = v; +// //return fit; +// } else { +// cParameters += 0.5 * std::log(ii(j)) + std::log(ci); +// } +// ++j; +// } else { +// // this branch computes the description length of the remaining tree structure +// if (std::abs(n.Value) < eps) { continue; } +// cComplexity += std::log(std::abs(n.Value)); +// } +// } + +// auto q { static_cast(uniqueFunctions.size()) }; +// if (q > 0) { cComplexity += static_cast(k) * std::log(q); } + +// cParameters -= p/2 * std::log(3); + +// auto cLikelihood = optimizer->ComputeLikelihood(estimatedValues, targetValues, sigma_); +// auto mdl = cComplexity + cParameters + cLikelihood; +// if (!std::isfinite(mdl)) { mdl = EvaluatorBase::ErrMax; } +// return typename EvaluatorBase::ReturnType { static_cast(mdl) }; +// } + +} // namespace Brush diff --git a/src/evaluator.h b/src/evaluator.h new file mode 100644 index 00000000..cee3b6ae --- /dev/null +++ b/src/evaluator.h @@ -0,0 +1,52 @@ +/* Brush +copyright 2020 William La Cava +license: GNU/GPL v3 +*/ + +#ifndef EVALUATOR_H +#define EVALUATOR_H + +//internal includes +#include "init.h" +#include "program/node.h" +#include "program/nodetype.h" +#include "program/tree_node.h" +// #include "program/program.h" +#include "util/utils.h" +#include "util/rnd.h" +#include "params.h" +#include +#include + +using namespace Brush::Data; +using namespace Brush::Util; +using Brush::Node; +using Brush::DataType; +using std::type_index; + +namespace Brush +{ + +// template +// class OPERON_EXPORT MinimumDescriptionLengthEvaluator final : public Evaluator { +// using Base = Evaluator; + +// public: +// explicit MinimumDescriptionLengthEvaluator(Operon::Problem& problem, DTable const& dtable) +// : Base(problem, dtable, sse_) +// , sigma_(1, 1) // assume unit variance by default +// { +// } + +// auto SetSigma(std::vector sigma) { sigma_ = std::move(sigma); } + +// auto +// operator()(Operon::RandomGenerator& /*random*/, Individual& ind, Operon::Span buf) const -> typename EvaluatorBase::ReturnType override; + +// private: +// Operon::SSE sse_; +// mutable std::vector sigma_; +// }; + +} // namespace Brush +#endif diff --git a/src/program/functions.h b/src/program/functions.h index 1d3cbd0f..e506a307 100644 --- a/src/program/functions.h +++ b/src/program/functions.h @@ -414,8 +414,21 @@ namespace Brush template<> struct Function { - template - inline auto operator()(const T1& t1, const T2& t2) { + template requires (!same_as) + inline auto operator()(const ArrayBase& t1, const ArrayBase& t2) { + return t1 && t2; + } + template requires same_as + inline auto operator()(const ArrayBase& t1, const ArrayBase& t2) { + // ArrayXb t1_bool(t1.size()); + // for (int i = 0; i< t1.size(); ++i) + // t1_bool(i) = t1(i).a; + + // ArrayXb t2_bool(t2.size()); + // for (int i = 0; i< t2.size(); ++i) + // t2_bool(i) = t2(i).a; + + // return (t1_bool || t2_bool).cast(); return t1 * t2; } }; @@ -424,20 +437,33 @@ namespace Brush template<> struct Function { - template - inline auto operator()(const T1& t1, const T2& t2) { + template requires (!same_as) + inline auto operator()(const ArrayBase& t1, const ArrayBase& t2) { + return t1 || t2; + } + template requires same_as + inline auto operator()(const ArrayBase& t1, const ArrayBase& t2) { return t1 + t2; } }; - /* logical not */ + /* logical not -- negate the input */ template<> struct Function { - template - inline auto operator()(const T& t) { return t; } - }; + template requires (!same_as) + inline auto operator()(const ArrayBase& t) { + auto trues = ArrayXb::Constant(t.size(), true); + + return t != trues; + } + template requires same_as + inline auto operator()(const ArrayBase& t) { + auto trues = ArrayXb::Constant(t.size(), true); + return (t - trues); + } + }; } // Brush #endif diff --git a/src/program/node.cpp b/src/program/node.cpp index bcaef619..77465642 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -33,7 +33,8 @@ auto Node::get_name(bool include_weight) const noexcept -> std::string } else if (Is(node_type) && include_weight) { - return fmt::format("MeanLabel({:.2f})", W); + // return fmt::format("MeanLabel({:.2f})", W); + return fmt::format("MeanLabel{:.2f}", W); } else if (is_weighted && include_weight) return fmt::format("{:.2f}*{}",W,name); @@ -52,7 +53,17 @@ string Node::get_model(const vector& children) const noexcept children.at(1) ); } - else if (Is(node_type)){ // TODO: have a better print for splitOn, based on child type + else if (Is(node_type)){ + if (arg_types.at(0) == DataType::ArrayB) + { + // booleans dont use thresholds (they are used directly as mask in split) + return fmt::format("If({},{},{})", + children.at(0), + children.at(1), + children.at(2) + ); + } + // integers or floating points (they have a threshold) return fmt::format("If({}>{:.2f},{},{})", children.at(0), W, diff --git a/src/program/nodetype.h b/src/program/nodetype.h index 6677350d..ac00ccfd 100644 --- a/src/program/nodetype.h +++ b/src/program/nodetype.h @@ -95,8 +95,8 @@ enum class NodeType : uint64_t { MeanLabel = 1UL << 40UL, Constant = 1UL << 41UL, Terminal = 1UL << 42UL, - ArgMax = 1UL << 43UL, - Count = 1UL << 44UL, // TODO: move before leaves + ArgMax = 1UL << 43UL, // TODO: move before leaves + Count = 1UL << 44UL, // custom CustomUnaryOp = 1UL << 44UL, diff --git a/src/program/signatures.h b/src/program/signatures.h index fa818623..91daba28 100644 --- a/src/program/signatures.h +++ b/src/program/signatures.h @@ -229,8 +229,6 @@ struct Signatures>>{ using type = std::tuple< Signature - // Signature, - // Signature // TODO: just for testing. delete later >; }; @@ -238,8 +236,6 @@ template<> struct Signatures{ using type = std::tuple< Signature - // Signature, - // Signature // TODO: just for testing. delete later >; }; @@ -369,9 +365,11 @@ struct Signatures{ Signature, Signature, Signature, + Signature, Signature, Signature, + Signature, Signature, Signature diff --git a/src/search_space.cpp b/src/search_space.cpp index 3459bd32..dd0c3b36 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -1,6 +1,5 @@ #include "search_space.h" #include "program/program.h" -#include namespace Brush{ @@ -250,7 +249,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // Now we actually start the PTC2 procedure to create the program tree /* cout << "queue size: " << queue.size() << endl; */ /* cout << "entering first while loop...\n"; */ - while ( max_arity*(queue.size()-1) + s < max_size && queue.size() > 0) + while ( queue.size() + s < max_size && queue.size() > 0) { // including the queue size in the max_size, since each element in queue // can grow up exponentially @@ -295,7 +294,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // qspot = n; if (!opt) - opt = sample_terminal(t, true); + queue.push_back(make_tuple(qspot, t, d)); n = opt.value(); diff --git a/src/search_space.h b/src/search_space.h index 820bc95c..dcf1db73 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -15,6 +15,7 @@ license: GNU/GPL v3 #include "params.h" #include #include +#include /* Defines the search space of Brush. * The search spaces consists of nodes and their accompanying probability From 0b950e0b1d35292da87d4c1e3d750bc5ac0cd324 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 11 Oct 2023 12:41:44 -0400 Subject: [PATCH 039/199] Included boolean operators in tests --- tests/cpp/test_data.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index a0bff17d..0a61e19a 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -48,10 +48,11 @@ TEST(Data, MixedVariableTypes) y << 6.1, 7.7, -4.2; // y = x_0 + x_1 + x_2 unordered_map user_ops = { - {"Add", 1.0}, - {"Sub", 1.0}, + {"Add", 0.5}, + {"Sub", 0.5}, // a boolean operator {"And", 1.0}, + {"Or", 1.0}, // operator that takes boolean as argument {"SplitOn", 1.0} }; From e1fd0ca671b73b43241efba0492165291820bbb8 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 11 Oct 2023 12:48:07 -0400 Subject: [PATCH 040/199] Option to use nsga or ga with island divisions --- src/brush/deap_api/__init__.py | 1 - src/brush/deap_api/nsga2island.py | 5 +---- src/brush/estimator.py | 6 +++--- src/variation.h | 1 + tests/cpp/test_program.cpp | 8 ++++++++ tests/python/test_brush.py | 2 ++ 6 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/brush/deap_api/__init__.py b/src/brush/deap_api/__init__.py index 6cbb48db..b011dec5 100644 --- a/src/brush/deap_api/__init__.py +++ b/src/brush/deap_api/__init__.py @@ -1,4 +1,3 @@ from .nsga2 import nsga2 -from .ga import ga from .nsga2island import nsga2island from .utils import DeapIndividual \ No newline at end of file diff --git a/src/brush/deap_api/nsga2island.py b/src/brush/deap_api/nsga2island.py index c976b211..d215d3f7 100644 --- a/src/brush/deap_api/nsga2island.py +++ b/src/brush/deap_api/nsga2island.py @@ -38,7 +38,6 @@ def calculate_statistics(ind): island_indexes = [((i*MU)//N_ISLANDS, ((i+1)*MU)//N_ISLANDS) for i in range(N_ISLANDS)] - print("island_indexes", island_indexes) pop = toolbox.population(n=MU) fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) @@ -125,9 +124,7 @@ def calculate_statistics(ind): pop.append(new_pop[idx_other_individual]) else: pop.append(new_pop[idx_individual]) - print(len(pop)) - for p in pop: - print(type(p), p) + record = stats.compile(pop) logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 5c810d70..a5520067 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -66,7 +66,7 @@ class BrushEstimator(BaseEstimator): initialization : {"grow", "full"}, default "grow" Strategy to create the initial population. If `full`, then every expression is created with `max_size` nodes. If `grow`, size will be uniformly distributed. - algorithm : {"nsga2island", "nsga2", "ga"}, default "nsga2island" + algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2island" Which Evolutionary Algorithm framework to use to evolve the population. validation_size : float, default 0.0 Percentage of samples to use as a hold-out partition. These samples are used @@ -169,7 +169,7 @@ def _setup_toolbox(self, data_train, data_validation): if self.algorithm=="nsga2" or self.algorithm=="nsga2island": toolbox.register("select", tools.selTournamentDCD) toolbox.register("survive", tools.selNSGA2) - elif self.algorithm=="ga": + elif self.algorithm=="ga" or self.algorithm=="gaisland": toolbox.register("select", tools.selTournament, tournsize=3) def offspring(pop, MU): return pop[-MU:] toolbox.register("survive", offspring) @@ -250,7 +250,7 @@ def fit(self, X, y): self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) - if self.algorithm=="nsga2island": + if self.algorithm=="nsga2island" or self.algorithm=="gaisland": self.archive_, self.logbook_ = nsga2island( self.toolbox_, self.max_gen, self.pop_size, self.n_islands, self.mig_prob, self.cx_prob, diff --git a/src/variation.h b/src/variation.h index 22d42a77..90421011 100644 --- a/src/variation.h +++ b/src/variation.h @@ -680,6 +680,7 @@ std::optional> cross(const Program& root, const Program& other) // fmt::print("other_spot : {}\n",other_spot.node->data); // swap subtrees at child_spot and other_spot + // TODO: do I need to delete the removed node? child.Tree.move_ontop(child_spot, other_spot); return child; } diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp index de7084f9..1509ac02 100644 --- a/tests/cpp/test_program.cpp +++ b/tests/cpp/test_program.cpp @@ -31,6 +31,14 @@ TEST(Program, MakeRegressor) ); ASSERT_TRUE( PRG.get_model("compact", true)==clone.get_model("compact", true) ); + + // probabilities are the same + vector PRG_weights(PRG.Tree.size()); + std::transform(PRG.Tree.begin(), PRG.Tree.end(), + PRG_weights.begin(), + [](const auto& n){ return n.get_prob_change(); } + ); + ASSERT_TRUE( PRG.get_model("compact", true)==clone.get_model("compact", true) ); } } diff --git a/tests/python/test_brush.py b/tests/python/test_brush.py index 1c7f759e..c9b55103 100644 --- a/tests/python/test_brush.py +++ b/tests/python/test_brush.py @@ -46,9 +46,11 @@ def regression_setup(): @pytest.mark.parametrize('setup,algorithm', [('classification_setup', 'nsga2island'), ('classification_setup', 'nsga2' ), + ('classification_setup', 'gaisland' ), ('classification_setup', 'ga' ), ('regression_setup', 'nsga2island'), ('regression_setup', 'nsga2' ), + ('regression_setup', 'gaisland' ), ('regression_setup', 'ga' )]) def test_fit(setup, algorithm, brush_args, request): """Testing common utilities related to fitting and generic brush estimator. From f747d63c1e44590f2205f966dc62c218985886e5 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 11 Oct 2023 13:05:50 -0400 Subject: [PATCH 041/199] Added comparison test of weights between cloned and original program --- tests/cpp/test_program.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp index de7084f9..09728462 100644 --- a/tests/cpp/test_program.cpp +++ b/tests/cpp/test_program.cpp @@ -31,6 +31,26 @@ TEST(Program, MakeRegressor) ); ASSERT_TRUE( PRG.get_model("compact", true)==clone.get_model("compact", true) ); + fmt::print("Models have the same representation\n"); + + // weights didnt changed + vector PRG_weights(PRG.Tree.size()); + std::transform(PRG.Tree.begin(), PRG.Tree.end(), PRG_weights.begin(), + [&](const auto& n){ return n.get_prob_change();}); + + vector clone_weights(clone.Tree.size()); + std::transform(clone.Tree.begin(), clone.Tree.end(), clone_weights.begin(), + [&](const auto& n){ return n.get_prob_change();}); + + ASSERT_TRUE( PRG_weights.size()==clone_weights.size() ); + fmt::print("Models have the same number of node weights\n"); + + for (size_t i=0; i Date: Wed, 11 Oct 2023 13:22:05 -0400 Subject: [PATCH 042/199] Fixed error causing tests to fail when inserting Logistic in the searchspace --- src/brush/estimator.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index f5a4f2e8..ce79efe4 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -231,14 +231,13 @@ def fit(self, X, y): if self.mode=="classification": self.n_classes_ = len(np.unique(y)) - # Including necessary functions for classification programs. We'll insert - # it with zero probability, so it doesn't interfer in the user-defined - # functions, but allows the search space to create the hash and mapping to - # use the functions. - if self.n_classes_ == 2 and "Logistic" not in self.functions_.keys(): - self.functions_["Logistic"] = 0.0 - elif "Softmax" not in self.functions_.keys(): - self.functions_["Softmax"] = 0.0 + # Including necessary functions for classification programs. This + # is needed so the search space can create the hash and mapping of + # the functions. + if self.n_classes_ == 2 and "Logistic" not in self.functions_: + self.functions_["Logistic"] = 1.0 + # elif "Softmax" not in self.functions_: # TODO: implement multiclassific. + # self.functions_["Softmax"] = 1.0 # These have a default behavior to return something meaningfull if From d70339ae310e8a366573fc39fa88a8b6f12c264b Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 11 Oct 2023 16:25:52 -0400 Subject: [PATCH 043/199] Implemented complexity. User can specify objectives --- src/bindings/bind_programs.h | 1 + src/brush/estimator.py | 63 ++++++++++++++++----------- src/program/nodetype.h | 3 +- src/program/program.h | 8 ++++ src/program/tree_node.cpp | 83 ++++++++++++++++++++++++++++++++++++ src/program/tree_node.h | 2 + 6 files changed, 135 insertions(+), 25 deletions(-) diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index f18e188e..2211aa90 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -46,6 +46,7 @@ void bind_program(py::module& m, string name) .def("get_dot_model", &T::get_dot_model, py::arg("extras")="") .def("get_weights", &T::get_weights) .def("size", &T::size, py::arg("include_weight")=true) + .def("complexity", &T::complexity) .def("depth", &T::depth) .def("cross", &T::cross, py::return_value_policy::automatic, "Performs one attempt to stochastically swap subtrees between two programs and generate a child") diff --git a/src/brush/estimator.py b/src/brush/estimator.py index ce79efe4..cd99af71 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -57,6 +57,10 @@ class BrushEstimator(BaseEstimator): A dictionary with keys naming the function set and values giving the probability of sampling them, or a list of functions which will be weighted uniformly. If empty, all available functions are included in the search space. + objectives : list[str], default ["error", "size"] + list with one or more objectives to use. Options are `"error", "size", "complexity"`. + If `"error"` is used, then it will be the mean squared error for regression, + and accuracy for classification. initialization : {"grow", "full"}, default "grow" Strategy to create the initial population. If `full`, then every expression is created with `max_size` nodes. If `grow`, size will be uniformly distributed. @@ -111,6 +115,7 @@ def __init__( mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6}, functions: list[str]|dict[str,float] = {}, + objectives=["error", "size"], initialization="grow", algorithm="nsga2", random_state=None, @@ -127,6 +132,7 @@ def __init__( self.cx_prob=cx_prob self.mutation_options=mutation_options self.functions=functions + self.objectives=objectives self.initialization=initialization self.random_state=random_state self.batch_size=batch_size @@ -239,6 +245,13 @@ def fit(self, X, y): # elif "Softmax" not in self.functions_: # TODO: implement multiclassific. # self.functions_["Softmax"] = 1.0 + # Weight of each objective (+ for maximization, - for minimization) + obj_weight = { + "error" : +1.0 if self.mode=="classification" else -1.0, + "size" : -1.0, + "complexity" : -1.0 + } + self.weights = [obj_weight[w] for w in self.objectives] # These have a default behavior to return something meaningfull if # no values are set @@ -370,23 +383,24 @@ class BrushClassifier(BrushEstimator,ClassifierMixin): def __init__( self, **kwargs): super().__init__(mode='classification',**kwargs) - # Weight of each objective (+ for maximization, - for minimization) - self.weights = (+1.0,-1.0) - + def _error(self, ind, data: _brush.Dataset): + return (data.y==ind.prg.predict(data)).sum() / data.y.shape[0] + def _fitness_validation(self, ind, data: _brush.Dataset): # Fitness without fitting the expression, used with validation data - return ( # (accuracy, size) - (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], - ind.prg.size() - ) + + ind_objectives = { + "error" : self._error(ind, data), + "size" : ind.prg.size(), + "complexity": ind.prg.complexity() + } + return [ ind_objectives[obj] for obj in self.objectives ] def _fitness_function(self, ind, data: _brush.Dataset): ind.prg.fit(data) - return ( # (accuracy, size) - (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], - ind.prg.size() - ) - + + return self._fitness_validation(ind, data) + def _make_individual(self): # C++'s PTC2-based `make_individual` will create a tree of at least # the given size. By uniformly sampling the size, we can instantiate a @@ -461,26 +475,27 @@ class BrushRegressor(BrushEstimator, RegressorMixin): def __init__(self, **kwargs): super().__init__(mode='regressor',**kwargs) - # Weight of each objective (+ for maximization, - for minimization) - self.weights = (-1.0,-1.0) - - def _fitness_validation(self, ind, data: _brush.Dataset): - # Fitness without fitting the expression, used with validation data - + def _error(self, ind, data: _brush.Dataset): MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf MSE = np.inf - return ( MSE, ind.prg.size() ) + return MSE + + def _fitness_validation(self, ind, data: _brush.Dataset): + # Fitness without fitting the expression, used with validation data + + ind_objectives = { + "error" : self._error(ind, data), + "size" : ind.prg.size(), + "complexity": ind.prg.complexity() + } + return [ ind_objectives[obj] for obj in self.objectives ] def _fitness_function(self, ind, data: _brush.Dataset): ind.prg.fit(data) - MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) - if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf - MSE = np.inf - - return ( MSE, ind.prg.size() ) + return self._fitness_validation(ind, data) def _make_individual(self): if self.initialization not in ["grow", "full"]: diff --git a/src/program/nodetype.h b/src/program/nodetype.h index ac00ccfd..7d153ab7 100644 --- a/src/program/nodetype.h +++ b/src/program/nodetype.h @@ -28,7 +28,8 @@ using Brush::Data::TimeSeriesf; namespace Brush { -enum class NodeType : uint64_t { +enum class NodeType : uint64_t { // Each node type must have a complexity + // in operator_complexities@tree_node.cpp // Unary Abs = 1UL << 0UL, Acos = 1UL << 1UL, diff --git a/src/program/program.h b/src/program/program.h index 523a0b90..ec0305b9 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -89,6 +89,14 @@ template struct Program SSref = std::optional>{s}; } + /// @brief count the complexity of the program. + /// @return int complexity. + int complexity() const{ + auto head = Tree.begin(); + + return head.node->get_complexity(); + } + /// @brief count the tree size of the program, including the weights in weighted nodes. /// @param include_weight whether to include the node's weight in the count. /// @return int number of nodes. diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index 0e4dfcd3..59b4088a 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -75,4 +75,87 @@ void from_json(const json &j, tree &t) stack.push_back(subtree); } t = stack.back(); +} + +unordered_map operator_complexities = { + // Unary + {NodeType::Abs , 3}, + {NodeType::Acos , 3}, + {NodeType::Asin , 3}, + {NodeType::Atan , 3}, + {NodeType::Cos , 3}, + {NodeType::Cosh , 3}, + {NodeType::Sin , 3}, + {NodeType::Sinh , 3}, + {NodeType::Tan , 3}, + {NodeType::Tanh , 3}, + {NodeType::Ceil , 3}, + {NodeType::Floor , 3}, + {NodeType::Exp , 3}, + {NodeType::Log , 3}, + {NodeType::Logabs , 3}, + {NodeType::Log1p , 3}, + {NodeType::Sqrt , 3}, + {NodeType::Sqrtabs , 3}, + {NodeType::Square , 3}, + {NodeType::Logistic, 3}, + + // timing masks + {NodeType::Before, 2}, + {NodeType::After , 2}, + {NodeType::During, 2}, + + // Reducers + {NodeType::Min , 4}, + {NodeType::Max , 4}, + {NodeType::Mean , 4}, + {NodeType::Median, 4}, + {NodeType::Sum , 4}, + {NodeType::Prod , 4}, + + // Transformers + {NodeType::Softmax, 4}, + + // Binary + {NodeType::Add, 1}, + {NodeType::Sub, 1}, + {NodeType::Mul, 1}, + {NodeType::Div, 1}, + {NodeType::Pow, 1}, + + //split + {NodeType::SplitBest, 2}, + {NodeType::SplitOn , 2}, + + // boolean + {NodeType::And, 1}, + {NodeType::Or , 1}, + {NodeType::Not, 1}, + + // leaves + {NodeType::MeanLabel, 1}, + {NodeType::Constant , 1}, + {NodeType::Terminal , 2}, + {NodeType::ArgMax , 2}, + {NodeType::Count , 2}, + + // custom + {NodeType::CustomUnaryOp , 5}, + {NodeType::CustomBinaryOp, 5}, + {NodeType::CustomSplit , 5} +}; + +int TreeNode::get_complexity() const +{ + int node_complexity = operator_complexities.at(data.node_type); + int children_complexity = 0; + + auto child = first_child; + for(int i = 0; i < data.get_arg_count(); ++i) + { + children_complexity += child->get_complexity(); + child = child->next_sibling; + } + + return node_complexity*children_complexity; } \ No newline at end of file diff --git a/src/program/tree_node.h b/src/program/tree_node.h index 81836137..64b54149 100644 --- a/src/program/tree_node.h +++ b/src/program/tree_node.h @@ -49,6 +49,8 @@ class tree_node_ { // size: 5*4=20 bytes (on 32 bit arch), can be reduced string get_model(bool pretty=false) const; string get_tree_model(bool pretty=false, string offset="") const; + + int get_complexity() const; }; using TreeNode = class tree_node_; From 7243f06a7c4c79cc27f89db32bf4df6bd14ab7bb Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 12 Oct 2023 12:27:42 -0400 Subject: [PATCH 044/199] Fixed zero complexities --- src/program/tree_node.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index 59b4088a..9757eae4 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -157,5 +157,5 @@ int TreeNode::get_complexity() const child = child->next_sibling; } - return node_complexity*children_complexity; + return node_complexity*max(children_complexity, 1); } \ No newline at end of file From d079365fee4e21180b77056de371bf916b1361cf Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 12 Oct 2023 12:28:43 -0400 Subject: [PATCH 045/199] Making constant features have zero weight --- src/brush/estimator.py | 5 ++--- src/search_space.cpp | 5 +++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index cd99af71..61c262e0 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -100,7 +100,6 @@ class BrushEstimator(BaseEstimator): Holds the operators and terminals and sampling utilities to update programs. toolbox_ : deap.Toolbox The toolbox used by DEAP for EA algorithm. - """ def __init__( @@ -187,9 +186,9 @@ def _crossover(self, ind1, ind2): for i,j in [(ind1,ind2),(ind2,ind1)]: child = i.prg.cross(j.prg) if child: - offspring.append(creator.Individual(child)) + offspring.extend([creator.Individual(child)]) else: # so we'll always have two elements to unpack in `offspring` - offspring.append(None) + offspring.extend([None]) return offspring[0], offspring[1] diff --git a/src/search_space.cpp b/src/search_space.cpp index dd0c3b36..c3987712 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -28,6 +28,11 @@ float calc_initial_weight(const ArrayXf& value, const ArrayXf& y) float prob_change = std::abs(slope(data.col(0).array() , // x=variable data.col(1).array() )); // y=target + // prob_change will evaluate to nan if variance(x)==0. Features with + // zero variance should not be used (as they behave just like a constant). + if (std::isnan(prob_change)) + prob_change = 0.0; + return prob_change; } From 672b9ca834aad3fa6b835187f3840247c048a471 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 12 Oct 2023 12:29:04 -0400 Subject: [PATCH 046/199] New tests --- tests/python/test_brush.py | 58 +++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/tests/python/test_brush.py b/tests/python/test_brush.py index 3cc191ba..d3ef4108 100644 --- a/tests/python/test_brush.py +++ b/tests/python/test_brush.py @@ -16,7 +16,9 @@ def brush_args(): pop_size=20, max_size=50, max_depth=6, - mutation_options = {"point":0.25, "insert": 0.5, "delete": 0.25}, + cx_prob= 1/7, + mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, + "toggle_weight_on":1/6, "toggle_weight_off":1/6}, ) @pytest.fixture @@ -79,7 +81,61 @@ def test_predict_proba(setup, brush_args, request): assert y_prob.shape[1] >= 2, \ "every class should have its own column (even for binary clf)" +@pytest.mark.parametrize('setup,fixed_node', [ + ('classification_setup', 'Logistic'), + # ('multiclass_classification_setup', 'Softmax') + ]) +def test_fixed_nodes(setup, fixed_node, brush_args, request): + # Classification has a fixed root that should not change after mutation or crossover + + Estimator, X, y = request.getfixturevalue(setup) + + est = Estimator(**brush_args) + est.fit(X, y) # Calling fit to make it create the setup toolbox and variation functions + + for i in range(10): + # Initial population + pop = est.toolbox_.population(n=100) + pop_models = [] + for p in pop: + pop_models.append(p.prg.get_model()) + assert p.prg.get_model().startswith(fixed_node), \ + (f"An individual for {setup} was criated without {fixed_node} " + + f"node on root. Model was {p.ind.get_model()}") + + # Clones + clones = [est.toolbox_.Clone(p) for p in pop] + for c in clones: + assert c.prg.get_model().startswith(fixed_node), \ + (f"An individual for {setup} was cloned without {fixed_node} " + + f"node on root. Model was {c.ind.get_model()}") + + # Mutation + xmen = [est.toolbox_.mutate(c) for c in clones] + xmen = [x for x in xmen if x is not None] + assert len(xmen) > 0, "Mutation didn't worked for any individual" + for x in xmen: + assert x.prg.get_model().startswith(fixed_node), \ + (f"An individual for {setup} was mutated without {fixed_node} " + + f"node on root. Model was {x.ind.get_model()}") + # Crossover + cxmen = [] + [cxmen.extend(est.toolbox_.mate(c1, c2)) + for (c1, c2) in zip(clones[::2], clones[1::2])] + cxmen = [x for x in cxmen if x is not None] + assert len(cxmen) > 0, "Crossover didn't worked for any individual" + for cx in cxmen: + assert cx.prg.get_model().startswith(fixed_node), \ + (f"An individual for {setup} was crossovered without {fixed_node} " + + f"node on root. Model was {cx.ind.get_model()}") + + # Originals still the same + for p, p_original_model in zip(pop, pop_models): + assert p.prg.get_model() == p_original_model, \ + "Variation operator changed the original model." + + # def test_random_state(): # TODO: make it work # test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) From 6586934bba22416435484ea52232e40451b421c0 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 17 Oct 2023 12:02:13 -0400 Subject: [PATCH 047/199] Renamed initialization methods --- src/brush/estimator.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 957938b4..579bf96c 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -57,9 +57,10 @@ class BrushEstimator(BaseEstimator): A dictionary with keys naming the function set and values giving the probability of sampling them, or a list of functions which will be weighted uniformly. If empty, all available functions are included in the search space. - initialization : {"grow", "full"}, default "grow" - Strategy to create the initial population. If `full`, then every expression is created - with `max_size` nodes. If `grow`, size will be uniformly distributed. + initialization : {"uniform", "max_size"}, default "uniform" + Distribution of sizes on the initial population. If `max_size`, then every + expression is created with `max_size` nodes. If `uniform`, size will be + uniformly distributed between 1 and `max_size`. algorithm : {"nsga2", "ga"}, default "nsga2" Which Evolutionary Algorithm framework to use to evolve the population. validation_size : float, default 0.0 @@ -111,7 +112,7 @@ def __init__( mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6}, functions: list[str]|dict[str,float] = {}, - initialization="grow", + initialization="uniform", algorithm="nsga2", random_state=None, validation_size: float = 0.0, @@ -383,16 +384,16 @@ def _make_individual(self): # the given size. By uniformly sampling the size, we can instantiate a # population with more diversity - if self.initialization not in ["grow", "full"]: + if self.initialization not in ["uniform", "max_size"]: raise ValueError(f"Invalid argument value for `initialization`. " - f"expected 'full' or 'grow'. got {self.initialization}") + f"expected 'max_size' or 'uniform'. got {self.initialization}") return creator.Individual( self.search_space_.make_classifier( - self.max_depth,(0 if self.initialization=='grow' else self.max_size)) + self.max_depth,(0 if self.initialization=='uniform' else self.max_size)) if self.n_classes_ == 2 else self.search_space_.make_multiclass_classifier( - self.max_depth, (0 if self.initialization=='grow' else self.max_size)) + self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) ) def predict_proba(self, X): @@ -474,13 +475,15 @@ def _fitness_function(self, ind, data: _brush.Dataset): return ( MSE, ind.prg.size() ) def _make_individual(self): - if self.initialization not in ["grow", "full"]: + if self.initialization not in ["uniform", "max_size"]: raise ValueError(f"Invalid argument value for `initialization`. " - f"expected 'full' or 'grow'. got {self.initialization}") + f"expected 'max_size' or 'uniform'. got {self.initialization}") - return creator.Individual( # No arguments (or zero): brush will use PARAMS passed in set_params. max_size is sampled between 1 and params['max_size'] if zero is provided + # No arguments (or zero): brush will use PARAMS passed in set_params. + # max_size is sampled between 1 and params['max_size'] if zero is provided + return creator.Individual( self.search_space_.make_regressor( - self.max_depth, (0 if self.initialization=='grow' else self.max_size)) + self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) ) # Under development From aa9655e441cf1fceb85f50b55c3b1f0720dcd46b Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 17 Oct 2023 12:25:30 -0400 Subject: [PATCH 048/199] New complexity measures --- src/program/tree_node.cpp | 76 +++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index 9757eae4..35f5fe4a 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -80,64 +80,64 @@ void from_json(const json &j, tree &t) unordered_map operator_complexities = { // Unary {NodeType::Abs , 3}, - {NodeType::Acos , 3}, - {NodeType::Asin , 3}, - {NodeType::Atan , 3}, - {NodeType::Cos , 3}, - {NodeType::Cosh , 3}, - {NodeType::Sin , 3}, - {NodeType::Sinh , 3}, - {NodeType::Tan , 3}, - {NodeType::Tanh , 3}, - {NodeType::Ceil , 3}, - {NodeType::Floor , 3}, - {NodeType::Exp , 3}, - {NodeType::Log , 3}, - {NodeType::Logabs , 3}, - {NodeType::Log1p , 3}, - {NodeType::Sqrt , 3}, - {NodeType::Sqrtabs , 3}, + {NodeType::Acos , 5}, + {NodeType::Asin , 5}, + {NodeType::Atan , 5}, + {NodeType::Cos , 5}, + {NodeType::Cosh , 5}, + {NodeType::Sin , 5}, + {NodeType::Sinh , 5}, + {NodeType::Tan , 5}, + {NodeType::Tanh , 5}, + {NodeType::Ceil , 4}, + {NodeType::Floor , 4}, + {NodeType::Exp , 4}, + {NodeType::Log , 4}, + {NodeType::Logabs , 12}, + {NodeType::Log1p , 8}, + {NodeType::Sqrt , 4}, + {NodeType::Sqrtabs , 4}, {NodeType::Square , 3}, {NodeType::Logistic, 3}, // timing masks - {NodeType::Before, 2}, - {NodeType::After , 2}, - {NodeType::During, 2}, + {NodeType::Before, 3}, + {NodeType::After , 3}, + {NodeType::During, 3}, // Reducers - {NodeType::Min , 4}, - {NodeType::Max , 4}, - {NodeType::Mean , 4}, - {NodeType::Median, 4}, - {NodeType::Sum , 4}, - {NodeType::Prod , 4}, + {NodeType::Min , 3}, + {NodeType::Max , 3}, + {NodeType::Mean , 3}, + {NodeType::Median, 3}, + {NodeType::Sum , 2}, + {NodeType::Prod , 3}, // Transformers {NodeType::Softmax, 4}, // Binary - {NodeType::Add, 1}, - {NodeType::Sub, 1}, - {NodeType::Mul, 1}, - {NodeType::Div, 1}, - {NodeType::Pow, 1}, + {NodeType::Add, 2}, + {NodeType::Sub, 2}, + {NodeType::Mul, 3}, + {NodeType::Div, 4}, + {NodeType::Pow, 5}, //split - {NodeType::SplitBest, 2}, - {NodeType::SplitOn , 2}, + {NodeType::SplitBest, 4}, + {NodeType::SplitOn , 4}, // boolean - {NodeType::And, 1}, - {NodeType::Or , 1}, - {NodeType::Not, 1}, + {NodeType::And, 2}, + {NodeType::Or , 2}, + {NodeType::Not, 2}, // leaves {NodeType::MeanLabel, 1}, {NodeType::Constant , 1}, {NodeType::Terminal , 2}, - {NodeType::ArgMax , 2}, - {NodeType::Count , 2}, + {NodeType::ArgMax , 5}, + {NodeType::Count , 3}, // custom {NodeType::CustomUnaryOp , 5}, From 06c642ce726a5cd6b0f4d3230738ef0e00227eb7 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 18 Oct 2023 15:05:03 -0400 Subject: [PATCH 049/199] Non-zero probability for terminals if they are not nan --- src/search_space.cpp | 3 +++ tests/cpp/test_search_space.cpp | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/src/search_space.cpp b/src/search_space.cpp index c3987712..f259a8c7 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -32,6 +32,9 @@ float calc_initial_weight(const ArrayXf& value, const ArrayXf& y) // zero variance should not be used (as they behave just like a constant). if (std::isnan(prob_change)) prob_change = 0.0; + else + // having a minimum feature weight if it was not set to zero + prob_change += 1e-5; return prob_change; } diff --git a/tests/cpp/test_search_space.cpp b/tests/cpp/test_search_space.cpp index 6b58375b..e0da9eca 100644 --- a/tests/cpp/test_search_space.cpp +++ b/tests/cpp/test_search_space.cpp @@ -42,6 +42,9 @@ TEST(SearchSpace, Initialization) // manually calculated. last value is the avg of prev values ArrayXf expected_weights_Xf(4); // 5 elements (x3, x4, x5, c, meanLabel) expected_weights_Xf << 0.80240685, 0.19270448, 0.5994426, 0.531518, 0.531518; + + // terminals that arent constant will have a minimum value + expected_weights_Xf = expected_weights_Xf + 1e-5; auto actual_weights_f = SS.terminal_weights.at(DataType::ArrayF); Eigen::Map actual_weights_Xf(actual_weights_f.data(), actual_weights_f.size()); @@ -51,6 +54,7 @@ TEST(SearchSpace, Initialization) ArrayXf expected_weights_Xi(2); // 2 elements (x2 and c) expected_weights_Xi << 0.2736814, 0.2736814; + expected_weights_Xi = expected_weights_Xi + 1e-5; auto actual_weights_i = SS.terminal_weights.at(DataType::ArrayI); Eigen::Map actual_weights_Xi(actual_weights_i.data(), actual_weights_i.size()); @@ -60,6 +64,7 @@ TEST(SearchSpace, Initialization) ArrayXf expected_weights_Xb(2); // 2 elements (x0 and c) expected_weights_Xb << 0.8117065, 0.8117065; + expected_weights_Xb = expected_weights_Xb + 1e-5; auto actual_weights_b = SS.terminal_weights.at(DataType::ArrayB); Eigen::Map actual_weights_Xb(actual_weights_b.data(), actual_weights_b.size()); From ae55ad9e2408a22230b76f9eec6035fd5d3e8694 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 18 Oct 2023 15:05:54 -0400 Subject: [PATCH 050/199] Bug fix in crossover attempts --- src/brush/estimator.py | 28 +++++++++++++++++++--------- src/program/functions.h | 17 ++++++++++------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 46cf2c36..7d884586 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -195,21 +195,31 @@ def _crossover(self, ind1, ind2): offspring = [] for i,j in [(ind1,ind2),(ind2,ind1)]: - child = i.prg.cross(j.prg) - if child: - offspring.extend([creator.Individual(child)]) - else: # so we'll always have two elements to unpack in `offspring` - offspring.extend([None]) + attempts = 0 + child = None + while (attempts < 3 and child is None): + child = i.prg.cross(j.prg) + if child is not None: + child = creator.Individual(child) + attempts = attempts + 1 + + offspring.extend([child]) + + # so we always need to have two elements to unpack inside `offspring` return offspring[0], offspring[1] def _mutate(self, ind1): # offspring = (creator.Individual(ind1.prg.mutate(self.search_space_)),) - offspring = ind1.prg.mutate() - - if offspring: - return creator.Individual(offspring) + attempts = 0 + offspring = None + while (attempts < 3 and offspring is None): + offspring = ind1.prg.mutate() + + if offspring is not None: + return creator.Individual(offspring) + attempts = attempts + 1 return None diff --git a/src/program/functions.h b/src/program/functions.h index e506a307..144c96a1 100644 --- a/src/program/functions.h +++ b/src/program/functions.h @@ -414,7 +414,7 @@ namespace Brush template<> struct Function { - template requires (!same_as) + template inline auto operator()(const ArrayBase& t1, const ArrayBase& t2) { return t1 && t2; } @@ -437,7 +437,7 @@ namespace Brush template<> struct Function { - template requires (!same_as) + template inline auto operator()(const ArrayBase& t1, const ArrayBase& t2) { return t1 || t2; } @@ -451,17 +451,20 @@ namespace Brush template<> struct Function { - template requires (!same_as) + template inline auto operator()(const ArrayBase& t) { - auto trues = ArrayXb::Constant(t.size(), true); - - return t != trues; + return !t; } template requires same_as inline auto operator()(const ArrayBase& t) { auto trues = ArrayXb::Constant(t.size(), true); - return (t - trues); + + // for (size_t i = 0; i < t.size(); ++i) { + // t.at(i).a = !t.at(i).a; + // } + + // return t; } }; } // Brush From 87ce49e048ef5887e163f2ea80e9901e9a38dc2e Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 18 Oct 2023 16:10:26 -0400 Subject: [PATCH 051/199] Added `weights_init` parameter into pybind wrapper --- src/brush/estimator.py | 5 +++ src/search_space.cpp | 79 ++++++++++++++++++++++-------------------- src/search_space.h | 10 +++--- 3 files changed, 53 insertions(+), 41 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 7d884586..d8183b7a 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -73,6 +73,11 @@ class BrushEstimator(BaseEstimator): and accuracy for classification. algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2" Which Evolutionary Algorithm framework to use to evolve the population. + weights_init : bool, default True + Whether the search space should initialize the sampling weights of terminal nodes + based on the correlation with the output y. If `False`, then all terminal nodes + will have the same probability of 1.0. + validation_size : float, default 0.0 Percentage of samples to use as a hold-out partition. These samples are used to calculate statistics during evolution, but not used to train the models. diff --git a/src/search_space.cpp b/src/search_space.cpp index f259a8c7..460e69f4 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -42,8 +42,9 @@ float calc_initial_weight(const ArrayXf& value, const ArrayXf& y) /// @brief generate terminals from the dataset features and random constants. /// @param d a dataset +/// @param weights_init whether the terminal prob_change should be estimated from correlations with the target value /// @return a vector of nodes -vector generate_terminals(const Dataset& d) +vector generate_terminals(const Dataset& d, const bool weights_init) { vector terminals; int i = 0; @@ -64,43 +65,46 @@ vector generate_terminals(const Dataset& d) float prob_change = 1.0; // default value - // if the value can be casted to float array, we can calculate slope - if (std::holds_alternative(value)) + if (d.y.size()>0 && weights_init) { - prob_change = calc_initial_weight(std::get(value), d.y); - } - else if (std::holds_alternative(value)) - { - // for each variable we create a one-vs-all binary variable, then - // calculate slope. Final value will be the average of slopes - - auto tmp = std::get(value); - - //get number of unique values - std::map uniqueMap; - for(int i = 0; i < tmp.size(); i++) - uniqueMap[(float)tmp(i)] = true; - - ArrayXf slopes = ArrayXf::Ones(uniqueMap.size()); - int slopesIterator = 0; - for (const auto& pair : uniqueMap) + // if the value can be casted to float array, we can calculate slope + if (std::holds_alternative(value) && d.y.size()>0) { - auto one_vs_all = ArrayXf::Ones(tmp.size()).array() * (tmp.array()==pair.first).cast(); - - slopes[slopesIterator++] = calc_initial_weight(one_vs_all, d.y); + prob_change = calc_initial_weight(std::get(value), d.y); + } + else if (std::holds_alternative(value)) + { + // for each variable we create a one-vs-all binary variable, then + // calculate slope. Final value will be the average of slopes + + auto tmp = std::get(value); + + //get number of unique values + std::map uniqueMap; + for(int i = 0; i < tmp.size(); i++) + uniqueMap[(float)tmp(i)] = true; + + ArrayXf slopes = ArrayXf::Ones(uniqueMap.size()); + int slopesIterator = 0; + for (const auto& pair : uniqueMap) + { + auto one_vs_all = ArrayXf::Ones(tmp.size()).array() * (tmp.array()==pair.first).cast(); + + slopes[slopesIterator++] = calc_initial_weight(one_vs_all, d.y); + } + + prob_change = slopes.mean(); + } + else if (std::holds_alternative(value)) + { + auto tmp = std::get(value).template cast(); + prob_change = calc_initial_weight(tmp, d.y); + } + else + { + auto msg = fmt::format("Brush coudn't calculate the initial weight of variable {}\n",feature_name); + HANDLE_ERROR_THROW(msg); } - - prob_change = slopes.mean(); - } - else if (std::holds_alternative(value)) - { - auto tmp = std::get(value).template cast(); - prob_change = calc_initial_weight(tmp, d.y); - } - else - { - auto msg = fmt::format("Brush coudn't calculate the initial weight of variable {}\n",feature_name); - HANDLE_ERROR_THROW(msg); } n.set_prob_change( prob_change ); @@ -154,7 +158,8 @@ void SearchSpace::print() const { std::cout << fmt::format("{}\n", *this) << std::flush; } -void SearchSpace::init(const Dataset& d, const unordered_map& user_ops) +void SearchSpace::init(const Dataset& d, const unordered_map& user_ops, + bool weights_init) { // fmt::print("constructing search space...\n"); this->node_map.clear(); @@ -171,7 +176,7 @@ void SearchSpace::init(const Dataset& d, const unordered_map& user // create nodes based on data types terminal_types = d.unique_data_types; - vector terminals = generate_terminals(d); + vector terminals = generate_terminals(d, weights_init); /* fmt::print("generate nodetype\n"); */ GenerateNodeMap(user_ops, d.unique_data_types, diff --git a/src/search_space.h b/src/search_space.h index dcf1db73..d551af21 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -46,7 +46,7 @@ using TreeIter = tree::pre_order_iterator; // enum class ProgramType: uint32_t; // template struct ProgramTypeEnum; -vector generate_terminals(const Dataset& d); +vector generate_terminals(const Dataset& d, const bool weights_init); //////////////////////////////////////////////////////////////////////////////// @@ -176,14 +176,16 @@ struct SearchSpace /// @brief Construct a search space /// @param d A dataset containing terminal definitions /// @param user_ops Optional user-provided dictionary of operators with their probability of being chosen - SearchSpace(const Dataset& d, const unordered_map& user_ops = {}){ - init(d,user_ops); + /// @param weights_init whether the terminal prob_change should be estimated from correlations with the target value + SearchSpace(const Dataset& d, const unordered_map& user_ops = {}, bool weights_init = true){ + init(d,user_ops,weights_init); } /// @brief Called by the constructor to initialize the search space /// @param d A dataset containing terminal definitions /// @param user_ops Optional user-provided dictionary of operators with their probability of being chosen - void init(const Dataset& d, const unordered_map& user_ops = {}); + /// @param weights_init whether the terminal prob_change should be estimated from correlations with the target value + void init(const Dataset& d, const unordered_map& user_ops = {}, bool weights_init = true); /// @brief check if a return type is in the node map /// @param R data type From 06caed00c4539e7bb6837789d88e3c00a34300ab Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 18 Oct 2023 17:06:35 -0400 Subject: [PATCH 052/199] Updated missing bind --- src/bindings/bind_search_space.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/bindings/bind_search_space.cpp b/src/bindings/bind_search_space.cpp index 29dc468c..4f43449a 100644 --- a/src/bindings/bind_search_space.cpp +++ b/src/bindings/bind_search_space.cpp @@ -13,12 +13,20 @@ void bind_search_space(py::module &m) // constructing it with a Dataset object, rather than initializing it as an // empty struct and then calling init() with the Dataset object. py::class_(m, "SearchSpace") - .def(py::init([](br::Data::Dataset data) - { + .def(py::init([](br::Data::Dataset data, bool weights_init=true){ SearchSpace SS; - SS.init(data); - return SS; })) - .def(py::init&>()) + SS.init(data, {}, weights_init); + return SS; + }), + py::arg("data"), + py::arg("weights_init") = true + ) + .def(py::init&, + bool>(), + py::arg("data"), + py::arg("user_ops"), + py::arg("weights_init") = true + ) .def("make_regressor", &br::SearchSpace::make_regressor) .def("make_classifier", &br::SearchSpace::make_classifier) .def("make_multiclass_classifier", &br::SearchSpace::make_multiclass_classifier) From 94daa7e881d3fc23dc1ceef9dad6c2ff2ab727d2 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 18 Oct 2023 17:06:50 -0400 Subject: [PATCH 053/199] Improved how `get_dot_model` prints `MeanLabel` --- src/program/node.cpp | 7 ++++--- src/program/program.h | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/program/node.cpp b/src/program/node.cpp index 77465642..2a60648e 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -31,10 +31,11 @@ auto Node::get_name(bool include_weight) const noexcept -> std::string { return fmt::format("{:.2f}", W); } - else if (Is(node_type) && include_weight) + else if (Is(node_type)) { - // return fmt::format("MeanLabel({:.2f})", W); - return fmt::format("MeanLabel{:.2f}", W); + if (include_weight) //explicitly print as a MeanLabel and include weight on label + return fmt::format("MeanLabel{:.2f}", W); + return fmt::format("{:.2f}", W); // Handle as if it was a constant } else if (is_weighted && include_weight) return fmt::format("{:.2f}*{}",W,name); diff --git a/src/program/program.h b/src/program/program.h index ec0305b9..b7229f61 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -396,7 +396,7 @@ template struct Program out += fmt::format("{}\n", extras); auto get_id = [](const auto& n){ - if (Is(n->data.node_type)) + if (Is(n->data.node_type)) return n->data.get_name(false); return fmt::format("{}",fmt::ptr(n)).substr(2); @@ -451,7 +451,8 @@ template struct Program // string kid_id = fmt::format("{}",fmt::ptr(kid)); // kid_id = kid_id.substr(2); - if (kid->data.get_is_weighted() && Isnt(kid->data.node_type)){ + if (kid->data.get_is_weighted() + && Isnt(kid->data.node_type)){ edge_label = fmt::format("{:.2f}",kid->data.W); } From 30718c3b2448d79908b97775d8dba649acc31913 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 18 Oct 2023 18:23:03 -0400 Subject: [PATCH 054/199] Fixed strategy to pick best solution when there's no validation split --- src/brush/estimator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index d8183b7a..51d731ee 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -316,8 +316,11 @@ def fit(self, X, y): max_vals = np.max(points, axis=0) points = (points - min_vals) / (max_vals - min_vals) + # nan means division by zero --- no solution dominates in that obj. + points = np.nan_to_num(points, nan=1.0) + # Reference should be best value each obj. can have (after normalization) - reference = np.array([1, 1]) + reference = np.array([1.0, 1.0]) # closest to the reference (smallest distance) final_ind_idx = np.argmin( np.linalg.norm(points - reference, axis=1) ) From 02181500f3d95c4a3763134a0df10d2ed744d18e Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 19 Oct 2023 11:50:07 -0400 Subject: [PATCH 055/199] Using weights_init argument properly --- src/brush/estimator.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 51d731ee..3f9cd695 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -77,7 +77,6 @@ class BrushEstimator(BaseEstimator): Whether the search space should initialize the sampling weights of terminal nodes based on the correlation with the output y. If `False`, then all terminal nodes will have the same probability of 1.0. - validation_size : float, default 0.0 Percentage of samples to use as a hold-out partition. These samples are used to calculate statistics during evolution, but not used to train the models. @@ -132,6 +131,7 @@ def __init__( algorithm="nsga2", objectives=["error", "size"], random_state=None, + weights_init=True, validation_size: float = 0.0, batch_size: float = 1.0 ): @@ -151,6 +151,7 @@ def __init__( self.initialization=initialization self.random_state=random_state self.batch_size=batch_size + self.weights_init=weights_init self.validation_size=validation_size @@ -284,7 +285,7 @@ def fit(self, X, y): self.train_.set_batch_size(self.batch_size) self.validation_ = self.data_.get_validation_data() - self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) + self.search_space_ = _brush.SearchSpace(self.train_, self.functions_, self.weights_init) self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) if self.algorithm=="nsga2island" or self.algorithm=="gaisland": From 050a38a16b7ca456b5446be9bdeaa07fea4cb9f8 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 19 Oct 2023 16:46:29 -0400 Subject: [PATCH 056/199] Fixed complexity not taking into account the node weights --- src/program/tree_node.cpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index 35f5fe4a..56036d08 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -148,14 +148,23 @@ unordered_map operator_complexities = { int TreeNode::get_complexity() const { int node_complexity = operator_complexities.at(data.node_type); - int children_complexity = 0; + int children_complexity_sum = 0; // acumulator for children complexities auto child = first_child; for(int i = 0; i < data.get_arg_count(); ++i) { - children_complexity += child->get_complexity(); + children_complexity_sum += child->get_complexity(); child = child->next_sibling; } - - return node_complexity*max(children_complexity, 1); + + // avoid multiplication by zero if the node is a terminal + children_complexity_sum = max(children_complexity_sum, 1); + + if (data.get_is_weighted()) // include the `w` and `*` if the node is weighted + return operator_complexities.at(NodeType::Mul)*( + operator_complexities.at(NodeType::Constant) + + node_complexity*(children_complexity_sum) + ); + + return node_complexity*(children_complexity_sum); } \ No newline at end of file From ec627b1d8bb52602cb5e152b3a46e705e001bd51 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 19 Oct 2023 17:42:48 -0400 Subject: [PATCH 057/199] Changed classification default metric (will work on having this as an option later) --- src/brush/estimator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 3f9cd695..441e6abd 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -13,6 +13,7 @@ from deap import algorithms, base, creator, tools # from tqdm import tqdm from types import NoneType +from sklearn.metrics import average_precision_score import _brush from .deap_api import nsga2, nsga2island, DeapIndividual # from _brush import Dataset, SearchSpace @@ -421,7 +422,8 @@ def __init__( self, **kwargs): super().__init__(mode='classification',**kwargs) def _error(self, ind, data: _brush.Dataset): - return (data.y==ind.prg.predict(data)).sum() / data.y.shape[0] + #return (data.y==ind.prg.predict(data)).sum() / data.y.shape[0] + return average_precision_score(data.y, ind.prg.predict(data)) def _fitness_validation(self, ind, data: _brush.Dataset): # Fitness without fitting the expression, used with validation data From 1b3bb75303ca7bd06b5ff96c6d090e40bbeef30c Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 19 Oct 2023 17:43:14 -0400 Subject: [PATCH 058/199] Avoid creating clones of programs --- src/brush/deap_api/nsga2.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py index 809be792..c4ef86ed 100644 --- a/src/brush/deap_api/nsga2.py +++ b/src/brush/deap_api/nsga2.py @@ -67,8 +67,7 @@ def calculate_statistics(ind): # offspring = tools.selTournamentDCD(pop, len(pop)) parents = toolbox.select(pop, len(pop)) # offspring = [toolbox.clone(ind) for ind in offspring] - offspring = [] - + offspring, successfull = [], 0 for ind1, ind2 in zip(parents[::2], parents[1::2]): off1, off2 = None, None if rnd_flt() < CXPB: # either mutation or crossover @@ -77,17 +76,24 @@ def calculate_statistics(ind): off1 = toolbox.mutate(ind1) off2 = toolbox.mutate(ind2) - # Inserting parent if mutation failed - offspring.extend([off1 if off1 is not None else toolbox.Clone(ind1)]) - offspring.extend([off2 if off2 is not None else toolbox.Clone(ind2)]) - - # Evaluate (instead of evaluateValidation) to fit the weights of the offspring - fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring) - if (use_batch): #calculating objectives based on batch - fitnesses = toolbox.map(functools.partial(toolbox.evaluateValidation, data=batch), offspring) - - for ind, fit in zip(offspring, fitnesses): - ind.fitness.values = fit + if off1 is not None: # first we fit + successfull = successfull + 1 + # Evaluate (instead of evaluateValidation) to fit the weights of the offspring + off1.fitness.values = toolbox.evaluate(off1) + if use_batch: # Adjust fitness to the same data as parents + off1.fitness.values = toolbox.evaluateValidation(off1, data=batch) + elif off1 is None: # Mutation failed + off1 = ind1 # just reinsert the individual in the population + offspring.extend([off1]) + + if off2 is not None: + successfull = successfull + 1 + off2.fitness.values = toolbox.evaluate(off2) + if use_batch: + off2.fitness.values = toolbox.evaluateValidation(off2, data=batch) + elif off2 is None: + off2 = ind1 + offspring.extend([off2]) # Select the next generation population (no sorting before this step, as # survive==offspring will cut it in half) @@ -96,7 +102,7 @@ def calculate_statistics(ind): pop.sort(key=lambda x: x.fitness, reverse=True) record = stats.compile(pop) - logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) + logbook.record(gen=gen, evals=successfull+(len(pop) if use_batch else 0), **record) if verbosity > 0: print(logbook.stream) From 981726c2126685c3a292e9cf0398b74623370387 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 19 Oct 2023 17:43:38 -0400 Subject: [PATCH 059/199] Increased minimal probability of changing --- src/search_space.cpp | 2 +- tests/cpp/test_search_space.cpp | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/search_space.cpp b/src/search_space.cpp index 460e69f4..d7c40327 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -34,7 +34,7 @@ float calc_initial_weight(const ArrayXf& value, const ArrayXf& y) prob_change = 0.0; else // having a minimum feature weight if it was not set to zero - prob_change += 1e-5; + prob_change += 1e-1; return prob_change; } diff --git a/tests/cpp/test_search_space.cpp b/tests/cpp/test_search_space.cpp index e0da9eca..c73cd2b0 100644 --- a/tests/cpp/test_search_space.cpp +++ b/tests/cpp/test_search_space.cpp @@ -5,6 +5,8 @@ TEST(SearchSpace, Initialization) { + float minimum_prob = 1e-1f; // minimum probability of changing + ArrayXf y(4); y << 3.00000, 3.59876, 7.18622, 15.19294; @@ -44,7 +46,7 @@ TEST(SearchSpace, Initialization) expected_weights_Xf << 0.80240685, 0.19270448, 0.5994426, 0.531518, 0.531518; // terminals that arent constant will have a minimum value - expected_weights_Xf = expected_weights_Xf + 1e-5; + expected_weights_Xf = expected_weights_Xf + minimum_prob; auto actual_weights_f = SS.terminal_weights.at(DataType::ArrayF); Eigen::Map actual_weights_Xf(actual_weights_f.data(), actual_weights_f.size()); @@ -54,7 +56,7 @@ TEST(SearchSpace, Initialization) ArrayXf expected_weights_Xi(2); // 2 elements (x2 and c) expected_weights_Xi << 0.2736814, 0.2736814; - expected_weights_Xi = expected_weights_Xi + 1e-5; + expected_weights_Xi = expected_weights_Xi + minimum_prob; auto actual_weights_i = SS.terminal_weights.at(DataType::ArrayI); Eigen::Map actual_weights_Xi(actual_weights_i.data(), actual_weights_i.size()); @@ -64,7 +66,7 @@ TEST(SearchSpace, Initialization) ArrayXf expected_weights_Xb(2); // 2 elements (x0 and c) expected_weights_Xb << 0.8117065, 0.8117065; - expected_weights_Xb = expected_weights_Xb + 1e-5; + expected_weights_Xb = expected_weights_Xb + minimum_prob; auto actual_weights_b = SS.terminal_weights.at(DataType::ArrayB); Eigen::Map actual_weights_Xb(actual_weights_b.data(), actual_weights_b.size()); From 825264af6a3b65e5370a7ffe3c9bbd4429dbd338 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 23 Oct 2023 17:05:05 -0400 Subject: [PATCH 060/199] Improved log. Fixed warnings in pytest. --- src/brush/deap_api/nsga2.py | 22 +++++++++++----------- src/brush/estimator.py | 16 +++++++++------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py index c4ef86ed..9035c5f2 100644 --- a/src/brush/deap_api/nsga2.py +++ b/src/brush/deap_api/nsga2.py @@ -1,5 +1,5 @@ from deap import tools -from deap.benchmarks.tools import diversity, convergence, hypervolume +from deap.benchmarks.tools import hypervolume import numpy as np import functools @@ -18,18 +18,18 @@ def calculate_statistics(ind): stats = tools.Statistics(calculate_statistics) - stats.register("avg", np.mean, axis=0) - stats.register("med", np.median, axis=0) - stats.register("std", np.std, axis=0) - stats.register("min", np.min, axis=0) - stats.register("max", np.max, axis=0) + stats.register("avg", np.nanmean, axis=0) + stats.register("med", np.nanmedian, axis=0) + stats.register("std", np.nanstd, axis=0) + stats.register("min", np.nanmin, axis=0) + stats.register("max", np.nanmax, axis=0) logbook = tools.Logbook() - logbook.header = "gen", "evals", "avg (O1 train, O2 train, O1 val, O2 val)", \ - "med (O1 train, O2 train, O1 val, O2 val)", \ - "std (O1 train, O2 train, O1 val, O2 val)", \ - "min (O1 train, O2 train, O1 val, O2 val)", \ - "max (O1 train, O2 train, O1 val, O2 val)" + logbook.header = ['gen', 'evals'] + \ + [f"{stat} {partition} O{objective}" + for stat in ['avg', 'med', 'std', 'min', 'max'] + for partition in ['train', 'val'] + for objective in toolbox.get_objectives()] pop = toolbox.population(n=MU) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index 441e6abd..c60ffcd7 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -14,6 +14,7 @@ # from tqdm import tqdm from types import NoneType from sklearn.metrics import average_precision_score +from sklearn.preprocessing import MinMaxScaler import _brush from .deap_api import nsga2, nsga2island, DeapIndividual # from _brush import Dataset, SearchSpace @@ -163,6 +164,12 @@ def _setup_toolbox(self, data_train, data_validation): # creator.create is used to "create new functions", and takes at least # 2 arguments: the name of the newly created class and a base class + # Cleaning possible previous classes that are model-dependent (clf and reg are differente) + if hasattr(creator, "FitnessMulti"): + del creator.FitnessMulti + if hasattr(creator, "Individual"): + del creator.Individual + # Minimizing/maximizing problem: negative/positive weight, respectively. # Our classification is using the error as a metric # Comparing fitnesses: https://deap.readthedocs.io/en/master/api/base.html#deap.base.Fitness @@ -191,6 +198,7 @@ def offspring(pop, MU): return pop[-MU:] toolbox.register("createRandom", self._make_individual) toolbox.register("population", tools.initRepeat, list, toolbox.createRandom) + toolbox.register("get_objectives", lambda: self.objectives) toolbox.register("getBatch", data_train.get_batch) toolbox.register("evaluate", self._fitness_function, data=data_train) toolbox.register("evaluateValidation", self._fitness_validation, data=data_validation) @@ -300,7 +308,6 @@ def fit(self, X, y): self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, (0.0 Date: Mon, 23 Oct 2023 17:05:23 -0400 Subject: [PATCH 061/199] Dont show `MeanLabel` in MeanLabel nodes --- src/program/node.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/program/node.cpp b/src/program/node.cpp index 2a60648e..245ef1dc 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -33,9 +33,10 @@ auto Node::get_name(bool include_weight) const noexcept -> std::string } else if (Is(node_type)) { - if (include_weight) //explicitly print as a MeanLabel and include weight on label - return fmt::format("MeanLabel{:.2f}", W); - return fmt::format("{:.2f}", W); // Handle as if it was a constant + if (include_weight) + return fmt::format("{:.2f}", W); // Handle as if it was a constant + //explicitly print as a MeanLabel and include weight on label + return fmt::format("MeanLabel({:.2f})", W); } else if (is_weighted && include_weight) return fmt::format("{:.2f}*{}",W,name); From 8c0979d58013da39688b163fba417ec77a3243a9 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 24 Oct 2023 12:30:04 -0400 Subject: [PATCH 062/199] Fixed meanlabel. Updated how we initialize weights --- src/program/program.h | 2 +- src/search_space.cpp | 8 +++++--- tests/cpp/test_search_space.cpp | 15 ++++----------- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/program/program.h b/src/program/program.h index b7229f61..747e5f71 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -396,7 +396,7 @@ template struct Program out += fmt::format("{}\n", extras); auto get_id = [](const auto& n){ - if (Is(n->data.node_type)) + if (Is(n->data.node_type)) return n->data.get_name(false); return fmt::format("{}",fmt::ptr(n)).substr(2); diff --git a/src/search_space.cpp b/src/search_space.cpp index d7c40327..8131ae76 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -28,13 +28,14 @@ float calc_initial_weight(const ArrayXf& value, const ArrayXf& y) float prob_change = std::abs(slope(data.col(0).array() , // x=variable data.col(1).array() )); // y=target + // having a minimum feature weight if it was not set to zero + if (std::abs(prob_change)<1e-4) + prob_change = 1e-1; + // prob_change will evaluate to nan if variance(x)==0. Features with // zero variance should not be used (as they behave just like a constant). if (std::isnan(prob_change)) prob_change = 0.0; - else - // having a minimum feature weight if it was not set to zero - prob_change += 1e-1; return prob_change; } @@ -131,6 +132,7 @@ vector generate_terminals(const Dataset& d, const bool weights_init) return sum / count; }; + // constants for each type auto cXf = Node(NodeType::Constant, Signature{}, true, "Cf"); float floats_avg_weights = signature_avg(cXf.ret_type); cXf.set_prob_change(floats_avg_weights); diff --git a/tests/cpp/test_search_space.cpp b/tests/cpp/test_search_space.cpp index c73cd2b0..8cebd9f0 100644 --- a/tests/cpp/test_search_space.cpp +++ b/tests/cpp/test_search_space.cpp @@ -27,10 +27,10 @@ TEST(SearchSpace, Initialization) // different weights to check if searchspace is initialized correctnly unordered_map user_ops = { - {"Add", 1}, - {"Sub", 1}, - {"Div", .5}, - {"Mul", 0.5} + {"Add", 1}, + {"Sub", 1}, + {"Div", .5}, + {"Mul", 0.5} }; SearchSpace SS; @@ -45,28 +45,21 @@ TEST(SearchSpace, Initialization) ArrayXf expected_weights_Xf(4); // 5 elements (x3, x4, x5, c, meanLabel) expected_weights_Xf << 0.80240685, 0.19270448, 0.5994426, 0.531518, 0.531518; - // terminals that arent constant will have a minimum value - expected_weights_Xf = expected_weights_Xf + minimum_prob; - auto actual_weights_f = SS.terminal_weights.at(DataType::ArrayF); Eigen::Map actual_weights_Xf(actual_weights_f.data(), actual_weights_f.size()); ASSERT_TRUE(expected_weights_Xf.isApprox(actual_weights_Xf)); - ArrayXf expected_weights_Xi(2); // 2 elements (x2 and c) expected_weights_Xi << 0.2736814, 0.2736814; - expected_weights_Xi = expected_weights_Xi + minimum_prob; auto actual_weights_i = SS.terminal_weights.at(DataType::ArrayI); Eigen::Map actual_weights_Xi(actual_weights_i.data(), actual_weights_i.size()); ASSERT_TRUE(expected_weights_Xi.isApprox(actual_weights_Xi)); - ArrayXf expected_weights_Xb(2); // 2 elements (x0 and c) expected_weights_Xb << 0.8117065, 0.8117065; - expected_weights_Xb = expected_weights_Xb + minimum_prob; auto actual_weights_b = SS.terminal_weights.at(DataType::ArrayB); Eigen::Map actual_weights_Xb(actual_weights_b.data(), actual_weights_b.size()); From 7c1d83fae5a176cda86dba732c53688b6c158715 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 1 Nov 2023 10:51:50 -0400 Subject: [PATCH 063/199] Stop reinserting the same expression if mutation fails --- src/brush/deap_api/nsga2.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py index 9035c5f2..b99262e8 100644 --- a/src/brush/deap_api/nsga2.py +++ b/src/brush/deap_api/nsga2.py @@ -76,24 +76,20 @@ def calculate_statistics(ind): off1 = toolbox.mutate(ind1) off2 = toolbox.mutate(ind2) - if off1 is not None: # first we fit + if off1 is not None: # Mutation worked. first we fit, then add to offspring successfull = successfull + 1 # Evaluate (instead of evaluateValidation) to fit the weights of the offspring off1.fitness.values = toolbox.evaluate(off1) if use_batch: # Adjust fitness to the same data as parents off1.fitness.values = toolbox.evaluateValidation(off1, data=batch) - elif off1 is None: # Mutation failed - off1 = ind1 # just reinsert the individual in the population - offspring.extend([off1]) + offspring.extend([off1]) if off2 is not None: successfull = successfull + 1 off2.fitness.values = toolbox.evaluate(off2) if use_batch: off2.fitness.values = toolbox.evaluateValidation(off2, data=batch) - elif off2 is None: - off2 = ind1 - offspring.extend([off2]) + offspring.extend([off2]) # Select the next generation population (no sorting before this step, as # survive==offspring will cut it in half) From f3418967e2fcaf69ce5de602e20f2473b79eb81f Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 2 Nov 2023 10:49:40 -0400 Subject: [PATCH 064/199] Stop reinserting parents into offspring. Fixed island indexes for that --- src/brush/deap_api/nsga2.py | 6 ++--- src/brush/deap_api/nsga2island.py | 43 ++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py index b99262e8..bbeb9e1e 100644 --- a/src/brush/deap_api/nsga2.py +++ b/src/brush/deap_api/nsga2.py @@ -67,7 +67,7 @@ def calculate_statistics(ind): # offspring = tools.selTournamentDCD(pop, len(pop)) parents = toolbox.select(pop, len(pop)) # offspring = [toolbox.clone(ind) for ind in offspring] - offspring, successfull = [], 0 + offspring = [] for ind1, ind2 in zip(parents[::2], parents[1::2]): off1, off2 = None, None if rnd_flt() < CXPB: # either mutation or crossover @@ -77,7 +77,6 @@ def calculate_statistics(ind): off2 = toolbox.mutate(ind2) if off1 is not None: # Mutation worked. first we fit, then add to offspring - successfull = successfull + 1 # Evaluate (instead of evaluateValidation) to fit the weights of the offspring off1.fitness.values = toolbox.evaluate(off1) if use_batch: # Adjust fitness to the same data as parents @@ -85,7 +84,6 @@ def calculate_statistics(ind): offspring.extend([off1]) if off2 is not None: - successfull = successfull + 1 off2.fitness.values = toolbox.evaluate(off2) if use_batch: off2.fitness.values = toolbox.evaluateValidation(off2, data=batch) @@ -98,7 +96,7 @@ def calculate_statistics(ind): pop.sort(key=lambda x: x.fitness, reverse=True) record = stats.compile(pop) - logbook.record(gen=gen, evals=successfull+(len(pop) if use_batch else 0), **record) + logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) if verbosity > 0: print(logbook.stream) diff --git a/src/brush/deap_api/nsga2island.py b/src/brush/deap_api/nsga2island.py index d215d3f7..f43e460c 100644 --- a/src/brush/deap_api/nsga2island.py +++ b/src/brush/deap_api/nsga2island.py @@ -27,11 +27,11 @@ def calculate_statistics(ind): stats.register("max", np.max, axis=0) logbook = tools.Logbook() - logbook.header = "gen", "evals", "avg (O1 train, O2 train, O1 val, O2 val)", \ - "med (O1 train, O2 train, O1 val, O2 val)", \ - "std (O1 train, O2 train, O1 val, O2 val)", \ - "min (O1 train, O2 train, O1 val, O2 val)", \ - "max (O1 train, O2 train, O1 val, O2 val)" + logbook.header = ['gen', 'evals'] + \ + [f"{stat} {partition} O{objective}" + for stat in ['avg', 'med', 'std', 'min', 'max'] + for partition in ['train', 'val'] + for objective in toolbox.get_objectives()] # Tuples with start and end indexes for each island. Number of individuals # in each island can slightly differ if N_ISLANDS is not a divisor of MU @@ -81,7 +81,9 @@ def calculate_statistics(ind): parents.extend(island_parents) offspring = [] # Will have the same size as pop + island_failed_variations = [] for (idx_start, idx_end) in island_indexes: + failed_variations = 0 for ind1, ind2 in zip(parents[idx_start:idx_end:2], parents[idx_start+1:idx_end:2] ): @@ -92,9 +94,21 @@ def calculate_statistics(ind): off1 = toolbox.mutate(ind1) off2 = toolbox.mutate(ind2) - # Inserting parent if mutation failed - offspring.extend([off1 if off1 is not None else toolbox.Clone(ind1)]) - offspring.extend([off2 if off2 is not None else toolbox.Clone(ind2)]) + if off1 is not None: + off1.fitness.values = toolbox.evaluate(off1) + if use_batch: + off1.fitness.values = toolbox.evaluateValidation(off1, data=batch) + offspring.extend([off1]) + else: + failed_variations += 1 + + if off2 is not None: + off2.fitness.values = toolbox.evaluate(off2) + if use_batch: + off2.fitness.values = toolbox.evaluateValidation(off2, data=batch) + offspring.extend([off2]) + else: + failed_variations += 1 # Evaluate (instead of evaluateValidation) to fit the weights of the offspring fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring) @@ -107,10 +121,15 @@ def calculate_statistics(ind): # Select the next generation population new_pop = [] - for (idx_start, idx_end) in island_indexes: - island_new_pop = toolbox.survive(pop[idx_start:idx_end] \ - +offspring[idx_start:idx_end], - idx_end-idx_start) + for i, (idx_start, idx_end) in enumerate(island_indexes): + # original population combined with offspring, taking into account that variations can fail + island_new_pop = toolbox.survive( + pop[idx_start:idx_end] \ + + offspring[ + idx_start-sum(island_failed_variations[:i]):idx_end+island_failed_variations[i] + ], + idx_end-idx_start # number of selected individuals should still the same + ) new_pop.extend(island_new_pop) # Migration to fill up the islands for the next generation From 1794950b7b93082c1f336f0576ab29b66e6693bb Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 2 Nov 2023 10:52:39 -0400 Subject: [PATCH 065/199] Updated --- docs/guide/search_space.ipynb | 30 +- docs/guide/working_with_programs.ipynb | 520 +------------------------ 2 files changed, 19 insertions(+), 531 deletions(-) diff --git a/docs/guide/search_space.ipynb b/docs/guide/search_space.ipynb index 9d072354..7ab2edf4 100644 --- a/docs/guide/search_space.ipynb +++ b/docs/guide/search_space.ipynb @@ -29,11 +29,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "b667948a", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -61,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "23d6f552", "metadata": {}, "outputs": [], @@ -95,28 +93,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "a2953719", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Search Space\n", - "===\n", - "terminal_map: {ArrayI: [x_5, x_7], ArrayF: [x_0, x_1, x_2, x_3, x_4, x_6]}\n", - "terminal_weights: {ArrayI: [1, 1], ArrayF: [1, 1, 1, 1, 1, 1]}\n", - "node_map[ArrayI][[\"ArrayI\", \"ArrayI\"]][SplitBest] = SplitBest[>0.000], weight = 0.2\n", - "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][SplitBest] = SplitBest[>0.000], weight = 0.2\n", - "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][Div] = Div, weight = 0.1\n", - "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][Mul] = Mul, weight = 1\n", - "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][Sub] = Sub, weight = 0.5\n", - "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][Add] = Add, weight = 0.5\n", - "===\n" - ] - } - ], + "outputs": [], "source": [ "search_space.print()" ] diff --git a/docs/guide/working_with_programs.ipynb b/docs/guide/working_with_programs.ipynb index f48be082..a73b7b15 100644 --- a/docs/guide/working_with_programs.ipynb +++ b/docs/guide/working_with_programs.ipynb @@ -64,10 +64,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "102e3fcb", "metadata": { - "scrolled": true, "tags": [ "remove-output" ] @@ -87,32 +86,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "ac39c9ca", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/bill/mambaforge/envs/brush/lib/python3.11/site-packages/deap/tools/emo.py:139: RuntimeWarning: invalid value encountered in scalar divide\n", - " distances[cur[1]] += (next[0][i] - prev[0][i]) / norm\n", - "/home/bill/mambaforge/envs/brush/lib/python3.11/site-packages/deap/tools/emo.py:139: RuntimeWarning: invalid value encountered in scalar subtract\n", - " distances[cur[1]] += (next[0][i] - prev[0][i]) / norm\n", - "/home/bill/projects/brush/src/brush/estimator.py:251: RuntimeWarning: overflow encountered in square\n", - " np.sum((data.y- ind.prg.predict(data))**2),\n", - "/home/bill/mambaforge/envs/brush/lib/python3.11/site-packages/numpy/core/fromnumeric.py:86: RuntimeWarning: overflow encountered in reduce\n", - " return ufunc.reduce(obj, axis, dtype, out, **passkwargs)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "best model: Add(10.57*x6,If(x0>0.75,Add(8.50*x6,If(x0>0.81,26.02,Add(-9.31*x4,127.74*x0))),Add(Add(13.60*x4,0.11*x2),-0.09*x1)))\n" - ] - } - ], + "outputs": [], "source": [ "# import and make a regressor\n", "est = BrushRegressor(\n", @@ -138,18 +115,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "316964d5", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Add(10.57*x6,If(x0>0.75,Add(8.50*x6,If(x0>0.81,26.02,Add(-9.31*x4,127.74*x0))),Add(Add(13.60*x4,0.11*x2),-0.09*x1)))\n" - ] - } - ], + "outputs": [], "source": [ "print(est.best_estimator_.get_model())" ] @@ -166,32 +135,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "dad68d01", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Add\n", - "|-10.57*x6\n", - "|-SplitBest\n", - "| |-Add\n", - "| |-8.50*x6\n", - "| |-SplitBest\n", - "| | |-26.02\n", - "| | |-Add\n", - "| | | |--9.31*x4\n", - "| | | |-127.74*x0\n", - "| |-Add\n", - "| | |-Add\n", - "| | |-13.60*x4\n", - "| | |-0.11*x2\n", - "| | |--0.09*x1\n" - ] - } - ], + "outputs": [], "source": [ "print(est.best_estimator_.get_model(\"tree\"))" ] @@ -209,211 +156,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "3ef1a735", - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "G\n", - "\n", - "\n", - "\n", - "5625d27dfb10\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "x6\n", - "\n", - "x6\n", - "\n", - "\n", - "\n", - "5625d27dfb10->x6\n", - "\n", - "\n", - "10.57\n", - "\n", - "\n", - "\n", - "5625d1de0610\n", - "\n", - "x0>0.75?\n", - "\n", - "\n", - "\n", - "5625d27dfb10->5625d1de0610\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "5625d3c02820\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "5625d1de0610->5625d3c02820\n", - "\n", - "\n", - "Y\n", - "\n", - "\n", - "\n", - "5625d1ddc200\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "5625d1de0610->5625d1ddc200\n", - "\n", - "\n", - "N\n", - "\n", - "\n", - "\n", - "5625d3c02820->x6\n", - "\n", - "\n", - "8.50\n", - "\n", - "\n", - "\n", - "5625d27cb4a0\n", - "\n", - "x0>0.81?\n", - "\n", - "\n", - "\n", - "5625d3c02820->5625d27cb4a0\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "5625d1d1bc10\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "5625d1ddc200->5625d1d1bc10\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "x1\n", - "\n", - "x1\n", - "\n", - "\n", - "\n", - "5625d1ddc200->x1\n", - "\n", - "\n", - "-0.09\n", - "\n", - "\n", - "\n", - "5625d1c92dc0\n", - "\n", - "26.02\n", - "\n", - "\n", - "\n", - "5625d27cb4a0->5625d1c92dc0\n", - "\n", - "\n", - "Y\n", - "\n", - "\n", - "\n", - "5625d2515750\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "5625d27cb4a0->5625d2515750\n", - "\n", - "\n", - "N\n", - "\n", - "\n", - "\n", - "x4\n", - "\n", - "x4\n", - "\n", - "\n", - "\n", - "5625d2515750->x4\n", - "\n", - "\n", - "-9.31\n", - "\n", - "\n", - "\n", - "x0\n", - "\n", - "x0\n", - "\n", - "\n", - "\n", - "5625d2515750->x0\n", - "\n", - "\n", - "127.74\n", - "\n", - "\n", - "\n", - "5625d1d1bc10->x4\n", - "\n", - "\n", - "13.60\n", - "\n", - "\n", - "\n", - "x2\n", - "\n", - "x2\n", - "\n", - "\n", - "\n", - "5625d1d1bc10->x2\n", - "\n", - "\n", - "0.11\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": {}, + "outputs": [], "source": [ "import graphviz\n", "\n", @@ -431,49 +177,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "1f7e725e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "digraph G {\n", - "\"5625d27dfb10\" [label=\"Add\"];\n", - "\"5625d27dfb10\" -> \"x6\" [label=\"10.57\"];\n", - "\"5625d27dfb10\" -> \"5625d1de0610\" [label=\"\"];\n", - "\"x6\" [label=\"x6\"];\n", - "\"5625d1de0610\" [label=\"x0>0.75?\"];\n", - "\"5625d1de0610\" -> \"5625d3c02820\" [headlabel=\"\",taillabel=\"Y\"];\n", - "\"5625d1de0610\" -> \"5625d1ddc200\" [headlabel=\"\",taillabel=\"N\"];\n", - "\"5625d3c02820\" [label=\"Add\"];\n", - "\"5625d3c02820\" -> \"x6\" [label=\"8.50\"];\n", - "\"5625d3c02820\" -> \"5625d27cb4a0\" [label=\"\"];\n", - "\"x6\" [label=\"x6\"];\n", - "\"5625d27cb4a0\" [label=\"x0>0.81?\"];\n", - "\"5625d27cb4a0\" -> \"5625d1c92dc0\" [headlabel=\"\",taillabel=\"Y\"];\n", - "\"5625d27cb4a0\" -> \"5625d2515750\" [headlabel=\"\",taillabel=\"N\"];\n", - "\"5625d1c92dc0\" [label=\"26.02\"];\n", - "\"5625d2515750\" [label=\"Add\"];\n", - "\"5625d2515750\" -> \"x4\" [label=\"-9.31\"];\n", - "\"5625d2515750\" -> \"x0\" [label=\"127.74\"];\n", - "\"x4\" [label=\"x4\"];\n", - "\"x0\" [label=\"x0\"];\n", - "\"5625d1ddc200\" [label=\"Add\"];\n", - "\"5625d1ddc200\" -> \"5625d1d1bc10\" [label=\"\"];\n", - "\"5625d1ddc200\" -> \"x1\" [label=\"-0.09\"];\n", - "\"5625d1d1bc10\" [label=\"Add\"];\n", - "\"5625d1d1bc10\" -> \"x4\" [label=\"13.60\"];\n", - "\"5625d1d1bc10\" -> \"x2\" [label=\"0.11\"];\n", - "\"x4\" [label=\"x4\"];\n", - "\"x2\" [label=\"x2\"];\n", - "\"x1\" [label=\"x1\"];\n", - "}\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "print(model)" ] @@ -493,209 +200,10 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "f35b1e05", "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "G\n", - "\n", - "\n", - "\n", - "5625d27dfb10\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "x6\n", - "\n", - "x6\n", - "\n", - "\n", - "\n", - "5625d27dfb10->x6\n", - "\n", - "\n", - "10.57\n", - "\n", - "\n", - "\n", - "5625d1de0610\n", - "\n", - "x0>0.75?\n", - "\n", - "\n", - "\n", - "5625d27dfb10->5625d1de0610\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "5625d3c02820\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "5625d1de0610->5625d3c02820\n", - "\n", - "\n", - "Y\n", - "\n", - "\n", - "\n", - "5625d1ddc200\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "5625d1de0610->5625d1ddc200\n", - "\n", - "\n", - "N\n", - "\n", - "\n", - "\n", - "5625d3c02820->x6\n", - "\n", - "\n", - "8.50\n", - "\n", - "\n", - "\n", - "5625d27cb4a0\n", - "\n", - "x0>0.81?\n", - "\n", - "\n", - "\n", - "5625d3c02820->5625d27cb4a0\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "5625d1d1bc10\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "5625d1ddc200->5625d1d1bc10\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "x1\n", - "\n", - "x1\n", - "\n", - "\n", - "\n", - "5625d1ddc200->x1\n", - "\n", - "\n", - "-0.09\n", - "\n", - "\n", - "\n", - "5625d1c92dc0\n", - "\n", - "26.02\n", - "\n", - "\n", - "\n", - "5625d27cb4a0->5625d1c92dc0\n", - "\n", - "\n", - "Y\n", - "\n", - "\n", - "\n", - "5625d2515750\n", - "\n", - "Add\n", - "\n", - "\n", - "\n", - "5625d27cb4a0->5625d2515750\n", - "\n", - "\n", - "N\n", - "\n", - "\n", - "\n", - "x4\n", - "\n", - "x4\n", - "\n", - "\n", - "\n", - "5625d2515750->x4\n", - "\n", - "\n", - "-9.31\n", - "\n", - "\n", - "\n", - "x0\n", - "\n", - "x0\n", - "\n", - "\n", - "\n", - "5625d2515750->x0\n", - "\n", - "\n", - "127.74\n", - "\n", - "\n", - "\n", - "5625d1d1bc10->x4\n", - "\n", - "\n", - "13.60\n", - "\n", - "\n", - "\n", - "x2\n", - "\n", - "x2\n", - "\n", - "\n", - "\n", - "5625d1d1bc10->x2\n", - "\n", - "\n", - "0.11\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "model = est.best_estimator_.get_dot_model(\"rankdir=LR;\")\n", "graphviz.Source(model)" From 69cc07ea154797edf876db22873b70abfe33887f Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 2 Nov 2023 10:57:31 -0400 Subject: [PATCH 066/199] Fixed not inserting value into list of failed variations --- src/brush/deap_api/nsga2island.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/brush/deap_api/nsga2island.py b/src/brush/deap_api/nsga2island.py index f43e460c..95de8197 100644 --- a/src/brush/deap_api/nsga2island.py +++ b/src/brush/deap_api/nsga2island.py @@ -109,6 +109,7 @@ def calculate_statistics(ind): offspring.extend([off2]) else: failed_variations += 1 + island_failed_variations.append(failed_variations) # Evaluate (instead of evaluateValidation) to fit the weights of the offspring fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring) @@ -156,4 +157,4 @@ def calculate_statistics(ind): archive = tools.ParetoFront() archive.update(pop) - return archive, logbook \ No newline at end of file + return archive, logbook From 5510c860d95415ca003d8a40bfa8e7e191eb8536 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 2 Nov 2023 17:07:33 -0400 Subject: [PATCH 067/199] Basic files to implement island model --- src/bindings/bind_cbrush.cpp | 16 ++++ src/brush/pybrush.py | 42 +++++++++ src/brushGA.h | 69 --------------- src/{brushGA.cpp => cbrush.cpp} | 4 +- src/cbrush.h | 139 ++++++++++++++++++++++++++++++ src/params.h | 17 +++- src/population.cpp | 132 ++++++++++++++++++++++++++++ src/population.h | 95 ++++++++++++++++++++ src/selection.h | 17 ---- src/selection/selection.cpp | 83 ++++++++++++++++++ src/selection/selection.h | 65 ++++++++++++++ tests/cpp/test_evolution_step.cpp | 0 tests/cpp/test_population.cpp | 0 tests/cpp/test_selection.cpp | 0 14 files changed, 590 insertions(+), 89 deletions(-) create mode 100644 src/bindings/bind_cbrush.cpp create mode 100644 src/brush/pybrush.py delete mode 100644 src/brushGA.h rename src/{brushGA.cpp => cbrush.cpp} (70%) create mode 100644 src/cbrush.h create mode 100644 src/population.cpp create mode 100644 src/population.h delete mode 100644 src/selection.h create mode 100644 src/selection/selection.cpp create mode 100644 src/selection/selection.h create mode 100644 tests/cpp/test_evolution_step.cpp create mode 100644 tests/cpp/test_population.cpp create mode 100644 tests/cpp/test_selection.cpp diff --git a/src/bindings/bind_cbrush.cpp b/src/bindings/bind_cbrush.cpp new file mode 100644 index 00000000..89185a9c --- /dev/null +++ b/src/bindings/bind_cbrush.cpp @@ -0,0 +1,16 @@ +#include "module.h" +#include "../cbrush.h" + +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +using namespace Brush; + +void bind_cbrush(py::module& m) +{ + py::class_(m, "CBrush", py::dynamic_attr()) + .def(py::init([]() + { CBrush est; return est; })) + ; +} \ No newline at end of file diff --git a/src/brush/pybrush.py b/src/brush/pybrush.py new file mode 100644 index 00000000..801d2d1d --- /dev/null +++ b/src/brush/pybrush.py @@ -0,0 +1,42 @@ + +from _brush import CBrush +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin + + +class PybrushEstimator(BaseEstimator): + def __init__(self): + self.cbrush_ = CBrush() + + def fit(self, X, y, Z=None): + pass + + def predict(self,X,Z=None): + pass + + def transform(self,X,Z=None): + pass + + def fit_predict(self,X,y,Z=None): + pass + + def fit_transform(self,X,y,Z=None): + pass + + def score(self,X,y,Z=None): + pass + + +class PybrushRegressor(PybrushEstimator): + def __init__(self,**kwargs): + pass + + +class PybrushClassifier(PybrushEstimator): + def __init__(self,**kwargs): + pass + + def predict(self,X,Z=None): + pass + + def predict_proba(self,X,Z=None): + pass \ No newline at end of file diff --git a/src/brushGA.h b/src/brushGA.h deleted file mode 100644 index 02d0c536..00000000 --- a/src/brushGA.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Brush -copyright 2020 William La Cava -license: GNU/GPL v3 -*/ - -#ifndef BrushGA_H -#define BrushGA_H - -#include "init.h" -#include "taskflow/taskflow.hpp" - -// TODO: improve the includes (why does this lines below does not work?) -// #include "variation.h" -// #include "selection.h" - -// using namespace selection; -// using namespace variation; - -namespace Brush -{ - -class BrushGA{ -public: - - BrushGA(){} - /// destructor - ~BrushGA(){} - - void init(); - - //getters and setters for GA configuration. - // getters and setters for the best solution found after evolution - // predict, transform, predict_proba, etc. - // get statistics - // load and save best individuals - // logger, save to file - // execution archive - // random state control - // score functions - // fit methods (this will run the evolution), run a single generation -private: - // attributes (hyperparameters) - // update best - // calculate/print stats -}; - -int main(){ - - tf::Executor executor; - tf::Taskflow taskflow; - - auto [A, B, C, D] = taskflow.emplace( // create four tasks - [] () { std::cout << "TaskA\n"; }, - [] () { std::cout << "TaskB\n"; }, - [] () { std::cout << "TaskC\n"; }, - [] () { std::cout << "TaskD\n"; } - ); - - A.precede(B, C); // A runs before B and C - D.succeed(B, C); // D runs after B and C - - executor.run(taskflow).wait(); - - return 0; -} - -} // Brush - -#endif diff --git a/src/brushGA.cpp b/src/cbrush.cpp similarity index 70% rename from src/brushGA.cpp rename to src/cbrush.cpp index 1b16a4d7..c634473c 100644 --- a/src/brushGA.cpp +++ b/src/cbrush.cpp @@ -1,11 +1,11 @@ -#include "brushGA.h" +#include "cbrush.h" #include using namespace Brush; /// @brief initialize Feat object for fitting. -void BrushGA::init() +void CBrush::init() { } diff --git a/src/cbrush.h b/src/cbrush.h new file mode 100644 index 00000000..9e190aae --- /dev/null +++ b/src/cbrush.h @@ -0,0 +1,139 @@ +/* Brush +copyright 2020 William La Cava +license: GNU/GPL v3 +*/ + +#ifndef CBrush_H +#define CBrush_H + +#include "init.h" +#include "params.h" +#include "selection/selection.h" +#include "population.h" +#include "taskflow/taskflow.hpp" + +// TODO: improve the includes (why does this lines below does not work?) +// #include "variation.h" +// #include "selection.h" + +// using namespace selection; +// using namespace variation; + +namespace Brush +{ + +class CBrush{ +public: + CBrush(){}; + ~CBrush(){}; + void init(); + + //getters and setters for GA configuration --------------------------------- + /// set flag indicating whether fit has been called + inline void set_is_fitted(bool f){is_fitted=f;} + inline bool get_is_fitted(){return is_fitted;} + + /// set size of population + void set_pop_size(int pop_size); + /// return population size + int get_pop_size(); + + /// set size of max generations + void set_gens(int gens); + ///return size of max generations + int get_gens(); + + /// set EProblemType for shogun + void set_classification(bool classification); + ///return type of classification flag set + bool get_classification(); + + /// set selection method + void set_selection(string sel); + string get_selection(); + + /// set survivability + void set_survival(string surv); + string get_survival(); + + ///return cross rate for variation + float get_cross_rate(); + /// set cross rate in variation + void set_cross_rate(float cross_rate); + + /// sets available functions based on comma-separated list. + // void set_functions(const vector& fns){ params.set_functions(fns); }; + // vector get_functions(){return params.get_functions();}; + + ///return max_depth of programs + int get_max_depth(); + /// set max depth of programs + void set_max_depth(unsigned int max_depth); + + ///return max dimensionality of programs + int get_max_size(); + /// set maximum sizeensionality of programs + void set_max_size(unsigned int max_dim); + + /// set seeds for each core's random number generator + // void set_random_state(int random_state); + // int get_random_state() { return params.random_state; }; + // /// returns the actual seed determined by the input argument. + // int get_random_state_() { return r.get_seed(); }; + + ///return fraction of data to use for training + float get_split(); + /// set train fraction of dataset + void set_split(float sp); + + // int get_batch_size(){return params.bp.batch_size;}; + // void set_batch_size(int bs); + + ///set number of threads + // void set_n_jobs(unsigned t); + // int get_n_jobs(){return omp_get_num_threads();}; + + ///set flag to use batch for training + void set_use_batch(); + + // getters and setters for the best solution found after evolution + // predict, transform, predict_proba, etc. + // get statistics + // load and save best individuals + // logger, save to file + // execution archive + // random state control + // score functions + // fit methods (this will run the evolution), run a single generation + + bool is_fitted; ///< keeps track of whether fit was called. +private: + Parameters params; ///< hyperparameters of Feat + // attributes (hyperparameters) + // update best + // calculate/print stats +}; + +int main(){ + + tf::Executor executor; + tf::Taskflow taskflow; + + auto [A, B, C, D] = taskflow.emplace( // create four tasks + [] () { std::cout << "TaskA\n"; }, + [] () { std::cout << "TaskB\n"; }, + [] () { std::cout << "TaskC\n"; }, + [] () { std::cout << "TaskD\n"; } + ); + + A.precede(B, C); // A runs before B and C + D.succeed(B, C); // D runs after B and C + + executor.run(taskflow).wait(); + + return 0; +} + +} // Brush + +#endif diff --git a/src/params.h b/src/params.h index eeed65f0..d42a26c6 100644 --- a/src/params.h +++ b/src/params.h @@ -5,11 +5,26 @@ license: GNU/GPL v3 #ifndef PARAMS_H #define PARAMS_H + #include "init.h" + namespace ns = nlohmann; namespace Brush { - extern ns::json PARAMS; + +struct Parameters +{ + int pop_size = 100; ///< population size + int gens = 100; ///< max generations + + + Parameters(); + ~Parameters(){}; + + void init(); +}; + + extern ns::json PARAMS; void set_params(const ns::json& j); ns::json get_params(); } // Brush diff --git a/src/population.cpp b/src/population.cpp new file mode 100644 index 00000000..1472e438 --- /dev/null +++ b/src/population.cpp @@ -0,0 +1,132 @@ +/* FEAT +copyright 2017 William La Cava +license: GNU/GPL v3 +*/ + +#include "population.h" + +namespace FT{ +namespace Pop{ + +int last; + +Population::Population(int p) +{ + individuals.resize(p); +} + +Population::~Population(){} + +/// update individual vector size +void Population::resize(int pop_size) +{ + individuals.resize(pop_size); +} + +/// returns population size +int Population::size(){ return individuals.size(); } + +const Program Population::operator [](size_t i) const {return individuals.at(i);} +const Program& Population::operator [](size_t i) {return individuals.at(i);} + +void Population::init(const Program& starting_model, + const Parameters& params, + const SearchSpace& ss + ) +{ + + #pragma omp parallel for + for (int i = 0; i< individuals.size(); ++i) + { + individuals.at(i).initialize(params, random, i); + } +} + +void Population::update(vector survivors) +{ + + /*! + * cull population down to survivor indices. + */ + vector pop_idx(individuals.size()); + std::iota(pop_idx.begin(),pop_idx.end(),0); + std::reverse(pop_idx.begin(),pop_idx.end()); + for (const auto& i : pop_idx) + if (!in(survivors,i)) + individuals.erase(individuals.begin()+i); + +} + + +void Population::add(Program& ind) +{ + individuals.push_back(ind); +} + +string Population::print_eqns(bool just_offspring, string sep) +{ + string output = ""; + int start = 0; + + if (just_offspring) + start = individuals.size()/2; + + for (unsigned int i=start; i< individuals.size(); ++i) + output += individuals.at(i).get_eqn() + sep; + + return output; +} + +vector Population::sorted_front(unsigned rank=1) +{ + /* Returns individuals on the Pareto front, sorted by increasign complexity. */ + vector pf; + for (unsigned int i =0; i> line; + + json j = json::parse(line); + from_json(j, *this); + + logger.log("Loaded population from " + filename + " of size = " + + to_string(this->size()),1); + + indata.close(); +} + + +} // Pop +} // FT diff --git a/src/population.h b/src/population.h new file mode 100644 index 00000000..d7b3c717 --- /dev/null +++ b/src/population.h @@ -0,0 +1,95 @@ +#ifndef POPULATION_H +#define POPULATION_H + +#include "program/program.h" +#include "search_space.h" + +using std::vector; +using std::string; +using Eigen::Map; + +namespace Brush +{ +namespace Pop{ + +////////////////////////////////////////////////////////////////// Declarations +extern int last; +/*! + * @class Population + * @brief Defines a population of programs and functions for constructing them. + */ +template +struct Population +{ + vector*> individuals; ///< individual programs + + Population(int p = 0); + + ~Population(); + + /// initialize population of programs with a starting model and/or from file + void init(const Program& starting_model, + const Parameters& params, + const SearchSpace& ss + ); + + /// update individual vector size + void resize(int pop_size); + + /// reduce programs to the indices in survivors. + void update(vector survivors); + + /// returns population size + int size(); + + /// adds a program to the population. + + void add(Program&); + + /// setting and getting from individuals vector + const Program operator [](size_t i) const; + const Program& operator [](size_t i); + + /// return population equations. + string print_eqns(bool just_offspring=false, string sep="\n"); + + /// return complexity-sorted Pareto front indices. + vector sorted_front(unsigned); + + /// Sort population in increasing complexity. + struct SortComplexity + { + Population& pop; + SortComplexity(Population& p): pop(p){} + bool operator()(size_t i, size_t j) + { + return pop.individuals[i].set_complexity() < pop.individuals[j].set_complexity(); + } + }; + + /// check for same fitness and complexity to filter uniqueness. + struct SameFitComplexity + { + Population & pop; + SameFitComplexity(Population& p): pop(p){} + bool operator()(size_t i, size_t j) + { + return (pop.individuals[i].fitness == pop.individuals[j].fitness && + pop.individuals[i].set_complexity() == pop.individuals[j].set_complexity()); + } + }; + + // save serialized population + void save(string filename); + // load serialized population + void load(string filename); +}; + +// //TODO +// /* void from_json(const json& j, Population& p); */ +// /* void to_json(json& j, const Population& p); */ +// NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, individuals); +}//Pop + +}//FT +#endif diff --git a/src/selection.h b/src/selection.h deleted file mode 100644 index 3ce1f849..00000000 --- a/src/selection.h +++ /dev/null @@ -1,17 +0,0 @@ -/* Brush -copyright 2020 William La Cava -license: GNU/GPL v3 -*/ - -#ifndef SELECTION_H -#define SELECTION_H - -namespace selection { - -class SelectorBase { -public: -private: -}; - -} // selection -#endif \ No newline at end of file diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp new file mode 100644 index 00000000..44f36ae5 --- /dev/null +++ b/src/selection/selection.cpp @@ -0,0 +1,83 @@ +#include "selection.h" + +namespace selection { + +Selection::Selection() +{ + /*! + * set type of selection operator. + */ + this->type = "lexicase"; + this->survival = false; + this->set_operator(); +} + +Selection::Selection(string type, bool survival) +{ + /*! + * set type of selection operator. + */ + this->type = type; + this->survival = survival; + this->set_operator(); +} + +void Selection::set_operator() +{ + // if (this->type == "lexicase") + // pselector = std::make_shared(survival); + // else if (this->type == "fair_lexicase") + // pselector = std::make_shared(survival); + // else if (this->type == "pareto_lexicase") + // pselector = std::make_shared(survival); + // else if (this->type == "nsga2") + // pselector = std::make_shared(survival); + // else if (this->type == "tournament") + // pselector = std::make_shared(survival); + // else if (this->type == "offspring") // offspring survival + // pselector = std::make_shared(survival); + // else if (this->type == "random") // offspring survival + // pselector = std::make_shared(survival); + // else if (this->type == "simanneal") // offspring survival + // pselector = std::make_shared(survival); + // else + // WARN("Undefined Selection Operator " + this->type + "\n"); + +} + +Selection::~Selection(){} + +/// return type of selectionoperator +string Selection::get_type(){ return pselector->name; } + +/// set type of selectionoperator +void Selection::set_type(string in){ type = in; set_operator();} + +/// perform selection +vector Selection::select() +{ + return pselector->select(pop, params, d); +} + +/// perform survival +vector Selection::survive( +{ + return pselector->survive(pop, params, d); +} + + +SelectionOperator::~SelectionOperator(){} + +vector SelectionOperator::select() +{ + THROW_INVALID_ARGUMENT("Undefined select() operation"); + return vector(); +} + +vector SelectionOperator::survive() +{ + THROW_INVALID_ARGUMENT("Undefined select() operation"); + return vector(); +} + +} // selection \ No newline at end of file diff --git a/src/selection/selection.h b/src/selection/selection.h new file mode 100644 index 00000000..18bd0506 --- /dev/null +++ b/src/selection/selection.h @@ -0,0 +1,65 @@ +/* Brush +copyright 2020 William La Cava +license: GNU/GPL v3 +*/ + +#ifndef SELECTION_H +#define SELECTION_H + +#include "../init.h" +#include "../params.h" +#include "../population.h" + +namespace selection { + +/*! + * @class SelectionOperator + * @brief base class for selection operators. + */ +struct SelectionOperator +{ + bool survival; + string name; + + //SelectionOperator(){} + + virtual ~SelectionOperator(); + + virtual vector select(); + + virtual vector survive(); +}; + +struct Parameters; // forward declaration of Parameters + +/*! +* @class Selection +* @brief interfaces with selection operators. +*/ +struct Selection +{ + shared_ptr pselector; + string type; + bool survival; + + Selection(); + ~Selection(); + Selection(string type, bool survival); + + void set_operator(); + + /// return type of selectionoperator + string get_type(); + void set_type(string); + + /// perform selection + vector select(); + + /// perform survival + vector survive(); +}; + +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Selection, type, survival); + +} // selection +#endif \ No newline at end of file diff --git a/tests/cpp/test_evolution_step.cpp b/tests/cpp/test_evolution_step.cpp new file mode 100644 index 00000000..e69de29b diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp new file mode 100644 index 00000000..e69de29b diff --git a/tests/cpp/test_selection.cpp b/tests/cpp/test_selection.cpp new file mode 100644 index 00000000..e69de29b From 6556d767a27fc58bb88db4b62667f8b04ed7a3d1 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 3 Nov 2023 14:40:00 -0400 Subject: [PATCH 068/199] Individuals. Now program compiles again --- src/bindings/bind_individuals.cpp | 0 src/individual.cpp | 33 ++++++++ src/individual.h | 24 ++++++ src/params.cpp | 11 ++- src/params.h | 48 ++++++++++-- src/population.cpp | 123 ++---------------------------- src/population.h | 91 ++++------------------ src/selection/selection.cpp | 55 +------------ src/selection/selection.h | 13 ---- src/util/error.h | 1 + 10 files changed, 128 insertions(+), 271 deletions(-) create mode 100644 src/bindings/bind_individuals.cpp create mode 100644 src/individual.cpp create mode 100644 src/individual.h diff --git a/src/bindings/bind_individuals.cpp b/src/bindings/bind_individuals.cpp new file mode 100644 index 00000000..e69de29b diff --git a/src/individual.cpp b/src/individual.cpp new file mode 100644 index 00000000..68d3709a --- /dev/null +++ b/src/individual.cpp @@ -0,0 +1,33 @@ +/* FEAT +copyright 2017 William La Cava +license: GNU/GPL v3 +*/ + +#include "individual.h" + +namespace Brush{ +namespace Pop{ + +template +Individual::Individual(Program Prog) +{ + program = Prog; + + // TODO: calculate this stuff + complexity = -1; + fitness = -1; + fitness_v = -1; + fairness = -1; + fairness_v = -1; + dcounter=-1; + crowd_dist = -1; +} + + +// void Individual::initialize(const Parameters& params, bool random, int id) +// { + +// } + +} // Pop +} // FT diff --git a/src/individual.h b/src/individual.h new file mode 100644 index 00000000..b8f95036 --- /dev/null +++ b/src/individual.h @@ -0,0 +1,24 @@ +#ifndef INDIVIDUAL_H +#define INDIVIDUAL_H + +#include "program/program.h" + +namespace Brush{ +namespace Pop{ + +template +class Individual{ +public: + Program program; ///< executable data structure + + Individual(Program Prog); + + // fitness, objetives, complexity, etc + // setters and getters + // wrappers (fit, predict). This class should also have its own cpp wrapper +}; + +} // Pop +} // Brush + +#endif diff --git a/src/params.cpp b/src/params.cpp index c785b6c3..03415ae1 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -5,7 +5,10 @@ license: GNU/GPL v3 #include "params.h" namespace Brush { - nlohmann::json PARAMS; - void set_params(const ns::json& j) { PARAMS = j; } - ns::json get_params(){ return PARAMS;} -} +void Parameters::init(const MatrixXf& X, const VectorXf& y) {};// TODO: implement this + +nlohmann::json PARAMS; +void set_params(const ns::json& j) { PARAMS = j; } +ns::json get_params(){ return PARAMS;} + +} // Brush diff --git a/src/params.h b/src/params.h index d42a26c6..e1d718ef 100644 --- a/src/params.h +++ b/src/params.h @@ -14,19 +14,53 @@ namespace Brush struct Parameters { - int pop_size = 100; ///< population size - int gens = 100; ///< max generations +private: + // settings + int random_state; // TODO: constructor should set the global rng to random_state (if given, otherwise just let it work normally) + int verbosity = 0; + // Evolutionary stuff + string mode="regression"; + int pop_size = 100; + int gens = 100; + unsigned int max_depth = 10; + unsigned int max_size=100; + vector objectives{"error","complexity"}; // error should be generic and deducted based on mode + float cx_prob; ///< cross rate for variation + float mutation_probs; + int num_islands=5; + float mig_prob = 0.05; + vector functions; + string scorer_; ///< actual loss function used, determined by error - Parameters(); + // for classification + unsigned int n_classes; ///< number of classes for classification + vector classes; ///< class labels + vector class_weights; ///< weights for each class + vector sample_weights; ///< weights for each sample + + // from dataset + bool shuffle = true; ///< option to shuffle the data + float split = 0.75; ///< fraction of data to use for training + vector feature_names; ///< names of features + float batch_size = 0.0; + bool use_batch = false; ///< whether to use mini batch for training + + int n_jobs = 1; ///< number of parallel jobs +public: + Parameters() {}; ~Parameters(){}; - void init(); + // TODO: getters and setters + + void init(const MatrixXf& X, const VectorXf& y); }; - extern ns::json PARAMS; - void set_params(const ns::json& j); - ns::json get_params(); +// Global (deprecated) params +extern ns::json PARAMS; +void set_params(const ns::json& j); +ns::json get_params(); + } // Brush #endif diff --git a/src/population.cpp b/src/population.cpp index 1472e438..23ad584b 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -5,128 +5,19 @@ license: GNU/GPL v3 #include "population.h" -namespace FT{ +namespace Brush{ namespace Pop{ int last; -Population::Population(int p) +template +Population::Population(int p) { - individuals.resize(p); -} - -Population::~Population(){} - -/// update individual vector size -void Population::resize(int pop_size) -{ - individuals.resize(pop_size); -} - -/// returns population size -int Population::size(){ return individuals.size(); } - -const Program Population::operator [](size_t i) const {return individuals.at(i);} -const Program& Population::operator [](size_t i) {return individuals.at(i);} - -void Population::init(const Program& starting_model, - const Parameters& params, - const SearchSpace& ss - ) -{ - - #pragma omp parallel for - for (int i = 0; i< individuals.size(); ++i) - { - individuals.at(i).initialize(params, random, i); - } -} - -void Population::update(vector survivors) -{ - - /*! - * cull population down to survivor indices. - */ - vector pop_idx(individuals.size()); - std::iota(pop_idx.begin(),pop_idx.end(),0); - std::reverse(pop_idx.begin(),pop_idx.end()); - for (const auto& i : pop_idx) - if (!in(survivors,i)) - individuals.erase(individuals.begin()+i); - -} - - -void Population::add(Program& ind) -{ - individuals.push_back(ind); -} - -string Population::print_eqns(bool just_offspring, string sep) -{ - string output = ""; - int start = 0; - - if (just_offspring) - start = individuals.size()/2; - - for (unsigned int i=start; i< individuals.size(); ++i) - output += individuals.at(i).get_eqn() + sep; - - return output; -} - -vector Population::sorted_front(unsigned rank=1) -{ - /* Returns individuals on the Pareto front, sorted by increasign complexity. */ - vector pf; - for (unsigned int i =0; i> line; - - json j = json::parse(line); - from_json(j, *this); - - logger.log("Loaded population from " + filename + " of size = " - + to_string(this->size()),1); - - indata.close(); + individuals.resize(p); } +template +Population::~Population(){} } // Pop -} // FT +} // Brush diff --git a/src/population.h b/src/population.h index d7b3c717..befe6aec 100644 --- a/src/population.h +++ b/src/population.h @@ -3,93 +3,28 @@ #include "program/program.h" #include "search_space.h" +#include "individual.h" using std::vector; using std::string; using Eigen::Map; -namespace Brush -{ -namespace Pop{ +namespace Brush { +namespace Pop { -////////////////////////////////////////////////////////////////// Declarations -extern int last; -/*! - * @class Population - * @brief Defines a population of programs and functions for constructing them. - */ template -struct Population -{ - vector*> individuals; ///< individual programs - - Population(int p = 0); - +class Population{ +public: + vector*> individuals; + Population(int p=0); ~Population(); - - /// initialize population of programs with a starting model and/or from file - void init(const Program& starting_model, - const Parameters& params, - const SearchSpace& ss - ); - - /// update individual vector size - void resize(int pop_size); - - /// reduce programs to the indices in survivors. - void update(vector survivors); - - /// returns population size - int size(); - - /// adds a program to the population. - - void add(Program&); - - /// setting and getting from individuals vector - const Program operator [](size_t i) const; - const Program& operator [](size_t i); - - /// return population equations. - string print_eqns(bool just_offspring=false, string sep="\n"); - - /// return complexity-sorted Pareto front indices. - vector sorted_front(unsigned); - - /// Sort population in increasing complexity. - struct SortComplexity - { - Population& pop; - SortComplexity(Population& p): pop(p){} - bool operator()(size_t i, size_t j) - { - return pop.individuals[i].set_complexity() < pop.individuals[j].set_complexity(); - } - }; - - /// check for same fitness and complexity to filter uniqueness. - struct SameFitComplexity - { - Population & pop; - SameFitComplexity(Population& p): pop(p){} - bool operator()(size_t i, size_t j) - { - return (pop.individuals[i].fitness == pop.individuals[j].fitness && - pop.individuals[i].set_complexity() == pop.individuals[j].set_complexity()); - } - }; - // save serialized population - void save(string filename); - // load serialized population - void load(string filename); -}; + // fitness, objetives, complexity, etc + // setters and getters + // wrappers (fit, predict). This class should also have its own cpp wrapper +}; -// //TODO -// /* void from_json(const json& j, Population& p); */ -// /* void to_json(json& j, const Population& p); */ -// NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, individuals); -}//Pop +}// Pop +}// Brush -}//FT #endif diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index 44f36ae5..d8e33a7a 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -9,74 +9,23 @@ Selection::Selection() */ this->type = "lexicase"; this->survival = false; - this->set_operator(); -} - -Selection::Selection(string type, bool survival) -{ - /*! - * set type of selection operator. - */ - this->type = type; - this->survival = survival; - this->set_operator(); -} - -void Selection::set_operator() -{ - // if (this->type == "lexicase") - // pselector = std::make_shared(survival); - // else if (this->type == "fair_lexicase") - // pselector = std::make_shared(survival); - // else if (this->type == "pareto_lexicase") - // pselector = std::make_shared(survival); - // else if (this->type == "nsga2") - // pselector = std::make_shared(survival); - // else if (this->type == "tournament") - // pselector = std::make_shared(survival); - // else if (this->type == "offspring") // offspring survival - // pselector = std::make_shared(survival); - // else if (this->type == "random") // offspring survival - // pselector = std::make_shared(survival); - // else if (this->type == "simanneal") // offspring survival - // pselector = std::make_shared(survival); - // else - // WARN("Undefined Selection Operator " + this->type + "\n"); - } Selection::~Selection(){} -/// return type of selectionoperator -string Selection::get_type(){ return pselector->name; } - -/// set type of selectionoperator -void Selection::set_type(string in){ type = in; set_operator();} - -/// perform selection -vector Selection::select() -{ - return pselector->select(pop, params, d); -} - -/// perform survival -vector Selection::survive( -{ - return pselector->survive(pop, params, d); -} SelectionOperator::~SelectionOperator(){} vector SelectionOperator::select() { - THROW_INVALID_ARGUMENT("Undefined select() operation"); + // THROW_INVALID_ARGUMENT("Undefined select() operation"); return vector(); } vector SelectionOperator::survive() { - THROW_INVALID_ARGUMENT("Undefined select() operation"); + // THROW_INVALID_ARGUMENT("Undefined select() operation"); return vector(); } diff --git a/src/selection/selection.h b/src/selection/selection.h index 18bd0506..c4bc6d3c 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -44,19 +44,6 @@ struct Selection Selection(); ~Selection(); - Selection(string type, bool survival); - - void set_operator(); - - /// return type of selectionoperator - string get_type(); - void set_type(string); - - /// perform selection - vector select(); - - /// perform survival - vector survive(); }; NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Selection, type, survival); diff --git a/src/util/error.h b/src/util/error.h index 5fe29d36..4ae607c1 100644 --- a/src/util/error.h +++ b/src/util/error.h @@ -24,6 +24,7 @@ namespace Brush{ namespace Util { #define HANDLE_ERROR_THROW( err ) (Brush::Util::HandleErrorThrow( err, __FILE__, __LINE__ )) #define HANDLE_WARNING( err ) (Brush::Util::HandleErrorNoThrow( err, __FILE__, __LINE__ )) + // TODO: have more errors }} #endif From 738377603d2a42fe5f56373bf7c34452fd587799 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 3 Nov 2023 15:20:03 -0400 Subject: [PATCH 069/199] Starting to implement Individual --- src/individual.cpp | 21 +++++++++------------ src/individual.h | 25 +++++++++++++++++++++++-- src/search_space.h | 1 + 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/individual.cpp b/src/individual.cpp index 68d3709a..57a75eb2 100644 --- a/src/individual.cpp +++ b/src/individual.cpp @@ -9,25 +9,22 @@ namespace Brush{ namespace Pop{ template -Individual::Individual(Program Prog) +Individual::Individual() { - program = Prog; - // TODO: calculate this stuff - complexity = -1; fitness = -1; fitness_v = -1; - fairness = -1; - fairness_v = -1; + dcounter=-1; crowd_dist = -1; } - -// void Individual::initialize(const Parameters& params, bool random, int id) -// { - -// } +template +void Individual::initialize(const SearchSpace& ss, const Parameters& params) +{ + // TODO: make searchspace use params, so it will generate something valid + program = SS.make_program(params.max_depth, params.max_size); +} } // Pop -} // FT +} // Brush diff --git a/src/individual.h b/src/individual.h index b8f95036..439d33e2 100644 --- a/src/individual.h +++ b/src/individual.h @@ -8,14 +8,35 @@ namespace Pop{ template class Individual{ -public: +private: Program program; ///< executable data structure - Individual(Program Prog); + // store just info that we dont have a getter. size, depth, complexity: they can all be obtained with program. + + VectorXf error; ///< training error (used in lexicase selectors) + + float fitness; ///< aggregate fitness score + float fitness_v; ///< aggregate validation fitness score + + unsigned int dcounter; ///< number of individuals this dominates + + vector dominated; ///< individual indices this dominates + unsigned int rank; ///< pareto front rank + float crowd_dist; ///< crowding distance on the Pareto front + +public: + Individual(); // fitness, objetives, complexity, etc // setters and getters // wrappers (fit, predict). This class should also have its own cpp wrapper + + void initialize(const SearchSpace& ss, const Parameters& params); + + // getters + string get_model() { return program.get_model(); }; + size_t get_size() { return program.size(); }; + size_t get_depth() { return program.depth(); }; }; } // Pop diff --git a/src/search_space.h b/src/search_space.h index dcf1db73..231737ef 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -682,6 +682,7 @@ P SearchSpace::make_program(int max_d, int max_size) max_d = PARAMS["max_depth"].get(); if (max_size == 0) max_size = r.rnd_int(1, PARAMS["max_size"].get()); + // TODO: searchspace should infer max_size from parameters class DataType root_type = DataTypeEnum::value; ProgramType program_type = P::program_type; From a9b469dbb88209d8f8857b0e7a29c07ac153d314 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 3 Nov 2023 16:37:55 -0400 Subject: [PATCH 070/199] Population and Selection templated for ProgramType --- src/cbrush.h | 2 +- src/individual.cpp | 2 +- src/individual.h | 11 ++++-- src/population.cpp | 13 +++++-- src/population.h | 7 ++-- src/selection/selection.cpp | 68 +++++++++++++++++++++++++++++++------ src/selection/selection.h | 38 +++++++++++++++++++-- 7 files changed, 117 insertions(+), 24 deletions(-) diff --git a/src/cbrush.h b/src/cbrush.h index 9e190aae..2eec929f 100644 --- a/src/cbrush.h +++ b/src/cbrush.h @@ -24,7 +24,7 @@ namespace Brush class CBrush{ public: - CBrush(){}; + CBrush(){}; // TODO: constructor should create a new parameters and use it in every other stuff ~CBrush(){}; void init(); diff --git a/src/individual.cpp b/src/individual.cpp index 57a75eb2..2427b0b0 100644 --- a/src/individual.cpp +++ b/src/individual.cpp @@ -20,7 +20,7 @@ Individual::Individual() } template -void Individual::initialize(const SearchSpace& ss, const Parameters& params) +void Individual::init(const SearchSpace& ss, const Parameters& params) { // TODO: make searchspace use params, so it will generate something valid program = SS.make_program(params.max_depth, params.max_size); diff --git a/src/individual.h b/src/individual.h index 439d33e2..a088ee80 100644 --- a/src/individual.h +++ b/src/individual.h @@ -28,15 +28,22 @@ class Individual{ Individual(); // fitness, objetives, complexity, etc + void fit(Dataset& data) { program.fit(data); }; + auto predict(Dataset& data) { return program.predict(data); }; + + // TODO: predict proba and classification related methods. + // setters and getters - // wrappers (fit, predict). This class should also have its own cpp wrapper - void initialize(const SearchSpace& ss, const Parameters& params); + // TODO: This class should also have its own cpp wrapper. Update it into the deap api (the idea is that the user is still able to prototype with brush, I dont think we should disable that feature) + + void init(const SearchSpace& ss, const Parameters& params); // getters string get_model() { return program.get_model(); }; size_t get_size() { return program.size(); }; size_t get_depth() { return program.depth(); }; + size_t get_complexity() { return program.complexity(); }; }; } // Pop diff --git a/src/population.cpp b/src/population.cpp index 23ad584b..781dc05c 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -8,8 +8,6 @@ license: GNU/GPL v3 namespace Brush{ namespace Pop{ -int last; - template Population::Population(int p) { @@ -17,7 +15,16 @@ Population::Population(int p) } template -Population::~Population(){} +void Population::init(const SearchSpace& ss, const Parameters& params) +{ + // TODO: load file (like feat) + + #pragma omp parallel for + for (int i = 0; i< individuals.size(); ++i) + { + individuals.at(i).init(ss, params, i); + } +} } // Pop } // Brush diff --git a/src/population.h b/src/population.h index befe6aec..86fdc7ed 100644 --- a/src/population.h +++ b/src/population.h @@ -16,12 +16,11 @@ template class Population{ public: vector*> individuals; + Population(int p=0); - ~Population(); + ~Population(){}; - // fitness, objetives, complexity, etc - // setters and getters - // wrappers (fit, predict). This class should also have its own cpp wrapper + void init(const SearchSpace& ss, const Parameters& params); }; }// Pop diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index d8e33a7a..e09f24b0 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -1,7 +1,12 @@ #include "selection.h" +// TODO: organize all namespaces +namespace Brush { namespace selection { +using namespace Brush; +using namespace Pop; + Selection::Selection() { /*! @@ -9,24 +14,65 @@ Selection::Selection() */ this->type = "lexicase"; this->survival = false; + this->set_operator(); } -Selection::~Selection(){} +Selection::Selection(string type, bool survival) +{ + /*! + * set type of selection operator. + */ + this->type = type; + this->survival = survival; + this->set_operator(); +} + +void Selection::set_operator() +{ + // if (this->type == "lexicase") + // pselector = std::make_shared(survival); + // else if (this->type == "fair_lexicase") + // pselector = std::make_shared(survival); + // else if (this->type == "pareto_lexicase") + // pselector = std::make_shared(survival); + // else if (this->type == "nsga2") + // pselector = std::make_shared(survival); + // else if (this->type == "tournament") + // pselector = std::make_shared(survival); + // else if (this->type == "offspring") // offspring survival + // pselector = std::make_shared(survival); + // else if (this->type == "random") // offspring survival + // pselector = std::make_shared(survival); + // else if (this->type == "simanneal") // offspring survival + // pselector = std::make_shared(survival); + // else + // WARN("Undefined Selection Operator " + this->type + "\n"); + +} +Selection::~Selection(){} +/// return type of selectionoperator +string Selection::get_type(){ return pselector->name; } -SelectionOperator::~SelectionOperator(){} +/// set type of selectionoperator +void Selection::set_type(string in){ type = in; set_operator();} -vector SelectionOperator::select() -{ - // THROW_INVALID_ARGUMENT("Undefined select() operation"); - return vector(); +/// perform selection +template +vector Selection::select(Population& pop, + const Parameters& params, const Dataset& data) +{ + return pselector->select(pop, params, data); } -vector SelectionOperator::survive() -{ - // THROW_INVALID_ARGUMENT("Undefined select() operation"); - return vector(); +/// perform survival +template +vector Selection::survive(Population& pop, + const Parameters& params, const Dataset& data) +{ + return pselector->survive(pop, params, data); } -} // selection \ No newline at end of file +} // selection +} // Brush \ No newline at end of file diff --git a/src/selection/selection.h b/src/selection/selection.h index c4bc6d3c..f5336a1f 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -10,8 +10,12 @@ license: GNU/GPL v3 #include "../params.h" #include "../population.h" +namespace Brush { namespace selection { +using namespace Brush; +using namespace Pop; + /*! * @class SelectionOperator * @brief base class for selection operators. @@ -25,9 +29,21 @@ struct SelectionOperator virtual ~SelectionOperator(); - virtual vector select(); + template // TODO: HOW TO STOP TEMPLATING EVERYTHING??? + vector select(Population& pop, + const Parameters& p, const Dataset& data) + { + // THROW_INVALID_ARGUMENT("Undefined select() operation"); + return vector(); + } - virtual vector survive(); + template + vector survive(Population& pop, + const Parameters& p, const Dataset& data) + { + // THROW_INVALID_ARGUMENT("Undefined select() operation"); + return vector(); + } }; struct Parameters; // forward declaration of Parameters @@ -44,9 +60,27 @@ struct Selection Selection(); ~Selection(); + Selection(string type, bool survival); + + void set_operator(); + + /// return type of selectionoperator + string get_type(); + void set_type(string); + + /// perform selection + template + vector select(Population& pop, + const Parameters& params, const Dataset& data); + + /// perform survival + template + vector survive(Population& pop, + const Parameters& params, const Dataset& data); }; NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Selection, type, survival); } // selection +} // Brush #endif \ No newline at end of file From bfa4d1cc6de68f04a0c539bc3e633b1593db8848 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 6 Nov 2023 16:16:22 -0500 Subject: [PATCH 071/199] Implementation of individual class --- src/individual.cpp | 30 ------------------------------ src/individual.h | 40 +++++++++++++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 39 deletions(-) delete mode 100644 src/individual.cpp diff --git a/src/individual.cpp b/src/individual.cpp deleted file mode 100644 index 2427b0b0..00000000 --- a/src/individual.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/* FEAT -copyright 2017 William La Cava -license: GNU/GPL v3 -*/ - -#include "individual.h" - -namespace Brush{ -namespace Pop{ - -template -Individual::Individual() -{ - // TODO: calculate this stuff - fitness = -1; - fitness_v = -1; - - dcounter=-1; - crowd_dist = -1; -} - -template -void Individual::init(const SearchSpace& ss, const Parameters& params) -{ - // TODO: make searchspace use params, so it will generate something valid - program = SS.make_program(params.max_depth, params.max_size); -} - -} // Pop -} // Brush diff --git a/src/individual.h b/src/individual.h index a088ee80..72ec4a0e 100644 --- a/src/individual.h +++ b/src/individual.h @@ -18,32 +18,54 @@ class Individual{ float fitness; ///< aggregate fitness score float fitness_v; ///< aggregate validation fitness score + size_t complexity; unsigned int dcounter; ///< number of individuals this dominates - vector dominated; ///< individual indices this dominates unsigned int rank; ///< pareto front rank float crowd_dist; ///< crowding distance on the Pareto front public: - Individual(); + Individual() + { // TODO: calculate this stuff + fitness = -1; + fitness_v = -1; + + complexity=-1; + dcounter=-1; + rank=-1; + crowd_dist = -1; + }; + + void init(const SearchSpace& ss, const Parameters& params) + { + // TODO: make searchspace use params, so it will generate something valid + program = SS.make_program(params.max_depth, params.max_size); + }; // fitness, objetives, complexity, etc void fit(Dataset& data) { program.fit(data); }; auto predict(Dataset& data) { return program.predict(data); }; // TODO: predict proba and classification related methods. - - // setters and getters - // TODO: This class should also have its own cpp wrapper. Update it into the deap api (the idea is that the user is still able to prototype with brush, I dont think we should disable that feature) - void init(const SearchSpace& ss, const Parameters& params); - - // getters + // just getters string get_model() { return program.get_model(); }; size_t get_size() { return program.size(); }; size_t get_depth() { return program.depth(); }; - size_t get_complexity() { return program.complexity(); }; + + // setters and getters + size_t set_complexity() { + complexity = program.complexity(); + return complexity; + }; // sets and returns it + size_t get_complexity() const { return complexity; }; + + void set_rank(unsigned r){ rank=r; }; + size_t get_rank() const { return rank; }; + + void set_crowd_dist(unsigned cd){ crowd_dist=cd; }; + size_t get_crow_dist() const { return crowd_dist; }; }; } // Pop From 7de001a96c249430ee28bdc88c1e13aac627f2fb Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 6 Nov 2023 16:16:44 -0500 Subject: [PATCH 072/199] Population implemented with island indexes control --- src/population.cpp | 142 +++++++++++++++++++++++++++++++++++++++++++-- src/population.h | 62 ++++++++++++++++++-- 2 files changed, 195 insertions(+), 9 deletions(-) diff --git a/src/population.cpp b/src/population.cpp index 781dc05c..cc719e9c 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -8,23 +8,157 @@ license: GNU/GPL v3 namespace Brush{ namespace Pop{ + template -Population::Population(int p) +void Population::set_island_ranges() { - individuals.resize(p); + // everytime we change popsize, this function must be called + + // Tuples with start and end indexes for each island. Number of individuals + // in each island can slightly differ if N_ISLANDS is not a divisor of p (popsize) + island_ranges.resize(n_islands); + + size_t p = size(); // population size + + for (int i=0; i +Population::Population(int p, int n_islands) +{ + individuals.resize(p); + + this->n_islands=n_islands; + set_island_ranges(); + + island_skip.resize(n_islands); + iota(island_skip.begin(), island_skip.end(), 0); + + offspring_ready = false; } template void Population::init(const SearchSpace& ss, const Parameters& params) { // TODO: load file (like feat) - #pragma omp parallel for for (int i = 0; i< individuals.size(); ++i) { - individuals.at(i).init(ss, params, i); + individuals.at(i).init(ss, params); } } +/// update individual vector size and island indexes +template +void Population::prep_offspring_slots() +{ + if (offspring_ready) + HANDLE_ERROR_THROW("Allocating space in population that already has active offspring slots"); + + vector*> expanded_pop; + expanded_pop.resize(2*individuals.size()); + + for (int i=0; iindividuals = &expanded_pop; + offspring_ready = true; +} + +template +void Population::update(vector survivors) +{ + if (!offspring_ready) + HANDLE_ERROR_THROW("Shrinking a population that has no active offspring"); + + assert(survivors.size() == individuals.size()/2 + && "Cant shrink a population to a size different from the original initial size"); + + vector pop_idx(individuals.size()); + std::iota(pop_idx.begin(),pop_idx.end(),0); + std::reverse(pop_idx.begin(),pop_idx.end()); + for (const auto& i : pop_idx) + if (!in(survivors,i)) + individuals.erase(individuals.begin()+i); + + set_island_ranges(); + offspring_ready = false; +} + +template +string Population::print_models(bool just_offspring, string sep) +{ + // not printing the island each individual belongs to + string output = ""; + + for (int i=0; i +vector> Population::sorted_front(unsigned rank) +{ + // this is used to update archive at the end of a generation. Supose islands without offspring + + /* Returns individuals on the Pareto front, sorted by increasign complexity. */ + vector> pf_islands; + pf_islands.resize(n_islands); + + for (int i=0; i pf; + + for (unsigned int i =idx_start; i class Population{ -public: - vector*> individuals; - - Population(int p=0); - ~Population(){}; +private: + void set_island_ranges(); +public: + bool offspring_ready; + vector*> individuals; + vector> island_ranges; + vector island_skip; // number of indexes to skip for each island (when variation fails) + unsigned int n_islands; + Population(int p = 0, int n_islands=1); + + ~Population(); + + /// initialize population of programs with a starting model and/or from file void init(const SearchSpace& ss, const Parameters& params); + + /// returns population size + int size() { return individuals.size(); }; + + /// update individual vector size, distributing the expressions in n_islands + void prep_offspring_slots(); + + // TODO: WORK WITH ISLANDS + /// reduce programs to the indices in survivors. + void update(vector survivors); + + /// setting and getting from individuals vector (will ignore islands) + const Individual operator [](size_t i) const {return individuals.at(i);} + const Individual & operator [](size_t i) {return individuals.at(i);} + + /// return population equations. + string print_models(bool just_offspring=false, string sep="\n"); + + // TODO: WORK WITH ISLANDS (vector of vectors, one for each island) + /// return complexity-sorted Pareto front indices. + vector> sorted_front(unsigned rank=1); + + /// Sort each island in increasing complexity. + struct SortComplexity + { + Population& pop; + SortComplexity(Population& p): pop(p){} + bool operator()(size_t i, size_t j) + { + return pop.individuals[i].set_complexity() < pop.individuals[j].set_complexity(); + } + }; + + /// check for same fitness and complexity to filter uniqueness. + struct SameFitComplexity + { + Population & pop; + SameFitComplexity(Population& p): pop(p){} + bool operator()(size_t i, size_t j) + { + return (pop.individuals[i].fitness == pop.individuals[j].fitness && + pop.individuals[i].set_complexity() == pop.individuals[j].set_complexity()); + } + }; }; }// Pop From cd73a62897a10455b7fd79b6b3c9db4723cf8ba9 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 6 Nov 2023 16:25:25 -0500 Subject: [PATCH 073/199] Files for implementing nsga2 --- src/selection/nsga2.cpp | 11 +++++++++++ src/selection/nsga2.h | 14 ++++++++++++++ src/selection/selection.h | 5 ++++- 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 src/selection/nsga2.cpp create mode 100644 src/selection/nsga2.h diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp new file mode 100644 index 00000000..815b4e5d --- /dev/null +++ b/src/selection/nsga2.cpp @@ -0,0 +1,11 @@ +#include "nsga2.h" + +namespace Brush { +namespace selection { + +using namespace Brush; +using namespace Pop; + + +} // selection +} // Brush \ No newline at end of file diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h new file mode 100644 index 00000000..8d7a5a18 --- /dev/null +++ b/src/selection/nsga2.h @@ -0,0 +1,14 @@ +#ifndef NSGA2_H +#define NSGA2_H + +#include "selection.h" + +namespace Brush { +namespace selection { + +using namespace Brush; +using namespace Pop; + +} // selection +} // Brush +#endif \ No newline at end of file diff --git a/src/selection/selection.h b/src/selection/selection.h index f5336a1f..082f7b5c 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -10,6 +10,9 @@ license: GNU/GPL v3 #include "../params.h" #include "../population.h" +// including other selection file headers +#include "nsga2.h" + namespace Brush { namespace selection { @@ -29,7 +32,7 @@ struct SelectionOperator virtual ~SelectionOperator(); - template // TODO: HOW TO STOP TEMPLATING EVERYTHING??? + template vector select(Population& pop, const Parameters& p, const Dataset& data) { From b5104720ec7d83ba95c0d6037bc3f7114f20de45 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 6 Nov 2023 21:36:58 -0500 Subject: [PATCH 074/199] Draft of nsga2 selection implementation --- src/selection/nsga2.cpp | 2 +- src/selection/nsga2.h | 65 ++++++++++++++++++++++++++++++++++++- src/selection/selection.cpp | 2 +- src/selection/selection.h | 11 +++---- 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 815b4e5d..ee6fffd6 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -1,7 +1,7 @@ #include "nsga2.h" namespace Brush { -namespace selection { +namespace Sel { using namespace Brush; using namespace Pop; diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index 8d7a5a18..0f0b6c9d 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -2,12 +2,75 @@ #define NSGA2_H #include "selection.h" +#include "../init.h" +#include "../program/program.h" +#include "../population.h" +#include "../individual.h" +#include "../data/data.h" namespace Brush { -namespace selection { +namespace Sel { using namespace Brush; using namespace Pop; +using namespace Data; +using namespace Sel; + +template +class NSGA2 : public SelectionOperator +{ + /** NSGA-II based selection and survival methods. */ + + NSGA2(bool surv); + ~NSGA2(); + + /// selection according to the survival scheme of NSGA-II + vector select(Population& pop, + const Parameters& p, const Dataset& d); + + /// survival according to the survival scheme of NSGA-II + vector survive(Population& pop, + const Parameters& p, const Dataset& d); + + //< the Pareto fronts + vector> front; + + //< Fast non-dominated sorting + void fast_nds(vector>&); + + //< crowding distance of a front i + void crowding_distance(Population&, int); + + private: + /// sort based on rank, breaking ties with crowding distance + struct sort_n + { + const Population& pop; ///< population address + sort_n(const Population& population) : pop(population) {}; + bool operator() (int i, int j) { + const Individual& ind1 = pop.individuals[i]; + const Individual& ind2 = pop.individuals[j]; + if (ind1.rank < ind2.rank) + return true; + else if (ind1.rank == ind2.rank && + ind1.crowd_dist > ind2.crowd_dist) + return true; + return false; + }; + }; + + /// sort based on objective m + struct comparator_obj + { + const Population& pop; ///< population address + int m; ///< objective index + comparator_obj(const Population& population, int index) + : pop(population), m(index) {}; + bool operator() (int i, int j) { return pop[i].obj[m] < pop[j].obj[m]; }; + }; + + size_t tournament(vector>& pop, size_t i, size_t j) const; +}; } // selection } // Brush diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index e09f24b0..1195c973 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -2,7 +2,7 @@ // TODO: organize all namespaces namespace Brush { -namespace selection { +namespace Sel { using namespace Brush; using namespace Pop; diff --git a/src/selection/selection.h b/src/selection/selection.h index 082f7b5c..e1a699df 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -10,11 +10,8 @@ license: GNU/GPL v3 #include "../params.h" #include "../population.h" -// including other selection file headers -#include "nsga2.h" - namespace Brush { -namespace selection { +namespace Sel { using namespace Brush; using namespace Pop; @@ -23,12 +20,13 @@ using namespace Pop; * @class SelectionOperator * @brief base class for selection operators. */ -struct SelectionOperator +class SelectionOperator { +public: bool survival; string name; - //SelectionOperator(){} + SelectionOperator(){} virtual ~SelectionOperator(); @@ -57,6 +55,7 @@ struct Parameters; // forward declaration of Parameters */ struct Selection { +public: shared_ptr pselector; string type; bool survival; From f7469eeb409a004ad3b3ee3e64d6990405b9b0f1 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 7 Nov 2023 10:59:49 -0500 Subject: [PATCH 075/199] Added TODOs --- src/individual.h | 3 ++- src/params.h | 1 + src/variation.h | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/individual.h b/src/individual.h index 72ec4a0e..3cc9e653 100644 --- a/src/individual.h +++ b/src/individual.h @@ -20,7 +20,8 @@ class Individual{ size_t complexity; unsigned int dcounter; ///< number of individuals this dominates - + vector dominated; ///< individual indices this dominates + unsigned int rank; ///< pareto front rank float crowd_dist; ///< crowding distance on the Pareto front diff --git a/src/params.h b/src/params.h index e1d718ef..c46222d1 100644 --- a/src/params.h +++ b/src/params.h @@ -19,6 +19,7 @@ struct Parameters int random_state; // TODO: constructor should set the global rng to random_state (if given, otherwise just let it work normally) int verbosity = 0; + // TODO: python wrapper should have getters and setters for all this stuff // Evolutionary stuff string mode="regression"; int pop_size = 100; diff --git a/src/variation.h b/src/variation.h index b789a081..9d8e782c 100644 --- a/src/variation.h +++ b/src/variation.h @@ -688,5 +688,7 @@ std::optional> cross(const Program& root, const Program& other) return std::nullopt; }; + +// TODO: implement migration as a variation method? } //namespace variation #endif \ No newline at end of file From 22b9fe9017dffce6ee94dfdb7967f53f9351da06 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 7 Nov 2023 11:00:21 -0500 Subject: [PATCH 076/199] Added TODOs --- src/population.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/population.cpp b/src/population.cpp index cc719e9c..9c4f152f 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -114,7 +114,7 @@ string Population::print_models(bool just_offspring, string sep) { auto [idx_start, idx_end] = island_ranges.at(i); size_t skip = island_skip.at(i); // number of individuals to ignore because variation failed - + //TODO: use taskflow and pragma once correctly (search and fix code) if (just_offspring) { size_t delta = idx_end - idx_start; // starting from the middle of the island (where the offspring lives) idx_start = idx_start + delta/2; From 877fa0f7a98e76c50a2d8607d791513192623ff1 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 7 Nov 2023 11:00:46 -0500 Subject: [PATCH 077/199] Select and Survive now takes the island range as argument --- src/selection/selection.cpp | 4 ++-- src/selection/selection.h | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index 1195c973..848c6d5b 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -60,7 +60,7 @@ void Selection::set_type(string in){ type = in; set_operator();} /// perform selection template -vector Selection::select(Population& pop, +vector Selection::select(Population& pop, tuple island_range, const Parameters& params, const Dataset& data) { return pselector->select(pop, params, data); @@ -68,7 +68,7 @@ vector Selection::select(Population& pop, /// perform survival template -vector Selection::survive(Population& pop, +vector Selection::survive(Population& pop, tuple island_range, const Parameters& params, const Dataset& data) { return pselector->survive(pop, params, data); diff --git a/src/selection/selection.h b/src/selection/selection.h index e1a699df..829f8444 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -31,7 +31,7 @@ class SelectionOperator virtual ~SelectionOperator(); template - vector select(Population& pop, + vector select(Population& pop, tuple island_range, const Parameters& p, const Dataset& data) { // THROW_INVALID_ARGUMENT("Undefined select() operation"); @@ -39,7 +39,7 @@ class SelectionOperator } template - vector survive(Population& pop, + vector survive(Population& pop, tuple island_range, const Parameters& p, const Dataset& data) { // THROW_INVALID_ARGUMENT("Undefined select() operation"); @@ -70,14 +70,14 @@ struct Selection string get_type(); void set_type(string); - /// perform selection + /// perform selection. selection uses a pop that has no offspring space template - vector select(Population& pop, + vector select(Population& pop, tuple island_range, const Parameters& params, const Dataset& data); - /// perform survival + /// perform survival. uses a pop with offspring space template - vector survive(Population& pop, + vector survive(Population& pop, tuple island_range, const Parameters& params, const Dataset& data); }; From 3c314996d3b3d9ba309f5c7dabc181e996f66035 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 7 Nov 2023 11:01:22 -0500 Subject: [PATCH 078/199] nsga2 with islands --- src/selection/nsga2.cpp | 240 ++++++++++++++++++++++++++++++++++++++++ src/selection/nsga2.h | 24 ++-- 2 files changed, 255 insertions(+), 9 deletions(-) diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index ee6fffd6..dd6c4463 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -5,7 +5,247 @@ namespace Sel { using namespace Brush; using namespace Pop; +using namespace Data; +using namespace Sel; +template +size_t NSGA2::tournament(vector>& pop, size_t i, size_t j) const +{ + // gets two individuals and compares them. i and j bhould be within island range + Individual& ind1 = pop.at(i); + Individual& ind2 = pop.at(j); + + int flag = ind1.check_dominance(ind2); + + if (flag == 1) // ind1 dominates ind2 + return i; + else if (flag == -1) // ind2 dominates ind1 + return j; + else if (ind1.crowd_dist > ind2.crowd_dist) + return i; + else if (ind2.crowd_dist > ind1.crowd_dist) + return j; + else + return i; +} + +template +vector NSGA2::select(Population& pop, tuple island_range, + const Parameters& params, const Dataset& d) +{ + /* Selection using Pareto tournaments. + * + * Input: + * + * pop: population of programs. + * params: parameters. + * r: random number generator + * + * Output: + * + * selected: vector of indices corresponding to pop that are selected. + * modifies individual ranks, objectives and dominations. + */ + + auto [idx_start, idx_end] = island_range; + + if (pop.offspring_ready) // dont look at offspring to select + idx_end = idx_end/2; + + size_t delta = idx_end - idx_start; + + vector island_pool(delta); + std::iota(island_pool.begin(), island_pool.end(), idx_start); + + // if this is first generation, just return indices to pop + if (params.current_gen==0) + return island_pool; + + vector selected(0); + + for (int i = 0; i < delta; ++i) // selecting based on island_pool size + { + size_t winner = tournament(pop.individuals, + r.select_randomly(island_pool.begin(), island_pool.end()), + r.select_randomly(island_pool.begin(), island_pool.end())); + + selected.push_back(winner); + } + return selected; +} + +template +vector NSGA2::survive(Population& pop, tuple island_range, + const Parameters& params, const Dataset& d) +{ + /* Selection using the survival scheme of NSGA-II. + * + * Input: + * + * pop: population of programs. + * params: parameters. + * r: random number generator + * + * Output: + * + * selected: vector of indices corresponding to pop that are selected. + * modifies individual ranks, objectives and dominations. + */ + + auto [idx_start, idx_end] = island_range; + + assert(pop.offspring_ready + && "survival was called in an island with no offspring"); + + size_t delta = idx_end - idx_start; + + vector island_pool(delta); // array with indexes for the specific island_pool + std::iota(island_pool.begin(), island_pool.end(), idx_start); + + // set objectives + #pragma omp parallel for + for (unsigned int i=0; i selected(0); + int i = 0; + while ( selected.size() + front.at(i).size() < delta/2 ) // (delta/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) + { + std::vector& Fi = front.at(i); // indices in front i + crowding_distance(pop, front, i); // calculate crowding in Fi + + for (int j = 0; j < Fi.size(); ++j) // Pt+1 = Pt+1 U Fi + selected.push_back(Fi.at(j)); + + ++i; + } + + crowding_distance(pop, front, i); // calculate crowding in final front to include + std::sort(front.at(i).begin(),front.at(i).end(),sort_n(pop)); + + const int extra = params.pop_size - selected.size(); + for (int j = 0; j < extra; ++j) // Pt+1 = Pt+1 U Fi[1:N-|Pt+1|] + selected.push_back(front.at(i).at(j)); + + return selected; +} + +template +vector> NSGA2::fast_nds(vector>& individuals, vector& island_pool) +{ + //< the Pareto fronts + vector> front; + + front.resize(1); + front.at(0).clear(); + + #pragma omp parallel for + for (int i = 0; i < island_pool.size(); ++i) { + + std::vector dom; + int dcount = 0; + + Individual& p = individuals.at(island_pool[i]); + + for (int j = 0; j < island_pool.size(); ++j) { + + Individual& q = individuals.at(island_pool[j]); + + int compare = p.check_dominance(q); + if (compare == 1) { // p dominates q + //p.dominated.push_back(j); + dom.push_back(island_pool[j]); + } else if (compare == -1) { // q dominates p + //p.dcounter += 1; + dcount += 1; + } + } + + #pragma omp critical + { + p.dcounter = dcount; + p.dominated.clear(); + p.dominated = dom; // dom will have values already referring to island indexes + + if (p.dcounter == 0) { + p.set_rank(1); + // front will have values already referring to island indexes + front.at(0).push_back(island_pool[i]); + } + } + } + + // using OpenMP can have different orders in the front.at(0) + // so let's sort it so that the algorithm is deterministic + // given a seed + std::sort(front.at(0).begin(), front.at(0).end()); + + int fi = 1; + while (front.at(fi-1).size() > 0) { + + std::vector& fronti = front.at(fi-1); + std::vector Q; + for (int i = 0; i < fronti.size(); ++i) { + + Individual& p = individuals.at(fronti.at(i)); + + // iterating over dominated individuals + for (int j = 0; j < p.dominated.size() ; ++j) { + + Individual& q = individuals.at(p.dominated.at(j)); + q.dcounter -= 1; + + if (q.dcounter == 0) { + q.set_rank(fi+1); + Q.push_back(p.dominated.at(j)); + } + } + } + + fi += 1; + front.push_back(Q); + } + + return front; +} + +template +void NSGA2::crowding_distance(Population& pop, vector>& front, int fronti) +{ + std::vector F = front.at(fronti); + if (F.size() == 0 ) return; + + const int fsize = F.size(); + + for (int i = 0; i < fsize; ++i) + pop.individuals.at(F.at(i)).crowd_dist = 0; + + const int limit = pop.individuals.at(0).obj.size(); + for (int m = 0; m < limit; ++m) { + + std::sort(F.begin(), F.end(), comparator_obj(pop,m)); + + // in the paper dist=INF for the first and last, in the code + // this is only done to the first one or to the two first when size=2 + pop.individuals.at(F.at(0)).crowd_dist = std::numeric_limits::max(); + if (fsize > 1) + pop.individuals.at(F.at(fsize-1)).crowd_dist = std::numeric_limits::max(); + + for (int i = 1; i < fsize-1; ++i) + { + if (pop.individuals.at(F.at(i)).crowd_dist != std::numeric_limits::max()) + { // crowd over obj + pop.individuals.at(F.at(i)).crowd_dist += + (pop.individuals.at(F.at(i+1)).obj.at(m) - pop.individuals.at(F.at(i-1)).obj.at(m)) + / (pop.individuals.at(F.at(fsize-1)).obj.at(m) - pop.individuals.at(F.at(0)).obj.at(m)); + } + } + } +} } // selection } // Brush \ No newline at end of file diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index 0f0b6c9d..ceb9f3ce 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -19,34 +19,38 @@ using namespace Sel; template class NSGA2 : public SelectionOperator { + // should operate only on a given island index /** NSGA-II based selection and survival methods. */ - NSGA2(bool surv); - ~NSGA2(); + // if any of the islands have overlapping indexes, parallel access and modification should be ok (because i dont increase or decrease pop size, not change island ranges inside selection) + + NSGA2(bool surv){ name = "nsga2"; survival = surv; }; + ~NSGA2(){}; /// selection according to the survival scheme of NSGA-II - vector select(Population& pop, + vector select(Population& pop, tuple island_range, const Parameters& p, const Dataset& d); /// survival according to the survival scheme of NSGA-II - vector survive(Population& pop, + vector survive(Population& pop, tuple island_range, const Parameters& p, const Dataset& d); - //< the Pareto fronts - vector> front; - //< Fast non-dominated sorting - void fast_nds(vector>&); + vector> fast_nds(vector>&, vector&); + + // front cannot be an attribute because selection will be executed in different threads for different islands (this is a modificationf rom original FEAT code that I got inspiration) //< crowding distance of a front i - void crowding_distance(Population&, int); + void crowding_distance(Population&, vector>&, int); private: /// sort based on rank, breaking ties with crowding distance struct sort_n { const Population& pop; ///< population address + sort_n(const Population& population) : pop(population) {}; + bool operator() (int i, int j) { const Individual& ind1 = pop.individuals[i]; const Individual& ind2 = pop.individuals[j]; @@ -64,8 +68,10 @@ class NSGA2 : public SelectionOperator { const Population& pop; ///< population address int m; ///< objective index + comparator_obj(const Population& population, int index) : pop(population), m(index) {}; + bool operator() (int i, int j) { return pop[i].obj[m] < pop[j].obj[m]; }; }; From 8a2d920c10a5b3e7442912d6f66903e556401ee0 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 7 Nov 2023 17:43:47 -0500 Subject: [PATCH 079/199] Parameters implemented (but not effectively used yet) --- src/bindings/bind_cbrush.cpp | 6 +- src/cbrush.cpp | 92 +++++++++++++++++++++++++- src/cbrush.h | 121 ++++++++++++++++++++--------------- src/params.cpp | 4 +- src/params.h | 36 ++++++----- 5 files changed, 184 insertions(+), 75 deletions(-) diff --git a/src/bindings/bind_cbrush.cpp b/src/bindings/bind_cbrush.cpp index 89185a9c..2e3d77f6 100644 --- a/src/bindings/bind_cbrush.cpp +++ b/src/bindings/bind_cbrush.cpp @@ -5,12 +5,10 @@ namespace py = pybind11; namespace br = Brush; namespace nl = nlohmann; -using namespace Brush; - void bind_cbrush(py::module& m) { - py::class_(m, "CBrush", py::dynamic_attr()) + py::class_(m, "CBrush") .def(py::init([]() - { CBrush est; return est; })) + { br::CBrush est; return est; })) ; } \ No newline at end of file diff --git a/src/cbrush.cpp b/src/cbrush.cpp index c634473c..137f0238 100644 --- a/src/cbrush.cpp +++ b/src/cbrush.cpp @@ -2,10 +2,100 @@ #include -using namespace Brush; +namespace Brush{ /// @brief initialize Feat object for fitting. void CBrush::init() { + if (params.n_jobs!=0) // TODO: change this to set taskflow jobs + omp_set_num_threads(params.n_jobs); + r.set_seed(params.random_state); + + set_is_fitted(false); + + // TODO: implement stuff below + // // start the clock + // timer.Reset(); + + // // signal handler + // signal(SIGINT, my_handler); + + // // reset statistics + // this->stats = Log_Stats(); + + // params.use_batch = params.bp.batch_size>0; + + // TODO: initialize dataset and search space here or inside fit? +} + +void CBrush::run_generation(unsigned int g, + vector survivors, + Dataset &d, + float fraction, + unsigned& stall_count) +{ + // d.t->set_protected_groups(); + + // params.set_current_gen(g); + + // // select parents + // logger.log("selection..", 2); + // vector parents = selector.select(pop, params, *d.t); + // logger.log("parents:\n"+pop.print_eqns(), 3); + // // variation to produce offspring + // logger.log("variation...", 2); + // variator.vary(pop, parents, params,*d.t); + // logger.log("offspring:\n" + pop.print_eqns(true), 3); + + // // evaluate offspring + // logger.log("evaluating offspring...", 2); + // evaluator.fitness(pop.individuals, *d.t, params, true); + // evaluator.validation(pop.individuals, *d.v, params, true); + + // // select survivors from combined pool of parents and offspring + // logger.log("survival...", 2); + // survivors = survivor.survive(pop, params, *d.t); + + // // reduce population to survivors + // logger.log("shrinking pop to survivors...",2); + // pop.update(survivors); + // logger.log("survivors:\n" + pop.print_eqns(), 3); + + // logger.log("update best...",2); + // bool updated_best = update_best(d); + + // logger.log("calculate stats...",2); + // calculate_stats(d); + + // if (params.max_stall > 0) + // update_stall_count(stall_count, updated_best); + + // logger.log("update archive...",2); + // if (use_arch) + // archive.update(pop,params); + + // if(params.verbosity>1) + // print_stats(log, fraction); + // else if(params.verbosity == 1) + // printProgress(fraction); + + // if (!logfile.empty()) + // log_stats(log); + + // if (save_pop > 1) + // pop.save(this->logfile+".pop.gen" + + // to_string(params.current_gen) + ".json"); + + // // tighten learning rate for grad descent as evolution progresses + // if (params.backprop) + // { + // params.bp.learning_rate = \ + // (1-1/(1+float(params.gens)))*params.bp.learning_rate; + // logger.log("learning rate: " + // + std::to_string(params.bp.learning_rate),3); + // } + // logger.log("finished with generation...",2); } + +} \ No newline at end of file diff --git a/src/cbrush.h b/src/cbrush.h index 2eec929f..1d4ac81d 100644 --- a/src/cbrush.h +++ b/src/cbrush.h @@ -9,6 +9,7 @@ license: GNU/GPL v3 #include "init.h" #include "params.h" #include "selection/selection.h" +#include "./util/rnd.h" #include "population.h" #include "taskflow/taskflow.hpp" @@ -24,7 +25,7 @@ namespace Brush class CBrush{ public: - CBrush(){}; // TODO: constructor should create a new parameters and use it in every other stuff + CBrush(){ params = Parameters(); }; ~CBrush(){}; void init(); @@ -33,82 +34,96 @@ class CBrush{ inline void set_is_fitted(bool f){is_fitted=f;} inline bool get_is_fitted(){return is_fitted;} - /// set size of population - void set_pop_size(int pop_size); - /// return population size - int get_pop_size(); + // TODO: WRAPPER SHOULD SET ALL THESE + + void set_pop_size(int pop_size){ params.pop_size = pop_size; }; + int get_pop_size(){ return params.pop_size; }; - /// set size of max generations - void set_gens(int gens); - ///return size of max generations - int get_gens(); - - /// set EProblemType for shogun - void set_classification(bool classification); - ///return type of classification flag set - bool get_classification(); - - /// set selection method - void set_selection(string sel); - string get_selection(); + void set_gens(int gens){ params.gens = gens; }; + int get_gens(){ return params.gens; }; - /// set survivability - void set_survival(string surv); - string get_survival(); - - ///return cross rate for variation - float get_cross_rate(); - /// set cross rate in variation - void set_cross_rate(float cross_rate); + void set_max_depth(unsigned int max_depth){ params.max_depth = max_depth; }; + int get_max_depth(){ return params.max_depth; }; + + void set_max_size(unsigned int max_size){ params.max_size = max_size; }; + int get_max_size(){ return params.max_size; }; - /// sets available functions based on comma-separated list. - // void set_functions(const vector& fns){ params.set_functions(fns); }; - // vector get_functions(){return params.get_functions();}; - - ///return max_depth of programs - int get_max_depth(); - /// set max depth of programs - void set_max_depth(unsigned int max_depth); - - ///return max dimensionality of programs - int get_max_size(); - /// set maximum sizeensionality of programs - void set_max_size(unsigned int max_dim); + void set_mode(string mode) { params.mode = mode; }; + string get_mode(){ return params.mode; }; + + void set_selection(string sel){ params.sel = sel; }; + string get_selection(){ return params.sel; }; + + void set_survival(string surv){ params.surv = surv; }; + string get_survival(){ return params.surv; }; + + void set_num_islands(int n_islands){ params.num_islands = n_islands; }; + int get_num_islands(){ return params.num_islands; }; + + void set_objectives(const vector& obj){params.objectives = obj; }; + auto get_objectives(){return params.objectives; }; + + void set_random_state(int random_state) { + params.random_state = random_state; + r.set_seed(params.random_state); + }; + int get_random_state() { return params.random_state; }; + + void set_mig_prob(float mig_prob){ params.mig_prob = mig_prob;}; + float get_mig_prob(){ return params.mig_prob; }; - /// set seeds for each core's random number generator - // void set_random_state(int random_state); - // int get_random_state() { return params.random_state; }; - // /// returns the actual seed determined by the input argument. - // int get_random_state_() { return r.get_seed(); }; + void set_cross_prob(float cross_prob){ params.cx_prob = cross_prob;}; + float get_cross_prob(){ return params.cx_prob; }; + + // sets available functions based on comma-separated list. + void set_functions(const vector& fns){ params.functions = fns; }; + vector get_functions(){ return params.functions; }; + void set_mutation_probs(std::map mutation_probs){ params.mutation_probs = mutation_probs;}; + std::map get_mutation_probs(){ return params.mutation_probs; }; + + //TODO ///return fraction of data to use for training - float get_split(); - /// set train fraction of dataset - void set_split(float sp); + // float get_split(); + // /// set train fraction of dataset + // void set_split(float sp); + // TODO // int get_batch_size(){return params.bp.batch_size;}; // void set_batch_size(int bs); - ///set number of threads + // TODO + ///set number of threads (and use them in taskflow) // void set_n_jobs(unsigned t); // int get_n_jobs(){return omp_get_num_threads();}; ///set flag to use batch for training - void set_use_batch(); + // void set_use_batch(); - // getters and setters for the best solution found after evolution + // TODO getters and setters for the best solution found after evolution // predict, transform, predict_proba, etc. // get statistics // load and save best individuals // logger, save to file // execution archive - // random state control // score functions - // fit methods (this will run the evolution), run a single generation + // fit methods (this will run the evolution) + /// train a model. TODO: take arguments needed to build the dataset. once we have it, go through params to set global options and use them + void fit(MatrixXf& X); + void fit(MatrixXf& X, VectorXf& y); + bool is_fitted; ///< keeps track of whether fit was called. + + void run_generation(unsigned int g, + vector survivors, + Dataset &d, + float percentage, + unsigned& stall_count); private: - Parameters params; ///< hyperparameters of Feat + Parameters params; ///< hyperparameters of brush + + // TODO // attributes (hyperparameters) // update best // calculate/print stats diff --git a/src/params.cpp b/src/params.cpp index 03415ae1..a2ddad92 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -3,10 +3,10 @@ copyright 2020 William La Cava license: GNU/GPL v3 */ #include "params.h" + namespace Brush { -void Parameters::init(const MatrixXf& X, const VectorXf& y) {};// TODO: implement this - + nlohmann::json PARAMS; void set_params(const ns::json& j) { PARAMS = j; } ns::json get_params(){ return PARAMS;} diff --git a/src/params.h b/src/params.h index c46222d1..a12bbcd4 100644 --- a/src/params.h +++ b/src/params.h @@ -14,47 +14,53 @@ namespace Brush struct Parameters { -private: +public: + // TODO: setters and getters for all parameters? (and do checks in setters?) + + // TODO: attribute current_gen + // settings int random_state; // TODO: constructor should set the global rng to random_state (if given, otherwise just let it work normally) - int verbosity = 0; + //int verbosity = 0; // TODO: implement log and verbosity // TODO: python wrapper should have getters and setters for all this stuff // Evolutionary stuff string mode="regression"; + int pop_size = 100; int gens = 100; - unsigned int max_depth = 10; + unsigned int max_depth=10; unsigned int max_size=100; vector objectives{"error","complexity"}; // error should be generic and deducted based on mode - float cx_prob; ///< cross rate for variation - float mutation_probs; + string sel = "nsga2"; //selection method + string surv = "nsga2"; //survival method + vector functions; int num_islands=5; + + // variation + std::map mutation_probs; // TODO: should be an map + float cx_prob; ///< cross rate for variation float mig_prob = 0.05; - vector functions; + string scorer_; ///< actual loss function used, determined by error - // for classification + // for classification (TODO: should I have these, or they could be just dataset arguments (except the ones needed to use in dataset constructor)) unsigned int n_classes; ///< number of classes for classification vector classes; ///< class labels vector class_weights; ///< weights for each class vector sample_weights; ///< weights for each sample - // from dataset + // for dataset bool shuffle = true; ///< option to shuffle the data float split = 0.75; ///< fraction of data to use for training vector feature_names; ///< names of features float batch_size = 0.0; bool use_batch = false; ///< whether to use mini batch for training - int n_jobs = 1; ///< number of parallel jobs -public: - Parameters() {}; - ~Parameters(){}; - - // TODO: getters and setters + int n_jobs = 1; ///< number of parallel jobs (TODO if -1, equals the number of islands?) - void init(const MatrixXf& X, const VectorXf& y); + Parameters(){}; + ~Parameters(){}; }; // Global (deprecated) params From 392aa7c8e1bae5bba134072a194e210f605d288f Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 7 Nov 2023 17:45:25 -0500 Subject: [PATCH 080/199] Starting to implement Variation class Old functions still exist to maintain python wrapper functional --- src/bindings/module.cpp | 2 + src/population.cpp | 17 +- src/program/program.h | 10 +- src/search_space.h | 1 + src/util/error.h | 5 +- src/variation.cpp | 775 ++++++++++++++++++++++++++++++++++++++++ src/variation.h | 635 ++------------------------------ 7 files changed, 833 insertions(+), 612 deletions(-) create mode 100644 src/variation.cpp diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index 277edac8..1da4692a 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -19,6 +19,7 @@ void bind_dataset(py::module &); void bind_search_space(py::module &); void bind_programs(py::module &); void bind_params(py::module &); +void bind_cbrush(py::module &); PYBIND11_MODULE(_brush, m) { @@ -31,6 +32,7 @@ PYBIND11_MODULE(_brush, m) { bind_params(m); bind_dataset(m); bind_search_space(m); + bind_cbrush(m); py::module_ m2 = m.def_submodule("program", "Contains Program classes."); bind_programs(m2); diff --git a/src/population.cpp b/src/population.cpp index 9c4f152f..85a0dda1 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -9,7 +9,7 @@ namespace Brush{ namespace Pop{ -template +template void Population::set_island_ranges() { // everytime we change popsize, this function must be called @@ -29,7 +29,7 @@ void Population::set_island_ranges() }; } -template +template Population::Population(int p, int n_islands) { individuals.resize(p); @@ -43,7 +43,7 @@ Population::Population(int p, int n_islands) offspring_ready = false; } -template +template void Population::init(const SearchSpace& ss, const Parameters& params) { // TODO: load file (like feat) @@ -55,7 +55,7 @@ void Population::init(const SearchSpace& ss, const Parameters& params) } /// update individual vector size and island indexes -template +template void Population::prep_offspring_slots() { if (offspring_ready) @@ -84,7 +84,7 @@ void Population::prep_offspring_slots() offspring_ready = true; } -template +template void Population::update(vector survivors) { if (!offspring_ready) @@ -104,7 +104,7 @@ void Population::update(vector survivors) offspring_ready = false; } -template +template string Population::print_models(bool just_offspring, string sep) { // not printing the island each individual belongs to @@ -129,10 +129,10 @@ string Population::print_models(bool just_offspring, string sep) return output; } -template +template vector> Population::sorted_front(unsigned rank) { - // this is used to update archive at the end of a generation. Supose islands without offspring + // this is used to update archive at the end of a generation. expect islands without offspring /* Returns individuals on the Pareto front, sorted by increasign complexity. */ vector> pf_islands; @@ -159,6 +159,5 @@ vector> Population::sorted_front(unsigned rank) return pf_islands; } - } // Pop } // Brush diff --git a/src/program/program.h b/src/program/program.h index 747e5f71..b567f490 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -22,6 +22,7 @@ license: GNU/GPL v3 #include "../params.h" #include "../util/utils.h" #include "functions.h" +// #include "../variation.h" // #include "weight_optimizer.h" @@ -530,6 +531,7 @@ template struct Program //////////////////////////////////////////////////////////////////////////////// // weight optimization #include "optimizer/weight_optimizer.h" +#include "../variation.h" namespace Brush{ template @@ -542,20 +544,20 @@ void Program::update_weights(const Dataset& d) WO.update((*this), d); }; + //////////////////////////////////////////////////////////////////////////////// // mutation and crossover -#include "../variation.h" template std::optional> Program::mutate() const { - return variation::mutate(*this, this->SSref.value().get()); + return Brush::Var::mutate(*this, this->SSref.value().get()); }; /// swaps subtrees between this and other (note the pass by copy) template std::optional> Program::cross(Program other) const { - return variation::cross(*this, other); + return Brush::Var::cross(*this, other); }; @@ -577,4 +579,6 @@ void from_json(const json &j, Program& p) }//namespace Brush + + #endif diff --git a/src/search_space.h b/src/search_space.h index f003d8b3..10cadd4d 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -10,6 +10,7 @@ license: GNU/GPL v3 #include "program/nodetype.h" #include "program/tree_node.h" // #include "program/program.h" +#include "util/error.h" #include "util/utils.h" #include "util/rnd.h" #include "params.h" diff --git a/src/util/error.h b/src/util/error.h index 4ae607c1..96911acf 100644 --- a/src/util/error.h +++ b/src/util/error.h @@ -21,10 +21,9 @@ namespace Brush{ namespace Util { ///prints error to stderr and returns void HandleErrorNoThrow(string err, const char *file, int line ); - #define HANDLE_ERROR_THROW( err ) (Brush::Util::HandleErrorThrow( err, __FILE__, __LINE__ )) - #define HANDLE_WARNING( err ) (Brush::Util::HandleErrorNoThrow( err, __FILE__, __LINE__ )) - // TODO: have more errors }} +#define HANDLE_ERROR_THROW( err ) (Util::HandleErrorThrow( err, __FILE__, __LINE__ )) +#define HANDLE_WARNING( err ) (Util::HandleErrorNoThrow( err, __FILE__, __LINE__ )) #endif diff --git a/src/variation.cpp b/src/variation.cpp new file mode 100644 index 00000000..0e960948 --- /dev/null +++ b/src/variation.cpp @@ -0,0 +1,775 @@ +#include "variation.h" + +namespace Brush { +namespace Var { + +/// @brief replace node with same typed node +/// @param prog the program +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space to sample a node like `spot` +/// @return boolean indicating the success (true) or fail (false) of the operation +class PointMutation : public MutationBase +{ +public: + explicit PointMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) + { + } + + auto operator()(tree& Tree, Iter spot) const -> bool override + { + // cout << "point mutation\n"; + + // get_node_like will sample a similar node based on node_map_weights or + // terminal_weights, and maybe will return a Node. + optional newNode = SS().get_node_like(spot.node->data); + + if (!newNode) // overload to check if newNode == nullopt + return false; + + // if optional contains a Node, we access its contained value + Tree.replace(spot, *newNode); + + return true; + } +}; + +/// @brief insert a node with spot as a child +/// @param prog the program +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space to sample a node like `spot` +/// @return boolean indicating the success (true) or fail (false) of the operation +class InsertMutation : public MutationBase +{ +public: + explicit InsertMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) + { + } + + auto find_spots(tree& Tree) const -> vector override + { + vector weights; + + if (size_with_weights(Tree) < max_size()) { + Iter iter = Tree.begin(); + std::transform(Tree.begin(), Tree.end(), std::back_inserter(weights), + [&](const auto& n){ + size_t d = 1+Tree.depth(iter); + std::advance(iter, 1); + + // check if SS holds an operator to avoid failing `check` in sample_op_with_arg + if ((d >= max_depth()) + || (SS().node_map.find(n.ret_type) == SS().node_map.end())) { + return 0.0f; + } + else { + return n.get_prob_change(); + } + }); + } + else { + // fill the vector with zeros, since we're already at max_size + weights.resize(Tree.size()); + std::fill(weights.begin(), weights.end(), 0.0f); + } + + return weights; + } + + auto operator()(tree& Tree, Iter spot) const -> bool override + { + // cout << "insert mutation\n"; + auto spot_type = spot.node->data.ret_type; + + // pick a random compatible node to insert (with probabilities given by + // node_map_weights). The `-1` represents the node being inserted. + // Ideally, it should always find at least one match (the same node + // used as a reference when calling the function). However, we have a + // size restriction, which will be relaxed here (just as it is in the PTC2 + // algorithm). This mutation can create a new expression that exceeds the + // maximum size by the highest arity among the operators. + std::optional n = SS().sample_op_with_arg(spot_type, spot_type, true, + max_size()-Tree.size()-1); + + if (!n) // there is no operator with compatible arguments + return false; + + // make node n wrap the subtree at the chosen spot + auto parent_node = Tree.wrap(spot, *n); + + // now fill the arguments of n appropriately + bool spot_filled = false; + for (auto a: (*n).arg_types) + { + if (spot_filled) + { + // if spot is in its child position, append children. + // TODO: reminding that sample_terminal may fail as well + auto opt = SS().sample_terminal(a); + + if (!opt) + return false; + + Tree.append_child(parent_node, opt.value()); + } + // if types match, treat this spot as filled by the spot node + else if (a == spot_type) + spot_filled = true; + // otherwise, add siblings before spot node + else { + auto opt = SS().sample_terminal(a); + + if (!opt) + return false; + + Tree.insert(spot, opt.value()); + } + } + + return true; + } +}; + +/// @brief delete subtree and replace it with a terminal of the same return type +/// @param prog the program +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space to sample a node like `spot` +/// @return boolean indicating the success (true) or fail (false) of the operation +class DeleteMutation : public MutationBase +{ +public: + explicit DeleteMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) + { + } + + auto operator()(tree& Tree, Iter spot) const -> bool override + { + // cout << "delete mutation\n"; + + // sample_terminal will sample based on terminal_weights. If it succeeds, + // then the new terminal will be in `opt.value()` + auto opt = SS().sample_terminal(spot.node->data.ret_type); + + if (!opt) // there is no terminal with compatible arguments + return false; + + Tree.erase_children(spot); + + Tree.replace(spot, opt.value()); + + return true; + } +}; + +/// @brief toggle the node's weight ON +/// @param prog the program +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space (unused) +/// @return boolean indicating the success (true) or fail (false) of the operation +class ToggleWeightOnMutation : public MutationBase +{ +public: + explicit ToggleWeightOnMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) + { + } + + auto find_spots(tree& Tree) const -> vector override + { + vector weights(Tree.size()); + + if (size_with_weights(Tree) < max_size()) { + std::transform(Tree.begin(), Tree.end(), weights.begin(), + [&](const auto& n){ + // only weighted nodes can be toggled off + if (!n.get_is_weighted() + && IsWeighable(n.ret_type)) + return n.get_prob_change(); + else + return 0.0f; + }); + } + else { + // fill the vector with zeros, since we're already at max_size + std::fill(weights.begin(), weights.end(), 0.0f); + } + + return weights; + } + + auto operator()(tree& Tree, Iter spot) const -> bool override + { + // cout << "toggle_weight_on mutation\n"; + + if (spot.node->data.get_is_weighted()==true // cant turn on whats already on + || !IsWeighable(spot.node->data.ret_type)) // does not accept weights (e.g. boolean) + return false; // false indicates that mutation failed and should return std::nullopt + + spot.node->data.set_is_weighted(true); + return true; + } +}; + +/// @brief toggle the node's weight OFF +/// @param prog the program +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space (unused) +/// @return boolean indicating the success (true) or fail (false) of the operation +class ToggleWeightOffMutation : public MutationBase +{ +public: + explicit ToggleWeightOffMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) + { + } + + auto find_spots(tree& Tree) const -> vector override + { + vector weights(Tree.size()); + + std::transform(Tree.begin(), Tree.end(), weights.begin(), + [&](const auto& n){ + if (n.get_is_weighted() + && IsWeighable(n.ret_type)) + return n.get_prob_change(); + else + return 0.0f; + }); + + return weights; + } + + auto operator()(tree& Tree, Iter spot) const -> bool override + { + // cout << "toggle_weight_off mutation\n"; + + if (spot.node->data.get_is_weighted()==false) + return false; + + spot.node->data.set_is_weighted(false); + return true; + } +}; + +/// @brief replaces the subtree rooted in `spot` +/// @param prog the program +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space to generate a compatible subtree +/// @return boolean indicating the success (true) or fail (false) of the operation +class SubtreeMutation : public MutationBase +{ +public: + explicit SubtreeMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) + : MutationBase(SS, max_size, max_depth) // TODO: change order size and depth + { + } + + // TODO: make different private functions to find spots and use them. theres too much copy and paste here + auto find_spots(tree& Tree) const -> vector override + { + vector weights; + + auto node_map = SS().node_map; + + if (size_with_weights(Tree) < max_size()) { + Iter iter = Tree.begin(); + std::transform(Tree.begin(), Tree.end(), std::back_inserter(weights), + [&](const auto& n){ + size_t d = 1+Tree.depth(iter); + std::advance(iter, 1); + + // we need to make sure there's some node to start the subtree + if ((d >= max_depth()) + || (SS().node_map.find(n.ret_type) == SS().node_map.end()) + || (SS().node_map.find(n.ret_type) == SS().node_map.end()) ) + return 0.0f; + else + return n.get_prob_change(); + }); + } + else { + weights.resize(Tree.size()); + std::fill(weights.begin(), weights.end(), 0.0f); + } + + return weights; + } + + auto operator()(tree& Tree, Iter spot) const -> bool override + { + // cout << "subtree mutation\n"; + + // check if we exceeded the size/depth constrains (without subtracting, + // to avoid overflow cases if the user sets max_size smaller than arity + // of smallest operator. The overflow would happen when calculating d and + // s in the following lines, to choose the PTC2 limits) + if ( max_size() <= (Tree.size() - Tree.size(spot)) + || max_depth() <= Tree.depth(spot) ) + return false; + + auto spot_type = spot.node->data.ret_type; + + // d and s must be compatible with PTC2 --- they should be based on + // tree structure, not program structure + size_t d = max_depth() - Tree.depth(spot); + size_t s = max_size() - (Tree.size() - Tree.size(spot)); + + s = r.rnd_int(1, s); + + // sample subtree uses PTC2, which operates on depth and size of the tree + // (and not on the program!). we shoudn't care for weights here + auto subtree = SS().sample_subtree(spot.node->data, d, s); + + if (!subtree) // there is no terminal with compatible arguments + return false; + + // if optional contains a Node, we access its contained value + Tree.erase_children(spot); + Tree.replace(spot, subtree.value().begin()); + + return true; + } +}; + + +/** + * @brief Stochastically mutate a program. + * + * Types of mutation: + * + * - point mutation changes a single node. + * - insertion mutation inserts a node as the parent of an existing node, and fills in the other arguments. + * - deletion mutation deletes a node. + * - subtree mutation inserts a new subtree into the program. + * - toggle_weight_on mutation turns a node's weight ON. + * - toggle_weight_off mutation turns a node's weight OFF. + * + * Every mutation has a probability (weight) based on global parameters. The + * spot where the mutation will take place is sampled based on attribute + * `get_prob_change` of each node in the tree. Inside each type of mutation, + * when a new node is inserted, it is sampled based on `terminal_weights`. + * + * Due to the stochastic behavior, and the several sampling steps, it may come to + * a case where the search space does not hold any possible modification to do in + * the program. In this case, the method returns `std::nullopt` (and has overloads + * so it can be used in a boolean context). + * + * If the mutation succeeds, the mutated program can be accessed through the + * `.value()` attribute of the `std::optional`. + * + * This means that, if you use the mutation as `auto opt = mutate(parent, SS)`, + * either `opt==false` or `opt.value()` contains the child program. + * + * @tparam T program type + * @param parent the program to be mutated + * @param SS a search space + * @return `std::optional` that may contain the child program of type `T` + */ +template +std::optional> mutate(const Program& parent, const SearchSpace& SS) +{ + auto options = PARAMS["mutation_options"].get>(); + + // whether we should write everything that happened inside the method + if (PARAMS.value("write_mutation_trace", false)==true) { + // Default fields of the trace. Initialize with default values, which are + // gradually changed throughout the execution of the method. + PARAMS["mutation_trace"] = json({ + {"parent", parent.get_model("compact", true)}, + {"mutation_weights", options}, + // default values, to be changed in case mutation works + {"mutation", "not selected"}, + {"spot_weights", "not calculated"}, + {"spot", "not selected"}, + {"child", "failed to generate"}, + {"status", "initialized weight vectors"}, + {"success", "false"} + }); + } + if (std::all_of(options.begin(), options.end(), + [](const auto& kv) { return kv.second<=0.0; }) + ) + { // No mutation can be successfully applied to this solution + return std::nullopt; + } + + // choose a valid mutation option + string choice = r.random_choice(options); + + // TODO: this could be improved (specially with the Variation class) + std::unique_ptr mutation; + if (choice == "point") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "insert") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "delete") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "toggle_weight_on") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "toggle_weight_off") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else if (choice == "subtree") + mutation = std::make_unique( + SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); + else { + std::string msg = fmt::format("{} not a valid mutation choice", choice); + HANDLE_ERROR_THROW(msg); + } + + if (PARAMS.value("write_mutation_trace", false)==true) { + PARAMS["mutation_trace"]["mutation"] = choice; + } + + Program child(parent); + + // choose location by weighted sampling of program + auto weights = mutation->find_spots(child.Tree); + + if (PARAMS.value("write_mutation_trace", false)==true) { + PARAMS["mutation_trace"]["spot_weights"] = weights; + } + + if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { + return w<=0.0; + })) + { // There is no spot that has a probability to be selected + return std::nullopt; + } + + // apply the mutation and check if it succeeded + auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), + weights.begin(), weights.end()); + + if (PARAMS.value("write_mutation_trace", false)==true) { + PARAMS["mutation_trace"]["spot"] = spot.node->get_model(false); + PARAMS["mutation_trace"]["status"] = "sampled the spot"; + } + + // Every mutation here works inplace, so they return bool instead of + // std::optional to indicare the result of their manipulation over the + // program tree. Here we call the mutation function and return the result + bool success = (*mutation)(child.Tree, spot); + + if (PARAMS.value("write_mutation_trace", false)==true) { + PARAMS["mutation_trace"]["status"] = "aplied the mutation"; + if (success) + PARAMS["mutation_trace"]["child"] = child.get_model("compact", true); + } + + if (success + && ( (child.size() <= PARAMS["max_size"].get() ) + && (child.depth() <= PARAMS["max_depth"].get()) )){ + + // success is true only if mutation returned a valid program + if (PARAMS.value("write_mutation_trace", false)==true) { + PARAMS["mutation_trace"]["success"] = true; + } + + return child; + } else { + + // here we have a string in PARAMS["mutation_trace"]["child"], + // but success is false since it didnt return an valid program + if (PARAMS.value("write_mutation_trace", false)==true) { + PARAMS["mutation_trace"]["status"] = "mutation returned child, but it exceeds max_size or max_depth"; + //fmt::print("{}\n", PARAMS["mutation_trace"].get().dump()); + } + return std::nullopt; + } +} + +/** + * @brief Stochastically swaps subtrees between root and other, returning a new program. + * + * The spot where the cross will take place in the `root` parent is sampled + * based on attribute `get_prob_change` of each node in the tree. After selecting + * the cross spot, the program will iterate through the `other` parent searching + * for all compatible sub-trees to replace. + * + * Due to the stochastic behavior, it may come to a case where there is no + * candidate to replace the spot node. In this case, the method returns + * `std::nullopt` (and has overloads so it can be used in a boolean context). + * + * If the cross succeeds, the child program can be accessed through the + * `.value()` attribute of the `std::optional`. + * + * This means that, if you use the cross as `auto opt = mutate(parent, SS)`, + * either `opt==false` or `opt.value()` contains the child. + * + * @tparam T the program type + * @param root the root parent + * @param other the donating parent + * @return `std::optional` that may contain the child program of type `T` + */ +template +std::optional> cross(const Program& root, const Program& other) +{ + /* subtree crossover between this and other, producing new Program */ + // choose location by weighted sampling of program + // TODO: why doesn't this copy the search space reference to child? + Program child(root); + + // pick a subtree to replace + vector child_weights(child.Tree.size()); + std::transform(child.Tree.begin(), child.Tree.end(), + child_weights.begin(), + [](const auto& n){ return n.get_prob_change(); } + ); + + if (std::all_of(child_weights.begin(), child_weights.end(), [](const auto& w) { + return w<=0.0; + })) + { // There is no spot that has a probability to be selected + return std::nullopt; + } + + auto child_spot = r.select_randomly(child.Tree.begin(), + child.Tree.end(), + child_weights.begin(), + child_weights.end() + ); + + auto child_ret_type = child_spot.node->data.ret_type; + + auto allowed_size = PARAMS["max_size"].get() - + ( child.size() - child.size_at(child_spot) ); + auto allowed_depth = PARAMS["max_depth"].get() - + ( child.depth_to_reach(child_spot) ); + + // pick a subtree to insert. Selection is based on other_weights + vector other_weights(other.Tree.size()); + + // iterator to get the size of subtrees inside transform + auto other_iter = other.Tree.begin(); + + // lambda function to check feasibility of solution and increment the iterator + const auto check_and_incrm = [other, &other_iter, allowed_size, allowed_depth]() -> bool { + int s = other.size_at( other_iter ); + int d = other.depth_at( other_iter ); + + std::advance(other_iter, 1); + return (s <= allowed_size) && (d <= allowed_depth); + }; + + // TODO: something like `is_valid_program` in FEAT + std::transform(other.Tree.begin(), other.Tree.end(), + other_weights.begin(), + [child_ret_type, check_and_incrm](const auto& n){ + // need to pick a node that has a matching output type to the child_spot. + // also need to check if swaping this node wouldn't exceed max_size + if (check_and_incrm() && (n.ret_type == child_ret_type)) + return n.get_prob_change(); + else + // setting the weight to zero to indicate a non-feasible crossover point + return 0.0f; + } + ); + + bool matching_spots_found = false; + for (const auto& w: other_weights) + { + matching_spots_found = w > 0.0; + + if (matching_spots_found) { + auto other_spot = r.select_randomly( + other.Tree.begin(), + other.Tree.end(), + other_weights.begin(), + other_weights.end() + ); + + // fmt::print("other_spot : {}\n",other_spot.node->data); + // swap subtrees at child_spot and other_spot + // TODO: do I need to delete the removed node? + child.Tree.move_ontop(child_spot, other_spot); + return child; + } + } + + return std::nullopt; +} + + +// TODO: make crossover and mutation private functions of a variation class +// variation class should get params as argument +// TODO: make sure every method doesnt store information, instead they retrieve it from parameters (so there's no side effect) +// TODO: implement migration as a variation method? +// TODO: delete previous mutation and crossover, and use just the variation class (implement the log for having the mutation trace) +// A BANDIT WOULD GO HERE INSIDE VARIATION (or population?) + +template +std::optional> Variation::cross( + const Program& root, const Program& other) +{ + /* subtree crossover between this and other, producing new Program */ + // choose location by weighted sampling of program + // TODO: why doesn't this copy the search space reference to child? + Program child(root); + + // pick a subtree to replace + vector child_weights(child.Tree.size()); + std::transform(child.Tree.begin(), child.Tree.end(), + child_weights.begin(), + [](const auto& n){ return n.get_prob_change(); } + ); + + if (std::all_of(child_weights.begin(), child_weights.end(), [](const auto& w) { + return w<=0.0; + })) + { // There is no spot that has a probability to be selected + return std::nullopt; + } + + auto child_spot = r.select_randomly(child.Tree.begin(), + child.Tree.end(), + child_weights.begin(), + child_weights.end() + ); + + auto child_ret_type = child_spot.node->data.ret_type; + + auto allowed_size = parameters.max_size - + ( child.size() - child.size_at(child_spot) ); + auto allowed_depth = parameters.max_depth - + ( child.depth_to_reach(child_spot) ); + + // pick a subtree to insert. Selection is based on other_weights + vector other_weights(other.Tree.size()); + + // iterator to get the size of subtrees inside transform + auto other_iter = other.Tree.begin(); + + // lambda function to check feasibility of solution and increment the iterator + const auto check_and_incrm = [other, &other_iter, allowed_size, allowed_depth]() -> bool { + int s = other.size_at( other_iter ); + int d = other.depth_at( other_iter ); + + std::advance(other_iter, 1); + return (s <= allowed_size) && (d <= allowed_depth); + }; + + // TODO: something like `is_valid_program` in FEAT + std::transform(other.Tree.begin(), other.Tree.end(), + other_weights.begin(), + [child_ret_type, check_and_incrm](const auto& n){ + // need to pick a node that has a matching output type to the child_spot. + // also need to check if swaping this node wouldn't exceed max_size + if (check_and_incrm() && (n.ret_type == child_ret_type)) + return n.get_prob_change(); + else + // setting the weight to zero to indicate a non-feasible crossover point + return 0.0f; + } + ); + + bool matching_spots_found = false; + for (const auto& w: other_weights) + { + matching_spots_found = w > 0.0; + + if (matching_spots_found) { + auto other_spot = r.select_randomly( + other.Tree.begin(), + other.Tree.end(), + other_weights.begin(), + other_weights.end() + ); + + // fmt::print("other_spot : {}\n",other_spot.node->data); + // swap subtrees at child_spot and other_spot + // TODO: do I need to delete the removed node? + child.Tree.move_ontop(child_spot, other_spot); + return child; + } + } + return std::nullopt; +} + +template +std::optional> Variation::mutate(const Program& parent) +{ + auto options = parameters.mutation_probs; + + if (std::all_of(options.begin(), options.end(), + [](const auto& kv) { return kv.second<=0.0; }) + ) + { // No mutation can be successfully applied to this solution + return std::nullopt; + } + + // choose a valid mutation option + string choice = r.random_choice(options); + + // TODO: this could be improved (specially with the Variation class) + std::unique_ptr mutation; + if (choice == "point") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "insert") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "delete") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "toggle_weight_on") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "toggle_weight_off") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "subtree") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else { + std::string msg = fmt::format("{} not a valid mutation choice", choice); + HANDLE_ERROR_THROW(msg); + } + + Program child(parent); + + // choose location by weighted sampling of program + auto weights = mutation->find_spots(child.Tree); + + if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { + return w<=0.0; + })) + { // There is no spot that has a probability to be selected + return std::nullopt; + } + + // apply the mutation and check if it succeeded + auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), + weights.begin(), weights.end()); + + // Every mutation here works inplace, so they return bool instead of + // std::optional to indicare the result of their manipulation over the + // program tree. Here we call the mutation function and return the result + bool success = (*mutation)(child.Tree, spot); + + if (success + && ( (child.size() <= parameters.max_size) + && (child.depth() <= parameters.max_depth) )){ + + return child; + } else { + + return std::nullopt; + } +} + +} //namespace Var +} //namespace Brush + diff --git a/src/variation.h b/src/variation.h index 9d8e782c..5b853b35 100644 --- a/src/variation.h +++ b/src/variation.h @@ -6,12 +6,14 @@ license: GNU/GPL v3 #ifndef VARIATION_H #define VARIATION_H -// #include "search_space.h" -// #include "program/program.h" -// #include "program/tree_node.h" -// #include "node.h" +// #include "util/error.h" +// #include "util/utils.h" + +#include "search_space.h" +// #include "population.h" #include +#include // namespace Brush{ @@ -20,13 +22,13 @@ license: GNU/GPL v3 //////////////////////////////////////////////////////////////////////////// // Mutation & Crossover - /** * @brief Namespace for variation functions like crossover and mutation. * */ -namespace variation { - +namespace Brush { +namespace Var { + class MutationBase { public: using Iter = tree::pre_order_iterator; @@ -35,8 +37,7 @@ class MutationBase { : SS_(SS) , max_size_(max_size) , max_depth_(max_depth) - { - } + {} virtual auto find_spots(tree& Tree) const -> vector { @@ -87,608 +88,48 @@ class MutationBase { private: SearchSpace SS_; // where to sample nodes to change the program - // constrains + // constrains TODO: use params to get this values, stop storing it size_t max_size_; size_t max_depth_; }; -/// @brief replace node with same typed node -/// @param prog the program -/// @param Tree the program tree -/// @param spot an iterator to the node that is being mutated -/// @param SS the search space to sample a node like `spot` -/// @return boolean indicating the success (true) or fail (false) of the operation -class PointMutation : public MutationBase -{ -public: - explicit PointMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) - { - } - - auto operator()(tree& Tree, Iter spot) const -> bool override - { - // cout << "point mutation\n"; - - // get_node_like will sample a similar node based on node_map_weights or - // terminal_weights, and maybe will return a Node. - optional newNode = SS().get_node_like(spot.node->data); - - if (!newNode) // overload to check if newNode == nullopt - return false; - - // if optional contains a Node, we access its contained value - Tree.replace(spot, *newNode); - - return true; - } -}; - -/// @brief insert a node with spot as a child -/// @param prog the program -/// @param Tree the program tree -/// @param spot an iterator to the node that is being mutated -/// @param SS the search space to sample a node like `spot` -/// @return boolean indicating the success (true) or fail (false) of the operation -class InsertMutation : public MutationBase -{ -public: - explicit InsertMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) - { - } - - auto find_spots(tree& Tree) const -> vector override - { - vector weights; - - if (size_with_weights(Tree) < max_size()) { - Iter iter = Tree.begin(); - std::transform(Tree.begin(), Tree.end(), std::back_inserter(weights), - [&](const auto& n){ - size_t d = 1+Tree.depth(iter); - std::advance(iter, 1); - - // check if SS holds an operator to avoid failing `check` in sample_op_with_arg - if ((d >= max_depth()) - || (SS().node_map.find(n.ret_type) == SS().node_map.end())) { - return 0.0f; - } - else { - return n.get_prob_change(); - } - }); - } - else { - // fill the vector with zeros, since we're already at max_size - weights.resize(Tree.size()); - std::fill(weights.begin(), weights.end(), 0.0f); - } - - return weights; - } - - auto operator()(tree& Tree, Iter spot) const -> bool override - { - // cout << "insert mutation\n"; - auto spot_type = spot.node->data.ret_type; - - // pick a random compatible node to insert (with probabilities given by - // node_map_weights). The `-1` represents the node being inserted. - // Ideally, it should always find at least one match (the same node - // used as a reference when calling the function). However, we have a - // size restriction, which will be relaxed here (just as it is in the PTC2 - // algorithm). This mutation can create a new expression that exceeds the - // maximum size by the highest arity among the operators. - std::optional n = SS().sample_op_with_arg(spot_type, spot_type, true, - max_size()-Tree.size()-1); - - if (!n) // there is no operator with compatible arguments - return false; - - // make node n wrap the subtree at the chosen spot - auto parent_node = Tree.wrap(spot, *n); - - // now fill the arguments of n appropriately - bool spot_filled = false; - for (auto a: (*n).arg_types) - { - if (spot_filled) - { - // if spot is in its child position, append children. - // TODO: reminding that sample_terminal may fail as well - auto opt = SS().sample_terminal(a); - - if (!opt) - return false; - - Tree.append_child(parent_node, opt.value()); - } - // if types match, treat this spot as filled by the spot node - else if (a == spot_type) - spot_filled = true; - // otherwise, add siblings before spot node - else { - auto opt = SS().sample_terminal(a); - - if (!opt) - return false; - - Tree.insert(spot, opt.value()); - } - } - - return true; - } -}; - -/// @brief delete subtree and replace it with a terminal of the same return type -/// @param prog the program -/// @param Tree the program tree -/// @param spot an iterator to the node that is being mutated -/// @param SS the search space to sample a node like `spot` -/// @return boolean indicating the success (true) or fail (false) of the operation -class DeleteMutation : public MutationBase -{ -public: - explicit DeleteMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) - { - } - - auto operator()(tree& Tree, Iter spot) const -> bool override - { - // cout << "delete mutation\n"; - - // sample_terminal will sample based on terminal_weights. If it succeeds, - // then the new terminal will be in `opt.value()` - auto opt = SS().sample_terminal(spot.node->data.ret_type); - - if (!opt) // there is no terminal with compatible arguments - return false; - - Tree.erase_children(spot); - - Tree.replace(spot, opt.value()); - - return true; - } -}; - -/// @brief toggle the node's weight ON -/// @param prog the program -/// @param Tree the program tree -/// @param spot an iterator to the node that is being mutated -/// @param SS the search space (unused) -/// @return boolean indicating the success (true) or fail (false) of the operation -class ToggleWeightOnMutation : public MutationBase -{ -public: - explicit ToggleWeightOnMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) - { - } - - auto find_spots(tree& Tree) const -> vector override - { - vector weights(Tree.size()); - - if (size_with_weights(Tree) < max_size()) { - std::transform(Tree.begin(), Tree.end(), weights.begin(), - [&](const auto& n){ - // only weighted nodes can be toggled off - if (!n.get_is_weighted() - && IsWeighable(n.ret_type)) - return n.get_prob_change(); - else - return 0.0f; - }); - } - else { - // fill the vector with zeros, since we're already at max_size - std::fill(weights.begin(), weights.end(), 0.0f); - } - - return weights; - } - - auto operator()(tree& Tree, Iter spot) const -> bool override - { - // cout << "toggle_weight_on mutation\n"; - - if (spot.node->data.get_is_weighted()==true // cant turn on whats already on - || !IsWeighable(spot.node->data.ret_type)) // does not accept weights (e.g. boolean) - return false; // false indicates that mutation failed and should return std::nullopt - - spot.node->data.set_is_weighted(true); - return true; - } -}; - -/// @brief toggle the node's weight OFF -/// @param prog the program -/// @param Tree the program tree -/// @param spot an iterator to the node that is being mutated -/// @param SS the search space (unused) -/// @return boolean indicating the success (true) or fail (false) of the operation -class ToggleWeightOffMutation : public MutationBase -{ -public: - explicit ToggleWeightOffMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) - { - } - - auto find_spots(tree& Tree) const -> vector override - { - vector weights(Tree.size()); - - std::transform(Tree.begin(), Tree.end(), weights.begin(), - [&](const auto& n){ - if (n.get_is_weighted() - && IsWeighable(n.ret_type)) - return n.get_prob_change(); - else - return 0.0f; - }); - - return weights; - } - - auto operator()(tree& Tree, Iter spot) const -> bool override - { - // cout << "toggle_weight_off mutation\n"; - - if (spot.node->data.get_is_weighted()==false) - return false; - - spot.node->data.set_is_weighted(false); - return true; - } -}; - -/// @brief replaces the subtree rooted in `spot` -/// @param prog the program -/// @param Tree the program tree -/// @param spot an iterator to the node that is being mutated -/// @param SS the search space to generate a compatible subtree -/// @return boolean indicating the success (true) or fail (false) of the operation -class SubtreeMutation : public MutationBase -{ -public: - explicit SubtreeMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) // TODO: change order size and depth - { - } - - // TODO: make different private functions to find spots and use them. theres too much copy and paste here - auto find_spots(tree& Tree) const -> vector override - { - vector weights; - - auto node_map = SS().node_map; - - if (size_with_weights(Tree) < max_size()) { - Iter iter = Tree.begin(); - std::transform(Tree.begin(), Tree.end(), std::back_inserter(weights), - [&](const auto& n){ - size_t d = 1+Tree.depth(iter); - std::advance(iter, 1); - - // we need to make sure there's some node to start the subtree - if ((d >= max_depth()) - || (SS().node_map.find(n.ret_type) == SS().node_map.end()) - || (SS().node_map.find(n.ret_type) == SS().node_map.end()) ) - return 0.0f; - else - return n.get_prob_change(); - }); - } - else { - weights.resize(Tree.size()); - std::fill(weights.begin(), weights.end(), 0.0f); - } - - return weights; - } - - auto operator()(tree& Tree, Iter spot) const -> bool override - { - // cout << "subtree mutation\n"; - - // check if we exceeded the size/depth constrains (without subtracting, - // to avoid overflow cases if the user sets max_size smaller than arity - // of smallest operator. The overflow would happen when calculating d and - // s in the following lines, to choose the PTC2 limits) - if ( max_size() <= (Tree.size() - Tree.size(spot)) - || max_depth() <= Tree.depth(spot) ) - return false; - - auto spot_type = spot.node->data.ret_type; - - // d and s must be compatible with PTC2 --- they should be based on - // tree structure, not program structure - size_t d = max_depth() - Tree.depth(spot); - size_t s = max_size() - (Tree.size() - Tree.size(spot)); - - s = r.rnd_int(1, s); - - // sample subtree uses PTC2, which operates on depth and size of the tree - // (and not on the program!). we shoudn't care for weights here - auto subtree = SS().sample_subtree(spot.node->data, d, s); - - if (!subtree) // there is no terminal with compatible arguments - return false; - - // if optional contains a Node, we access its contained value - Tree.erase_children(spot); - Tree.replace(spot, subtree.value().begin()); - - return true; - } -}; - -/** - * @brief Stochastically mutate a program. - * - * Types of mutation: - * - * - point mutation changes a single node. - * - insertion mutation inserts a node as the parent of an existing node, and fills in the other arguments. - * - deletion mutation deletes a node. - * - subtree mutation inserts a new subtree into the program. - * - toggle_weight_on mutation turns a node's weight ON. - * - toggle_weight_off mutation turns a node's weight OFF. - * - * Every mutation has a probability (weight) based on global parameters. The - * spot where the mutation will take place is sampled based on attribute - * `get_prob_change` of each node in the tree. Inside each type of mutation, - * when a new node is inserted, it is sampled based on `terminal_weights`. - * - * Due to the stochastic behavior, and the several sampling steps, it may come to - * a case where the search space does not hold any possible modification to do in - * the program. In this case, the method returns `std::nullopt` (and has overloads - * so it can be used in a boolean context). - * - * If the mutation succeeds, the mutated program can be accessed through the - * `.value()` attribute of the `std::optional`. - * - * This means that, if you use the mutation as `auto opt = mutate(parent, SS)`, - * either `opt==false` or `opt.value()` contains the child program. - * - * @tparam T program type - * @param parent the program to be mutated - * @param SS a search space - * @return `std::optional` that may contain the child program of type `T` - */ template -std::optional> mutate(const Program& parent, const SearchSpace& SS) -{ - auto options = PARAMS["mutation_options"].get>(); - - // whether we should write everything that happened inside the method - if (PARAMS.value("write_mutation_trace", false)==true) { - // Default fields of the trace. Initialize with default values, which are - // gradually changed throughout the execution of the method. - PARAMS["mutation_trace"] = json({ - {"parent", parent.get_model("compact", true)}, - {"mutation_weights", options}, - // default values, to be changed in case mutation works - {"mutation", "not selected"}, - {"spot_weights", "not calculated"}, - {"spot", "not selected"}, - {"child", "failed to generate"}, - {"status", "initialized weight vectors"}, - {"success", "false"} - }); - } - if (std::all_of(options.begin(), options.end(), - [](const auto& kv) { return kv.second<=0.0; }) - ) - { // No mutation can be successfully applied to this solution - return std::nullopt; - } - - // choose a valid mutation option - string choice = r.random_choice(options); +std::optional> cross(const Program& root, const Program& other); - // TODO: this could be improved - std::unique_ptr mutation; - if (choice == "point") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "insert") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "delete") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "toggle_weight_on") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "toggle_weight_off") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "subtree") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else { - std::string msg = fmt::format("{} not a valid mutation choice", choice); - HANDLE_ERROR_THROW(msg); - } - - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["mutation"] = choice; - } - - Program child(parent); - - // choose location by weighted sampling of program - auto weights = mutation->find_spots(child.Tree); - - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["spot_weights"] = weights; - } - - if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { - return w<=0.0; - })) - { // There is no spot that has a probability to be selected - return std::nullopt; - } - - // apply the mutation and check if it succeeded - auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), - weights.begin(), weights.end()); - - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["spot"] = spot.node->get_model(false); - PARAMS["mutation_trace"]["status"] = "sampled the spot"; - } - - // Every mutation here works inplace, so they return bool instead of - // std::optional to indicare the result of their manipulation over the - // program tree. Here we call the mutation function and return the result - bool success = (*mutation)(child.Tree, spot); - - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["status"] = "aplied the mutation"; - if (success) - PARAMS["mutation_trace"]["child"] = child.get_model("compact", true); - } - - if (success - && ( (child.size() <= PARAMS["max_size"].get() ) - && (child.depth() <= PARAMS["max_depth"].get()) )){ - - // success is true only if mutation returned a valid program - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["success"] = true; - } +template +std::optional> mutate(const Program& parent, const SearchSpace& SS); - return child; - } else { - - // here we have a string in PARAMS["mutation_trace"]["child"], - // but success is false since it didnt return an valid program - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["status"] = "mutation returned child, but it exceeds max_size or max_depth"; - //fmt::print("{}\n", PARAMS["mutation_trace"].get().dump()); - } - return std::nullopt; - } -}; +// TODO: make crossover and mutation private functions of a variation class +// variation class should get params as argument +// TODO: make sure every method doesnt store information, instead they retrieve it from parameters (so there's no side effect) +// TODO: implement migration as a variation method? +// TODO: delete previous mutation and crossover, and use just the variation class (implement the log for having the mutation trace) + // A BANDIT WOULD GO HERE INSIDE VARIATION (or population?) -/** - * @brief Stochastically swaps subtrees between root and other, returning a new program. - * - * The spot where the cross will take place in the `root` parent is sampled - * based on attribute `get_prob_change` of each node in the tree. After selecting - * the cross spot, the program will iterate through the `other` parent searching - * for all compatible sub-trees to replace. - * - * Due to the stochastic behavior, it may come to a case where there is no - * candidate to replace the spot node. In this case, the method returns - * `std::nullopt` (and has overloads so it can be used in a boolean context). - * - * If the cross succeeds, the child program can be accessed through the - * `.value()` attribute of the `std::optional`. - * - * This means that, if you use the cross as `auto opt = mutate(parent, SS)`, - * either `opt==false` or `opt.value()` contains the child. - * - * @tparam T the program type - * @param root the root parent - * @param other the donating parent - * @return `std::optional` that may contain the child program of type `T` - */ template -std::optional> cross(const Program& root, const Program& other) +class Variation { - /* subtree crossover between this and other, producing new Program */ - // choose location by weighted sampling of program - // TODO: why doesn't this copy the search space reference to child? - Program child(root); - - // pick a subtree to replace - vector child_weights(child.Tree.size()); - std::transform(child.Tree.begin(), child.Tree.end(), - child_weights.begin(), - [](const auto& n){ return n.get_prob_change(); } - ); - - if (std::all_of(child_weights.begin(), child_weights.end(), [](const auto& w) { - return w<=0.0; - })) - { // There is no spot that has a probability to be selected - return std::nullopt; - } - - auto child_spot = r.select_randomly(child.Tree.begin(), - child.Tree.end(), - child_weights.begin(), - child_weights.end() - ); - - auto child_ret_type = child_spot.node->data.ret_type; - - auto allowed_size = PARAMS["max_size"].get() - - ( child.size() - child.size_at(child_spot) ); - auto allowed_depth = PARAMS["max_depth"].get() - - ( child.depth_to_reach(child_spot) ); - - // pick a subtree to insert. Selection is based on other_weights - vector other_weights(other.Tree.size()); - - // iterator to get the size of subtrees inside transform - auto other_iter = other.Tree.begin(); - - // lambda function to check feasibility of solution and increment the iterator - const auto check_and_incrm = [other, &other_iter, allowed_size, allowed_depth]() -> bool { - int s = other.size_at( other_iter ); - int d = other.depth_at( other_iter ); +private: + SearchSpace& search_space; + Parameters& parameters; - std::advance(other_iter, 1); - return (s <= allowed_size) && (d <= allowed_depth); - }; + std::optional> cross(const Program& root, const Program& other); - std::transform(other.Tree.begin(), other.Tree.end(), - other_weights.begin(), - [child_ret_type, check_and_incrm](const auto& n){ - // need to pick a node that has a matching output type to the child_spot. - // also need to check if swaping this node wouldn't exceed max_size - if (check_and_incrm() && (n.ret_type == child_ret_type)) - return n.get_prob_change(); - else - // setting the weight to zero to indicate a non-feasible crossover point - return 0.0f; - } - ); + std::optional> mutate(const Program& parent); +public: + Variation(const Parameters& params, const SearchSpace& ss) + : parameters(params) + , search_space(ss) + {}; - bool matching_spots_found = false; - for (const auto& w: other_weights) - { - matching_spots_found = w > 0.0; + ~Variation(); + + /// method to handle variation of population + // void vary(Population& pop, const vector& parents); +}; - if (matching_spots_found) { - auto other_spot = r.select_randomly( - other.Tree.begin(), - other.Tree.end(), - other_weights.begin(), - other_weights.end() - ); - - // fmt::print("other_spot : {}\n",other_spot.node->data); - // swap subtrees at child_spot and other_spot - // TODO: do I need to delete the removed node? - child.Tree.move_ontop(child_spot, other_spot); - return child; - } - } - return std::nullopt; -}; -// TODO: implement migration as a variation method? -} //namespace variation +} //namespace Var +} //namespace Brush #endif \ No newline at end of file From 47820feb8a7853f5fa351882466571afadd86c38 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 8 Nov 2023 12:37:28 -0500 Subject: [PATCH 081/199] Commented broken tests --- tests/cpp/test_data.cpp | 250 ++++---- tests/cpp/test_variation.cpp | 1058 +++++++++++++++++----------------- tests/cpp/testsHeader.h | 2 + 3 files changed, 656 insertions(+), 654 deletions(-) diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index 0a61e19a..af0678f9 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -1,126 +1,126 @@ -#include "testsHeader.h" -#include "../../src/search_space.h" -#include "../../src/program/program.h" -#include "../../src/program/dispatch_table.h" - -TEST(Data, ErrorHandling) -{ - // Creating an empty dataset throws error - EXPECT_THROW({ - MatrixXf X(0,0); - ArrayXf y(0); - - try - { - Dataset dt(X, y); - } - catch( const std::runtime_error& err ) - { - const string msg = err.what(); - ASSERT_NE( - msg.find("Error during the initialization of the dataset"), - std::string::npos); - throw; - } - }, std::runtime_error); -} - -TEST(Data, MixedVariableTypes) -{ - // We need to set at least the mutation options (and respective - // probabilities) in order to call PRG.predict() - PARAMS["write_mutation_trace"] = true; - PARAMS["mutation_options"] = { - {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} - }; - - MatrixXf X(5,3); - X << 0 , 1, 0 , // binary with integer values - 0.0, 1.0, 1.0, // binary with float values - 2 , 1.0, -3.0, // integer with float and negative values - 2 , 1 , 3 , // integer with integer values - 2.1, 3.7, -5.2; // float values - - X.transposeInPlace(); - - ArrayXf y(3); - - y << 6.1, 7.7, -4.2; // y = x_0 + x_1 + x_2 +// #include "testsHeader.h" +// #include "../../src/program/program.h" +// #include "../../src/search_space.h" +// #include "../../src/program/dispatch_table.h" + +// TEST(Data, ErrorHandling) +// { +// // Creating an empty dataset throws error +// EXPECT_THROW({ +// MatrixXf X(0,0); +// ArrayXf y(0); + +// try +// { +// Dataset dt(X, y); +// } +// catch( const std::runtime_error& err ) +// { +// const string msg = err.what(); +// ASSERT_NE( +// msg.find("Error during the initialization of the dataset"), +// std::string::npos); +// throw; +// } +// }, std::runtime_error); +// } + +// TEST(Data, MixedVariableTypes) +// { +// // We need to set at least the mutation options (and respective +// // probabilities) in order to call PRG.predict() +// PARAMS["write_mutation_trace"] = true; +// PARAMS["mutation_options"] = { +// {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} +// }; + +// MatrixXf X(5,3); +// X << 0 , 1, 0 , // binary with integer values +// 0.0, 1.0, 1.0, // binary with float values +// 2 , 1.0, -3.0, // integer with float and negative values +// 2 , 1 , 3 , // integer with integer values +// 2.1, 3.7, -5.2; // float values + +// X.transposeInPlace(); + +// ArrayXf y(3); + +// y << 6.1, 7.7, -4.2; // y = x_0 + x_1 + x_2 - unordered_map user_ops = { - {"Add", 0.5}, - {"Sub", 0.5}, - // a boolean operator - {"And", 1.0}, - {"Or", 1.0}, - // operator that takes boolean as argument - {"SplitOn", 1.0} - }; - - Dataset dt(X, y); - SearchSpace SS; - SS.init(dt, user_ops); - - dt.print(); - SS.print(); - - for (size_t d = 5; d < 10; ++d) - for (size_t s = 5; s < 20; ++s) - { - fmt::print( - "=================================================\n" - "depth={}, size={}. ", d, s - ); - - PARAMS["max_size"] = s; - PARAMS["max_depth"] = d; - - RegressorProgram PRG = SS.make_regressor(s-4, d-4); - - fmt::print( - "Tree model: {}\n", PRG.get_model("compact", true) - ); - - // visualizing detailed information for the model - std::for_each(PRG.Tree.begin(), PRG.Tree.end(), - [](const auto& n) { - fmt::print("Name {}, node {}, feature {}\n" - " sig_hash {}\n ret_type {}\n ret_type type {}\n", - n.name, n.node_type, n.get_feature(), - n.sig_hash, n.ret_type, typeid(n.ret_type).name()); - }); - - std::cout << std::endl; - - fmt::print( "PRG fit\n"); - PRG.fit(dt); - fmt::print( "PRG predict\n"); - ArrayXf y_pred = PRG.predict(dt); - fmt::print( "y_pred: {}\n", y_pred); - - // creating and fitting a child - auto opt = PRG.mutate(); - - if (!opt){ - fmt::print("Mutation failed to create a child\n"); - fmt::print("{}\n", PARAMS["mutation_trace"].get().dump()); - } - else { - auto Child = opt.value(); - - fmt::print("Child model: {}\n", Child.get_model("compact", true)); - - fmt::print( "Child fit\n"); - Child.fit(dt); - fmt::print( "Child predict\n"); - ArrayXf y_pred_child = Child.predict(dt); - fmt::print( "y_pred: {}\n", y_pred); - } - } - - // Brush exports two DispatchTable structs named dtable_fit and dtable_predict. - // These structures holds the mapping between nodes and its corresponding - // operations, and are used to resolve the evaluation of an expression. - // dtable_fit.print(); - // dtable_predict.print(); -} \ No newline at end of file +// unordered_map user_ops = { +// {"Add", 0.5}, +// {"Sub", 0.5}, +// // a boolean operator +// {"And", 1.0}, +// {"Or", 1.0}, +// // operator that takes boolean as argument +// {"SplitOn", 1.0} +// }; + +// Dataset dt(X, y); +// SearchSpace SS; +// SS.init(dt, user_ops); + +// dt.print(); +// SS.print(); + +// for (size_t d = 5; d < 10; ++d) +// for (size_t s = 5; s < 20; ++s) +// { +// fmt::print( +// "=================================================\n" +// "depth={}, size={}. ", d, s +// ); + +// PARAMS["max_size"] = s; +// PARAMS["max_depth"] = d; + +// RegressorProgram PRG = SS.make_regressor(s-4, d-4); + +// fmt::print( +// "Tree model: {}\n", PRG.get_model("compact", true) +// ); + +// // visualizing detailed information for the model +// std::for_each(PRG.Tree.begin(), PRG.Tree.end(), +// [](const auto& n) { +// fmt::print("Name {}, node {}, feature {}\n" +// " sig_hash {}\n ret_type {}\n ret_type type {}\n", +// n.name, n.node_type, n.get_feature(), +// n.sig_hash, n.ret_type, typeid(n.ret_type).name()); +// }); + +// std::cout << std::endl; + +// fmt::print( "PRG fit\n"); +// PRG.fit(dt); +// fmt::print( "PRG predict\n"); +// ArrayXf y_pred = PRG.predict(dt); +// fmt::print( "y_pred: {}\n", y_pred); + +// // creating and fitting a child +// auto opt = PRG.mutate(); + +// if (!opt){ +// fmt::print("Mutation failed to create a child\n"); +// fmt::print("{}\n", PARAMS["mutation_trace"].get().dump()); +// } +// else { +// auto Child = opt.value(); + +// fmt::print("Child model: {}\n", Child.get_model("compact", true)); + +// fmt::print( "Child fit\n"); +// Child.fit(dt); +// fmt::print( "Child predict\n"); +// ArrayXf y_pred_child = Child.predict(dt); +// fmt::print( "y_pred: {}\n", y_pred); +// } +// } + +// // Brush exports two DispatchTable structs named dtable_fit and dtable_predict. +// // These structures holds the mapping between nodes and its corresponding +// // operations, and are used to resolve the evaluation of an expression. +// // dtable_fit.print(); +// // dtable_predict.print(); +// } \ No newline at end of file diff --git a/tests/cpp/test_variation.cpp b/tests/cpp/test_variation.cpp index eae5e67d..44ff612c 100644 --- a/tests/cpp/test_variation.cpp +++ b/tests/cpp/test_variation.cpp @@ -1,545 +1,545 @@ -#include "testsHeader.h" -#include "../../src/search_space.h" -#include "../../src/program/program.h" -#include "../../src/program/dispatch_table.h" -#include "../../src/data/io.h" - -TEST(Variation, FixedRootDoesntChange) -{ - PARAMS["mutation_options"] = { - {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} - }; - PARAMS["max_size"] = 20; - PARAMS["max_depth"] = 10; - - MatrixXf X(10,2); - ArrayXf y(10); - X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, - 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - - 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, - 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - - y << 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0; - - Dataset data(X,y); - - SearchSpace SS; - SS.init(data); - - auto logistic_hash = Signature().hash(); - - for (int d = 1; d < 10; ++d) - { - for (int s = 1; s < 10; ++s) - { - int successes = 0; - for (int attempt = 0; attempt < 10; ++attempt) - { - // different program types changes how predict works (and the rettype of predict) - ClassifierProgram PRG = SS.make_classifier(d, s); - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model 1: {}\n", - d, s, - PRG.get_model("compact", true) - ); - - Node root = *(PRG.Tree.begin()); - ASSERT_TRUE(root.node_type == NodeType::Logistic); - ASSERT_TRUE(root.ret_type == DataType::ArrayF); - ASSERT_TRUE(root.sig_hash == logistic_hash); - ASSERT_TRUE(root.get_prob_change()==0.0); - ASSERT_TRUE(root.fixed==true); - - auto opt_mutation = PRG.mutate(); - if (opt_mutation) - { - successes += 1; - auto Mut_Child = opt_mutation.value(); - fmt::print("After mutation : {}\n", - Mut_Child.get_model("compact", true)); - - Node mut_child_root = *(Mut_Child.Tree.begin()); - ASSERT_TRUE(mut_child_root.node_type == NodeType::Logistic); - ASSERT_TRUE(mut_child_root.ret_type == DataType::ArrayF); - ASSERT_TRUE(mut_child_root.sig_hash == logistic_hash); - ASSERT_TRUE(mut_child_root.get_prob_change()==0.0); - ASSERT_TRUE(mut_child_root.fixed==true); - } - - ClassifierProgram PRG2 = SS.make_classifier(d, s); - auto opt_cx = PRG.cross(PRG2); - if (opt_cx) - { - successes += 1; - auto CX_Child = opt_cx.value(); - fmt::print("After crossover: {}\n", - CX_Child.get_model("compact", true)); - - Node cx_child_root = *(CX_Child.Tree.begin()); - ASSERT_TRUE(cx_child_root.node_type == NodeType::Logistic); - ASSERT_TRUE(cx_child_root.ret_type == DataType::ArrayF); - ASSERT_TRUE(cx_child_root.sig_hash == logistic_hash); - ASSERT_TRUE(cx_child_root.get_prob_change()==0.0); - ASSERT_TRUE(cx_child_root.fixed==true); - } - - // root remained unchanged - ASSERT_TRUE(root.node_type == NodeType::Logistic); - ASSERT_TRUE(root.ret_type == DataType::ArrayF); - ASSERT_TRUE(root.sig_hash == logistic_hash); - ASSERT_TRUE(root.get_prob_change()==0.0); - ASSERT_TRUE(root.fixed==true); - } - ASSERT_TRUE(successes > 0); - } - } -} - -TEST(Variation, InsertMutationWorks) -{ - // TODO: this tests could be parameterized. - // To understand design implementation of this test, check Mutation test - - PARAMS["mutation_options"] = { - {"point", 0.0}, {"insert", 1.0}, {"delete", 0.0}, {"subtree", 0.0}, {"toggle_weight_on", 0.0}, {"toggle_weight_off", 0.0} - }; - - // retrieving the options to check if everything was set right - std::cout << "Initial mutation configuration" << std::endl; - auto options = PARAMS["mutation_options"].get>(); - for (const auto& [k, v] : options) - std::cout << k << " : " << v << std::endl; - - MatrixXf X(10,2); - ArrayXf y(10); - X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, - 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - - 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, - 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - - y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, - 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - - Dataset data(X,y); - - SearchSpace SS; - SS.init(data); - - int successes = 0; - for (int attempt = 0; attempt < 100; ++attempt) - { - // we need to have big values here so the mutation will work - // (when the xmen child exceeds the maximum limits, mutation returns - // std::nullopt) - PARAMS["max_size"] = 20; - PARAMS["max_depth"] = 10; +// #include "testsHeader.h" +// #include "../../src/search_space.h" +// #include "../../src/program/program.h" +// #include "../../src/program/dispatch_table.h" +// #include "../../src/data/io.h" + +// TEST(Variation, FixedRootDoesntChange) +// { +// PARAMS["mutation_options"] = { +// {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} +// }; +// PARAMS["max_size"] = 20; +// PARAMS["max_depth"] = 10; + +// MatrixXf X(10,2); +// ArrayXf y(10); +// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, +// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + +// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, +// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + +// y << 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0; + +// Dataset data(X,y); + +// SearchSpace SS; +// SS.init(data); + +// auto logistic_hash = Signature().hash(); + +// for (int d = 1; d < 10; ++d) +// { +// for (int s = 1; s < 10; ++s) +// { +// int successes = 0; +// for (int attempt = 0; attempt < 10; ++attempt) +// { +// // different program types changes how predict works (and the rettype of predict) +// ClassifierProgram PRG = SS.make_classifier(d, s); +// fmt::print( +// "=================================================\n" +// "depth = {}, size= {}\n" +// "Initial Model 1: {}\n", +// d, s, +// PRG.get_model("compact", true) +// ); + +// Node root = *(PRG.Tree.begin()); +// ASSERT_TRUE(root.node_type == NodeType::Logistic); +// ASSERT_TRUE(root.ret_type == DataType::ArrayF); +// ASSERT_TRUE(root.sig_hash == logistic_hash); +// ASSERT_TRUE(root.get_prob_change()==0.0); +// ASSERT_TRUE(root.fixed==true); + +// auto opt_mutation = PRG.mutate(); +// if (opt_mutation) +// { +// successes += 1; +// auto Mut_Child = opt_mutation.value(); +// fmt::print("After mutation : {}\n", +// Mut_Child.get_model("compact", true)); + +// Node mut_child_root = *(Mut_Child.Tree.begin()); +// ASSERT_TRUE(mut_child_root.node_type == NodeType::Logistic); +// ASSERT_TRUE(mut_child_root.ret_type == DataType::ArrayF); +// ASSERT_TRUE(mut_child_root.sig_hash == logistic_hash); +// ASSERT_TRUE(mut_child_root.get_prob_change()==0.0); +// ASSERT_TRUE(mut_child_root.fixed==true); +// } + +// ClassifierProgram PRG2 = SS.make_classifier(d, s); +// auto opt_cx = PRG.cross(PRG2); +// if (opt_cx) +// { +// successes += 1; +// auto CX_Child = opt_cx.value(); +// fmt::print("After crossover: {}\n", +// CX_Child.get_model("compact", true)); + +// Node cx_child_root = *(CX_Child.Tree.begin()); +// ASSERT_TRUE(cx_child_root.node_type == NodeType::Logistic); +// ASSERT_TRUE(cx_child_root.ret_type == DataType::ArrayF); +// ASSERT_TRUE(cx_child_root.sig_hash == logistic_hash); +// ASSERT_TRUE(cx_child_root.get_prob_change()==0.0); +// ASSERT_TRUE(cx_child_root.fixed==true); +// } + +// // root remained unchanged +// ASSERT_TRUE(root.node_type == NodeType::Logistic); +// ASSERT_TRUE(root.ret_type == DataType::ArrayF); +// ASSERT_TRUE(root.sig_hash == logistic_hash); +// ASSERT_TRUE(root.get_prob_change()==0.0); +// ASSERT_TRUE(root.fixed==true); +// } +// ASSERT_TRUE(successes > 0); +// } +// } +// } + +// TEST(Variation, InsertMutationWorks) +// { +// // TODO: this tests could be parameterized. +// // To understand design implementation of this test, check Mutation test + +// PARAMS["mutation_options"] = { +// {"point", 0.0}, {"insert", 1.0}, {"delete", 0.0}, {"subtree", 0.0}, {"toggle_weight_on", 0.0}, {"toggle_weight_off", 0.0} +// }; + +// // retrieving the options to check if everything was set right +// std::cout << "Initial mutation configuration" << std::endl; +// auto options = PARAMS["mutation_options"].get>(); +// for (const auto& [k, v] : options) +// std::cout << k << " : " << v << std::endl; + +// MatrixXf X(10,2); +// ArrayXf y(10); +// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, +// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + +// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, +// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + +// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, +// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + +// Dataset data(X,y); + +// SearchSpace SS; +// SS.init(data); + +// int successes = 0; +// for (int attempt = 0; attempt < 100; ++attempt) +// { +// // we need to have big values here so the mutation will work +// // (when the xmen child exceeds the maximum limits, mutation returns +// // std::nullopt) +// PARAMS["max_size"] = 20; +// PARAMS["max_depth"] = 10; - fmt::print("d={},s={}\n", PARAMS["max_depth"].get(), PARAMS["max_size"].get()); - fmt::print("make_regressor\n"); +// fmt::print("d={},s={}\n", PARAMS["max_depth"].get(), PARAMS["max_size"].get()); +// fmt::print("make_regressor\n"); - // creating a "small" program (with a plenty amount of space to insert stuff) - RegressorProgram PRG = SS.make_regressor(5, 5); +// // creating a "small" program (with a plenty amount of space to insert stuff) +// RegressorProgram PRG = SS.make_regressor(5, 5); - fmt::print("PRG.fit(data);\n"); - PRG.fit(data); - ArrayXf y_pred = PRG.predict(data); +// fmt::print("PRG.fit(data);\n"); +// PRG.fit(data); +// ArrayXf y_pred = PRG.predict(data); - // applying mutation and checking if the optional result is non-empty - fmt::print("auto Child = PRG.mutate();\n"); - auto opt = PRG.mutate(); // We should assume that it will be always the insert mutation - - if (opt){ - successes += 1; - auto Child = opt.value(); - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model: {}\n" - "Mutated Model: {}\n", - PARAMS["max_depth"].get(), PARAMS["max_size"].get(), - PRG.get_model("compact", true), - Child.get_model("compact", true) - ); - - fmt::print("child fit\n"); - Child.fit(data); - y_pred = Child.predict(data); - - // since we successfully inserted a node, this should be always true - ASSERT_TRUE(Child.size() > PRG.size()); - - // maybe the insertion spot was a shorter branch than the maximum - // depth. At least, xmen depth should be equal to its parent - ASSERT_TRUE(Child.depth() >= PRG.depth()); - } - - // lets also see if it always fails when the child exceeds the maximum limits - PARAMS["max_size"] = PRG.size(); - PARAMS["max_depth"] = PRG.depth(); - - auto opt2 = PRG.mutate(); - if (opt2){ // This shoudl't happen. We'll print then error - auto Child2 = opt2.value(); - - std::cout << "Fail failed. Mutation weights:" << std::endl; - auto options2 = PARAMS["mutation_options"].get>(); - for (const auto& [k, v] : options2) - std::cout << k << " : " << v << std::endl; - - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model: {}\n" - "Mutated Model: {}\n", - PARAMS["max_depth"].get(), PARAMS["max_size"].get(), - PRG.get_model("compact", true), - Child2.get_model("compact", true) - ); - ASSERT_TRUE(opt2==std::nullopt); - } - } - ASSERT_TRUE(successes > 0); -} - -TEST(Variation, Mutation) -{ - PARAMS["write_mutation_trace"] = true; - PARAMS["mutation_options"] = { - {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} - }; +// // applying mutation and checking if the optional result is non-empty +// fmt::print("auto Child = PRG.mutate();\n"); +// auto opt = PRG.mutate(); // We should assume that it will be always the insert mutation + +// if (opt){ +// successes += 1; +// auto Child = opt.value(); +// fmt::print( +// "=================================================\n" +// "depth = {}, size= {}\n" +// "Initial Model: {}\n" +// "Mutated Model: {}\n", +// PARAMS["max_depth"].get(), PARAMS["max_size"].get(), +// PRG.get_model("compact", true), +// Child.get_model("compact", true) +// ); + +// fmt::print("child fit\n"); +// Child.fit(data); +// y_pred = Child.predict(data); + +// // since we successfully inserted a node, this should be always true +// ASSERT_TRUE(Child.size() > PRG.size()); + +// // maybe the insertion spot was a shorter branch than the maximum +// // depth. At least, xmen depth should be equal to its parent +// ASSERT_TRUE(Child.depth() >= PRG.depth()); +// } + +// // lets also see if it always fails when the child exceeds the maximum limits +// PARAMS["max_size"] = PRG.size(); +// PARAMS["max_depth"] = PRG.depth(); + +// auto opt2 = PRG.mutate(); +// if (opt2){ // This shoudl't happen. We'll print then error +// auto Child2 = opt2.value(); + +// std::cout << "Fail failed. Mutation weights:" << std::endl; +// auto options2 = PARAMS["mutation_options"].get>(); +// for (const auto& [k, v] : options2) +// std::cout << k << " : " << v << std::endl; + +// fmt::print( +// "=================================================\n" +// "depth = {}, size= {}\n" +// "Initial Model: {}\n" +// "Mutated Model: {}\n", +// PARAMS["max_depth"].get(), PARAMS["max_size"].get(), +// PRG.get_model("compact", true), +// Child2.get_model("compact", true) +// ); +// ASSERT_TRUE(opt2==std::nullopt); +// } +// } +// ASSERT_TRUE(successes > 0); +// } + +// TEST(Variation, Mutation) +// { +// PARAMS["write_mutation_trace"] = true; +// PARAMS["mutation_options"] = { +// {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} +// }; - MatrixXf X(10,2); - ArrayXf y(10); - X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, - 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - - 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, - 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - - y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, - 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - - Dataset data(X,y); - - SearchSpace SS; - SS.init(data); - - int successes = 0; - for (int d = 1; d < 10; ++d) - { - for (int s = 1; s < 10; ++s) - { - fmt::print("d={},s={}\n",d,s); - fmt::print("make_regressor\n"); - - // if we set max_size and max_depth to zero, it will use the - // values in the global PARAMS. Otherwise, it will respect the - // values passed as argument. - RegressorProgram PRG = SS.make_regressor(d, s); - - fmt::print("PRG.fit(data);\n"); - PRG.fit(data); - ArrayXf y_pred = PRG.predict(data); +// MatrixXf X(10,2); +// ArrayXf y(10); +// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, +// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + +// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, +// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + +// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, +// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + +// Dataset data(X,y); + +// SearchSpace SS; +// SS.init(data); + +// int successes = 0; +// for (int d = 1; d < 10; ++d) +// { +// for (int s = 1; s < 10; ++s) +// { +// fmt::print("d={},s={}\n",d,s); +// fmt::print("make_regressor\n"); + +// // if we set max_size and max_depth to zero, it will use the +// // values in the global PARAMS. Otherwise, it will respect the +// // values passed as argument. +// RegressorProgram PRG = SS.make_regressor(d, s); + +// fmt::print("PRG.fit(data);\n"); +// PRG.fit(data); +// ArrayXf y_pred = PRG.predict(data); - // applying mutation and checking if the optional result is non-empty - fmt::print("auto Child = PRG.mutate();\n"); - auto opt = PRG.mutate(); - - if (!opt){ - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model: {}\n" - "Mutation failed to create a child", - d, s, - PRG.get_model("compact", true) - ); - fmt::print("{}", PARAMS["mutation_trace"].get().dump()); - } - else { - successes += 1; - auto Child = opt.value(); - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model: {}\n" - "Mutated Model: {}\n", - d, s, - PRG.get_model("compact", true), - Child.get_model("compact", true) - ); - - fmt::print("child fit\n"); - Child.fit(data); - y_pred = Child.predict(data); - } - } - } - // since x1 and x2 have same type, we shoudn't get fails - ASSERT_TRUE(successes > 0); -} - -TEST(Variation, MutationSizeAndDepthLimit) -{ - PARAMS["write_mutation_trace"] = true; - PARAMS["mutation_options"] = { - {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} - }; +// // applying mutation and checking if the optional result is non-empty +// fmt::print("auto Child = PRG.mutate();\n"); +// auto opt = PRG.mutate(); + +// if (!opt){ +// fmt::print( +// "=================================================\n" +// "depth = {}, size= {}\n" +// "Initial Model: {}\n" +// "Mutation failed to create a child", +// d, s, +// PRG.get_model("compact", true) +// ); +// fmt::print("{}", PARAMS["mutation_trace"].get().dump()); +// } +// else { +// successes += 1; +// auto Child = opt.value(); +// fmt::print( +// "=================================================\n" +// "depth = {}, size= {}\n" +// "Initial Model: {}\n" +// "Mutated Model: {}\n", +// d, s, +// PRG.get_model("compact", true), +// Child.get_model("compact", true) +// ); + +// fmt::print("child fit\n"); +// Child.fit(data); +// y_pred = Child.predict(data); +// } +// } +// } +// // since x1 and x2 have same type, we shoudn't get fails +// ASSERT_TRUE(successes > 0); +// } + +// TEST(Variation, MutationSizeAndDepthLimit) +// { +// PARAMS["write_mutation_trace"] = true; +// PARAMS["mutation_options"] = { +// {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} +// }; - MatrixXf X(10,2); - ArrayXf y(10); - X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, - 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, +// MatrixXf X(10,2); +// ArrayXf y(10); +// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, +// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, - 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; +// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, +// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, - 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; +// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, +// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - Dataset data(X,y); +// Dataset data(X,y); - SearchSpace SS; - SS.init(data); +// SearchSpace SS; +// SS.init(data); - // prod operator --> arity 4: prod(T1, T2, T3) - // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) - int max_arity = 6; - - int successes = 0; - for (int d = 5; d < 15; ++d) - { - for (int s = 5; s < 15; ++s) - { - PARAMS["max_size"] = s; - PARAMS["max_depth"] = d; - - fmt::print("d={},s={}\n",d,s); - fmt::print("make_regressor\n"); - - // Enforcing that the parents does not exceed max_size by - // taking into account the highest arity of the function nodes; - // and the max_depth+1 that PTC2 can generate - RegressorProgram PRG = SS.make_regressor(d-1, s - max_arity); +// // prod operator --> arity 4: prod(T1, T2, T3) +// // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) +// int max_arity = 6; + +// int successes = 0; +// for (int d = 5; d < 15; ++d) +// { +// for (int s = 5; s < 15; ++s) +// { +// PARAMS["max_size"] = s; +// PARAMS["max_depth"] = d; + +// fmt::print("d={},s={}\n",d,s); +// fmt::print("make_regressor\n"); + +// // Enforcing that the parents does not exceed max_size by +// // taking into account the highest arity of the function nodes; +// // and the max_depth+1 that PTC2 can generate +// RegressorProgram PRG = SS.make_regressor(d-1, s - max_arity); - auto PRG_model = PRG.get_model("compact", true); - - auto opt = PRG.mutate(); - - if (!opt){ - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model: {}\n" - "Mutation failed to create a child", - d, s, - PRG.get_model("compact", true) - ); - fmt::print("{}", PARAMS["mutation_trace"].get().dump()); - } - else { - successes += 1; +// auto PRG_model = PRG.get_model("compact", true); + +// auto opt = PRG.mutate(); + +// if (!opt){ +// fmt::print( +// "=================================================\n" +// "depth = {}, size= {}\n" +// "Initial Model: {}\n" +// "Mutation failed to create a child", +// d, s, +// PRG.get_model("compact", true) +// ); +// fmt::print("{}", PARAMS["mutation_trace"].get().dump()); +// } +// else { +// successes += 1; - // Extracting the child from the std::optional and checking - // if it is within size and depth restrictions. There is no - // margin for having slightly bigger expressions. - auto Child = opt.value(); +// // Extracting the child from the std::optional and checking +// // if it is within size and depth restrictions. There is no +// // margin for having slightly bigger expressions. +// auto Child = opt.value(); - fmt::print("print\n"); - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model: {}\n" - "Mutated Model: {}\n" - "Mutated depth: {}\n" - "Mutated size : {}\n", - d, s, - PRG.get_model("compact", true), - Child.get_model("compact", true), - Child.depth(), - Child.size() - ); - - // Original didn't change - ASSERT_TRUE(PRG_model == PRG.get_model("compact", true)); +// fmt::print("print\n"); +// fmt::print( +// "=================================================\n" +// "depth = {}, size= {}\n" +// "Initial Model: {}\n" +// "Mutated Model: {}\n" +// "Mutated depth: {}\n" +// "Mutated size : {}\n", +// d, s, +// PRG.get_model("compact", true), +// Child.get_model("compact", true), +// Child.depth(), +// Child.size() +// ); + +// // Original didn't change +// ASSERT_TRUE(PRG_model == PRG.get_model("compact", true)); - ASSERT_TRUE(Child.size() > 0); - ASSERT_TRUE(Child.size() <= s); - - ASSERT_TRUE(Child.size() > 0); - ASSERT_TRUE(Child.size() <= s); - - ASSERT_TRUE(Child.depth() >= 0); - ASSERT_TRUE(Child.depth() <= d); - } - } - } - ASSERT_TRUE(successes > 0); -} - -TEST(Variation, Crossover) -{ - MatrixXf X(10,2); - ArrayXf y(10); - X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, - 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - - 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, - 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - - y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, - 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - - Dataset data(X,y); - - SearchSpace SS; - SS.init(data); - - int successes = 0; - for (int d = 1; d < 10; ++d) - { - for (int s = 1; s < 10; ++s) - { - RegressorProgram PRG1 = SS.make_regressor(d, s); - RegressorProgram PRG2 = SS.make_regressor(d, s); - PRG1.fit(data); - PRG2.fit(data); - - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model 1: {}\n" - "Initial Model 2: {}\n", - d, s, - PRG1.get_model("compact", true), - PRG2.get_model("compact", true) - ); - - ArrayXf y_pred = PRG1.predict(data); - fmt::print("cross one\n"); - - auto opt = PRG1.cross(PRG2); - if (!opt){ - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Original model 1: {}\n" - "Original model 2: {}\n", - "Crossover failed to create a child", - d, s, - PRG1.get_model("compact", true), - PRG2.get_model("compact", true) - ); - } - else { - successes += 1; - auto Child = opt.value(); - fmt::print( - "Original model 1 after cross: {}\n" - "Original model 2 after cross: {}\n", - PRG1.get_model("compact", true), - PRG2.get_model("compact", true) - ); - fmt::print( - "Crossed Model: {}\n" - "=================================================\n", - Child.get_model("compact", true) - ); - Child.fit(data); - auto child_pred1 = Child.predict(data); - } - } - } - ASSERT_TRUE(successes > 0); -} - -TEST(Variation, CrossoverSizeAndDepthLimit) -{ - MatrixXf X(10,2); - ArrayXf y(10); - X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, - 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - - 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, - 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - - y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, - 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - - Dataset data(X,y); - - SearchSpace SS; - SS.init(data); - - // prod operator --> arity 4: prod(T1, T2, T3) - // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) - int max_arity = 6; - - int successes = 0; - for (int d = 5; d < 15; ++d) - { - for (int s = 5; s < 15; ++s) - { - PARAMS["max_size"] = s; - PARAMS["max_depth"] = d; - - // Enforcing that the parents does not exceed max_size by - // taking into account the highest arity of the function nodes - RegressorProgram PRG1 = SS.make_regressor(d-1, s-max_arity); - RegressorProgram PRG2 = SS.make_regressor(d-1, s-max_arity); - - auto PRG1_model = PRG1.get_model("compact", true); - auto PRG2_model = PRG2.get_model("compact", true); - - fmt::print( - "=================================================\n" - "settings: depth = {}, size= {}\n" - "Original model 1: {}\n" - "depth = {}, size= {}\n" - "Original model 2: {}\n" - "depth = {}, size= {}\n", - d, s, - PRG1.get_model("compact", true), - PRG1.depth(), PRG1.size(), - PRG2.get_model("compact", true), - PRG2.depth(), PRG2.size() - ); - - fmt::print("cross\n"); - auto opt = PRG1.cross(PRG2); - - if (!opt){ - fmt::print("Crossover failed to create a child" - "=================================================\n"); - } - else { - successes += 1; - auto Child = opt.value(); - fmt::print( - "Child Model : {}\n" - "Child Model depth: {}\n" - "Child Model size : {}\n" - "=================================================\n", - Child.get_model("compact", true), - Child.depth(), Child.size() - ); - - // Original didn't change - ASSERT_TRUE(PRG1_model == PRG1.get_model("compact", true)); - ASSERT_TRUE(PRG2_model == PRG2.get_model("compact", true)); - - // Child is within restrictions - ASSERT_TRUE(Child.size() > 0); - ASSERT_TRUE(Child.size() <= s); - - ASSERT_TRUE(Child.depth() >= 0); - ASSERT_TRUE(Child.depth() <= d); - } - } - } - ASSERT_TRUE(successes > 0); -} \ No newline at end of file +// ASSERT_TRUE(Child.size() > 0); +// ASSERT_TRUE(Child.size() <= s); + +// ASSERT_TRUE(Child.size() > 0); +// ASSERT_TRUE(Child.size() <= s); + +// ASSERT_TRUE(Child.depth() >= 0); +// ASSERT_TRUE(Child.depth() <= d); +// } +// } +// } +// ASSERT_TRUE(successes > 0); +// } + +// TEST(Variation, Crossover) +// { +// MatrixXf X(10,2); +// ArrayXf y(10); +// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, +// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + +// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, +// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + +// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, +// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + +// Dataset data(X,y); + +// SearchSpace SS; +// SS.init(data); + +// int successes = 0; +// for (int d = 1; d < 10; ++d) +// { +// for (int s = 1; s < 10; ++s) +// { +// RegressorProgram PRG1 = SS.make_regressor(d, s); +// RegressorProgram PRG2 = SS.make_regressor(d, s); +// PRG1.fit(data); +// PRG2.fit(data); + +// fmt::print( +// "=================================================\n" +// "depth = {}, size= {}\n" +// "Initial Model 1: {}\n" +// "Initial Model 2: {}\n", +// d, s, +// PRG1.get_model("compact", true), +// PRG2.get_model("compact", true) +// ); + +// ArrayXf y_pred = PRG1.predict(data); +// fmt::print("cross one\n"); + +// auto opt = PRG1.cross(PRG2); +// if (!opt){ +// fmt::print( +// "=================================================\n" +// "depth = {}, size= {}\n" +// "Original model 1: {}\n" +// "Original model 2: {}\n", +// "Crossover failed to create a child", +// d, s, +// PRG1.get_model("compact", true), +// PRG2.get_model("compact", true) +// ); +// } +// else { +// successes += 1; +// auto Child = opt.value(); +// fmt::print( +// "Original model 1 after cross: {}\n" +// "Original model 2 after cross: {}\n", +// PRG1.get_model("compact", true), +// PRG2.get_model("compact", true) +// ); +// fmt::print( +// "Crossed Model: {}\n" +// "=================================================\n", +// Child.get_model("compact", true) +// ); +// Child.fit(data); +// auto child_pred1 = Child.predict(data); +// } +// } +// } +// ASSERT_TRUE(successes > 0); +// } + +// TEST(Variation, CrossoverSizeAndDepthLimit) +// { +// MatrixXf X(10,2); +// ArrayXf y(10); +// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, +// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + +// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, +// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + +// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, +// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + +// Dataset data(X,y); + +// SearchSpace SS; +// SS.init(data); + +// // prod operator --> arity 4: prod(T1, T2, T3) +// // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) +// int max_arity = 6; + +// int successes = 0; +// for (int d = 5; d < 15; ++d) +// { +// for (int s = 5; s < 15; ++s) +// { +// PARAMS["max_size"] = s; +// PARAMS["max_depth"] = d; + +// // Enforcing that the parents does not exceed max_size by +// // taking into account the highest arity of the function nodes +// RegressorProgram PRG1 = SS.make_regressor(d-1, s-max_arity); +// RegressorProgram PRG2 = SS.make_regressor(d-1, s-max_arity); + +// auto PRG1_model = PRG1.get_model("compact", true); +// auto PRG2_model = PRG2.get_model("compact", true); + +// fmt::print( +// "=================================================\n" +// "settings: depth = {}, size= {}\n" +// "Original model 1: {}\n" +// "depth = {}, size= {}\n" +// "Original model 2: {}\n" +// "depth = {}, size= {}\n", +// d, s, +// PRG1.get_model("compact", true), +// PRG1.depth(), PRG1.size(), +// PRG2.get_model("compact", true), +// PRG2.depth(), PRG2.size() +// ); + +// fmt::print("cross\n"); +// auto opt = PRG1.cross(PRG2); + +// if (!opt){ +// fmt::print("Crossover failed to create a child" +// "=================================================\n"); +// } +// else { +// successes += 1; +// auto Child = opt.value(); +// fmt::print( +// "Child Model : {}\n" +// "Child Model depth: {}\n" +// "Child Model size : {}\n" +// "=================================================\n", +// Child.get_model("compact", true), +// Child.depth(), Child.size() +// ); + +// // Original didn't change +// ASSERT_TRUE(PRG1_model == PRG1.get_model("compact", true)); +// ASSERT_TRUE(PRG2_model == PRG2.get_model("compact", true)); + +// // Child is within restrictions +// ASSERT_TRUE(Child.size() > 0); +// ASSERT_TRUE(Child.size() <= s); + +// ASSERT_TRUE(Child.depth() >= 0); +// ASSERT_TRUE(Child.depth() <= d); +// } +// } +// } +// ASSERT_TRUE(successes > 0); +// } \ No newline at end of file diff --git a/tests/cpp/testsHeader.h b/tests/cpp/testsHeader.h index 093f867a..24797088 100644 --- a/tests/cpp/testsHeader.h +++ b/tests/cpp/testsHeader.h @@ -28,7 +28,9 @@ using std::stof; #include "../../src/init.h" #include "../../src/data/data.h" #include "../../src/program/operator.h" +#include "../../src/variation.h" using namespace Brush; using namespace Brush::Data; +using namespace Brush::Var; #endif From e6da3e32f360422f60e74a6af74b5e24a3c157cf Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 8 Nov 2023 12:38:24 -0500 Subject: [PATCH 082/199] Removed previous mutation and crossover interfaces with python --- src/bindings/bind_programs.h | 8 +- src/program/program.h | 48 ++--- src/variation.cpp | 340 ++++++++++------------------------- src/variation.h | 17 +- 4 files changed, 127 insertions(+), 286 deletions(-) diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index 2211aa90..7ba5d585 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -48,10 +48,10 @@ void bind_program(py::module& m, string name) .def("size", &T::size, py::arg("include_weight")=true) .def("complexity", &T::complexity) .def("depth", &T::depth) - .def("cross", &T::cross, py::return_value_policy::automatic, - "Performs one attempt to stochastically swap subtrees between two programs and generate a child") - .def("mutate", &T::mutate, py::return_value_policy::automatic, - "Performs one attempt to stochastically mutate the program and generate a child") + // .def("cross", &T::cross, py::return_value_policy::automatic, + // "Performs one attempt to stochastically swap subtrees between two programs and generate a child") + // .def("mutate", &T::mutate, py::return_value_policy::automatic, + // "Performs one attempt to stochastically mutate the program and generate a child") .def("set_search_space", &T::set_search_space) //.def("copy", &T::copy<>, py::return_value_policy::copy) .def("copy", [](const T& self){ T clone(self); return clone; }) diff --git a/src/program/program.h b/src/program/program.h index b567f490..44629aa4 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -505,17 +505,17 @@ template struct Program //////////////////////////////////////////////////////////////////////////// // Mutation & Crossover - /// @brief convenience wrapper for :cpp:func:`variation:mutate()` in variation.h - /// @return a mutated version of this program - std::optional> mutate() const; - - /** - * @brief convenience wrapper for :cpp:func:`variation:cross` in variation.h - * - * @param other another program to cross with this one. - * @return a new version of this and the other program - */ - std::optional> cross(Program other) const; + // /// @brief convenience wrapper for :cpp:func:`variation:mutate()` in variation.h + // /// @return a mutated version of this program + // std::optional> mutate() const; + + // /** + // * @brief convenience wrapper for :cpp:func:`variation:cross` in variation.h + // * + // * @param other another program to cross with this one. + // * @return a new version of this and the other program + // */ + // std::optional> cross(Program other) const; /// @brief turns program tree into a linear program. /// @return a vector of nodes encoding the program in reverse polish notation @@ -531,7 +531,7 @@ template struct Program //////////////////////////////////////////////////////////////////////////////// // weight optimization #include "optimizer/weight_optimizer.h" -#include "../variation.h" +// #include "../variation.h" namespace Brush{ template @@ -547,18 +547,18 @@ void Program::update_weights(const Dataset& d) //////////////////////////////////////////////////////////////////////////////// // mutation and crossover -template -std::optional> Program::mutate() const -{ - return Brush::Var::mutate(*this, this->SSref.value().get()); -}; - -/// swaps subtrees between this and other (note the pass by copy) -template -std::optional> Program::cross(Program other) const -{ - return Brush::Var::cross(*this, other); -}; +// template +// std::optional> Program::mutate() const +// { +// return Brush::Var::mutate(*this, this->SSref.value().get()); +// }; + +// /// swaps subtrees between this and other (note the pass by copy) +// template +// std::optional> Program::cross(Program other) const +// { +// return Brush::Var::cross(*this, other); +// }; //////////////////////////////////////////////////////////////////////////////// diff --git a/src/variation.cpp b/src/variation.cpp index 0e960948..ff6b69ba 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -339,158 +339,6 @@ class SubtreeMutation : public MutationBase } }; - -/** - * @brief Stochastically mutate a program. - * - * Types of mutation: - * - * - point mutation changes a single node. - * - insertion mutation inserts a node as the parent of an existing node, and fills in the other arguments. - * - deletion mutation deletes a node. - * - subtree mutation inserts a new subtree into the program. - * - toggle_weight_on mutation turns a node's weight ON. - * - toggle_weight_off mutation turns a node's weight OFF. - * - * Every mutation has a probability (weight) based on global parameters. The - * spot where the mutation will take place is sampled based on attribute - * `get_prob_change` of each node in the tree. Inside each type of mutation, - * when a new node is inserted, it is sampled based on `terminal_weights`. - * - * Due to the stochastic behavior, and the several sampling steps, it may come to - * a case where the search space does not hold any possible modification to do in - * the program. In this case, the method returns `std::nullopt` (and has overloads - * so it can be used in a boolean context). - * - * If the mutation succeeds, the mutated program can be accessed through the - * `.value()` attribute of the `std::optional`. - * - * This means that, if you use the mutation as `auto opt = mutate(parent, SS)`, - * either `opt==false` or `opt.value()` contains the child program. - * - * @tparam T program type - * @param parent the program to be mutated - * @param SS a search space - * @return `std::optional` that may contain the child program of type `T` - */ -template -std::optional> mutate(const Program& parent, const SearchSpace& SS) -{ - auto options = PARAMS["mutation_options"].get>(); - - // whether we should write everything that happened inside the method - if (PARAMS.value("write_mutation_trace", false)==true) { - // Default fields of the trace. Initialize with default values, which are - // gradually changed throughout the execution of the method. - PARAMS["mutation_trace"] = json({ - {"parent", parent.get_model("compact", true)}, - {"mutation_weights", options}, - // default values, to be changed in case mutation works - {"mutation", "not selected"}, - {"spot_weights", "not calculated"}, - {"spot", "not selected"}, - {"child", "failed to generate"}, - {"status", "initialized weight vectors"}, - {"success", "false"} - }); - } - if (std::all_of(options.begin(), options.end(), - [](const auto& kv) { return kv.second<=0.0; }) - ) - { // No mutation can be successfully applied to this solution - return std::nullopt; - } - - // choose a valid mutation option - string choice = r.random_choice(options); - - // TODO: this could be improved (specially with the Variation class) - std::unique_ptr mutation; - if (choice == "point") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "insert") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "delete") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "toggle_weight_on") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "toggle_weight_off") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else if (choice == "subtree") - mutation = std::make_unique( - SS, PARAMS["max_size"].get(), PARAMS["max_depth"].get()); - else { - std::string msg = fmt::format("{} not a valid mutation choice", choice); - HANDLE_ERROR_THROW(msg); - } - - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["mutation"] = choice; - } - - Program child(parent); - - // choose location by weighted sampling of program - auto weights = mutation->find_spots(child.Tree); - - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["spot_weights"] = weights; - } - - if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { - return w<=0.0; - })) - { // There is no spot that has a probability to be selected - return std::nullopt; - } - - // apply the mutation and check if it succeeded - auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), - weights.begin(), weights.end()); - - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["spot"] = spot.node->get_model(false); - PARAMS["mutation_trace"]["status"] = "sampled the spot"; - } - - // Every mutation here works inplace, so they return bool instead of - // std::optional to indicare the result of their manipulation over the - // program tree. Here we call the mutation function and return the result - bool success = (*mutation)(child.Tree, spot); - - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["status"] = "aplied the mutation"; - if (success) - PARAMS["mutation_trace"]["child"] = child.get_model("compact", true); - } - - if (success - && ( (child.size() <= PARAMS["max_size"].get() ) - && (child.depth() <= PARAMS["max_depth"].get()) )){ - - // success is true only if mutation returned a valid program - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["success"] = true; - } - - return child; - } else { - - // here we have a string in PARAMS["mutation_trace"]["child"], - // but success is false since it didnt return an valid program - if (PARAMS.value("write_mutation_trace", false)==true) { - PARAMS["mutation_trace"]["status"] = "mutation returned child, but it exceeds max_size or max_depth"; - //fmt::print("{}\n", PARAMS["mutation_trace"].get().dump()); - } - return std::nullopt; - } -} - /** * @brief Stochastically swaps subtrees between root and other, returning a new program. * @@ -514,102 +362,6 @@ std::optional> mutate(const Program& parent, const SearchSpace& SS * @param other the donating parent * @return `std::optional` that may contain the child program of type `T` */ -template -std::optional> cross(const Program& root, const Program& other) -{ - /* subtree crossover between this and other, producing new Program */ - // choose location by weighted sampling of program - // TODO: why doesn't this copy the search space reference to child? - Program child(root); - - // pick a subtree to replace - vector child_weights(child.Tree.size()); - std::transform(child.Tree.begin(), child.Tree.end(), - child_weights.begin(), - [](const auto& n){ return n.get_prob_change(); } - ); - - if (std::all_of(child_weights.begin(), child_weights.end(), [](const auto& w) { - return w<=0.0; - })) - { // There is no spot that has a probability to be selected - return std::nullopt; - } - - auto child_spot = r.select_randomly(child.Tree.begin(), - child.Tree.end(), - child_weights.begin(), - child_weights.end() - ); - - auto child_ret_type = child_spot.node->data.ret_type; - - auto allowed_size = PARAMS["max_size"].get() - - ( child.size() - child.size_at(child_spot) ); - auto allowed_depth = PARAMS["max_depth"].get() - - ( child.depth_to_reach(child_spot) ); - - // pick a subtree to insert. Selection is based on other_weights - vector other_weights(other.Tree.size()); - - // iterator to get the size of subtrees inside transform - auto other_iter = other.Tree.begin(); - - // lambda function to check feasibility of solution and increment the iterator - const auto check_and_incrm = [other, &other_iter, allowed_size, allowed_depth]() -> bool { - int s = other.size_at( other_iter ); - int d = other.depth_at( other_iter ); - - std::advance(other_iter, 1); - return (s <= allowed_size) && (d <= allowed_depth); - }; - - // TODO: something like `is_valid_program` in FEAT - std::transform(other.Tree.begin(), other.Tree.end(), - other_weights.begin(), - [child_ret_type, check_and_incrm](const auto& n){ - // need to pick a node that has a matching output type to the child_spot. - // also need to check if swaping this node wouldn't exceed max_size - if (check_and_incrm() && (n.ret_type == child_ret_type)) - return n.get_prob_change(); - else - // setting the weight to zero to indicate a non-feasible crossover point - return 0.0f; - } - ); - - bool matching_spots_found = false; - for (const auto& w: other_weights) - { - matching_spots_found = w > 0.0; - - if (matching_spots_found) { - auto other_spot = r.select_randomly( - other.Tree.begin(), - other.Tree.end(), - other_weights.begin(), - other_weights.end() - ); - - // fmt::print("other_spot : {}\n",other_spot.node->data); - // swap subtrees at child_spot and other_spot - // TODO: do I need to delete the removed node? - child.Tree.move_ontop(child_spot, other_spot); - return child; - } - } - - return std::nullopt; -} - - -// TODO: make crossover and mutation private functions of a variation class -// variation class should get params as argument -// TODO: make sure every method doesnt store information, instead they retrieve it from parameters (so there's no side effect) -// TODO: implement migration as a variation method? -// TODO: delete previous mutation and crossover, and use just the variation class (implement the log for having the mutation trace) -// A BANDIT WOULD GO HERE INSIDE VARIATION (or population?) - template std::optional> Variation::cross( const Program& root, const Program& other) @@ -698,6 +450,39 @@ std::optional> Variation::cross( return std::nullopt; } +/** + * @brief Stochastically mutate a program. + * + * Types of mutation: + * + * - point mutation changes a single node. + * - insertion mutation inserts a node as the parent of an existing node, and fills in the other arguments. + * - deletion mutation deletes a node. + * - subtree mutation inserts a new subtree into the program. + * - toggle_weight_on mutation turns a node's weight ON. + * - toggle_weight_off mutation turns a node's weight OFF. + * + * Every mutation has a probability (weight) based on global parameters. The + * spot where the mutation will take place is sampled based on attribute + * `get_prob_change` of each node in the tree. Inside each type of mutation, + * when a new node is inserted, it is sampled based on `terminal_weights`. + * + * Due to the stochastic behavior, and the several sampling steps, it may come to + * a case where the search space does not hold any possible modification to do in + * the program. In this case, the method returns `std::nullopt` (and has overloads + * so it can be used in a boolean context). + * + * If the mutation succeeds, the mutated program can be accessed through the + * `.value()` attribute of the `std::optional`. + * + * This means that, if you use the mutation as `auto opt = mutate(parent, SS)`, + * either `opt==false` or `opt.value()` contains the child program. + * + * @tparam T program type + * @param parent the program to be mutated + * @param SS a search space + * @return `std::optional` that may contain the child program of type `T` + */ template std::optional> Variation::mutate(const Program& parent) { @@ -770,6 +555,65 @@ std::optional> Variation::mutate(const Program& parent) } } +template +void Variation::vary(Population& pop, tuple island_range, + const vector& parents) +{ + /*! + * performs variation on the current population. + * + * @param pop: current population + * @param parents: indices of population to use for variation + * @param params: feat parameters + * + * @return appends params.pop_size offspring derived from parent variation + */ + + assert(pop.offspring_ready + && ("Population does not have slots for generating the offspring. " + +"You should `prep_offspring_slots`. `vary` will add new xmen individuals " + +"starting from the middle of the island")); + + // parents should be within island range. TODO: assert that they are + + auto [idx_start, idx_end] = island_range; + size_t delta = idx_end - idx_start; + size_t vary_start = delta/2; + + // TODO: fix pragma omp usage + //#pragma omp parallel for + for (unsigned i = vary_start; i> opt=std::nullopt; // new individual + while (!opt) + { + Individual& mom = pop.individuals.at( + r.select_randomly(parents.begin(), parents.end())); + + if ( r() < parameters.cx_prob) // crossover + { + // get random mom and dad, make copies + Individual& dad = pop.individuals.at( + r.select_randomly(parents.begin(), parents.end())); + + opt = cross(mom, dad); + } + else // mutation + { + opt = mutate(mom); + } + + if (opt) // no optional value was returned + { + auto child = opt.value(); + assert(child.size()>0); + pop.individuals.at(i) = child; + } + } + } +} + } //namespace Var } //namespace Brush diff --git a/src/variation.h b/src/variation.h index 5b853b35..302386d2 100644 --- a/src/variation.h +++ b/src/variation.h @@ -9,8 +9,8 @@ license: GNU/GPL v3 // #include "util/error.h" // #include "util/utils.h" -#include "search_space.h" -// #include "population.h" +//#include "search_space.h" +#include "population.h" #include #include @@ -22,6 +22,8 @@ license: GNU/GPL v3 //////////////////////////////////////////////////////////////////////////// // Mutation & Crossover +using namespace Brush::Pop; + /** * @brief Namespace for variation functions like crossover and mutation. * @@ -93,12 +95,6 @@ class MutationBase { size_t max_depth_; }; -template -std::optional> cross(const Program& root, const Program& other); - -template -std::optional> mutate(const Program& parent, const SearchSpace& SS); - // TODO: make crossover and mutation private functions of a variation class // variation class should get params as argument // TODO: make sure every method doesnt store information, instead they retrieve it from parameters (so there's no side effect) @@ -123,9 +119,10 @@ class Variation {}; ~Variation(); - + /// method to handle variation of population - // void vary(Population& pop, const vector& parents); + void vary(Population& pop, tuple island_range, + const vector& parents); }; From 5beec865c167366f1d7e1bec72fc016866643188 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 8 Nov 2023 12:39:58 -0500 Subject: [PATCH 083/199] Individual uses search space to create programs --- src/individual.h | 7 +++++-- src/search_space.cpp | 32 +++++++++++++++++++++++--------- src/search_space.h | 30 ++++++++++++++++-------------- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/src/individual.h b/src/individual.h index 3cc9e653..e13bf1ec 100644 --- a/src/individual.h +++ b/src/individual.h @@ -39,8 +39,11 @@ class Individual{ void init(const SearchSpace& ss, const Parameters& params) { - // TODO: make searchspace use params, so it will generate something valid - program = SS.make_program(params.max_depth, params.max_size); + program = SS.make_program(params, 0, 0); + + // If different from zero, then the program is created with a fixed depth and size. + // If zero, it samples the value + // program = SS.make_program(params, params.max_depth, params.max_size); }; // fitness, objetives, complexity, etc diff --git a/src/search_space.cpp b/src/search_space.cpp index 8131ae76..dd076162 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -250,6 +250,14 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const int d = 1; // current tree size int s = 1; + + // updating size accordingly to root node + if (Is(root.node_type)) + s += 3; + if ( root.get_is_weighted()==true + && Isnt(root.node_type) ) + s += 2; + //For each argument position a of n, Enqueue(a; g) for (auto a : root.arg_types) { @@ -308,8 +316,10 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // TreeIter new_spot = Tree.append_child(qspot, n); // qspot = n; - if (!opt) + if (!opt) { queue.push_back(make_tuple(qspot, t, d)); + continue; + } n = opt.value(); @@ -369,25 +379,29 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const return Tree; }; -RegressorProgram SearchSpace::make_regressor(int max_d, int max_size) +// TODO: stop using params as a default argument and actually pass it (also update tests) +RegressorProgram SearchSpace::make_regressor(int max_d, int max_size, const Parameters& params) { - return make_program(max_d, max_size); + return make_program(params, max_d, max_size); }; -ClassifierProgram SearchSpace::make_classifier(int max_d, int max_size) +// TODO: stop using params as a default argument and actually pass it (also update tests) +ClassifierProgram SearchSpace::make_classifier(int max_d, int max_size, const Parameters& params) { - return make_program(max_d, max_size); + return make_program(params, max_d, max_size); }; +// TODO: stop using params as a default argument and actually pass it (also update tests) MulticlassClassifierProgram SearchSpace::make_multiclass_classifier( - int max_d, int max_size) + int max_d, int max_size, const Parameters& params) { - return make_program(max_d, max_size); + return make_program(params, max_d, max_size); }; -RepresenterProgram SearchSpace::make_representer(int max_d, int max_size) +// TODO: stop using params as a default argument and actually pass it (also update tests) +RepresenterProgram SearchSpace::make_representer(int max_d, int max_size, const Parameters& params) { - return make_program(max_d, max_size); + return make_program(params, max_d, max_size); }; } //Brush diff --git a/src/search_space.h b/src/search_space.h index 10cadd4d..84b6d052 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -146,31 +146,31 @@ struct SearchSpace * */ template - PT make_program(int max_d=0, int max_size=0); + PT make_program(const Parameters& params, int max_d=0, int max_size=0); /// @brief Makes a random regressor program. Convenience wrapper for @ref make_program /// @param max_d max depth of the program /// @param max_size max size of the program /// @return a regressor program - RegressorProgram make_regressor(int max_d = 0, int max_size = 0); + RegressorProgram make_regressor(int max_d = 0, int max_size = 0, const Parameters& params=Parameters()); /// @brief Makes a random classifier program. Convenience wrapper for @ref make_program /// @param max_d max depth of the program /// @param max_size max size of the program /// @return a classifier program - ClassifierProgram make_classifier(int max_d = 0, int max_size = 0); + ClassifierProgram make_classifier(int max_d = 0, int max_size = 0, const Parameters& params=Parameters()); /// @brief Makes a random multiclass classifier program. Convenience wrapper for @ref make_program /// @param max_d max depth of the program /// @param max_size max size of the program /// @return a multiclass classifier program - MulticlassClassifierProgram make_multiclass_classifier(int max_d = 0, int max_size = 0); + MulticlassClassifierProgram make_multiclass_classifier(int max_d = 0, int max_size = 0, const Parameters& params=Parameters()); /// @brief Makes a random representer program. Convenience wrapper for @ref make_program /// @param max_d max depth of the program /// @param max_size max size of the program /// @return a representer program - RepresenterProgram make_representer(int max_d = 0, int max_size = 0); + RepresenterProgram make_representer(int max_d = 0, int max_size = 0, const Parameters& params=Parameters()); SearchSpace() = default; @@ -677,15 +677,14 @@ T RandomDequeue(std::vector& Q) }; template -P SearchSpace::make_program(int max_d, int max_size) +P SearchSpace::make_program(const Parameters& params, int max_d, int max_size) { // this is what makes `make_program` create uniformly distributed // individuals to feed initial population - if (max_d == 0) - max_d = PARAMS["max_depth"].get(); - if (max_size == 0) - max_size = r.rnd_int(1, PARAMS["max_size"].get()); - // TODO: searchspace should infer max_size from parameters class + if (max_d < 1) + max_d = r.rnd_int(1, params.max_depth); + if (max_size < 1) + max_size = r.rnd_int(1, params.max_size); DataType root_type = DataTypeEnum::value; ProgramType program_type = P::program_type; @@ -709,9 +708,12 @@ P SearchSpace::make_program(int max_d, int max_size) root.fixed=true; } else { - // we start with a non-terminal (can be replaced inside PTC2 though, if max_size==1) - auto opt = sample_op(root_type); - if (!opt) + std::optional opt=std::nullopt; + + if (max_size>1 && max_d>1) + opt = sample_op(root_type); + + if (!opt) // if failed, then we dont have any operator to use as root... opt = sample_terminal(root_type, true); root = opt.value(); } From 232ee982a9504dcc247b94449cf7c44a2e6de051 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 8 Nov 2023 12:40:30 -0500 Subject: [PATCH 084/199] Migration methods --- src/population.cpp | 106 ++++++++++++++++++++++++++++++++++++++++++++- src/population.h | 13 ++++-- 2 files changed, 115 insertions(+), 4 deletions(-) diff --git a/src/population.cpp b/src/population.cpp index 85a0dda1..f77e118d 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -46,6 +46,8 @@ Population::Population(int p, int n_islands) template void Population::init(const SearchSpace& ss, const Parameters& params) { + this->mig_prob = params.mig_prob; + // TODO: load file (like feat) #pragma omp parallel for for (int i = 0; i< individuals.size(); ++i) @@ -58,6 +60,12 @@ void Population::init(const SearchSpace& ss, const Parameters& params) template void Population::prep_offspring_slots() { + // reading and writing is thread-safe, as long as there's no overlap on island ranges. + // manipulating a vector IS NOT thread-safe (inserting and erasing elements). + // So, prep_offspring_slots and update should be the synchronization points, not + // operations performed concurrently + + // TODO: add _SingleThreaded in funcname if (offspring_ready) HANDLE_ERROR_THROW("Allocating space in population that already has active offspring slots"); @@ -132,12 +140,43 @@ string Population::print_models(bool just_offspring, string sep) template vector> Population::sorted_front(unsigned rank) { - // this is used to update archive at the end of a generation. expect islands without offspring + // this is used to migration and update archive at the end of a generation. expect islands without offspring /* Returns individuals on the Pareto front, sorted by increasign complexity. */ vector> pf_islands; pf_islands.resize(n_islands); + for (int i=0; i pf; + + for (unsigned int i =idx_start; i +vector Population::hall_of_fame(unsigned rank) +{ + // this is used to migration and update archive at the end of a generation. expect islands without offspring + + /* Returns individuals on the Pareto front, sorted by increasign complexity. */ + vector pf_islands; + pf_islands.resize(n_islands); + for (int i=0; i> Population::sorted_front(unsigned rank) } return pf_islands; + + vector pf(0); + for (unsigned int i =0; i +void Population::migrate() +{ + assert(!offspring_ready + && "pop with offspring dont migrate (run update before calling this)"); + + auto island_fronts = sorted_front(); + auto global_hall_of_fame = hall_of_fame(); + + // This is not thread safe (as it is now) + for (int island=0; island1) { // from global hall of fame + migrating_idx = r.select_randomly( + global_hall_of_fame.begin(), + global_hall_of_fame.end()); + } + else { // from any other local hall of fame + // finding other island indexes + vector other_islands(n_islands-1); + iota(other_islands.begin(), other_islands.end(), 0); + + // skipping current island + auto it = other_islands.begin(); + std::advance(it, island); + for (;it != other_islands.end(); ++it) { + ++(*it); + } + + // picking other island + int other_island = r.select_randomly( + other_islands.begin(), + other_islands.end()); + + migrating_idx = r.select_randomly( + island_fronts.at(other_island).begin(), + island_fronts.at(other_island).end()); + } + + individuals.at(i) = individuals.at(migrating_idx); + } + } + } } + } // Pop } // Brush diff --git a/src/population.h b/src/population.h index 82690e3a..720350ed 100644 --- a/src/population.h +++ b/src/population.h @@ -1,9 +1,9 @@ #ifndef POPULATION_H #define POPULATION_H -#include "program/program.h" #include "search_space.h" #include "individual.h" +#include "program/program.h" using std::vector; using std::string; @@ -22,6 +22,7 @@ class Population{ vector> island_ranges; vector island_skip; // number of indexes to skip for each island (when variation fails) unsigned int n_islands; + float mig_prob; Population(int p = 0, int n_islands=1); @@ -47,10 +48,16 @@ class Population{ /// return population equations. string print_models(bool just_offspring=false, string sep="\n"); - // TODO: WORK WITH ISLANDS (vector of vectors, one for each island) - /// return complexity-sorted Pareto front indices. + /// return complexity-sorted Pareto front indices for each island vector> sorted_front(unsigned rank=1); + + // pareto front ignoring island divisions + vector hall_of_fame(unsigned rank=1); + // perform a migration in the population. Individuals from sorted front or hall of fame will replace others by the + // probability set in parameters. Expects a population without offspring + void migrate(); + /// Sort each island in increasing complexity. struct SortComplexity { From e5a6d7a2727b571b7c976e8df5412cf4288576e2 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 8 Nov 2023 12:40:44 -0500 Subject: [PATCH 085/199] Updating python wrapper (almost finishing it!) --- src/bindings/bind_cbrush.cpp | 6 ++-- src/cbrush.cpp | 54 +++++------------------------------- src/cbrush.h | 13 +++++++-- src/params.h | 9 +++--- 4 files changed, 26 insertions(+), 56 deletions(-) diff --git a/src/bindings/bind_cbrush.cpp b/src/bindings/bind_cbrush.cpp index 2e3d77f6..90f917f3 100644 --- a/src/bindings/bind_cbrush.cpp +++ b/src/bindings/bind_cbrush.cpp @@ -1,14 +1,16 @@ #include "module.h" #include "../cbrush.h" +#include "../types.h" namespace py = pybind11; namespace br = Brush; namespace nl = nlohmann; +// TODO: copy bind_programs.h to make the cbrush void bind_cbrush(py::module& m) { - py::class_(m, "CBrush") + py::class_>(m, "BrushRegressor") .def(py::init([]() - { br::CBrush est; return est; })) + { br::CBrush est; return est; })) ; } \ No newline at end of file diff --git a/src/cbrush.cpp b/src/cbrush.cpp index 137f0238..d6f20c72 100644 --- a/src/cbrush.cpp +++ b/src/cbrush.cpp @@ -5,12 +5,14 @@ namespace Brush{ /// @brief initialize Feat object for fitting. -void CBrush::init() +template +void CBrush::init() { if (params.n_jobs!=0) // TODO: change this to set taskflow jobs omp_set_num_threads(params.n_jobs); r.set_seed(params.random_state); + // set up the pop, variator, etc set_is_fitted(false); // TODO: implement stuff below @@ -28,74 +30,32 @@ void CBrush::init() // TODO: initialize dataset and search space here or inside fit? } -void CBrush::run_generation(unsigned int g, +template +void CBrush::run_generation(unsigned int g, vector survivors, Dataset &d, float fraction, unsigned& stall_count) { - // d.t->set_protected_groups(); + params.current_gen = g; - // params.set_current_gen(g); - - // // select parents - // logger.log("selection..", 2); + // select parents // vector parents = selector.select(pop, params, *d.t); - // logger.log("parents:\n"+pop.print_eqns(), 3); // // variation to produce offspring - // logger.log("variation...", 2); // variator.vary(pop, parents, params,*d.t); - // logger.log("offspring:\n" + pop.print_eqns(true), 3); // // evaluate offspring - // logger.log("evaluating offspring...", 2); // evaluator.fitness(pop.individuals, *d.t, params, true); // evaluator.validation(pop.individuals, *d.v, params, true); // // select survivors from combined pool of parents and offspring - // logger.log("survival...", 2); // survivors = survivor.survive(pop, params, *d.t); // // reduce population to survivors - // logger.log("shrinking pop to survivors...",2); // pop.update(survivors); - // logger.log("survivors:\n" + pop.print_eqns(), 3); - // logger.log("update best...",2); // bool updated_best = update_best(d); - - // logger.log("calculate stats...",2); - // calculate_stats(d); - - // if (params.max_stall > 0) - // update_stall_count(stall_count, updated_best); - - // logger.log("update archive...",2); - // if (use_arch) - // archive.update(pop,params); - - // if(params.verbosity>1) - // print_stats(log, fraction); - // else if(params.verbosity == 1) - // printProgress(fraction); - - // if (!logfile.empty()) - // log_stats(log); - - // if (save_pop > 1) - // pop.save(this->logfile+".pop.gen" + - // to_string(params.current_gen) + ".json"); - - // // tighten learning rate for grad descent as evolution progresses - // if (params.backprop) - // { - // params.bp.learning_rate = \ - // (1-1/(1+float(params.gens)))*params.bp.learning_rate; - // logger.log("learning rate: " - // + std::to_string(params.bp.learning_rate),3); - // } - // logger.log("finished with generation...",2); } } \ No newline at end of file diff --git a/src/cbrush.h b/src/cbrush.h index 1d4ac81d..d46b68ea 100644 --- a/src/cbrush.h +++ b/src/cbrush.h @@ -7,22 +7,24 @@ license: GNU/GPL v3 #define CBrush_H #include "init.h" +#include "population.h" #include "params.h" #include "selection/selection.h" #include "./util/rnd.h" -#include "population.h" #include "taskflow/taskflow.hpp" // TODO: improve the includes (why does this lines below does not work?) // #include "variation.h" // #include "selection.h" -// using namespace selection; -// using namespace variation; namespace Brush { +using namespace Pop; +// using namespace variation; + +template class CBrush{ public: CBrush(){ params = Parameters(); }; @@ -123,6 +125,11 @@ class CBrush{ private: Parameters params; ///< hyperparameters of brush + Population pop; ///< population of programs + // Selection selector; ///< selection algorithm + // Variation variator; ///< variation operators + // Selection survivor; ///< survival algorithm + // TODO // attributes (hyperparameters) // update best diff --git a/src/params.h b/src/params.h index a12bbcd4..07734dac 100644 --- a/src/params.h +++ b/src/params.h @@ -17,16 +17,17 @@ struct Parameters public: // TODO: setters and getters for all parameters? (and do checks in setters?) - // TODO: attribute current_gen - // settings int random_state; // TODO: constructor should set the global rng to random_state (if given, otherwise just let it work normally) //int verbosity = 0; // TODO: implement log and verbosity + // TODO: every parameter should have a default value // TODO: python wrapper should have getters and setters for all this stuff // Evolutionary stuff string mode="regression"; + unsigned int current_gen = 1; + int pop_size = 100; int gens = 100; unsigned int max_depth=10; @@ -38,8 +39,8 @@ struct Parameters int num_islands=5; // variation - std::map mutation_probs; // TODO: should be an map - float cx_prob; ///< cross rate for variation + std::map mutation_probs; + float cx_prob=0.2; ///< cross rate for variation float mig_prob = 0.05; string scorer_; ///< actual loss function used, determined by error From 9ea4bb4515cdeb69f6a08fd9166152db9ee9872d Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 8 Nov 2023 13:51:50 -0500 Subject: [PATCH 086/199] Generation basic operations almost done (evaluator is missing) --- src/cbrush.cpp | 16 ++++++++-------- src/cbrush.h | 16 ++++++++++++---- src/population.h | 3 +++ src/selection/nsga2.cpp | 1 + src/selection/selection.h | 4 +++- src/variation.h | 18 +++++++++--------- 6 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/cbrush.cpp b/src/cbrush.cpp index d6f20c72..0afac76e 100644 --- a/src/cbrush.cpp +++ b/src/cbrush.cpp @@ -15,6 +15,8 @@ void CBrush::init() // set up the pop, variator, etc set_is_fitted(false); + // TODO: INIT SEARCH SPACE AND VARIATION HERE + // TODO: implement stuff below // // start the clock // timer.Reset(); @@ -33,27 +35,25 @@ void CBrush::init() template void CBrush::run_generation(unsigned int g, vector survivors, - Dataset &d, + Dataset &data, float fraction, unsigned& stall_count) { params.current_gen = g; // select parents - // vector parents = selector.select(pop, params, *d.t); + vector parents = selector.select(pop, pop.get_island_range(0), data); // // variation to produce offspring - // variator.vary(pop, parents, params,*d.t); + variator.vary(pop, pop.get_island_range(0), parents); - // // evaluate offspring - // evaluator.fitness(pop.individuals, *d.t, params, true); - // evaluator.validation(pop.individuals, *d.v, params, true); + // TODO: needs to create the evaluator // // select survivors from combined pool of parents and offspring - // survivors = survivor.survive(pop, params, *d.t); + survivors = survivor.survive(pop, pop.get_island_range(0), data); // // reduce population to survivors - // pop.update(survivors); + pop.update(survivors); // bool updated_best = update_best(d); } diff --git a/src/cbrush.h b/src/cbrush.h index d46b68ea..8fa91407 100644 --- a/src/cbrush.h +++ b/src/cbrush.h @@ -22,12 +22,19 @@ namespace Brush { using namespace Pop; +using namespace Sel; + // using namespace variation; template class CBrush{ public: - CBrush(){ params = Parameters(); }; + CBrush() + : params(Parameters()) + , ss(SearchSpace()) + , variator(Variation(params, ss)) + {}; + ~CBrush(){}; void init(); @@ -124,11 +131,12 @@ class CBrush{ unsigned& stall_count); private: Parameters params; ///< hyperparameters of brush + SearchSpace ss; Population pop; ///< population of programs - // Selection selector; ///< selection algorithm - // Variation variator; ///< variation operators - // Selection survivor; ///< survival algorithm + Selection selector; ///< selection algorithm + Variation variator; ///< variation operators + Selection survivor; ///< survival algorithm // TODO // attributes (hyperparameters) diff --git a/src/population.h b/src/population.h index 720350ed..174d42a4 100644 --- a/src/population.h +++ b/src/population.h @@ -34,6 +34,9 @@ class Population{ /// returns population size int size() { return individuals.size(); }; + tuple get_island_range(int island) { + return island_ranges.at(island); }; + /// update individual vector size, distributing the expressions in n_islands void prep_offspring_slots(); diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index dd6c4463..cbaa56ca 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -15,6 +15,7 @@ size_t NSGA2::tournament(vector>& pop, size_t i, size_t j) cons Individual& ind1 = pop.at(i); Individual& ind2 = pop.at(j); + // TODO: implement this int flag = ind1.check_dominance(ind2); if (flag == 1) // ind1 dominates ind2 diff --git a/src/selection/selection.h b/src/selection/selection.h index 829f8444..13c9a5c3 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -9,12 +9,14 @@ license: GNU/GPL v3 #include "../init.h" #include "../params.h" #include "../population.h" +#include "../variation.h" namespace Brush { namespace Sel { using namespace Brush; using namespace Pop; +using namespace Var; /*! * @class SelectionOperator @@ -47,7 +49,7 @@ class SelectionOperator } }; -struct Parameters; // forward declaration of Parameters +// struct Parameters; // forward declaration of Parameters /*! * @class Selection diff --git a/src/variation.h b/src/variation.h index 302386d2..13cda4b6 100644 --- a/src/variation.h +++ b/src/variation.h @@ -95,12 +95,7 @@ class MutationBase { size_t max_depth_; }; -// TODO: make crossover and mutation private functions of a variation class -// variation class should get params as argument // TODO: make sure every method doesnt store information, instead they retrieve it from parameters (so there's no side effect) -// TODO: implement migration as a variation method? -// TODO: delete previous mutation and crossover, and use just the variation class (implement the log for having the mutation trace) - // A BANDIT WOULD GO HERE INSIDE VARIATION (or population?) template class Variation @@ -113,20 +108,25 @@ class Variation std::optional> mutate(const Program& parent); public: - Variation(const Parameters& params, const SearchSpace& ss) + Variation() = default; + + Variation(Parameters& params, SearchSpace& ss) : parameters(params) , search_space(ss) {}; - ~Variation(); + ~Variation() {}; + + void init(Parameters& params, SearchSpace& ss){ + parameters = params; + search_space = ss; + }; /// method to handle variation of population void vary(Population& pop, tuple island_range, const vector& parents); }; - - } //namespace Var } //namespace Brush #endif \ No newline at end of file From 5e862b388f28778d0c3043bd52db391f9d7b3db6 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 8 Nov 2023 14:14:23 -0500 Subject: [PATCH 087/199] Starting to implement evaluation --- src/cbrush.cpp | 5 +- src/cbrush.h | 7 +- src/eval/evaluation.cpp | 241 ++++++++++++++++++++++++++++++++++++++++ src/eval/evaluation.h | 55 +++++++++ src/eval/metrics.h | 0 src/eval/scorer.h | 0 src/evaluator.cpp | 98 ---------------- src/evaluator.h | 52 --------- 8 files changed, 306 insertions(+), 152 deletions(-) create mode 100644 src/eval/evaluation.cpp create mode 100644 src/eval/evaluation.h create mode 100644 src/eval/metrics.h create mode 100644 src/eval/scorer.h delete mode 100644 src/evaluator.cpp delete mode 100644 src/evaluator.h diff --git a/src/cbrush.cpp b/src/cbrush.cpp index 0afac76e..75b77fea 100644 --- a/src/cbrush.cpp +++ b/src/cbrush.cpp @@ -39,6 +39,7 @@ void CBrush::run_generation(unsigned int g, float fraction, unsigned& stall_count) { + // TODO: implement custom behavior for first generation (specially regarding evaluator) params.current_gen = g; // select parents @@ -47,7 +48,9 @@ void CBrush::run_generation(unsigned int g, // // variation to produce offspring variator.vary(pop, pop.get_island_range(0), parents); - // TODO: needs to create the evaluator + // TODO: needs to create the evaluator (and calculate the information on train and validation partition) + evaluator.fitness(pop.individuals, data, params, true); + evaluator.validation(pop.individuals, data, params, true); // // select survivors from combined pool of parents and offspring survivors = survivor.survive(pop, pop.get_island_range(0), data); diff --git a/src/cbrush.h b/src/cbrush.h index 8fa91407..7d567e10 100644 --- a/src/cbrush.h +++ b/src/cbrush.h @@ -9,6 +9,7 @@ license: GNU/GPL v3 #include "init.h" #include "population.h" #include "params.h" +#include "./eval/evaluation.h" #include "selection/selection.h" #include "./util/rnd.h" #include "taskflow/taskflow.hpp" @@ -23,6 +24,7 @@ namespace Brush using namespace Pop; using namespace Sel; +using namespace Eval; // using namespace variation; @@ -135,17 +137,20 @@ class CBrush{ Population pop; ///< population of programs Selection selector; ///< selection algorithm + Evaluation evaluator; ///< evaluation code Variation variator; ///< variation operators Selection survivor; ///< survival algorithm + // TODO: MISSING CLASSES: timer, archive, logger + // TODO - // attributes (hyperparameters) // update best // calculate/print stats }; int main(){ + // TODO: USE TASKFLOW TO DO THE ISLAND STUFF tf::Executor executor; tf::Taskflow taskflow; diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp new file mode 100644 index 00000000..1fad9fdf --- /dev/null +++ b/src/eval/evaluation.cpp @@ -0,0 +1,241 @@ +// /* FEAT +// copyright 2017 William La Cava +// license: GNU/GPL v3 +// */ + +// #include "evaluation.h" + +// // code to evaluate GP programs. +// namespace FT{ + +// using namespace Opt; + +// namespace Eval{ + +// Evaluation::Evaluation(string scorer): S(scorer) +// { +// this->S.set_scorer(scorer); +// } + +// Evaluation::~Evaluation(){} + +// void Evaluation::validation(vector& individuals, +// const Data& d, +// const Parameters& params, +// bool offspring +// ) +// { +// unsigned start =0; +// if (offspring) +// start = individuals.size()/2; + +// // loop through individuals +// /* #pragma omp parallel for */ +// for (unsigned i = start; i yhat = ind.predict(d); +// // assign aggregate fitness +// logger.log("Assigning fitness to ind " + to_string(i) +// + ", eqn: " + ind.get_eqn(), 3); + +// if (!pass) +// { + +// ind.fitness_v = MAX_FLT; +// } +// else +// { +// // assign fitness to individual +// VectorXf loss; +// ind.fitness_v = this->S.score(d.y, yhat, loss, +// params.class_weights); +// } +// } +// } +// // fitness of population +// void Evaluation::fitness(vector& individuals, +// const Data& d, +// const Parameters& params, +// bool offspring) +// { +// /*! +// * @param individuals: population +// * @param d: Data structure +// * @param params: algorithm parameters +// * @param offspring: if true, only evaluate last half of population + +// * Output + +// * individuals.fitness, yhat, error is modified +// */ + +// unsigned start =0; +// if (offspring) start = individuals.size()/2; + +// /* for (unsigned i = start; i yhat = ind.fit(d,params,pass); +// // assign F and aggregate fitness +// logger.log("Assigning fitness to ind " + to_string(i) +// + ", eqn: " + ind.get_eqn(), 3); + +// if (!pass) +// { + +// ind.fitness = MAX_FLT; +// ind.error = MAX_FLT*VectorXf::Ones(d.y.size()); +// } +// else +// { +// // assign weights to individual +// assign_fit(ind,yhat,d,params,false); + + +// if (params.hillclimb) +// { +// HillClimb hc(params.scorer_, params.hc.iters, +// params.hc.step); +// bool updated = false; +// shared_ptr yhat2 = hc.run(ind, d, params, +// updated); +// // update the fitness of this individual +// if (updated) +// { +// assign_fit(ind, yhat2, d, params); +// } + +// } +// } +// } +// } + +// // assign fitness to program +// void Evaluation::assign_fit(Individual& ind, +// const shared_ptr& yhat, const Data& d, +// const Parameters& params, bool val) +// { +// /*! +// * assign raw errors and aggregate fitnesses to individuals. +// * +// * Input: +// * +// * ind: individual +// * yhat: predicted output of ind +// * d: data +// * params: feat parameters +// * +// * Output: +// * +// * modifies individual metrics +// */ +// VectorXf loss; +// float f = S.score(d.y, yhat, loss, params.class_weights); +// //TODO: add if condition for this +// float fairness = marginal_fairness(loss, d, f); + +// if (fairness <0 ) +// { +// cout << "fairness is " << fairness << "...\n"; +// } +// if (val) +// { +// ind.fitness_v = f; +// ind.fairness_v = fairness; +// } +// else +// { +// ind.fitness = f; +// ind.fairness = fairness; +// ind.error = loss; +// } + +// logger.log("ind " + std::to_string(ind.id) + " fitness: " +// + std::to_string(ind.fitness),3); +// } + +// float Evaluation::marginal_fairness(VectorXf& loss, const Data& d, +// float base_score, bool use_alpha) +// { +// // averages the deviation of the loss function from average loss +// // over k +// float avg_score = 0; +// float count = 0; +// float alpha = 1; + +// ArrayXb x_idx; + +// for (const auto& pl : d.protect_levels) +// { +// for (const auto& lvl : pl.second) +// { +// x_idx = (d.X.row(pl.first).array() == lvl); +// float len_g = x_idx.count(); +// if (use_alpha) +// alpha = len_g/d.X.cols(); +// /* cout << "alpha = " << len_g << "/" +// * << d.X.cols() << endl; */ +// float Beta = fabs(base_score - +// x_idx.select(loss,0).sum()/len_g); +// /* cout << "Beta = |" << base_score << " - " */ +// /* << x_idx.select(loss,0).sum() << "/" */ +// /* << len_g << "|" << endl; */ +// avg_score += alpha * Beta; +// ++count; +// } + +// } +// avg_score /= count; +// if (std::isinf(avg_score) +// || std::isnan(avg_score) +// || avg_score < 0) +// return MAX_FLT; + +// return avg_score; + +// } +// } +// } diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h new file mode 100644 index 00000000..b9dc1e1a --- /dev/null +++ b/src/eval/evaluation.h @@ -0,0 +1,55 @@ + +#ifndef EVALUATION_H +#define EVALUATION_H + +#include + +#include "../search_space.h" +#include "../individual.h" +#include "../program/program.h" +#include "../data/data.h" + +using std::string; + +namespace Brush { + +using namespace Pop; + +namespace Eval { + +template +class Evaluation { +public: + Evaluation(string scorer=""); + ~Evaluation(); + + // TODO: IMPLEMENT THIS + /// validation of population. + void validation(vector>& individuals, + const Dataset& data, + const Parameters& params, + bool offspring = false + ); + + + // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING + + /// fitness of population. + void fitness(vector>& individuals, + const Dataset& data, + const Parameters& params, + bool offspring = false + ); + + // TODO: implement other eval methods + /// assign fitness to an individual. + // void assign_fit(Individual& ind, + // const Dataset& data, + // const Parameters& params,bool val=false); + + // Scorer S; +}; + +} //selection +} //brush +#endif diff --git a/src/eval/metrics.h b/src/eval/metrics.h new file mode 100644 index 00000000..e69de29b diff --git a/src/eval/scorer.h b/src/eval/scorer.h new file mode 100644 index 00000000..e69de29b diff --git a/src/evaluator.cpp b/src/evaluator.cpp deleted file mode 100644 index 4e9e6b5f..00000000 --- a/src/evaluator.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* Brush -copyright 2020 William La Cava -license: GNU/GPL v3 -*/ - -#include "evaluator.h" -namespace Brush -{ - -// template<> auto OPERON_EXPORT -// MinimumDescriptionLengthEvaluator::operator()(Operon::RandomGenerator& rng, Individual& ind, Operon::Span buf) const -> typename EvaluatorBase::ReturnType { -// auto const& problem = Evaluator::GetProblem(); -// auto const range = problem.TrainingRange(); -// auto const& dataset = problem.GetDataset(); -// auto const& nodes = ind.Genotype.Nodes(); -// auto const& dtable = Evaluator::GetDispatchTable(); - -// auto const* optimizer = Evaluator::GetOptimizer(); -// EXPECT(optimizer != nullptr); - -// // this call will optimize the tree coefficients and compute the SSE -// auto const& tree = ind.Genotype; -// Operon::Interpreter interpreter{dtable, dataset, ind.Genotype}; -// auto summary = optimizer->Optimize(rng, tree); -// auto parameters = summary.Success ? summary.FinalParameters : tree.GetCoefficients(); -// auto const p { static_cast(parameters.size()) }; - -// std::vector buffer; -// if (buf.size() < range.Size()) { -// buffer.resize(range.Size()); -// buf = Operon::Span(buffer); -// } -// interpreter.Evaluate(parameters, range, buf); - -// auto estimatedValues = buf; -// auto targetValues = problem.TargetValues(range); - -// // codelength of the complexity -// // count number of unique functions -// // - count weight * variable as three nodes -// // - compute complexity c of the remaining numerical values -// // (that are not part of the coefficients that are optimized) -// Operon::Set uniqueFunctions; // to count the number of unique functions -// auto k{0.0}; // number of nodes -// auto cComplexity { 0.0 }; - -// // codelength of the parameters -// Eigen::Matrix j = interpreter.JacRev(parameters, range); // jacobian -// auto fm = optimizer->ComputeFisherMatrix(estimatedValues, {j.data(), static_cast(j.size())}, sigma_); -// auto ii = fm.diagonal().array(); -// ENSURE(ii.size() == p); - -// auto cParameters { 0.0 }; -// auto constexpr eps = std::numeric_limits::epsilon(); // machine epsilon for zero comparison - -// for (auto i = 0, j = 0; i < std::ssize(nodes); ++i) { -// auto const& n = nodes[i]; - -// // count the number of nodes and the number of unique operators -// k += n.IsVariable() ? 3 : 1; -// uniqueFunctions.insert(n.HashValue); - -// if (n.Optimize) { -// // this branch computes the description length of the parameters to be optimized -// auto const di = std::sqrt(12 / ii(j)); -// auto const ci = std::abs(parameters[j]); - -// if (!(std::isfinite(ci) && std::isfinite(di)) || ci / di < 1) { -// //ind.Genotype[i].Optimize = false; -// //auto const v = ind.Genotype[i].Value; -// //ind.Genotype[i].Value = 0; -// //auto fit = (*this)(rng, ind, buf); -// //ind.Genotype[i].Optimize = true; -// //ind.Genotype[i].Value = v; -// //return fit; -// } else { -// cParameters += 0.5 * std::log(ii(j)) + std::log(ci); -// } -// ++j; -// } else { -// // this branch computes the description length of the remaining tree structure -// if (std::abs(n.Value) < eps) { continue; } -// cComplexity += std::log(std::abs(n.Value)); -// } -// } - -// auto q { static_cast(uniqueFunctions.size()) }; -// if (q > 0) { cComplexity += static_cast(k) * std::log(q); } - -// cParameters -= p/2 * std::log(3); - -// auto cLikelihood = optimizer->ComputeLikelihood(estimatedValues, targetValues, sigma_); -// auto mdl = cComplexity + cParameters + cLikelihood; -// if (!std::isfinite(mdl)) { mdl = EvaluatorBase::ErrMax; } -// return typename EvaluatorBase::ReturnType { static_cast(mdl) }; -// } - -} // namespace Brush diff --git a/src/evaluator.h b/src/evaluator.h deleted file mode 100644 index cee3b6ae..00000000 --- a/src/evaluator.h +++ /dev/null @@ -1,52 +0,0 @@ -/* Brush -copyright 2020 William La Cava -license: GNU/GPL v3 -*/ - -#ifndef EVALUATOR_H -#define EVALUATOR_H - -//internal includes -#include "init.h" -#include "program/node.h" -#include "program/nodetype.h" -#include "program/tree_node.h" -// #include "program/program.h" -#include "util/utils.h" -#include "util/rnd.h" -#include "params.h" -#include -#include - -using namespace Brush::Data; -using namespace Brush::Util; -using Brush::Node; -using Brush::DataType; -using std::type_index; - -namespace Brush -{ - -// template -// class OPERON_EXPORT MinimumDescriptionLengthEvaluator final : public Evaluator { -// using Base = Evaluator; - -// public: -// explicit MinimumDescriptionLengthEvaluator(Operon::Problem& problem, DTable const& dtable) -// : Base(problem, dtable, sse_) -// , sigma_(1, 1) // assume unit variance by default -// { -// } - -// auto SetSigma(std::vector sigma) { sigma_ = std::move(sigma); } - -// auto -// operator()(Operon::RandomGenerator& /*random*/, Individual& ind, Operon::Span buf) const -> typename EvaluatorBase::ReturnType override; - -// private: -// Operon::SSE sse_; -// mutable std::vector sigma_; -// }; - -} // namespace Brush -#endif From 9f7f11cead366a669e5a97bbe8a6e762720ce5fe Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 9 Nov 2023 14:38:49 -0500 Subject: [PATCH 088/199] Evaluation implemented --- src/eval/evaluation.cpp | 367 ++++++++++++++-------------------------- src/eval/evaluation.h | 32 ++-- src/eval/metrics.cpp | 20 +++ src/eval/metrics.h | 20 +++ src/eval/scorer.h | 65 +++++++ src/individual.h | 3 +- src/population.cpp | 5 + src/variation.cpp | 11 +- 8 files changed, 266 insertions(+), 257 deletions(-) create mode 100644 src/eval/metrics.cpp diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index 1fad9fdf..ffeec6c1 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -1,241 +1,130 @@ -// /* FEAT -// copyright 2017 William La Cava -// license: GNU/GPL v3 -// */ - -// #include "evaluation.h" - -// // code to evaluate GP programs. -// namespace FT{ - -// using namespace Opt; - -// namespace Eval{ - -// Evaluation::Evaluation(string scorer): S(scorer) -// { -// this->S.set_scorer(scorer); -// } - -// Evaluation::~Evaluation(){} - -// void Evaluation::validation(vector& individuals, -// const Data& d, -// const Parameters& params, -// bool offspring -// ) -// { -// unsigned start =0; -// if (offspring) -// start = individuals.size()/2; - -// // loop through individuals -// /* #pragma omp parallel for */ -// for (unsigned i = start; i yhat = ind.predict(d); -// // assign aggregate fitness -// logger.log("Assigning fitness to ind " + to_string(i) -// + ", eqn: " + ind.get_eqn(), 3); - -// if (!pass) -// { - -// ind.fitness_v = MAX_FLT; -// } -// else -// { -// // assign fitness to individual -// VectorXf loss; -// ind.fitness_v = this->S.score(d.y, yhat, loss, -// params.class_weights); -// } -// } -// } -// // fitness of population -// void Evaluation::fitness(vector& individuals, -// const Data& d, -// const Parameters& params, -// bool offspring) -// { -// /*! -// * @param individuals: population -// * @param d: Data structure -// * @param params: algorithm parameters -// * @param offspring: if true, only evaluate last half of population - -// * Output - -// * individuals.fitness, yhat, error is modified -// */ - -// unsigned start =0; -// if (offspring) start = individuals.size()/2; - -// /* for (unsigned i = start; i yhat = ind.fit(d,params,pass); -// // assign F and aggregate fitness -// logger.log("Assigning fitness to ind " + to_string(i) -// + ", eqn: " + ind.get_eqn(), 3); - -// if (!pass) -// { - -// ind.fitness = MAX_FLT; -// ind.error = MAX_FLT*VectorXf::Ones(d.y.size()); -// } -// else -// { -// // assign weights to individual -// assign_fit(ind,yhat,d,params,false); - - -// if (params.hillclimb) -// { -// HillClimb hc(params.scorer_, params.hc.iters, -// params.hc.step); -// bool updated = false; -// shared_ptr yhat2 = hc.run(ind, d, params, -// updated); -// // update the fitness of this individual -// if (updated) -// { -// assign_fit(ind, yhat2, d, params); -// } - -// } -// } -// } -// } +#include "evaluation.h" + +namespace Brush{ +namespace Eval{ + + +template +void Evaluation::validation(Population& pop, + tuple island_range, + const Dataset& data, + const Parameters& params, + bool offspring + ) +{ + // if offspring false --> if has offspring, do it on first half. else, do on entire island + // offspring true --> assert that has offspring, do it on the second half of the island + + auto [idx_start, idx_end] = island_range; + size_t delta = idx_end - idx_start; + if (offspring) + { + assert(pop.offspring_ready + && ("Population does not have offspring to calculate validation fitness")); + + idx_start = idx_start + (delta/2); + } + else if (pop.offspring_ready) // offspring is false. We need to see where we sould stop + { + idx_end = idx_end - (delta/2); + } + + for (unsigned i = idx_start; i& ind = pop[i]; -// // assign fitness to program -// void Evaluation::assign_fit(Individual& ind, -// const shared_ptr& yhat, const Data& d, -// const Parameters& params, bool val) -// { -// /*! -// * assign raw errors and aggregate fitnesses to individuals. -// * -// * Input: -// * -// * ind: individual -// * yhat: predicted output of ind -// * d: data -// * params: feat parameters -// * -// * Output: -// * -// * modifies individual metrics -// */ -// VectorXf loss; -// float f = S.score(d.y, yhat, loss, params.class_weights); -// //TODO: add if condition for this -// float fairness = marginal_fairness(loss, d, f); + // if there is no validation data, + // set fitness_v to fitness and return ( this assumes that fitness on train was calculated previously.) + if (!data.use_validation) + { + ind.fitness_v = ind.fitness; + continue; + } + + bool pass = true; + + if (!pass) + { + // TODO: stop doing this hardcoded? + ind.fitness_v = MAX_FLT; + } + else + { + // TODO: implement the class weights and use it here (and on fitness) + auto y_pred = ind.predict(data.get_validation_data); + assign_fit(ind, y_pred, data, params, true); + } + } +} + +// fitness of population +template +void Evaluation::fitness(Population& pop, + tuple island_range, + const Dataset& data, + const Parameters& params, + bool offspring + ) +{ + // if offspring false --> if has offspring, do it on first half. else, do on entire island + // offspring true --> assert that has offspring, do it on the second half of the island + + auto [idx_start, idx_end] = island_range; + size_t delta = idx_end - idx_start; + if (offspring) + { + assert(pop.offspring_ready + && ("Population does not have offspring to calculate validation fitness")); + + idx_start = idx_start + (delta/2); + } + else if (pop.offspring_ready) // offspring is false. We need to see where we sould stop + { + idx_end = idx_end - (delta/2); + } + + for (unsigned i = idx_start; i& ind = pop[i]; + + bool pass = true; + + if (!pass) + { + ind.fitness = MAX_FLT; + ind.error = MAX_FLT*VectorXf::Ones(data.y.size()); + } + else + { + // assign weights to individual + ind.fit(data); -// if (fairness <0 ) -// { -// cout << "fairness is " << fairness << "...\n"; -// } -// if (val) -// { -// ind.fitness_v = f; -// ind.fairness_v = fairness; -// } -// else -// { -// ind.fitness = f; -// ind.fairness = fairness; -// ind.error = loss; -// } - -// logger.log("ind " + std::to_string(ind.id) + " fitness: " -// + std::to_string(ind.fitness),3); -// } - -// float Evaluation::marginal_fairness(VectorXf& loss, const Data& d, -// float base_score, bool use_alpha) -// { -// // averages the deviation of the loss function from average loss -// // over k -// float avg_score = 0; -// float count = 0; -// float alpha = 1; - -// ArrayXb x_idx; - -// for (const auto& pl : d.protect_levels) -// { -// for (const auto& lvl : pl.second) -// { -// x_idx = (d.X.row(pl.first).array() == lvl); -// float len_g = x_idx.count(); -// if (use_alpha) -// alpha = len_g/d.X.cols(); -// /* cout << "alpha = " << len_g << "/" -// * << d.X.cols() << endl; */ -// float Beta = fabs(base_score - -// x_idx.select(loss,0).sum()/len_g); -// /* cout << "Beta = |" << base_score << " - " */ -// /* << x_idx.select(loss,0).sum() << "/" */ -// /* << len_g << "|" << endl; */ -// avg_score += alpha * Beta; -// ++count; -// } - -// } -// avg_score /= count; -// if (std::isinf(avg_score) -// || std::isnan(avg_score) -// || avg_score < 0) -// return MAX_FLT; - -// return avg_score; - -// } -// } -// } + auto y_pred = ind.predict(data.get_training_data); + assign_fit(ind, y_pred, data, params, false); + } + } +} + +// assign fitness to program +template +void Evaluation::assign_fit(Individual& ind, + VectorXf& y_pred, const Dataset& data, + const Parameters& params, bool val) +{ + VectorXf loss; + + float f = S.score(data.y, y_pred, loss, params.class_weights); + + if (val) + { + ind.fitness_v = f; + } + else + { + ind.fitness = f; + ind.error = loss; + } +} + +} // Pop +} // Brush \ No newline at end of file diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index b9dc1e1a..02885ba8 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -8,6 +8,8 @@ #include "../individual.h" #include "../program/program.h" #include "../data/data.h" +#include "scorer.h" +#include "../population.h" using std::string; @@ -20,34 +22,36 @@ namespace Eval { template class Evaluation { public: - Evaluation(string scorer=""); - ~Evaluation(); + Scorer S; + + Evaluation(string scorer="mse"): S(scorer) { this->S.set_scorer(scorer); }; + ~Evaluation(){}; // TODO: IMPLEMENT THIS /// validation of population. - void validation(vector>& individuals, + void validation(Population& pop, + tuple island_range, const Dataset& data, const Parameters& params, bool offspring = false ); - - // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING + // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING? (caps) /// fitness of population. - void fitness(vector>& individuals, - const Dataset& data, - const Parameters& params, - bool offspring = false - ); + void fitness(Population& pop, + tuple island_range, + const Dataset& data, + const Parameters& params, + bool offspring = false + ); // TODO: implement other eval methods + /// assign fitness to an individual. - // void assign_fit(Individual& ind, - // const Dataset& data, - // const Parameters& params,bool val=false); + void assign_fit(Individual& ind, VectorXf& y_pred, + const Dataset& data, const Parameters& params, bool val=false); - // Scorer S; }; } //selection diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp new file mode 100644 index 00000000..8375dc26 --- /dev/null +++ b/src/eval/metrics.cpp @@ -0,0 +1,20 @@ +#include "metrics.h" + +namespace Brush { +namespace Eval { + +/* Scoring functions */ + +/// mean squared error +float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, + const vector& weights) +{ + loss = (yhat - y).array().pow(2); + return loss.mean(); +} + + +// TODO: implement other metrics. Right know I have just the MSE + +} // metrics +} // Brush \ No newline at end of file diff --git a/src/eval/metrics.h b/src/eval/metrics.h index e69de29b..e640d19c 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -0,0 +1,20 @@ +#ifndef METRICS_H +#define METRICS_H + +#include "../data/data.h" + +namespace Brush { +namespace Eval { + +/* Scoring functions */ + +/// mean squared error +float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, + const vector& weights=vector() ); + +// TODO: implement other metrics. Right know I have just the MSE + +} // metrics +} // Brush + +#endif \ No newline at end of file diff --git a/src/eval/scorer.h b/src/eval/scorer.h index e69de29b..eb3f2298 100644 --- a/src/eval/scorer.h +++ b/src/eval/scorer.h @@ -0,0 +1,65 @@ +#ifndef SCORER_H +#define SCORER_H + +#include "metrics.h" +#include "../util/error.h" + +// code to evaluate GP programs. +namespace Brush{ +namespace Eval{ + +typedef float (*funcPointer)(const VectorXf&, + const VectorXf&, + VectorXf&, + const vector&); + +class Scorer +{ +public: + // map the string into a function to be called when calculating the score + std::map score_hash; + string scorer; + + // TODO: add more scores, include them here, add to score_hash + Scorer(string scorer="mse") { + score_hash["mse"] = &mse; + + this->set_scorer(scorer); + }; + + void set_scorer(string scorer){ this->scorer = scorer; }; + + /* void set_scorer(string scorer); */ + float score(const VectorXf& y_true, VectorXf& y_pred, + VectorXf& loss, const vector& w) + { + // loss is an array passed by reference to store each prediction (used in lexicase) + // weights are used to give more or less importance for a given sample. + // Every scorer must have the same function signature, but arent required to use all info + + if ( score_hash.find(this->scorer) == score_hash.end() ) + { + // not found + HANDLE_ERROR_THROW("Scoring function '" + this->scorer + + "' not defined"); + return 0.0; + } + else + { + // found + return score_hash.at(this->scorer)(y_true, y_pred, loss, w); + } + }; + + // overloaded score with no loss + float score(const VectorXf& y_true, VectorXf& y_pred, + vector w=vector()) + { + VectorXf dummy; + return this->score(y_true, y_pred, dummy, w); + }; +}; + +} +} +#endif diff --git a/src/individual.h b/src/individual.h index e13bf1ec..9a72e5d8 100644 --- a/src/individual.h +++ b/src/individual.h @@ -27,11 +27,12 @@ class Individual{ public: Individual() - { // TODO: calculate this stuff + { fitness = -1; fitness_v = -1; complexity=-1; + dcounter=-1; rank=-1; crowd_dist = -1; diff --git a/src/population.cpp b/src/population.cpp index f77e118d..7f60a617 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -90,6 +90,11 @@ void Population::prep_offspring_slots() this->individuals = &expanded_pop; offspring_ready = true; + + // Im keeping the offspring and parents in the same population object, because we + // have operations that require them together (archive, hall of fame.) + // The downside is having to be aware that islands will create offsprings + // intercalated with other islands } template diff --git a/src/variation.cpp b/src/variation.cpp index ff6b69ba..fc9ac2d1 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -578,14 +578,19 @@ void Variation::vary(Population& pop, tuple island_range, auto [idx_start, idx_end] = island_range; size_t delta = idx_end - idx_start; - size_t vary_start = delta/2; + + idx_start = idx_start + (delta/2); // TODO: fix pragma omp usage //#pragma omp parallel for - for (unsigned i = vary_start; i> opt=std::nullopt; // new individual + std::optional> opt=std::nullopt; // new individual + // TODO: do it a certain number of times. after that, assume that variation cant + // change individual and add it to the island failures + // TODO: use island failures everytime that I'm iterating on the offspring of an + // island (with island range) while (!opt) { Individual& mom = pop.individuals.at( From 37952eb0da2f03c4f2094816eeb2d7423d7fca91 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 13 Nov 2023 15:03:32 -0500 Subject: [PATCH 089/199] Setting and using individual objective values --- src/individual.cpp | 54 +++++++++++++++++++++++++++++++++++++++++ src/individual.h | 5 ++++ src/selection/nsga2.cpp | 6 ++++- 3 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 src/individual.cpp diff --git a/src/individual.cpp b/src/individual.cpp new file mode 100644 index 00000000..becfa42c --- /dev/null +++ b/src/individual.cpp @@ -0,0 +1,54 @@ +#include "individual.h" + +namespace Brush{ +namespace Pop{ + +template +int Individual::check_dominance(const Individual& b) const +{ + int flag1 = 0, // to check if this has a smaller objective + flag2 = 0; // to check if b has a smaller objective + + for (int i=0; i b.obj.at(i)) + flag2 = 1; + } + + if (flag1==1 && flag2==0) + // there is at least one smaller objective for this and none + // for b + return 1; + else if (flag1==0 && flag2==1) + // there is at least one smaller objective for b and none + // for this + return -1; + else + // no smaller objective or both have one smaller + return 0; +} + +template +void Individual::set_obj(const vector& objectives) +{ + obj.clear(); + + for (const auto& n : objectives) + { + // TODO: implement other objectives? + if (n.compare("fitness")==0) + obj.push_back(fitness); // fitness on training data, not validation. + // if you use batch, this value will change every generation + else if (n.compare("complexity")==0) + obj.push_back(set_complexity()); + else if (n.compare("size")==0) + obj.push_back(program.size()); + else + HANDLE_ERROR_THROW(n+" is not a known objective"); + } + +} + +} // Pop +} // Brush \ No newline at end of file diff --git a/src/individual.h b/src/individual.h index 9a72e5d8..f24a93c3 100644 --- a/src/individual.h +++ b/src/individual.h @@ -24,6 +24,7 @@ class Individual{ unsigned int rank; ///< pareto front rank float crowd_dist; ///< crowding distance on the Pareto front + vector obj; ///< objectives for use with Pareto selection public: Individual() @@ -71,6 +72,10 @@ class Individual{ void set_crowd_dist(unsigned cd){ crowd_dist=cd; }; size_t get_crow_dist() const { return crowd_dist; }; + + /// set obj vector given a string of objective names + void set_obj(const vector&); + int check_dominance(const Individual& b) const; }; } // Pop diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index cbaa56ca..66dc14ee 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -62,6 +62,10 @@ vector NSGA2::select(Population& pop, tuple island if (params.current_gen==0) return island_pool; + // setting the objectives + for (unsigned int i=0; i selected(0); for (int i = 0; i < delta; ++i) // selecting based on island_pool size @@ -103,7 +107,7 @@ vector NSGA2::survive(Population& pop, tuple islan vector island_pool(delta); // array with indexes for the specific island_pool std::iota(island_pool.begin(), island_pool.end(), idx_start); - // set objectives + // set objectives (this is when the obj vector is updated.) #pragma omp parallel for for (unsigned int i=0; i Date: Mon, 13 Nov 2023 15:04:29 -0500 Subject: [PATCH 090/199] Evaluation now has the option to not re-fit the individual In case we want to use batch learning --- src/eval/evaluation.cpp | 4 +++- src/eval/evaluation.h | 3 ++- src/params.h | 14 +++++++------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index ffeec6c1..ea221953 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -63,6 +63,7 @@ void Evaluation::fitness(Population& pop, tuple island_range, const Dataset& data, const Parameters& params, + bool fit, bool offspring ) { @@ -97,7 +98,8 @@ void Evaluation::fitness(Population& pop, else { // assign weights to individual - ind.fit(data); + if (fit) + ind.fit(data); auto y_pred = ind.predict(data.get_training_data); assign_fit(ind, y_pred, data, params, false); diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index 02885ba8..5ddb141f 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -37,12 +37,13 @@ class Evaluation { ); // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING? (caps) - + // TODO: MAKE it work for classification (do I need to have a way to set accuracy as a minimization problem?) /// fitness of population. void fitness(Population& pop, tuple island_range, const Dataset& data, const Parameters& params, + bool fit=true, bool offspring = false ); diff --git a/src/params.h b/src/params.h index 07734dac..11e04c48 100644 --- a/src/params.h +++ b/src/params.h @@ -28,12 +28,12 @@ struct Parameters unsigned int current_gen = 1; - int pop_size = 100; - int gens = 100; - unsigned int max_depth=10; - unsigned int max_size=100; + int pop_size = 100; + int gens = 100; + unsigned int max_depth = 10; + unsigned int max_size = 100; vector objectives{"error","complexity"}; // error should be generic and deducted based on mode - string sel = "nsga2"; //selection method + string sel = "nsga2"; //selection method string surv = "nsga2"; //survival method vector functions; int num_islands=5; @@ -43,7 +43,7 @@ struct Parameters float cx_prob=0.2; ///< cross rate for variation float mig_prob = 0.05; - string scorer_; ///< actual loss function used, determined by error + string scorer_="mse"; ///< actual loss function used, determined by error // for classification (TODO: should I have these, or they could be just dataset arguments (except the ones needed to use in dataset constructor)) unsigned int n_classes; ///< number of classes for classification @@ -51,7 +51,7 @@ struct Parameters vector class_weights; ///< weights for each class vector sample_weights; ///< weights for each sample - // for dataset + // for dataset. TODO: make it work bool shuffle = true; ///< option to shuffle the data float split = 0.75; ///< fraction of data to use for training vector feature_names; ///< names of features From d76cfd00ae04a1189d445f7a8f39e9fb6de183cd Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 13 Nov 2023 15:05:16 -0500 Subject: [PATCH 091/199] Generation and fit methods --- src/cbrush.cpp | 133 +++++++++++++++++++++++++++++++++++++++++++------ src/cbrush.h | 21 ++++---- 2 files changed, 129 insertions(+), 25 deletions(-) diff --git a/src/cbrush.cpp b/src/cbrush.cpp index 75b77fea..18bf123b 100644 --- a/src/cbrush.cpp +++ b/src/cbrush.cpp @@ -30,35 +30,136 @@ void CBrush::init() // params.use_batch = params.bp.batch_size>0; // TODO: initialize dataset and search space here or inside fit? + +} + +template +bool CBrush::update_best(const Dataset& data, bool val) +{ + float bs; + bs = this->best_loss; + + float f; + vector>& pop_ref =this->pop.individuals; // TODO: archive here? + + bool updated = false; + + for (const auto& ind: pop_ref) + { + if (ind.rank == 1) + { + if (val) + f = ind.fitness_v; + else + f = ind.fitness; + + if (f < bs + || (f == bs && ind.get_complexity() < this->best_complexity) + ) + { + bs = f; + this->best_ind = ind; + this->best_complexity = ind.get_complexity(); + + updated = true; + } + } + } + + this->best_loss = bs; + + return updated; } + template -void CBrush::run_generation(unsigned int g, - vector survivors, - Dataset &data, - float fraction, - unsigned& stall_count) +void CBrush::run_generation(unsigned int g, Dataset &data) { + // https://taskflow.github.io/taskflow/ParallelIterations.html + tf::Executor executor; + tf::Taskflow taskflow; + // TODO: implement custom behavior for first generation (specially regarding evaluator) params.current_gen = g; - // select parents - vector parents = selector.select(pop, pop.get_island_range(0), data); + auto batch = data.get_batch(); // will return the original dataset if it is set to dont use batch + + vector> island_parents; + island_parents.resize(pop.n_islands); + taskflow.for_each_index(0, pop.n_islands, 1, [&](int island) { + tuple island_range = pop.get_island_range(island); + + // fit the weights with all training data + evaluator.fitness(pop.individuals, island_range, data, params, true, false); + evaluator.validation(pop.individuals, island_range, data, params, false); - // // variation to produce offspring - variator.vary(pop, pop.get_island_range(0), parents); + // TODO: if using batch, fitness should be called before selection to set the batch + if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) + evaluator.fitness(pop.individuals, island_range, batch, params, false, false); - // TODO: needs to create the evaluator (and calculate the information on train and validation partition) - evaluator.fitness(pop.individuals, data, params, true); - evaluator.validation(pop.individuals, data, params, true); + // select parents + vector parents = selector.select(pop, island_range, data); + island_parents.at(island) = parents; + }); + + vector survivors(pop.size()); + pop.prep_offspring_slots(); + + taskflow.for_each_index(0, pop.n_islands, 1, [&](int island) { + tuple island_range = pop.get_island_range(island); + + // // variation to produce offspring + variator.vary(pop, island_range, island_parents.at(island)); + + // TODO: needs to create the evaluator (and calculate the information on train and validation partition) + evaluator.fitness(pop.individuals, island_range, data, params, true, true); + evaluator.validation(pop.individuals, island_range, data, params, true); + + if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) + evaluator.fitness(pop.individuals, island_range, batch, params, false, true); + + // // select survivors from combined pool of parents and offspring + auto island_survivors = survivor.survive(pop, island_range, data); + + auto [idx_start, idx_end] = island_range; + size_t delta = idx_end - idx_start; + for (unsigned i = 0; i +void CBrush::fit(MatrixXf& X, VectorXf& y) +{ + this->init(); + + // TODO: fit method that takes different arguments? + Dataset data(X, y); + + this->ss = SearchSpace(data, params.functions); + this->pop = Population(params.pop_size, params.num_islands); + this->evaluator = Evaluation(params.scorer_); + this->selector = Selection(params.sel, false); + this->survivor = Selection(params.surv, true); + + // initialize population with initial model and/or starting pop + pop.init(this->ss, this->params); + + unsigned g = 0; + // continue until max gens is reached or max_time is up (if it is set) - // bool updated_best = update_best(d); + while(g survivors, - Dataset &d, - float percentage, - unsigned& stall_count); + void run_generation(unsigned int g, Dataset &data); private: Parameters params; ///< hyperparameters of brush SearchSpace ss; @@ -144,16 +143,20 @@ class CBrush{ // TODO: MISSING CLASSES: timer, archive, logger // TODO + // results so far + float best_loss; + int best_complexity; + Individual best_ind; + // update best // calculate/print stats }; int main(){ - - // TODO: USE TASKFLOW TO DO THE ISLAND STUFF - tf::Executor executor; - tf::Taskflow taskflow; + tf::Executor executor; + tf::Taskflow taskflow; + auto [A, B, C, D] = taskflow.emplace( // create four tasks [] () { std::cout << "TaskA\n"; }, [] () { std::cout << "TaskB\n"; }, From 4216e90d3bf300ed04eb812ec0abeb30740df666 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 13 Nov 2023 15:05:45 -0500 Subject: [PATCH 092/199] Moving the python implementation out of cpp src. Starting to rewrite it --- {src/brush => brush}/__init__.py | 0 {src/brush => brush}/pybrush.py | 4 +- brush/versionstr.py | 1 + src/brush/deap_api/__init__.py | 3 - src/brush/deap_api/nsga2.py | 110 ------ src/brush/deap_api/nsga2island.py | 160 -------- src/brush/deap_api/utils.py | 4 - src/brush/estimator.py | 588 ------------------------------ 8 files changed, 4 insertions(+), 866 deletions(-) rename {src/brush => brush}/__init__.py (100%) rename {src/brush => brush}/pybrush.py (87%) create mode 100644 brush/versionstr.py delete mode 100644 src/brush/deap_api/__init__.py delete mode 100644 src/brush/deap_api/nsga2.py delete mode 100644 src/brush/deap_api/nsga2island.py delete mode 100644 src/brush/deap_api/utils.py delete mode 100644 src/brush/estimator.py diff --git a/src/brush/__init__.py b/brush/__init__.py similarity index 100% rename from src/brush/__init__.py rename to brush/__init__.py diff --git a/src/brush/pybrush.py b/brush/pybrush.py similarity index 87% rename from src/brush/pybrush.py rename to brush/pybrush.py index 801d2d1d..cac154c6 100644 --- a/src/brush/pybrush.py +++ b/brush/pybrush.py @@ -1,8 +1,10 @@ -from _brush import CBrush +from _brush import CBrush # TODO: stop calling cbrush from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +# TODO? LOGGER AND ARCHIVE +# TODO: GET DOCUMENTATION BACK class PybrushEstimator(BaseEstimator): def __init__(self): self.cbrush_ = CBrush() diff --git a/brush/versionstr.py b/brush/versionstr.py new file mode 100644 index 00000000..4d9b0682 --- /dev/null +++ b/brush/versionstr.py @@ -0,0 +1 @@ +__version__="i-never-tested-that-thing" \ No newline at end of file diff --git a/src/brush/deap_api/__init__.py b/src/brush/deap_api/__init__.py deleted file mode 100644 index b011dec5..00000000 --- a/src/brush/deap_api/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .nsga2 import nsga2 -from .nsga2island import nsga2island -from .utils import DeapIndividual \ No newline at end of file diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py deleted file mode 100644 index bbeb9e1e..00000000 --- a/src/brush/deap_api/nsga2.py +++ /dev/null @@ -1,110 +0,0 @@ -from deap import tools -from deap.benchmarks.tools import hypervolume -import numpy as np -import functools - - -def nsga2(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rnd_flt): - # NGEN = 250 - # MU = 100 - # CXPB = 0.9 - # rnd_flt: random number generator to sample crossover prob - - def calculate_statistics(ind): - on_train = ind.fitness.values - on_val = toolbox.evaluateValidation(ind) - - return (*on_train, *on_val) - - stats = tools.Statistics(calculate_statistics) - - stats.register("avg", np.nanmean, axis=0) - stats.register("med", np.nanmedian, axis=0) - stats.register("std", np.nanstd, axis=0) - stats.register("min", np.nanmin, axis=0) - stats.register("max", np.nanmax, axis=0) - - logbook = tools.Logbook() - logbook.header = ['gen', 'evals'] + \ - [f"{stat} {partition} O{objective}" - for stat in ['avg', 'med', 'std', 'min', 'max'] - for partition in ['train', 'val'] - for objective in toolbox.get_objectives()] - - pop = toolbox.population(n=MU) - - # OBS: evaluate calls fit in the individual. It is different from using it to predict. The - # function evaluateValidation don't call the fit - fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) - for ind, fit in zip(pop, fitnesses): - ind.fitness.values = fit - - # This is just to assign the crowding distance to the individuals - # no actual selection is done - pop = toolbox.survive(pop, len(pop)) - - record = stats.compile(pop) - logbook.record(gen=0, evals=len(pop), **record) - - if verbosity > 0: - print(logbook.stream) - - # Begin the generational process - for gen in range(1, NGEN): - batch = toolbox.getBatch() # batch will be a random subset only if it was not defined as the size of the train set. - # everytime this function is called, a new random batch is generated. - if (use_batch): # recalculate the fitness for the parents - # use_batch is false if batch_size is different from train set size. - # If we're using batch, we need to re-evaluate every model (without changing its weights). - # evaluateValidation doesnt fit the weights - fitnesses = toolbox.map( - functools.partial(toolbox.evaluateValidation, data=batch), pop) - - for ind, fit in zip(pop, fitnesses): - ind.fitness.values = fit - - # Vary the population - # offspring = tools.selTournamentDCD(pop, len(pop)) - parents = toolbox.select(pop, len(pop)) - # offspring = [toolbox.clone(ind) for ind in offspring] - offspring = [] - for ind1, ind2 in zip(parents[::2], parents[1::2]): - off1, off2 = None, None - if rnd_flt() < CXPB: # either mutation or crossover - off1, off2 = toolbox.mate(ind1, ind2) - else: - off1 = toolbox.mutate(ind1) - off2 = toolbox.mutate(ind2) - - if off1 is not None: # Mutation worked. first we fit, then add to offspring - # Evaluate (instead of evaluateValidation) to fit the weights of the offspring - off1.fitness.values = toolbox.evaluate(off1) - if use_batch: # Adjust fitness to the same data as parents - off1.fitness.values = toolbox.evaluateValidation(off1, data=batch) - offspring.extend([off1]) - - if off2 is not None: - off2.fitness.values = toolbox.evaluate(off2) - if use_batch: - off2.fitness.values = toolbox.evaluateValidation(off2, data=batch) - offspring.extend([off2]) - - # Select the next generation population (no sorting before this step, as - # survive==offspring will cut it in half) - pop = toolbox.survive(pop + offspring, MU) - - pop.sort(key=lambda x: x.fitness, reverse=True) - - record = stats.compile(pop) - logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) - - if verbosity > 0: - print(logbook.stream) - - if verbosity > 0: - print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) - - archive = tools.ParetoFront() - archive.update(pop) - - return archive, logbook \ No newline at end of file diff --git a/src/brush/deap_api/nsga2island.py b/src/brush/deap_api/nsga2island.py deleted file mode 100644 index 95de8197..00000000 --- a/src/brush/deap_api/nsga2island.py +++ /dev/null @@ -1,160 +0,0 @@ -from deap import tools -from deap.benchmarks.tools import diversity, convergence, hypervolume -import numpy as np -import functools - - -def nsga2island(toolbox, NGEN, MU, N_ISLANDS, MIGPX, CXPB, use_batch, verbosity, rnd_flt): - # NGEN = 250 - # MU = 100 - # CXPB = 0.9 - # N_ISLANDS: number of independent islands. Islands are controled by indexes. - # setting N_ISLANDS=1 would be the same as the original nsga2 - # rnd_flt: random number generator to sample crossover prob - - def calculate_statistics(ind): - on_train = ind.fitness.values - on_val = toolbox.evaluateValidation(ind) - - return (*on_train, *on_val) - - stats = tools.Statistics(calculate_statistics) - - stats.register("avg", np.mean, axis=0) - stats.register("med", np.median, axis=0) - stats.register("std", np.std, axis=0) - stats.register("min", np.min, axis=0) - stats.register("max", np.max, axis=0) - - logbook = tools.Logbook() - logbook.header = ['gen', 'evals'] + \ - [f"{stat} {partition} O{objective}" - for stat in ['avg', 'med', 'std', 'min', 'max'] - for partition in ['train', 'val'] - for objective in toolbox.get_objectives()] - - # Tuples with start and end indexes for each island. Number of individuals - # in each island can slightly differ if N_ISLANDS is not a divisor of MU - island_indexes = [((i*MU)//N_ISLANDS, ((i+1)*MU)//N_ISLANDS) - for i in range(N_ISLANDS)] - - pop = toolbox.population(n=MU) - - fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) - for ind, fit in zip(pop, fitnesses): - ind.fitness.values = fit - - survived = [] - for (idx_start, idx_end) in island_indexes: - survived_parents = toolbox.survive(pop[idx_start:idx_end], - idx_end-idx_start) - survived.extend(survived_parents) - pop = survived - - record = stats.compile(pop) - logbook.record(gen=0, evals=len(pop), **record) - - if verbosity > 0: - print(logbook.stream) - - # Begin the generational process - for gen in range(1, NGEN): - batch = toolbox.getBatch() # batch will be a random subset only if it was not - # defined as the size of the train set. everytime - # this function is called, a new random batch is generated. - - if (use_batch): # recalculate the fitness for the parents - # use_batch is false if batch_size is different from train set size. - # If we're using batch, we need to re-evaluate every model (without - # changing its weights). evaluateValidation doesnt fit the weights - fitnesses = toolbox.map( - functools.partial(toolbox.evaluateValidation, data=batch), pop) - - for ind, fit in zip(pop, fitnesses): - ind.fitness.values = fit - - # Vary the population inside each island - parents = [] - for (idx_start, idx_end) in island_indexes: - island_parents = toolbox.select(pop[idx_start:idx_end], - idx_end-idx_start) - parents.extend(island_parents) - - offspring = [] # Will have the same size as pop - island_failed_variations = [] - for (idx_start, idx_end) in island_indexes: - failed_variations = 0 - for ind1, ind2 in zip(parents[idx_start:idx_end:2], - parents[idx_start+1:idx_end:2] - ): - off1, off2 = None, None - if rnd_flt() < CXPB: # either mutation or crossover - off1, off2 = toolbox.mate(ind1, ind2) - else: - off1 = toolbox.mutate(ind1) - off2 = toolbox.mutate(ind2) - - if off1 is not None: - off1.fitness.values = toolbox.evaluate(off1) - if use_batch: - off1.fitness.values = toolbox.evaluateValidation(off1, data=batch) - offspring.extend([off1]) - else: - failed_variations += 1 - - if off2 is not None: - off2.fitness.values = toolbox.evaluate(off2) - if use_batch: - off2.fitness.values = toolbox.evaluateValidation(off2, data=batch) - offspring.extend([off2]) - else: - failed_variations += 1 - island_failed_variations.append(failed_variations) - - # Evaluate (instead of evaluateValidation) to fit the weights of the offspring - fitnesses = toolbox.map(functools.partial(toolbox.evaluate), offspring) - if (use_batch): #calculating objectives based on batch - fitnesses = toolbox.map( - functools.partial(toolbox.evaluateValidation, data=batch), offspring) - - for ind, fit in zip(offspring, fitnesses): - ind.fitness.values = fit - - # Select the next generation population - new_pop = [] - for i, (idx_start, idx_end) in enumerate(island_indexes): - # original population combined with offspring, taking into account that variations can fail - island_new_pop = toolbox.survive( - pop[idx_start:idx_end] \ - + offspring[ - idx_start-sum(island_failed_variations[:i]):idx_end+island_failed_variations[i] - ], - idx_end-idx_start # number of selected individuals should still the same - ) - new_pop.extend(island_new_pop) - - # Migration to fill up the islands for the next generation - pop = [] - for (idx_start, idx_end) in island_indexes: - other_islands = list(range(0, idx_start)) + list(range(idx_end, MU)) - for idx_individual in range(idx_start, idx_end): - if rnd_flt() < MIGPX: # replace by someone not from the same island - idx_other_individual = other_islands[ - int(rnd_flt() * len(other_islands))] - pop.append(new_pop[idx_other_individual]) - else: - pop.append(new_pop[idx_individual]) - - record = stats.compile(pop) - logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) - - if verbosity > 0: - print(logbook.stream) - - if verbosity > 0: - print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) - - archive = tools.ParetoFront() - archive.update(pop) - - return archive, logbook diff --git a/src/brush/deap_api/utils.py b/src/brush/deap_api/utils.py deleted file mode 100644 index 9a9bdcb3..00000000 --- a/src/brush/deap_api/utils.py +++ /dev/null @@ -1,4 +0,0 @@ -class DeapIndividual(): - """Class that wraps brush program for creator.Individual class from DEAP.""" - def __init__(self, prg): - self.prg = prg \ No newline at end of file diff --git a/src/brush/estimator.py b/src/brush/estimator.py deleted file mode 100644 index c60ffcd7..00000000 --- a/src/brush/estimator.py +++ /dev/null @@ -1,588 +0,0 @@ -""" -sklearn-compatible wrapper for GP analyses. - -See brushgp.cpp for Python (via pybind11) modules that give more fine-grained -control of the underlying GP objects. -""" -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin -from sklearn.utils.validation import check_is_fitted -# from sklearn.metrics import mean_squared_error -import numpy as np -import pandas as pd -# import deap as dp -from deap import algorithms, base, creator, tools -# from tqdm import tqdm -from types import NoneType -from sklearn.metrics import average_precision_score -from sklearn.preprocessing import MinMaxScaler -import _brush -from .deap_api import nsga2, nsga2island, DeapIndividual -# from _brush import Dataset, SearchSpace - - -class BrushEstimator(BaseEstimator): - """ - This is the base class for Brush estimators. - This class shouldn't be called directly; instead, call a child class like - :py:class:`BrushRegressor ` or :py:class:`BrushClassifier `. - All of the shared parameters are documented here. - - Parameters - ---------- - mode : str, default 'classification' - The mode of the estimator. Used by subclasses - pop_size : int, default 100 - Population size. - max_gen : int, default 100 - Maximum iterations of the algorithm. - verbosity : int, default 0 - Controls level of printouts. - max_depth : int, default 0 - Maximum depth of GP trees in the GP program. Use 0 for no limit. - max_size : int, default 0 - Maximum number of nodes in a tree. Use 0 for no limit. - n_islands : int, default 5 - Number of independent islands to use in evolutionary framework. - Ignored if `algorithm!="nsga2island"`. - mig_prob : float, default 0.05 - Probability of occuring a migration between two random islands at the - end of a generation, must be between 0 and 1. - cx_prob : float, default 1/7 - Probability of applying the crossover variation when generating the offspring, - must be between 0 and 1. - Given that there are `n` mutations, and either crossover or mutation is - used to generate each individual in the offspring (but not both at the - same time), we want to have by default an uniform probability between - crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and - `1/n` for each mutation, we can achieve an uniform distribution. - mutation_options : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} - A dictionary with keys naming the types of mutation and floating point - values specifying the fraction of total mutations to do with that method. - The probability of having a mutation is `(1-cx_prob)` and, in case the mutation - is applied, then each mutation option is sampled based on the probabilities - defined in `mutation_options`. The set of probabilities should add up to 1.0. - functions: dict[str,float] or list[str], default {} - A dictionary with keys naming the function set and values giving the probability - of sampling them, or a list of functions which will be weighted uniformly. - If empty, all available functions are included in the search space. - initialization : {"uniform", "max_size"}, default "uniform" - Distribution of sizes on the initial population. If `max_size`, then every - expression is created with `max_size` nodes. If `uniform`, size will be - uniformly distributed between 1 and `max_size`. - objectives : list[str], default ["error", "size"] - list with one or more objectives to use. Options are `"error", "size", "complexity"`. - If `"error"` is used, then it will be the mean squared error for regression, - and accuracy for classification. - algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2" - Which Evolutionary Algorithm framework to use to evolve the population. - weights_init : bool, default True - Whether the search space should initialize the sampling weights of terminal nodes - based on the correlation with the output y. If `False`, then all terminal nodes - will have the same probability of 1.0. - validation_size : float, default 0.0 - Percentage of samples to use as a hold-out partition. These samples are used - to calculate statistics during evolution, but not used to train the models. - The `best_estimator_` will be selected using this partition. If zero, then - the same data used for training is used for validation. - batch_size : float, default 1.0 - Percentage of training data to sample every generation. If `1.0`, then - all data is used. Very small values can improve execution time, but - also lead to underfit. - random_state: int or None, default None - If int, then the value is used to seed the c++ random generator; if None, - then a seed will be generated using a non-deterministic generator. It is - important to notice that, even if the random state is fixed, it is - unlikely that running brush using multiple threads will have the same - results. This happens because the Operating System's scheduler is - responsible to choose which thread will run at any given time, thus - reproductibility is not guaranteed. - - Attributes - ---------- - best_estimator_ : _brush.Program - The final model picked from training. Used in subsequent calls to :func:`predict`. - archive_ : list[deap_api.DeapIndividual] - The final population from training. - data_ : _brush.Dataset - The complete data in Brush format. - train_ : _brush.Dataset - Partition of `data_` containing `(1-validation_size)`% of the data, in Brush format. - validation_ : _brush.Dataset - Partition of `data_` containing `(validation_size)`% of the data, in Brush format. - search_space_ : a Brush `SearchSpace` object. - Holds the operators and terminals and sampling utilities to update programs. - toolbox_ : deap.Toolbox - The toolbox used by DEAP for EA algorithm. - """ - - def __init__( - self, - mode='classification', - pop_size=100, - max_gen=100, - verbosity=0, - max_depth=3, - max_size=20, - n_islands=5, - mig_prob=0.05, - cx_prob= 1/7, - mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, - "toggle_weight_on":1/6, "toggle_weight_off":1/6}, - functions: list[str]|dict[str,float] = {}, - initialization="uniform", - algorithm="nsga2", - objectives=["error", "size"], - random_state=None, - weights_init=True, - validation_size: float = 0.0, - batch_size: float = 1.0 - ): - self.pop_size=pop_size - self.max_gen=max_gen - self.verbosity=verbosity - self.algorithm=algorithm - self.mode=mode - self.max_depth=max_depth - self.max_size=max_size - self.n_islands=n_islands - self.mig_prob=mig_prob - self.cx_prob=cx_prob - self.mutation_options=mutation_options - self.functions=functions - self.objectives=objectives - self.initialization=initialization - self.random_state=random_state - self.batch_size=batch_size - self.weights_init=weights_init - self.validation_size=validation_size - - - def _setup_toolbox(self, data_train, data_validation): - """Setup the deap toolbox""" - toolbox: base.Toolbox = base.Toolbox() - - # creator.create is used to "create new functions", and takes at least - # 2 arguments: the name of the newly created class and a base class - - # Cleaning possible previous classes that are model-dependent (clf and reg are differente) - if hasattr(creator, "FitnessMulti"): - del creator.FitnessMulti - if hasattr(creator, "Individual"): - del creator.Individual - - # Minimizing/maximizing problem: negative/positive weight, respectively. - # Our classification is using the error as a metric - # Comparing fitnesses: https://deap.readthedocs.io/en/master/api/base.html#deap.base.Fitness - creator.create("FitnessMulti", base.Fitness, weights=self.weights) - - # create Individual class, inheriting from self.Individual with a fitness attribute - creator.create("Individual", DeapIndividual, fitness=creator.FitnessMulti) - - toolbox.register("Clone", lambda ind: creator.Individual(ind.prg.copy())) - - toolbox.register("mate", self._crossover) - toolbox.register("mutate", self._mutate) - - # When solving multi-objective problems, selection and survival must - # support this feature. This means that these selection operators must - # accept a tuple of fitnesses as argument) - if self.algorithm=="nsga2" or self.algorithm=="nsga2island": - toolbox.register("select", tools.selTournamentDCD) - toolbox.register("survive", tools.selNSGA2) - elif self.algorithm=="ga" or self.algorithm=="gaisland": - toolbox.register("select", tools.selTournament, tournsize=3) - def offspring(pop, MU): return pop[-MU:] - toolbox.register("survive", offspring) - - # toolbox.population will return a list of elements by calling toolbox.individual - toolbox.register("createRandom", self._make_individual) - toolbox.register("population", tools.initRepeat, list, toolbox.createRandom) - - toolbox.register("get_objectives", lambda: self.objectives) - toolbox.register("getBatch", data_train.get_batch) - toolbox.register("evaluate", self._fitness_function, data=data_train) - toolbox.register("evaluateValidation", self._fitness_validation, data=data_validation) - - return toolbox - - - def _crossover(self, ind1, ind2): - offspring = [] - - for i,j in [(ind1,ind2),(ind2,ind1)]: - attempts = 0 - child = None - while (attempts < 3 and child is None): - child = i.prg.cross(j.prg) - - if child is not None: - child = creator.Individual(child) - attempts = attempts + 1 - - offspring.extend([child]) - - # so we always need to have two elements to unpack inside `offspring` - return offspring[0], offspring[1] - - - def _mutate(self, ind1): - # offspring = (creator.Individual(ind1.prg.mutate(self.search_space_)),) - attempts = 0 - offspring = None - while (attempts < 3 and offspring is None): - offspring = ind1.prg.mutate() - - if offspring is not None: - return creator.Individual(offspring) - attempts = attempts + 1 - - return None - - - def fit(self, X, y): - """ - Fit an estimator to X,y. - - Parameters - ---------- - X : np.ndarray - 2-d array of input data. - y : np.ndarray - 1-d array of (boolean) target values. - """ - _brush.set_params(self.get_params()) - - if self.random_state is not None: - _brush.set_random_state(self.random_state) - - self.feature_names_ = [] - if isinstance(X, pd.DataFrame): - self.feature_names_ = X.columns.to_list() - - self.data_ = self._make_data(X, y, - feature_names=self.feature_names_, - validation_size=self.validation_size) - - if isinstance(self.functions, list): - self.functions_ = {k:1.0 for k in self.functions} - else: - self.functions_ = self.functions - - # set n classes if relevant - if self.mode=="classification": - self.n_classes_ = len(np.unique(y)) - - # Including necessary functions for classification programs. This - # is needed so the search space can create the hash and mapping of - # the functions. - if self.n_classes_ == 2 and "Logistic" not in self.functions_: - self.functions_["Logistic"] = 1.0 - # elif "Softmax" not in self.functions_: # TODO: implement multiclassific. - # self.functions_["Softmax"] = 1.0 - - # Weight of each objective (+ for maximization, - for minimization) - obj_weight = { - "error" : +1.0 if self.mode=="classification" else -1.0, - "size" : -1.0, - "complexity" : -1.0 - } - self.weights = [obj_weight[w] for w in self.objectives] - - # These have a default behavior to return something meaningfull if - # no values are set - self.train_ = self.data_.get_training_data() - self.train_.set_batch_size(self.batch_size) - self.validation_ = self.data_.get_validation_data() - - self.search_space_ = _brush.SearchSpace(self.train_, self.functions_, self.weights_init) - self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) - - if self.algorithm=="nsga2island" or self.algorithm=="gaisland": - self.archive_, self.logbook_ = nsga2island( - self.toolbox_, self.max_gen, self.pop_size, self.n_islands, - self.mig_prob, self.cx_prob, - (0.0 0: - print(f'best model {self.best_estimator_.get_model()}'+ - f' with size {self.best_estimator_.size()}, ' + - f' depth {self.best_estimator_.depth()}, ' + - f' and fitness {self.archive_[0].fitness}' ) - - return self - - def _make_data(self, X, y=None, feature_names=[], validation_size=0.0): - # This function should not partition data (since it may be used in `predict`). - # partitioning is done by `fit`. Feature names should be inferred - # before calling _make_data (so predict can be made with np arrays or - # pd dataframes). - - if isinstance(y, pd.Series): - y = y.values - if isinstance(X, pd.DataFrame): - X = X.values - - assert isinstance(X, np.ndarray) - - if isinstance(y, NoneType): - return _brush.Dataset(X=X, - feature_names=feature_names, validation_size=validation_size) - - return _brush.Dataset(X=X, y=y, - feature_names=feature_names, validation_size=validation_size) - - - def predict(self, X): - """Predict using the best estimator in the archive. """ - - check_is_fitted(self) - - if isinstance(X, pd.DataFrame): - X = X.values - - assert isinstance(X, np.ndarray) - - data = _brush.Dataset(X=X, ref_dataset=self.data_, - feature_names=self.feature_names_) - - # data = self._make_data(X, feature_names=self.feature_names_) - - return self.best_estimator_.predict(data) - - # def _setup_population(self): - # """initialize programs""" - # if self.mode == 'classification': - # generate = self.search_space_.make_classifier - # else: - # generate = self.search_space_.make_regressor - - # programs = [ - # DeapIndividual(generate(self.max_depth, self.max_size)) - # for i in range(self.pop_size) - # ] - # # return [self._create_deap_individual_(p) for p in programs] - # return programs - - def get_params(self, deep=True): - out = dict() - for (key, value) in self.__dict__.items(): - if not key.endswith('_'): - if deep and hasattr(value, "get_params") and not isinstance(value, type): - deep_items = value.get_params().items() - out.update((key + "__" + k, val) for k, val in deep_items) - out[key] = value - return out - - -class BrushClassifier(BrushEstimator,ClassifierMixin): - """Brush for classification. - - For options, see :py:class:`BrushEstimator `. - - Examples - -------- - >>> import pandas as pd - >>> df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') - >>> X = df.drop(columns='target') - >>> y = df['target'] - >>> from brush import BrushClassifier - >>> est = BrushClassifier() - >>> est.fit(X,y) - >>> print('score:', est.score(X,y)) - """ - def __init__( self, **kwargs): - super().__init__(mode='classification',**kwargs) - - def _error(self, ind, data: _brush.Dataset): - #return (data.y==ind.prg.predict(data)).sum() / data.y.shape[0] - return average_precision_score(data.y, ind.prg.predict(data)) - - def _fitness_validation(self, ind, data: _brush.Dataset): - # Fitness without fitting the expression, used with validation data - - ind_objectives = { - "error" : self._error(ind, data), - "size" : ind.prg.size(), - "complexity": ind.prg.complexity() - } - return [ ind_objectives[obj] for obj in self.objectives ] - - def _fitness_function(self, ind, data: _brush.Dataset): - ind.prg.fit(data) - - return self._fitness_validation(ind, data) - - def _make_individual(self): - # C++'s PTC2-based `make_individual` will create a tree of at least - # the given size. By uniformly sampling the size, we can instantiate a - # population with more diversity - - if self.initialization not in ["uniform", "max_size"]: - raise ValueError(f"Invalid argument value for `initialization`. " - f"expected 'max_size' or 'uniform'. got {self.initialization}") - - return creator.Individual( - self.search_space_.make_classifier( - self.max_depth,(0 if self.initialization=='uniform' else self.max_size)) - if self.n_classes_ == 2 else - self.search_space_.make_multiclass_classifier( - self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) - ) - - def predict_proba(self, X): - """Predict class probabilities for X. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples. Internally, it will be converted to - ``dtype=np.float32``. - - Returns - ------- - p : ndarray of shape (n_samples, n_classes) - The class probabilities of the input samples. The order of the - classes corresponds to that in the attribute :term:`classes_`. - - """ - - check_is_fitted(self) - - if isinstance(X, pd.DataFrame): - X = X.values - - assert isinstance(X, np.ndarray) - - data = _brush.Dataset(X=X, ref_dataset=self.data_, - feature_names=self.feature_names_) - - # data = self._make_data(X, feature_names=self.feature_names_) - - prob = self.best_estimator_.predict_proba(data) - - if self.n_classes_ <= 2: - prob = np.hstack( (np.ones(X.shape[0]).reshape(-1,1), prob.reshape(-1,1)) ) - prob[:, 0] -= prob[:, 1] - - return prob - - -class BrushRegressor(BrushEstimator, RegressorMixin): - """Brush for regression. - - For options, see :py:class:`BrushEstimator `. - - Examples - -------- - >>> import pandas as pd - >>> df = pd.read_csv('docs/examples/datasets/d_enc.csv') - >>> X = df.drop(columns='label') - >>> y = df['label'] - >>> from brush import BrushRegressor - >>> est = BrushRegressor() - >>> est.fit(X,y) - >>> print('score:', est.score(X,y)) - """ - def __init__(self, **kwargs): - super().__init__(mode='regressor',**kwargs) - - def _error(self, ind, data: _brush.Dataset): - MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) - if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf - MSE = np.inf - - return MSE - - def _fitness_validation(self, ind, data: _brush.Dataset): - # Fitness without fitting the expression, used with validation data - - ind_objectives = { - "error" : self._error(ind, data), - "size" : ind.prg.size(), - "complexity": ind.prg.complexity() - } - return [ ind_objectives[obj] for obj in self.objectives ] - - def _fitness_function(self, ind, data: _brush.Dataset): - ind.prg.fit(data) - - return self._fitness_validation(ind, data) - - def _make_individual(self): - if self.initialization not in ["uniform", "max_size"]: - raise ValueError(f"Invalid argument value for `initialization`. " - f"expected 'max_size' or 'uniform'. got {self.initialization}") - - # No arguments (or zero): brush will use PARAMS passed in set_params. - # max_size is sampled between 1 and params['max_size'] if zero is provided - return creator.Individual( - self.search_space_.make_regressor( - self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) - ) - -# Under development -# class BrushRepresenter(BrushEstimator, TransformerMixin): -# """Brush for representation learning. - -# For options, see :py:class:`BrushEstimator `. - -# Examples -# -------- -# >>> import pandas as pd -# >>> df = pd.read_csv('docs/examples/datasets/d_enc.csv') -# >>> X = df.drop(columns='label') -# >>> y = df['label'] -# >>> from brush import BrushRegressor -# >>> est = BrushRegressor() -# >>> est.fit(X,y) -# >>> print('score:', est.score(X,y)) -# """ -# def __init__(self, **kwargs): -# super().__init__(mode='regressor',**kwargs) - -# def _fitness_function(self, ind, data: _brush.Dataset): -# ind.prg.fit(data) -# return ( -# # todo: need to return a matrix from X for this -# np.sum((data.get_X()- ind.prg.predict(data))**2), -# ind.prg.size() -# ) - -# def _make_individual(self): -# return creator.Individual( -# self.search_space_.make_representer(self.max_depth, self.max_size) -# ) - -# def transform(self, X): -# """Transform X using the best estimator in the archive. """ -# return self.predict(X) \ No newline at end of file From fdde3b2f5a9ff431646459940bd44fa3cc9370c2 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 13 Nov 2023 15:06:21 -0500 Subject: [PATCH 093/199] Renamed variable --- tests/cpp/test_optimization.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cpp/test_optimization.cpp b/tests/cpp/test_optimization.cpp index 857ea47d..2d45ff27 100644 --- a/tests/cpp/test_optimization.cpp +++ b/tests/cpp/test_optimization.cpp @@ -66,7 +66,7 @@ TEST_P(OptimizerTest, OptimizeWeightsWorksCorrectly) { fmt::print( "weights: {}\n", learned_weights); // calculating the MSE - float mse = (data.y - y_pred).square().mean(); + float mse_error = (data.y - y_pred).square().mean(); ASSERT_TRUE(data.y.isApprox(y_pred, 1e-3)) << "Not all predictions " "are close to the correct values. Predictions are\n" << y_pred << @@ -75,7 +75,7 @@ TEST_P(OptimizerTest, OptimizeWeightsWorksCorrectly) { ASSERT_TRUE(check_fit(learned_weights)) << "Check of learned weights " "didn't pass. Learned weights are\n" << learned_weights << std::endl; - ASSERT_TRUE(mse <= 1e-3) << "The MSE " << mse << "obtained after fitting " + ASSERT_TRUE(mse_error <= 1e-3) << "The MSE " << mse_error << "obtained after fitting " "the expression is not smaller than threshold of 1e-3" << std::endl; } From dd8ae26191229fc0dee0ca6381e61bdb94a19e61 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 14 Nov 2023 16:16:27 -0500 Subject: [PATCH 094/199] Bug fix in cx. improved spot selection in cx. cx and mut are now public --- src/variation.cpp | 43 +++++++++++++++++++++++++++---------------- src/variation.h | 9 +++++---- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/variation.cpp b/src/variation.cpp index fc9ac2d1..eb977b9a 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -364,27 +364,37 @@ class SubtreeMutation : public MutationBase */ template std::optional> Variation::cross( - const Program& root, const Program& other) + const Program& mom, const Program& dad) { /* subtree crossover between this and other, producing new Program */ // choose location by weighted sampling of program // TODO: why doesn't this copy the search space reference to child? - Program child(root); + Program child(mom); // pick a subtree to replace vector child_weights(child.Tree.size()); - std::transform(child.Tree.begin(), child.Tree.end(), - child_weights.begin(), - [](const auto& n){ return n.get_prob_change(); } - ); - + auto child_iter = child.Tree.begin(); + std::transform(child.Tree.begin(), child.Tree.end(), child_weights.begin(), + [&](const auto& n){ + auto s_at = child.size_at(child_iter); + auto d_at = child.depth_to_reach(child_iter); + + std::advance(child_iter, 1); + + if (s_at> Variation::cross( ( child.size() - child.size_at(child_spot) ); auto allowed_depth = parameters.max_depth - ( child.depth_to_reach(child_spot) ); - + // pick a subtree to insert. Selection is based on other_weights + Program other(dad); + vector other_weights(other.Tree.size()); // iterator to get the size of subtrees inside transform @@ -426,7 +438,7 @@ std::optional> Variation::cross( return 0.0f; } ); - + bool matching_spots_found = false; for (const auto& w: other_weights) { @@ -596,24 +608,23 @@ void Variation::vary(Population& pop, tuple island_range, Individual& mom = pop.individuals.at( r.select_randomly(parents.begin(), parents.end())); - if ( r() < parameters.cx_prob) // crossover + if ( r() < parameters.cx_prob) // crossover { - // get random mom and dad, make copies Individual& dad = pop.individuals.at( r.select_randomly(parents.begin(), parents.end())); - opt = cross(mom, dad); + opt = cross(mom.program, dad.program); } - else // mutation + else // mutation { - opt = mutate(mom); + opt = mutate(mom.program); } if (opt) // no optional value was returned { auto child = opt.value(); assert(child.size()>0); - pop.individuals.at(i) = child; + pop.individuals.at(i) = Individual(child); } } } diff --git a/src/variation.h b/src/variation.h index 13cda4b6..8fc34c00 100644 --- a/src/variation.h +++ b/src/variation.h @@ -35,6 +35,7 @@ class MutationBase { public: using Iter = tree::pre_order_iterator; + // TODO: static methods, without storing information, and using just SS and params as arguments MutationBase(const SearchSpace& SS, size_t max_size, size_t max_depth) : SS_(SS) , max_size_(max_size) @@ -103,10 +104,6 @@ class Variation private: SearchSpace& search_space; Parameters& parameters; - - std::optional> cross(const Program& root, const Program& other); - - std::optional> mutate(const Program& parent); public: Variation() = default; @@ -122,6 +119,10 @@ class Variation search_space = ss; }; + // individual-level variations + std::optional> cross(const Program& mom, const Program& dad); + std::optional> mutate(const Program& parent); + /// method to handle variation of population void vary(Population& pop, tuple island_range, const vector& parents); From cec5354fcf3c13e3fc5ff345eaa76bf9bb0a48df Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 14 Nov 2023 16:17:29 -0500 Subject: [PATCH 095/199] Default values for mutation probs. New comments with TODOs --- src/params.h | 15 +++++++++++---- src/population.h | 1 + src/search_space.h | 3 ++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/params.h b/src/params.h index 11e04c48..35ef31ae 100644 --- a/src/params.h +++ b/src/params.h @@ -30,16 +30,23 @@ struct Parameters int pop_size = 100; int gens = 100; - unsigned int max_depth = 10; - unsigned int max_size = 100; + unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size + unsigned int max_size = 50; vector objectives{"error","complexity"}; // error should be generic and deducted based on mode string sel = "nsga2"; //selection method string surv = "nsga2"; //survival method - vector functions; + std::unordered_map functions; int num_islands=5; // variation - std::map mutation_probs; + std::map mutation_probs = { + {"point", 0.167}, + {"insert", 0.167}, + {"delete", 0.167}, + {"subtree", 0.167}, + {"toggle_weight_on", 0.167}, + {"toggle_weight_off", 0.167} + }; float cx_prob=0.2; ///< cross rate for variation float mig_prob = 0.05; diff --git a/src/population.h b/src/population.h index 174d42a4..bf2f54b7 100644 --- a/src/population.h +++ b/src/population.h @@ -24,6 +24,7 @@ class Population{ unsigned int n_islands; float mig_prob; + // TODO: taskflow needs to use n_islands as n_jobs Population(int p = 0, int n_islands=1); ~Population(); diff --git a/src/search_space.h b/src/search_space.h index 84b6d052..7e57f470 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -718,7 +718,8 @@ P SearchSpace::make_program(const Parameters& params, int max_d, int max_size) root = opt.value(); } - auto Tree = PTC2(root, max_d, max_size); + // max_d-1 because we always pick the root before calling ptc2 + auto Tree = PTC2(root, max_d-1, max_size); return P(*this,Tree); }; From 95efb074d75350c06513cf112b254ca3a0fcf609 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 14 Nov 2023 16:18:20 -0500 Subject: [PATCH 096/199] Individual constructor --- src/cbrush.h | 2 +- src/individual.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cbrush.h b/src/cbrush.h index d4342b54..33ea7232 100644 --- a/src/cbrush.h +++ b/src/cbrush.h @@ -88,7 +88,7 @@ class CBrush{ // sets available functions based on comma-separated list. void set_functions(const vector& fns){ params.functions = fns; }; - vector get_functions(){ return params.functions; }; + unordered_map get_functions(){ return params.functions; }; void set_mutation_probs(std::map mutation_probs){ params.mutation_probs = mutation_probs;}; std::map get_mutation_probs(){ return params.mutation_probs; }; diff --git a/src/individual.h b/src/individual.h index f24a93c3..896e8456 100644 --- a/src/individual.h +++ b/src/individual.h @@ -39,6 +39,8 @@ class Individual{ crowd_dist = -1; }; + Individual(Program& prg) : Individual() { program = prg; }; + void init(const SearchSpace& ss, const Parameters& params) { program = SS.make_program(params, 0, 0); From eafb2452af1685763e2869b653725428bbc56613 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 14 Nov 2023 16:18:50 -0500 Subject: [PATCH 097/199] all previous cpp tests are pdated and working! --- tests/cpp/{test_params.cpp => test_brush.cpp} | 0 tests/cpp/test_data.cpp | 245 ++-- tests/cpp/test_population.cpp | 18 + tests/cpp/test_program.cpp | 57 +- tests/cpp/test_variation.cpp | 1096 +++++++++-------- tests/cpp/testsHeader.h | 7 + 6 files changed, 757 insertions(+), 666 deletions(-) rename tests/cpp/{test_params.cpp => test_brush.cpp} (100%) diff --git a/tests/cpp/test_params.cpp b/tests/cpp/test_brush.cpp similarity index 100% rename from tests/cpp/test_params.cpp rename to tests/cpp/test_brush.cpp diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index af0678f9..be8c0a9e 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -1,126 +1,121 @@ -// #include "testsHeader.h" -// #include "../../src/program/program.h" -// #include "../../src/search_space.h" -// #include "../../src/program/dispatch_table.h" - -// TEST(Data, ErrorHandling) -// { -// // Creating an empty dataset throws error -// EXPECT_THROW({ -// MatrixXf X(0,0); -// ArrayXf y(0); - -// try -// { -// Dataset dt(X, y); -// } -// catch( const std::runtime_error& err ) -// { -// const string msg = err.what(); -// ASSERT_NE( -// msg.find("Error during the initialization of the dataset"), -// std::string::npos); -// throw; -// } -// }, std::runtime_error); -// } - -// TEST(Data, MixedVariableTypes) -// { -// // We need to set at least the mutation options (and respective -// // probabilities) in order to call PRG.predict() -// PARAMS["write_mutation_trace"] = true; -// PARAMS["mutation_options"] = { -// {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} -// }; - -// MatrixXf X(5,3); -// X << 0 , 1, 0 , // binary with integer values -// 0.0, 1.0, 1.0, // binary with float values -// 2 , 1.0, -3.0, // integer with float and negative values -// 2 , 1 , 3 , // integer with integer values -// 2.1, 3.7, -5.2; // float values - -// X.transposeInPlace(); - -// ArrayXf y(3); - -// y << 6.1, 7.7, -4.2; // y = x_0 + x_1 + x_2 +#include "testsHeader.h" + + +TEST(Data, ErrorHandling) +{ + // Creating an empty dataset throws error + EXPECT_THROW({ + MatrixXf X(0,0); + ArrayXf y(0); + + try + { + Dataset dt(X, y); + } + catch( const std::runtime_error& err ) + { + const string msg = err.what(); + ASSERT_NE( + msg.find("Error during the initialization of the dataset"), + std::string::npos); + throw; + } + }, std::runtime_error); +} + +TEST(Data, MixedVariableTypes) +{ + Parameters params; + + MatrixXf X(5,3); + X << 0 , 1, 0 , // binary with integer values + 0.0, 1.0, 1.0, // binary with float values + 2 , 1.0, -3.0, // integer with float and negative values + 2 , 1 , 3 , // integer with integer values + 2.1, 3.7, -5.2; // float values + + X.transposeInPlace(); + + ArrayXf y(3); + + y << 6.1, 7.7, -4.2; // y = x_0 + x_1 + x_2 -// unordered_map user_ops = { -// {"Add", 0.5}, -// {"Sub", 0.5}, -// // a boolean operator -// {"And", 1.0}, -// {"Or", 1.0}, -// // operator that takes boolean as argument -// {"SplitOn", 1.0} -// }; - -// Dataset dt(X, y); -// SearchSpace SS; -// SS.init(dt, user_ops); - -// dt.print(); -// SS.print(); - -// for (size_t d = 5; d < 10; ++d) -// for (size_t s = 5; s < 20; ++s) -// { -// fmt::print( -// "=================================================\n" -// "depth={}, size={}. ", d, s -// ); - -// PARAMS["max_size"] = s; -// PARAMS["max_depth"] = d; - -// RegressorProgram PRG = SS.make_regressor(s-4, d-4); - -// fmt::print( -// "Tree model: {}\n", PRG.get_model("compact", true) -// ); - -// // visualizing detailed information for the model -// std::for_each(PRG.Tree.begin(), PRG.Tree.end(), -// [](const auto& n) { -// fmt::print("Name {}, node {}, feature {}\n" -// " sig_hash {}\n ret_type {}\n ret_type type {}\n", -// n.name, n.node_type, n.get_feature(), -// n.sig_hash, n.ret_type, typeid(n.ret_type).name()); -// }); - -// std::cout << std::endl; - -// fmt::print( "PRG fit\n"); -// PRG.fit(dt); -// fmt::print( "PRG predict\n"); -// ArrayXf y_pred = PRG.predict(dt); -// fmt::print( "y_pred: {}\n", y_pred); - -// // creating and fitting a child -// auto opt = PRG.mutate(); - -// if (!opt){ -// fmt::print("Mutation failed to create a child\n"); -// fmt::print("{}\n", PARAMS["mutation_trace"].get().dump()); -// } -// else { -// auto Child = opt.value(); - -// fmt::print("Child model: {}\n", Child.get_model("compact", true)); - -// fmt::print( "Child fit\n"); -// Child.fit(dt); -// fmt::print( "Child predict\n"); -// ArrayXf y_pred_child = Child.predict(dt); -// fmt::print( "y_pred: {}\n", y_pred); -// } -// } - -// // Brush exports two DispatchTable structs named dtable_fit and dtable_predict. -// // These structures holds the mapping between nodes and its corresponding -// // operations, and are used to resolve the evaluation of an expression. -// // dtable_fit.print(); -// // dtable_predict.print(); -// } \ No newline at end of file + params.functions = { + {"Add", 0.5}, + {"Sub", 0.5}, + // a boolean operator + {"And", 1.0}, + {"Or", 1.0}, + // operator that takes boolean as argument + {"SplitOn", 1.0} + }; + + Dataset dt(X, y); + SearchSpace SS; + SS.init(dt, params.functions); + + dt.print(); + SS.print(); + + for (size_t d = 5; d < 10; ++d) + for (size_t s = 5; s < 20; ++s) + { + fmt::print( + "=================================================\n" + "depth={}, size={}. ", d, s + ); + + params.max_size = s; + params.max_depth = d; + + // TODO: update all calls of make_ to use params + RegressorProgram PRG = SS.make_regressor(0, 0, params); + + fmt::print( + "Tree model: {}\n", PRG.get_model("compact", true) + ); + + // visualizing detailed information for the model + std::for_each(PRG.Tree.begin(), PRG.Tree.end(), + [](const auto& n) { + fmt::print("Name {}, node {}, feature {}\n" + " sig_hash {}\n ret_type {}\n ret_type type {}\n", + n.name, n.node_type, n.get_feature(), + n.sig_hash, n.ret_type, typeid(n.ret_type).name()); + }); + std::cout << std::endl; + + fmt::print( "PRG fit\n"); + PRG.fit(dt); + + fmt::print( "PRG predict\n"); + ArrayXf y_pred = PRG.predict(dt); + fmt::print( "y_pred: {}\n", y_pred); + + // creating and fitting a child + Variation variator = Variation(params, SS); + std::optional opt = variator.mutate(PRG); + + if (!opt){ + fmt::print("Mutation failed to create a child\n"); + } + else { + auto Child = opt.value(); + + fmt::print("Child model: {}\n", Child.get_model("compact", true)); + + fmt::print( "Child fit\n"); + Child.fit(dt); + + fmt::print( "Child predict\n"); + ArrayXf y_pred_child = Child.predict(dt); + fmt::print( "y_pred: {}\n", y_pred); + } + } + + // Brush exports two DispatchTable structs named dtable_fit and dtable_predict. + // These structures holds the mapping between nodes and its corresponding + // operations, and are used to resolve the evaluation of an expression. + // dtable_fit.print(); + // dtable_predict.print(); +} \ No newline at end of file diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index e69de29b..68a407ee 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -0,0 +1,18 @@ +#include "testsHeader.h" + +TEST(Population, PopulationTests) +{ + + Population pop; + + // size, + // island sizes growns and comes back to the same, + // update and prep offspring slots. + // no overlap in island indexes. + // works with even and uneven pop sizes. + // initialize population works? + // migrate? + // print models + +} + diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp index 2d741f68..69d5819b 100644 --- a/tests/cpp/test_program.cpp +++ b/tests/cpp/test_program.cpp @@ -11,12 +11,16 @@ TEST(Program, MakeRegressor) SearchSpace SS; SS.init(data); + Parameters params; // Program DXtree; for (int d = 1; d < 10; ++d) for (int s = 1; s < 10; ++s) { - RegressorProgram PRG = SS.make_regressor(d, s); + params.max_size = s; + params.max_depth = d; + + RegressorProgram PRG = SS.make_regressor(0, 0, params); fmt::print( "=================================================\n" "Tree model for depth = {}, size= {}: {}\n", @@ -57,17 +61,23 @@ TEST(Program, MakeRegressor) TEST(Program, FitRegressor) { - + Parameters params; + Dataset data = Data::read_csv("docs/examples/datasets/d_enc.csv","label"); SearchSpace SS; SS.init(data); + dtable_fit.print(); dtable_predict.print(); + // for (int t = 0; t < 10; ++t) { for (int d = 1; d < 10; ++d) { for (int s = 1; s < 100; s+=10) { - RegressorProgram PRG = SS.make_regressor(d, s); + params.max_size = s; + params.max_depth = d; + + RegressorProgram PRG = SS.make_regressor(0, 0, params); fmt::print( "=================================================\n" "Tree model for depth = {}, size= {}: {}\n" @@ -83,27 +93,36 @@ TEST(Program, FitRegressor) TEST(Program, PredictWithWeights) { + Parameters params; Dataset data = Data::read_csv("docs/examples/datasets/d_enc.csv","label"); SearchSpace SS; SS.init(data); + dtable_fit.print(); dtable_predict.print(); + // for (int t = 0; t < 10; ++t) { for (int d = 1; d < 10; ++d) { for (int s = 1; s < 10; s+=10) { - RegressorProgram PRG = SS.make_regressor(d, s); + params.max_size = s; + params.max_depth = d; + + RegressorProgram PRG = SS.make_regressor(0, 0, params); fmt::print( "=================================================\n" "Tree model for depth = {}, size= {}: {}\n" "=================================================\n", d, s, PRG.get_model("compact", true) ); + PRG.fit(data); auto y = PRG.predict(data); + auto weights = PRG.get_weights(); auto yweights = PRG.predict_with_weights(data, weights); + for (int i = 0; i < y.size(); ++i){ if (std::isnan(y(i))) ASSERT_TRUE(std::isnan(y(i))); @@ -117,6 +136,7 @@ TEST(Program, PredictWithWeights) TEST(Program, FitClassifier) { + Parameters params; Dataset data = Data::read_csv("docs/examples/datasets/d_analcatdata_aids.csv","target"); SearchSpace SS; @@ -124,7 +144,12 @@ TEST(Program, FitClassifier) for (int d = 1; d < 10; ++d) { for (int s = 1; s < 100; s+=10) { - auto PRG = SS.make_classifier(d, s); + + params.max_size = s; + params.max_depth = d; + + auto PRG = SS.make_classifier(0, 0, params); + fmt::print( "=================================================\n" "Tree model for depth = {}, size= {}: {}\n" @@ -140,6 +165,8 @@ TEST(Program, FitClassifier) TEST(Program, Serialization) { + Parameters params; + // test mutation // TODO: set random seed MatrixXf X(10,2); @@ -159,7 +186,10 @@ TEST(Program, Serialization) { for (int s = 1; s < 10; ++s) { - RegressorProgram PRG = SS.make_regressor(d, s); + params.max_size = s; + params.max_depth = d; + + RegressorProgram PRG = SS.make_regressor(0, 0, params); fmt::print( "=================================================\n" "depth = {}, size= {}\n" @@ -171,12 +201,15 @@ TEST(Program, Serialization) ArrayXf y_pred = PRG.predict(data); json PRGjson = PRG; fmt::print( "json of initial model: {}\n", PRGjson.dump(2)); + // auto newPRG = PRGjson.get(); RegressorProgram newPRG = PRGjson; json newPRGjson = newPRG; + fmt::print( "json of loaded model: {}\n", newPRGjson.dump(2)); fmt::print("Initial Model: {}\n",PRG.get_model("compact", true)); fmt::print("Loaded Model: {}\n",newPRG.get_model("compact", true)); + ASSERT_TRUE( std::equal(PRG.Tree.begin(), PRG.Tree.end(), newPRG.Tree.begin()) ); @@ -203,19 +236,21 @@ TEST(Operators, ProgramSizeAndDepthPARAMS) Dataset data(X,y); + Parameters params; + SearchSpace SS; SS.init(data); - for (int d = 1; d < 10; ++d) + for (int d = 1; d < 6; ++d) { - for (int s = 1; s < 10; ++s) + for (int s = 10; s < 20; ++s) { - PARAMS["max_size"] = s; - PARAMS["max_depth"] = d; + params.max_size = s; + params.max_depth = d; fmt::print("d={},s={}\n",d,s); fmt::print("make_regressor\n"); - RegressorProgram PRG = SS.make_regressor(0, 0); + RegressorProgram PRG = SS.make_regressor(0, 0, params); fmt::print( "depth = {}, size= {}\n" diff --git a/tests/cpp/test_variation.cpp b/tests/cpp/test_variation.cpp index 44ff612c..7cae962b 100644 --- a/tests/cpp/test_variation.cpp +++ b/tests/cpp/test_variation.cpp @@ -1,545 +1,581 @@ -// #include "testsHeader.h" -// #include "../../src/search_space.h" -// #include "../../src/program/program.h" -// #include "../../src/program/dispatch_table.h" -// #include "../../src/data/io.h" - -// TEST(Variation, FixedRootDoesntChange) -// { -// PARAMS["mutation_options"] = { -// {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} -// }; -// PARAMS["max_size"] = 20; -// PARAMS["max_depth"] = 10; - -// MatrixXf X(10,2); -// ArrayXf y(10); -// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, -// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - -// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, -// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - -// y << 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0; - -// Dataset data(X,y); - -// SearchSpace SS; -// SS.init(data); - -// auto logistic_hash = Signature().hash(); - -// for (int d = 1; d < 10; ++d) -// { -// for (int s = 1; s < 10; ++s) -// { -// int successes = 0; -// for (int attempt = 0; attempt < 10; ++attempt) -// { -// // different program types changes how predict works (and the rettype of predict) -// ClassifierProgram PRG = SS.make_classifier(d, s); -// fmt::print( -// "=================================================\n" -// "depth = {}, size= {}\n" -// "Initial Model 1: {}\n", -// d, s, -// PRG.get_model("compact", true) -// ); - -// Node root = *(PRG.Tree.begin()); -// ASSERT_TRUE(root.node_type == NodeType::Logistic); -// ASSERT_TRUE(root.ret_type == DataType::ArrayF); -// ASSERT_TRUE(root.sig_hash == logistic_hash); -// ASSERT_TRUE(root.get_prob_change()==0.0); -// ASSERT_TRUE(root.fixed==true); - -// auto opt_mutation = PRG.mutate(); -// if (opt_mutation) -// { -// successes += 1; -// auto Mut_Child = opt_mutation.value(); -// fmt::print("After mutation : {}\n", -// Mut_Child.get_model("compact", true)); - -// Node mut_child_root = *(Mut_Child.Tree.begin()); -// ASSERT_TRUE(mut_child_root.node_type == NodeType::Logistic); -// ASSERT_TRUE(mut_child_root.ret_type == DataType::ArrayF); -// ASSERT_TRUE(mut_child_root.sig_hash == logistic_hash); -// ASSERT_TRUE(mut_child_root.get_prob_change()==0.0); -// ASSERT_TRUE(mut_child_root.fixed==true); -// } - -// ClassifierProgram PRG2 = SS.make_classifier(d, s); -// auto opt_cx = PRG.cross(PRG2); -// if (opt_cx) -// { -// successes += 1; -// auto CX_Child = opt_cx.value(); -// fmt::print("After crossover: {}\n", -// CX_Child.get_model("compact", true)); - -// Node cx_child_root = *(CX_Child.Tree.begin()); -// ASSERT_TRUE(cx_child_root.node_type == NodeType::Logistic); -// ASSERT_TRUE(cx_child_root.ret_type == DataType::ArrayF); -// ASSERT_TRUE(cx_child_root.sig_hash == logistic_hash); -// ASSERT_TRUE(cx_child_root.get_prob_change()==0.0); -// ASSERT_TRUE(cx_child_root.fixed==true); -// } - -// // root remained unchanged -// ASSERT_TRUE(root.node_type == NodeType::Logistic); -// ASSERT_TRUE(root.ret_type == DataType::ArrayF); -// ASSERT_TRUE(root.sig_hash == logistic_hash); -// ASSERT_TRUE(root.get_prob_change()==0.0); -// ASSERT_TRUE(root.fixed==true); -// } -// ASSERT_TRUE(successes > 0); -// } -// } -// } - -// TEST(Variation, InsertMutationWorks) -// { -// // TODO: this tests could be parameterized. -// // To understand design implementation of this test, check Mutation test - -// PARAMS["mutation_options"] = { -// {"point", 0.0}, {"insert", 1.0}, {"delete", 0.0}, {"subtree", 0.0}, {"toggle_weight_on", 0.0}, {"toggle_weight_off", 0.0} -// }; - -// // retrieving the options to check if everything was set right -// std::cout << "Initial mutation configuration" << std::endl; -// auto options = PARAMS["mutation_options"].get>(); -// for (const auto& [k, v] : options) -// std::cout << k << " : " << v << std::endl; - -// MatrixXf X(10,2); -// ArrayXf y(10); -// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, -// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - -// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, -// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - -// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, -// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - -// Dataset data(X,y); - -// SearchSpace SS; -// SS.init(data); - -// int successes = 0; -// for (int attempt = 0; attempt < 100; ++attempt) -// { -// // we need to have big values here so the mutation will work -// // (when the xmen child exceeds the maximum limits, mutation returns -// // std::nullopt) -// PARAMS["max_size"] = 20; -// PARAMS["max_depth"] = 10; - -// fmt::print("d={},s={}\n", PARAMS["max_depth"].get(), PARAMS["max_size"].get()); -// fmt::print("make_regressor\n"); +#include "testsHeader.h" + +TEST(Variation, FixedRootDoesntChange) +{ + Parameters params; + + MatrixXf X(10,2); + ArrayXf y(10); + X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, + 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + + 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, + 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + + y << 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0; + + Dataset data(X,y); + + SearchSpace SS; + SS.init(data); -// // creating a "small" program (with a plenty amount of space to insert stuff) -// RegressorProgram PRG = SS.make_regressor(5, 5); + auto logistic_hash = Signature().hash(); -// fmt::print("PRG.fit(data);\n"); -// PRG.fit(data); -// ArrayXf y_pred = PRG.predict(data); + // TODO: use these values for d and s in all tests (not 1, 1 for example) + for (int d = 3; d < 6; ++d) + { + for (int s = 10; s < 50; ++s) + { + params.max_size = s; + params.max_depth = d; + + Variation variator = Variation(params, SS); + + int successes = 0; + for (int attempt = 0; attempt < 10; ++attempt) + { + // different program types changes how predict works (and the rettype of predict) + ClassifierProgram PRG = SS.make_classifier(0, 0, params); + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model 1: {}\n", + d, s, + PRG.get_model("compact", true) + ); + + Node root = *(PRG.Tree.begin()); + + ASSERT_TRUE(root.node_type == NodeType::Logistic); + ASSERT_TRUE(root.ret_type == DataType::ArrayF); + ASSERT_TRUE(root.sig_hash == logistic_hash); + ASSERT_TRUE(root.get_prob_change()==0.0); + ASSERT_TRUE(root.fixed==true); + + auto opt_mutation = variator.mutate(PRG); + + if (opt_mutation) + { + successes += 1; + auto Mut_Child = opt_mutation.value(); + fmt::print("After mutation : {}\n", + Mut_Child.get_model("compact", true)); + + Node mut_child_root = *(Mut_Child.Tree.begin()); + + ASSERT_TRUE(mut_child_root.node_type == NodeType::Logistic); + ASSERT_TRUE(mut_child_root.ret_type == DataType::ArrayF); + ASSERT_TRUE(mut_child_root.sig_hash == logistic_hash); + ASSERT_TRUE(mut_child_root.get_prob_change()==0.0); + ASSERT_TRUE(mut_child_root.fixed==true); + } + + ClassifierProgram PRG2 = SS.make_classifier(0, 0, params); + auto opt_cx = variator.cross(PRG, PRG2); + + if (opt_cx) + { + successes += 1; + auto CX_Child = opt_cx.value(); + fmt::print("After crossover: {}\n", + CX_Child.get_model("compact", true)); + + Node cx_child_root = *(CX_Child.Tree.begin()); + + ASSERT_TRUE(cx_child_root.node_type == NodeType::Logistic); + ASSERT_TRUE(cx_child_root.ret_type == DataType::ArrayF); + ASSERT_TRUE(cx_child_root.sig_hash == logistic_hash); + ASSERT_TRUE(cx_child_root.get_prob_change()==0.0); + ASSERT_TRUE(cx_child_root.fixed==true); + } + + // root remained unchanged + ASSERT_TRUE(root.node_type == NodeType::Logistic); + ASSERT_TRUE(root.ret_type == DataType::ArrayF); + ASSERT_TRUE(root.sig_hash == logistic_hash); + ASSERT_TRUE(root.get_prob_change()==0.0); + ASSERT_TRUE(root.fixed==true); + } + ASSERT_TRUE(successes > 0); + } + } +} + +TEST(Variation, InsertMutationWorks) +{ + // TODO: this tests could be parameterized (one type of mutation each). + // To understand design implementation of this test, check Mutation test + + Parameters params; + params.mutation_probs = { + {"point", 0.0}, + {"insert", 1.0}, + {"delete", 0.0}, + {"subtree", 0.0}, + {"toggle_weight_on", 0.0}, + {"toggle_weight_off", 0.0} + }; + + // retrieving the options to check if everything was set right + std::cout << "Initial mutation configuration" << std::endl; + for (const auto& [k, v] : params.mutation_probs) + std::cout << k << " : " << v << std::endl; + + MatrixXf X(10,2); + ArrayXf y(10); + X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, + 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + + 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, + 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + + y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, + 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + + Dataset data(X,y); + + SearchSpace SS; + SS.init(data); + + Variation variator = Variation(params, SS); + + int successes = 0; + for (int attempt = 0; attempt < 100; ++attempt) + { + params.max_size = 50; + params.max_depth = 6; + + fmt::print("d={},s={}\n", params.max_depth, params.max_size); + fmt::print("make_regressor\n"); + + // creating a "small" program (with a plenty amount of space to insert stuff) + RegressorProgram PRG = SS.make_regressor(5, 5, params); + + fmt::print("PRG.fit(data);\n"); + PRG.fit(data); + ArrayXf y_pred = PRG.predict(data); -// // applying mutation and checking if the optional result is non-empty -// fmt::print("auto Child = PRG.mutate();\n"); -// auto opt = PRG.mutate(); // We should assume that it will be always the insert mutation - -// if (opt){ -// successes += 1; -// auto Child = opt.value(); -// fmt::print( -// "=================================================\n" -// "depth = {}, size= {}\n" -// "Initial Model: {}\n" -// "Mutated Model: {}\n", -// PARAMS["max_depth"].get(), PARAMS["max_size"].get(), -// PRG.get_model("compact", true), -// Child.get_model("compact", true) -// ); - -// fmt::print("child fit\n"); -// Child.fit(data); -// y_pred = Child.predict(data); - -// // since we successfully inserted a node, this should be always true -// ASSERT_TRUE(Child.size() > PRG.size()); - -// // maybe the insertion spot was a shorter branch than the maximum -// // depth. At least, xmen depth should be equal to its parent -// ASSERT_TRUE(Child.depth() >= PRG.depth()); -// } - -// // lets also see if it always fails when the child exceeds the maximum limits -// PARAMS["max_size"] = PRG.size(); -// PARAMS["max_depth"] = PRG.depth(); - -// auto opt2 = PRG.mutate(); -// if (opt2){ // This shoudl't happen. We'll print then error -// auto Child2 = opt2.value(); - -// std::cout << "Fail failed. Mutation weights:" << std::endl; -// auto options2 = PARAMS["mutation_options"].get>(); -// for (const auto& [k, v] : options2) -// std::cout << k << " : " << v << std::endl; - -// fmt::print( -// "=================================================\n" -// "depth = {}, size= {}\n" -// "Initial Model: {}\n" -// "Mutated Model: {}\n", -// PARAMS["max_depth"].get(), PARAMS["max_size"].get(), -// PRG.get_model("compact", true), -// Child2.get_model("compact", true) -// ); -// ASSERT_TRUE(opt2==std::nullopt); -// } -// } -// ASSERT_TRUE(successes > 0); -// } - -// TEST(Variation, Mutation) -// { -// PARAMS["write_mutation_trace"] = true; -// PARAMS["mutation_options"] = { -// {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} -// }; + // applying mutation and checking if the optional result is non-empty + fmt::print("auto Child = PRG.mutate();\n"); + + // We should assume that it will be always the insert mutation + auto opt = variator.mutate(PRG); + + if (opt){ + successes += 1; + auto Child = opt.value(); + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutated Model: {}\n", + params.max_depth, params.max_size, + PRG.get_model("compact", true), + Child.get_model("compact", true) + ); + + fmt::print("child fit\n"); + Child.fit(data); + y_pred = Child.predict(data); + + // since we successfully inserted a node, this should be always true + ASSERT_TRUE(Child.size() > PRG.size()); + + // maybe the insertion spot was a shorter branch than the maximum + // depth. At least, xmen depth should be equal to its parent + ASSERT_TRUE(Child.depth() >= PRG.depth()); + } + + // lets also see if it always fails when the child exceeds the maximum limits + params.max_size = PRG.size(); + params.max_depth = PRG.depth(); + + auto opt2 = variator.mutate(PRG); + if (opt2){ // This shoudl't happen. We'll print then error + auto Child2 = opt2.value(); + + std::cout << "Fail failed. Mutation weights:" << std::endl; + for (const auto& [k, v] : params.mutation_probs) + std::cout << k << " : " << v << std::endl; + + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutated Model: {}\n", + params.max_depth, params.max_size, + PRG.get_model("compact", true), + Child2.get_model("compact", true) + ); + ASSERT_TRUE(opt2==std::nullopt); + } + } + ASSERT_TRUE(successes > 0); +} + +TEST(Variation, Mutation) +{ + Parameters params; -// MatrixXf X(10,2); -// ArrayXf y(10); -// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, -// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - -// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, -// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - -// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, -// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - -// Dataset data(X,y); - -// SearchSpace SS; -// SS.init(data); - -// int successes = 0; -// for (int d = 1; d < 10; ++d) -// { -// for (int s = 1; s < 10; ++s) -// { -// fmt::print("d={},s={}\n",d,s); -// fmt::print("make_regressor\n"); - -// // if we set max_size and max_depth to zero, it will use the -// // values in the global PARAMS. Otherwise, it will respect the -// // values passed as argument. -// RegressorProgram PRG = SS.make_regressor(d, s); - -// fmt::print("PRG.fit(data);\n"); -// PRG.fit(data); -// ArrayXf y_pred = PRG.predict(data); + MatrixXf X(10,2); + ArrayXf y(10); + X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, + 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + + 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, + 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + + y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, + 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + + Dataset data(X,y); + + SearchSpace SS; + SS.init(data); + + int successes = 0; + for (int d = 1; d < 6; ++d) + { + for (int s = 10; s < 20; ++s) + { + params.max_size = s; + params.max_depth = d; + + Variation variator = Variation(params, SS); + + fmt::print("d={},s={}\n",d,s); + fmt::print("make_regressor\n"); + + // if we set max_size and max_depth to zero, it will use the + // values in the global PARAMS. Otherwise, it will respect the + // values passed as argument. + RegressorProgram PRG = SS.make_regressor(0, 0, params); + + fmt::print("PRG.fit(data);\n"); + PRG.fit(data); + + // saving a string representation + auto PRG_model = PRG.get_model("compact", true); + + fmt::print( + "=================================================\n" + "Original model (BEFORE MUTATION) 1: {}\n", + PRG.get_model("compact", true) + ); + ArrayXf y_pred = PRG.predict(data); -// // applying mutation and checking if the optional result is non-empty -// fmt::print("auto Child = PRG.mutate();\n"); -// auto opt = PRG.mutate(); - -// if (!opt){ -// fmt::print( -// "=================================================\n" -// "depth = {}, size= {}\n" -// "Initial Model: {}\n" -// "Mutation failed to create a child", -// d, s, -// PRG.get_model("compact", true) -// ); -// fmt::print("{}", PARAMS["mutation_trace"].get().dump()); -// } -// else { -// successes += 1; -// auto Child = opt.value(); -// fmt::print( -// "=================================================\n" -// "depth = {}, size= {}\n" -// "Initial Model: {}\n" -// "Mutated Model: {}\n", -// d, s, -// PRG.get_model("compact", true), -// Child.get_model("compact", true) -// ); - -// fmt::print("child fit\n"); -// Child.fit(data); -// y_pred = Child.predict(data); -// } -// } -// } -// // since x1 and x2 have same type, we shoudn't get fails -// ASSERT_TRUE(successes > 0); -// } - -// TEST(Variation, MutationSizeAndDepthLimit) -// { -// PARAMS["write_mutation_trace"] = true; -// PARAMS["mutation_options"] = { -// {"point",0.167}, {"insert", 0.167}, {"delete", 0.167}, {"subtree", 0.167}, {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} -// }; + // applying mutation and checking if the optional result is non-empty + fmt::print("auto Child = PRG.mutate();\n"); + auto opt = variator.mutate(PRG); + + if (!opt){ + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutation failed to create a child", + d, s, + PRG.get_model("compact", true) + ); + } + else { + successes += 1; + auto Child = opt.value(); + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutated Model: {}\n", + d, s, + PRG.get_model("compact", true), + Child.get_model("compact", true) + ); + + fmt::print("child fit\n"); + Child.fit(data); + y_pred = Child.predict(data); + + // no collateral effect (parent still the same) + ASSERT_TRUE(PRG_model == PRG.get_model("compact", true)); + } + } + } + // since x1 and x2 have same type, we shoudn't get fails + ASSERT_TRUE(successes > 0); +} + +TEST(Variation, MutationSizeAndDepthLimit) +{ + Parameters params; -// MatrixXf X(10,2); -// ArrayXf y(10); -// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, -// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + MatrixXf X(10,2); + ArrayXf y(10); + X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, + 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, -// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, -// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, + 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; -// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, -// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, + 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; -// Dataset data(X,y); + Dataset data(X,y); -// SearchSpace SS; -// SS.init(data); + SearchSpace SS; + SS.init(data); -// // prod operator --> arity 4: prod(T1, T2, T3) -// // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) -// int max_arity = 6; - -// int successes = 0; -// for (int d = 5; d < 15; ++d) -// { -// for (int s = 5; s < 15; ++s) -// { -// PARAMS["max_size"] = s; -// PARAMS["max_depth"] = d; - -// fmt::print("d={},s={}\n",d,s); -// fmt::print("make_regressor\n"); - -// // Enforcing that the parents does not exceed max_size by -// // taking into account the highest arity of the function nodes; -// // and the max_depth+1 that PTC2 can generate -// RegressorProgram PRG = SS.make_regressor(d-1, s - max_arity); + // prod operator --> arity 4: prod(T1, T2, T3) + // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) + int max_arity = 6; + + int successes = 0; + for (int d = 1; d < 6; ++d) + { + for (int s = 5; s < 15; ++s) + { + params.max_size = s; + params.max_depth = d; -// auto PRG_model = PRG.get_model("compact", true); - -// auto opt = PRG.mutate(); - -// if (!opt){ -// fmt::print( -// "=================================================\n" -// "depth = {}, size= {}\n" -// "Initial Model: {}\n" -// "Mutation failed to create a child", -// d, s, -// PRG.get_model("compact", true) -// ); -// fmt::print("{}", PARAMS["mutation_trace"].get().dump()); -// } -// else { -// successes += 1; + // creating and fitting a child + Variation variator = Variation(params, SS); + + fmt::print("d={},s={}\n",d,s); + fmt::print("make_regressor\n"); + + // Enforcing that the parents does not exceed max_size by + // taking into account the highest arity of the function nodes; + // and the max_depth+1 that PTC2 can generate + RegressorProgram PRG = SS.make_regressor(0, 0, params); + + auto PRG_model = PRG.get_model("compact", true); + auto opt = variator.mutate(PRG); + + if (!opt){ + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutation failed to create a child", + d, s, + PRG.get_model("compact", true) + ); + } + else { + successes += 1; -// // Extracting the child from the std::optional and checking -// // if it is within size and depth restrictions. There is no -// // margin for having slightly bigger expressions. -// auto Child = opt.value(); + // Extracting the child from the std::optional and checking + // if it is within size and depth restrictions. There is no + // margin for having slightly bigger expressions. + auto Child = opt.value(); -// fmt::print("print\n"); -// fmt::print( -// "=================================================\n" -// "depth = {}, size= {}\n" -// "Initial Model: {}\n" -// "Mutated Model: {}\n" -// "Mutated depth: {}\n" -// "Mutated size : {}\n", -// d, s, -// PRG.get_model("compact", true), -// Child.get_model("compact", true), -// Child.depth(), -// Child.size() -// ); - -// // Original didn't change -// ASSERT_TRUE(PRG_model == PRG.get_model("compact", true)); + fmt::print("print\n"); + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutated Model: {}\n" + "Mutated depth: {}\n" + "Mutated size : {}\n", + d, s, + PRG.get_model("compact", true), + Child.get_model("compact", true), + Child.depth(), + Child.size() + ); + + // Original didn't change + ASSERT_TRUE(PRG_model == PRG.get_model("compact", true)); -// ASSERT_TRUE(Child.size() > 0); -// ASSERT_TRUE(Child.size() <= s); - -// ASSERT_TRUE(Child.size() > 0); -// ASSERT_TRUE(Child.size() <= s); - -// ASSERT_TRUE(Child.depth() >= 0); -// ASSERT_TRUE(Child.depth() <= d); -// } -// } -// } -// ASSERT_TRUE(successes > 0); -// } - -// TEST(Variation, Crossover) -// { -// MatrixXf X(10,2); -// ArrayXf y(10); -// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, -// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - -// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, -// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - -// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, -// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - -// Dataset data(X,y); - -// SearchSpace SS; -// SS.init(data); - -// int successes = 0; -// for (int d = 1; d < 10; ++d) -// { -// for (int s = 1; s < 10; ++s) -// { -// RegressorProgram PRG1 = SS.make_regressor(d, s); -// RegressorProgram PRG2 = SS.make_regressor(d, s); -// PRG1.fit(data); -// PRG2.fit(data); - -// fmt::print( -// "=================================================\n" -// "depth = {}, size= {}\n" -// "Initial Model 1: {}\n" -// "Initial Model 2: {}\n", -// d, s, -// PRG1.get_model("compact", true), -// PRG2.get_model("compact", true) -// ); - -// ArrayXf y_pred = PRG1.predict(data); -// fmt::print("cross one\n"); - -// auto opt = PRG1.cross(PRG2); -// if (!opt){ -// fmt::print( -// "=================================================\n" -// "depth = {}, size= {}\n" -// "Original model 1: {}\n" -// "Original model 2: {}\n", -// "Crossover failed to create a child", -// d, s, -// PRG1.get_model("compact", true), -// PRG2.get_model("compact", true) -// ); -// } -// else { -// successes += 1; -// auto Child = opt.value(); -// fmt::print( -// "Original model 1 after cross: {}\n" -// "Original model 2 after cross: {}\n", -// PRG1.get_model("compact", true), -// PRG2.get_model("compact", true) -// ); -// fmt::print( -// "Crossed Model: {}\n" -// "=================================================\n", -// Child.get_model("compact", true) -// ); -// Child.fit(data); -// auto child_pred1 = Child.predict(data); -// } -// } -// } -// ASSERT_TRUE(successes > 0); -// } - -// TEST(Variation, CrossoverSizeAndDepthLimit) -// { -// MatrixXf X(10,2); -// ArrayXf y(10); -// X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, -// 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - -// 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, -// 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - -// y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, -// 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - -// Dataset data(X,y); - -// SearchSpace SS; -// SS.init(data); - -// // prod operator --> arity 4: prod(T1, T2, T3) -// // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) -// int max_arity = 6; - -// int successes = 0; -// for (int d = 5; d < 15; ++d) -// { -// for (int s = 5; s < 15; ++s) -// { -// PARAMS["max_size"] = s; -// PARAMS["max_depth"] = d; - -// // Enforcing that the parents does not exceed max_size by -// // taking into account the highest arity of the function nodes -// RegressorProgram PRG1 = SS.make_regressor(d-1, s-max_arity); -// RegressorProgram PRG2 = SS.make_regressor(d-1, s-max_arity); - -// auto PRG1_model = PRG1.get_model("compact", true); -// auto PRG2_model = PRG2.get_model("compact", true); - -// fmt::print( -// "=================================================\n" -// "settings: depth = {}, size= {}\n" -// "Original model 1: {}\n" -// "depth = {}, size= {}\n" -// "Original model 2: {}\n" -// "depth = {}, size= {}\n", -// d, s, -// PRG1.get_model("compact", true), -// PRG1.depth(), PRG1.size(), -// PRG2.get_model("compact", true), -// PRG2.depth(), PRG2.size() -// ); - -// fmt::print("cross\n"); -// auto opt = PRG1.cross(PRG2); - -// if (!opt){ -// fmt::print("Crossover failed to create a child" -// "=================================================\n"); -// } -// else { -// successes += 1; -// auto Child = opt.value(); -// fmt::print( -// "Child Model : {}\n" -// "Child Model depth: {}\n" -// "Child Model size : {}\n" -// "=================================================\n", -// Child.get_model("compact", true), -// Child.depth(), Child.size() -// ); - -// // Original didn't change -// ASSERT_TRUE(PRG1_model == PRG1.get_model("compact", true)); -// ASSERT_TRUE(PRG2_model == PRG2.get_model("compact", true)); - -// // Child is within restrictions -// ASSERT_TRUE(Child.size() > 0); -// ASSERT_TRUE(Child.size() <= s); - -// ASSERT_TRUE(Child.depth() >= 0); -// ASSERT_TRUE(Child.depth() <= d); -// } -// } -// } -// ASSERT_TRUE(successes > 0); -// } \ No newline at end of file + ASSERT_TRUE(Child.size() > 0); + ASSERT_TRUE(Child.size() <= s); + + ASSERT_TRUE(Child.size() > 0); + ASSERT_TRUE(Child.size() <= s); + + ASSERT_TRUE(Child.depth() >= 0); + ASSERT_TRUE(Child.depth() <= d); + } + } + } + ASSERT_TRUE(successes > 0); +} + +TEST(Variation, Crossover) +{ + Parameters params; + + MatrixXf X(10,2); + ArrayXf y(10); + X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, + 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + + 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, + 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + + y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, + 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + + Dataset data(X,y); + + SearchSpace SS; + SS.init(data); + + int successes = 0; + for (int d = 2; d < 6; ++d) + { + for (int s = 5; s < 15; ++s) + { + params.max_size = s; + params.max_depth = d; + Variation variator = Variation(params, SS); + + RegressorProgram PRG1 = SS.make_regressor(d, 0, params); + PRG1.fit(data); + auto PRG1_model = PRG1.get_model("compact", true); + + RegressorProgram PRG2 = SS.make_regressor(d, 0, params); + PRG2.fit(data); + auto PRG2_model = PRG2.get_model("compact", true); + + + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model 1: {}\n" + "Initial Model 2: {}\n", + d, s, + PRG1.get_model("compact", true), + PRG2.get_model("compact", true) + ); + + ArrayXf y_pred = PRG1.predict(data); + fmt::print("cross one\n"); + + auto opt = variator.cross(PRG1, PRG2); + if (!opt){ + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Original model 1: {}\n" + "Original model 2: {}\n", + "Crossover failed to create a child", + d, s, + PRG1.get_model("compact", true), + PRG2.get_model("compact", true) + ); + } + else { + successes += 1; + auto Child = opt.value(); + fmt::print( + "Original model 1 after cross: {}\n" + "Original model 2 after cross: {}\n", + PRG1.get_model("compact", true), + PRG2.get_model("compact", true) + ); + fmt::print( + "Crossed Model: {}\n" + "=================================================\n", + Child.get_model("compact", true) + ); + Child.fit(data); + auto child_pred1 = Child.predict(data); + + // no collateral effect (parent still the same) + ASSERT_TRUE(PRG1_model == PRG1.get_model("compact", true)); + ASSERT_TRUE(PRG2_model == PRG2.get_model("compact", true)); + } + } + } + ASSERT_TRUE(successes > 0); +} + +TEST(Variation, CrossoverSizeAndDepthLimit) +{ + Parameters params; + + MatrixXf X(10,2); + ArrayXf y(10); + X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, + 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + + 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, + 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + + y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, + 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + + Dataset data(X,y); + + SearchSpace SS; + SS.init(data); + + // prod operator --> arity 4: prod(T1, T2, T3) + // split best --> arity 6: if(terminal > value, T_case_true, T_case_false) + int max_arity = 6; + + int successes = 0; + for (int d = 1; d < 6; ++d) + { + for (int s = 5; s < 15; ++s) + { + params.max_size = s; + params.max_depth = d; + Variation variator = Variation(params, SS); + + // Enforcing that the parents does not exceed max_size by + // taking into account the highest arity of the function nodes + RegressorProgram PRG1 = SS.make_regressor(0, 0, params); + RegressorProgram PRG2 = SS.make_regressor(0, 0, params); + + auto PRG1_model = PRG1.get_model("compact", true); + auto PRG2_model = PRG2.get_model("compact", true); + + fmt::print( + "=================================================\n" + "settings: depth = {}, size= {}\n" + "Original model 1: {}\n" + "depth = {}, size= {}\n" + "Original model 2: {}\n" + "depth = {}, size= {}\n", + d, s, + PRG1.get_model("compact", true), + PRG1.depth(), PRG1.size(), + PRG2.get_model("compact", true), + PRG2.depth(), PRG2.size() + ); + + fmt::print("cross\n"); + auto opt = variator.cross(PRG1, PRG2); + + if (!opt){ + fmt::print("Crossover failed to create a child" + "=================================================\n"); + } + else { + successes += 1; + auto Child = opt.value(); + fmt::print( + "Child Model : {}\n" + "Child Model depth: {}\n" + "Child Model size : {}\n" + "=================================================\n", + Child.get_model("compact", true), + Child.depth(), Child.size() + ); + + // Original didn't change + ASSERT_TRUE(PRG1_model == PRG1.get_model("compact", true)); + ASSERT_TRUE(PRG2_model == PRG2.get_model("compact", true)); + + // Child is within restrictions + ASSERT_TRUE(Child.size() > 0); + ASSERT_TRUE(Child.size() <= s + 3*max_arity); + + ASSERT_TRUE(Child.depth() >= 0); + ASSERT_TRUE(Child.depth() <= d); + } + } + } + ASSERT_TRUE(successes > 0); +} \ No newline at end of file diff --git a/tests/cpp/testsHeader.h b/tests/cpp/testsHeader.h index 24797088..e846c2e6 100644 --- a/tests/cpp/testsHeader.h +++ b/tests/cpp/testsHeader.h @@ -26,9 +26,16 @@ using std::stof; #include #include "../../src/init.h" +#include "../../src/params.h" #include "../../src/data/data.h" #include "../../src/program/operator.h" +#include "../../src/program/dispatch_table.h" +#include "../../src/program/program.h" +#include "../../src/individual.h" +#include "../../src/search_space.h" #include "../../src/variation.h" +#include "../../src/variation.cpp" // TODO: is this ok? + using namespace Brush; using namespace Brush::Data; using namespace Brush::Var; From 5f7957a147a93514c7dab52f9540d0afff8c768b Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 15 Nov 2023 16:04:11 -0500 Subject: [PATCH 098/199] Templated selection. Bug fixes. --- src/cbrush.cpp | 23 ++++++++-------- src/cbrush.h | 5 ++-- src/eval/evaluation.cpp | 8 +++--- src/individual.h | 5 ++-- src/params.h | 2 +- src/population.cpp | 46 ++++++++----------------------- src/population.h | 6 ++-- src/selection/nsga2.cpp | 29 +++++++++++++------ src/selection/nsga2.h | 5 ++-- src/selection/selection.cpp | 55 ++++++++++++------------------------- src/selection/selection.h | 42 ++++++++++++++++------------ src/variation.cpp | 11 ++++---- 12 files changed, 110 insertions(+), 127 deletions(-) diff --git a/src/cbrush.cpp b/src/cbrush.cpp index 18bf123b..fafed3e4 100644 --- a/src/cbrush.cpp +++ b/src/cbrush.cpp @@ -84,21 +84,21 @@ void CBrush::run_generation(unsigned int g, Dataset &data) auto batch = data.get_batch(); // will return the original dataset if it is set to dont use batch - vector> island_parents; + vector> island_parents; island_parents.resize(pop.n_islands); taskflow.for_each_index(0, pop.n_islands, 1, [&](int island) { tuple island_range = pop.get_island_range(island); // fit the weights with all training data - evaluator.fitness(pop.individuals, island_range, data, params, true, false); - evaluator.validation(pop.individuals, island_range, data, params, false); + evaluator.fitness(pop, island_range, data, params, true, false); + evaluator.validation(pop, island_range, data, params, false); // TODO: if using batch, fitness should be called before selection to set the batch if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) - evaluator.fitness(pop.individuals, island_range, batch, params, false, false); + evaluator.fitness(pop, island_range, batch, params, false, false); // select parents - vector parents = selector.select(pop, island_range, data); + vector parents = selector.select(pop, island_range, params, data); island_parents.at(island) = parents; }); @@ -111,15 +111,14 @@ void CBrush::run_generation(unsigned int g, Dataset &data) // // variation to produce offspring variator.vary(pop, island_range, island_parents.at(island)); - // TODO: needs to create the evaluator (and calculate the information on train and validation partition) - evaluator.fitness(pop.individuals, island_range, data, params, true, true); - evaluator.validation(pop.individuals, island_range, data, params, true); + evaluator.fitness(pop, island_range, data, params, true, true); + evaluator.validation(pop, island_range, data, params, true); if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) - evaluator.fitness(pop.individuals, island_range, batch, params, false, true); + evaluator.fitness(pop, island_range, batch, params, false, true); - // // select survivors from combined pool of parents and offspring - auto island_survivors = survivor.survive(pop, island_range, data); + // select survivors from combined pool of parents and offspring + auto island_survivors = survivor.survive(pop, island_range, params, data); auto [idx_start, idx_end] = island_range; size_t delta = idx_end - idx_start; @@ -131,6 +130,7 @@ void CBrush::run_generation(unsigned int g, Dataset &data) // // reduce population to survivors pop.update(survivors); + // pop.migrate(); bool updated_best = update_best(data); } @@ -148,6 +148,7 @@ void CBrush::fit(MatrixXf& X, VectorXf& y) this->selector = Selection(params.sel, false); this->survivor = Selection(params.surv, true); + // TODO: initialize (set operator) for survivor and selector // initialize population with initial model and/or starting pop pop.init(this->ss, this->params); diff --git a/src/cbrush.h b/src/cbrush.h index 33ea7232..8a842bf8 100644 --- a/src/cbrush.h +++ b/src/cbrush.h @@ -134,11 +134,12 @@ class CBrush{ Parameters params; ///< hyperparameters of brush SearchSpace ss; + // TODO: make other classes like selection (no template), or make selection like other classes? Population pop; ///< population of programs - Selection selector; ///< selection algorithm + Selection selector; ///< selection algorithm Evaluation evaluator; ///< evaluation code Variation variator; ///< variation operators - Selection survivor; ///< survival algorithm + Selection survivor; ///< survival algorithm // TODO: MISSING CLASSES: timer, archive, logger diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index ea221953..36d94ed4 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -51,7 +51,7 @@ void Evaluation::validation(Population& pop, else { // TODO: implement the class weights and use it here (and on fitness) - auto y_pred = ind.predict(data.get_validation_data); + VectorXf y_pred = ind.program.predict(data.get_validation_data()); assign_fit(ind, y_pred, data, params, true); } } @@ -86,7 +86,7 @@ void Evaluation::fitness(Population& pop, for (unsigned i = idx_start; i& ind = pop[i]; + Individual& ind = pop.individuals.at(i); bool pass = true; @@ -99,9 +99,9 @@ void Evaluation::fitness(Population& pop, { // assign weights to individual if (fit) - ind.fit(data); + ind.program.fit(data); - auto y_pred = ind.predict(data.get_training_data); + VectorXf y_pred = ind.program.predict(data.get_training_data()); assign_fit(ind, y_pred, data, params, false); } } diff --git a/src/individual.h b/src/individual.h index 896e8456..2e479ef1 100644 --- a/src/individual.h +++ b/src/individual.h @@ -2,6 +2,7 @@ #define INDIVIDUAL_H #include "program/program.h" +#include "search_space.h" namespace Brush{ namespace Pop{ @@ -41,9 +42,9 @@ class Individual{ Individual(Program& prg) : Individual() { program = prg; }; - void init(const SearchSpace& ss, const Parameters& params) + void init(SearchSpace& ss, const Parameters& params) { - program = SS.make_program(params, 0, 0); + program = ss.make_program>(params, 0, 0); // If different from zero, then the program is created with a fixed depth and size. // If zero, it samples the value diff --git a/src/params.h b/src/params.h index 35ef31ae..5d2d9534 100644 --- a/src/params.h +++ b/src/params.h @@ -32,7 +32,7 @@ struct Parameters int gens = 100; unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size unsigned int max_size = 50; - vector objectives{"error","complexity"}; // error should be generic and deducted based on mode + vector objectives{"fitness","complexity"}; // error should be generic and deducted based on mode string sel = "nsga2"; //selection method string surv = "nsga2"; //survival method std::unordered_map functions; diff --git a/src/population.cpp b/src/population.cpp index 7f60a617..b7b0deff 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -32,6 +32,7 @@ void Population::set_island_ranges() template Population::Population(int p, int n_islands) { + // this calls the default constructor for the container template class individuals.resize(p); this->n_islands=n_islands; @@ -44,7 +45,7 @@ Population::Population(int p, int n_islands) } template -void Population::init(const SearchSpace& ss, const Parameters& params) +void Population::init(SearchSpace& ss, const Parameters& params) { this->mig_prob = params.mig_prob; @@ -69,7 +70,7 @@ void Population::prep_offspring_slots() if (offspring_ready) HANDLE_ERROR_THROW("Allocating space in population that already has active offspring slots"); - vector*> expanded_pop; + vector> expanded_pop; expanded_pop.resize(2*individuals.size()); for (int i=0; i::prep_offspring_slots() expanded_pop.at(2*idx_start + j) = individuals.at(idx_start+j); } - // setting new island sizes (TODO: i think I can just call set island - // ranges again, but i need to do the math to see if floor operations - // will not accidentally migrate some individuals) - island_ranges.at(i) = {2*idx_start, 2*(idx_end + delta)}; + // // setting new island sizes (TODO: i think I can just call set island + // // ranges again, but i need to do the math to see if floor operations + // // will not accidentally migrate some individuals) + // island_ranges.at(i) = {2*idx_start, 2*(idx_end + delta)}; }; - this->individuals = &expanded_pop; + this->individuals = expanded_pop; + set_island_ranges(); offspring_ready = true; // Im keeping the offspring and parents in the same population object, because we @@ -178,30 +180,6 @@ vector Population::hall_of_fame(unsigned rank) { // this is used to migration and update archive at the end of a generation. expect islands without offspring - /* Returns individuals on the Pareto front, sorted by increasign complexity. */ - vector pf_islands; - pf_islands.resize(n_islands); - - for (int i=0; i pf; - - for (unsigned int i =idx_start; i pf(0); for (unsigned int i =0; i::migrate() size_t migrating_idx; // determine if incoming individual comes from global or local hall of fame if (r() < 0.5 && n_islands>1) { // from global hall of fame - migrating_idx = r.select_randomly( + migrating_idx = *r.select_randomly( global_hall_of_fame.begin(), global_hall_of_fame.end()); } @@ -252,11 +230,11 @@ void Population::migrate() } // picking other island - int other_island = r.select_randomly( + int other_island = *r.select_randomly( other_islands.begin(), other_islands.end()); - migrating_idx = r.select_randomly( + migrating_idx = *r.select_randomly( island_fronts.at(other_island).begin(), island_fronts.at(other_island).end()); } diff --git a/src/population.h b/src/population.h index bf2f54b7..24f255dc 100644 --- a/src/population.h +++ b/src/population.h @@ -18,7 +18,7 @@ class Population{ void set_island_ranges(); public: bool offspring_ready; - vector*> individuals; + vector> individuals; vector> island_ranges; vector island_skip; // number of indexes to skip for each island (when variation fails) unsigned int n_islands; @@ -27,10 +27,10 @@ class Population{ // TODO: taskflow needs to use n_islands as n_jobs Population(int p = 0, int n_islands=1); - ~Population(); + ~Population(){}; /// initialize population of programs with a starting model and/or from file - void init(const SearchSpace& ss, const Parameters& params); + void init(SearchSpace& ss, const Parameters& params); /// returns population size int size() { return individuals.size(); }; diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 66dc14ee..ca330bf5 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -8,6 +8,13 @@ using namespace Pop; using namespace Data; using namespace Sel; +template +NSGA2::NSGA2(bool surv) +{ + this->name = "nsga2"; + this->survival = surv; +} + template size_t NSGA2::tournament(vector>& pop, size_t i, size_t j) const { @@ -51,7 +58,7 @@ vector NSGA2::select(Population& pop, tuple island auto [idx_start, idx_end] = island_range; if (pop.offspring_ready) // dont look at offspring to select - idx_end = idx_end/2; + idx_end = (idx_end - idx_start)/2; size_t delta = idx_end - idx_start; @@ -71,8 +78,8 @@ vector NSGA2::select(Population& pop, tuple island for (int i = 0; i < delta; ++i) // selecting based on island_pool size { size_t winner = tournament(pop.individuals, - r.select_randomly(island_pool.begin(), island_pool.end()), - r.select_randomly(island_pool.begin(), island_pool.end())); + *r.select_randomly(island_pool.begin(), island_pool.end()), + *r.select_randomly(island_pool.begin(), island_pool.end())); selected.push_back(winner); } @@ -102,27 +109,32 @@ vector NSGA2::survive(Population& pop, tuple islan assert(pop.offspring_ready && "survival was called in an island with no offspring"); - size_t delta = idx_end - idx_start; + size_t delta = (idx_end - idx_start); // the whole island (pop + offspring) vector island_pool(delta); // array with indexes for the specific island_pool std::iota(island_pool.begin(), island_pool.end(), idx_start); // set objectives (this is when the obj vector is updated.) - #pragma omp parallel for + + fmt::print("-- first loop\n"); for (unsigned int i=0; i selected(0); int i = 0; while ( selected.size() + front.at(i).size() < delta/2 ) // (delta/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) { + fmt::print("-- crawd dist\n"); std::vector& Fi = front.at(i); // indices in front i crowding_distance(pop, front, i); // calculate crowding in Fi - + + fmt::print("-- select loop\n"); for (int j = 0; j < Fi.size(); ++j) // Pt+1 = Pt+1 U Fi selected.push_back(Fi.at(j)); @@ -131,8 +143,9 @@ vector NSGA2::survive(Population& pop, tuple islan crowding_distance(pop, front, i); // calculate crowding in final front to include std::sort(front.at(i).begin(),front.at(i).end(),sort_n(pop)); - - const int extra = params.pop_size - selected.size(); + + fmt::print("adding last front)\n"); + const int extra = delta/2 - selected.size(); for (int j = 0; j < extra; ++j) // Pt+1 = Pt+1 U Fi[1:N-|Pt+1|] selected.push_back(front.at(i).at(j)); diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index ceb9f3ce..6e9f4eb6 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -17,14 +17,15 @@ using namespace Data; using namespace Sel; template -class NSGA2 : public SelectionOperator +class NSGA2 : public SelectionOperator { +public: // should operate only on a given island index /** NSGA-II based selection and survival methods. */ // if any of the islands have overlapping indexes, parallel access and modification should be ok (because i dont increase or decrease pop size, not change island ranges inside selection) - NSGA2(bool surv){ name = "nsga2"; survival = surv; }; + NSGA2(bool surv); ~NSGA2(){}; /// selection according to the survival scheme of NSGA-II diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index 848c6d5b..fb2e01db 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -1,4 +1,5 @@ #include "selection.h" +#include "nsga2.h" // TODO: organize all namespaces namespace Brush { @@ -7,17 +8,8 @@ namespace Sel { using namespace Brush; using namespace Pop; -Selection::Selection() -{ - /*! - * set type of selection operator. - */ - this->type = "lexicase"; - this->survival = false; - this->set_operator(); -} - -Selection::Selection(string type, bool survival) +template +Selection::Selection(string type, bool survival) { /*! * set type of selection operator. @@ -27,51 +19,38 @@ Selection::Selection(string type, bool survival) this->set_operator(); } -void Selection::set_operator() +template +void Selection::set_operator() { - // if (this->type == "lexicase") - // pselector = std::make_shared(survival); - // else if (this->type == "fair_lexicase") - // pselector = std::make_shared(survival); - // else if (this->type == "pareto_lexicase") - // pselector = std::make_shared(survival); - // else if (this->type == "nsga2") - // pselector = std::make_shared(survival); - // else if (this->type == "tournament") - // pselector = std::make_shared(survival); - // else if (this->type == "offspring") // offspring survival - // pselector = std::make_shared(survival); - // else if (this->type == "random") // offspring survival - // pselector = std::make_shared(survival); - // else if (this->type == "simanneal") // offspring survival - // pselector = std::make_shared(survival); - // else - // WARN("Undefined Selection Operator " + this->type + "\n"); + if (this->type == "nsga2") + pselector = new NSGA2(survival); + else + HANDLE_ERROR_THROW("Undefined Selection Operator " + this->type + "\n"); } -Selection::~Selection(){} - /// return type of selectionoperator -string Selection::get_type(){ return pselector->name; } +template +string Selection::get_type(){ return pselector->name; } /// set type of selectionoperator -void Selection::set_type(string in){ type = in; set_operator();} +template +void Selection::set_type(string in){ type = in; set_operator();} /// perform selection template -vector Selection::select(Population& pop, tuple island_range, +vector Selection::select(Population& pop, tuple island_range, const Parameters& params, const Dataset& data) { - return pselector->select(pop, params, data); + return pselector->select(pop, island_range, params, data); } /// perform survival template -vector Selection::survive(Population& pop, tuple island_range, +vector Selection::survive(Population& pop, tuple island_range, const Parameters& params, const Dataset& data) { - return pselector->survive(pop, params, data); + return pselector->survive(pop, island_range, params, data); } } // selection diff --git a/src/selection/selection.h b/src/selection/selection.h index 13c9a5c3..4c0d1c43 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -8,6 +8,7 @@ license: GNU/GPL v3 #include "../init.h" #include "../params.h" +#include "../types.h" #include "../population.h" #include "../variation.h" @@ -22,31 +23,31 @@ using namespace Var; * @class SelectionOperator * @brief base class for selection operators. */ +template class SelectionOperator { public: bool survival; string name; - SelectionOperator(){} + // shoudn't have a constructor + // SelectionOperator(){}; - virtual ~SelectionOperator(); - - template - vector select(Population& pop, tuple island_range, + virtual ~SelectionOperator(){}; + + virtual vector select(Population& pop, tuple island_range, const Parameters& p, const Dataset& data) { - // THROW_INVALID_ARGUMENT("Undefined select() operation"); + HANDLE_ERROR_THROW("Undefined select() operation"); return vector(); - } + }; - template - vector survive(Population& pop, tuple island_range, + virtual vector survive(Population& pop, tuple island_range, const Parameters& p, const Dataset& data) { - // THROW_INVALID_ARGUMENT("Undefined select() operation"); + HANDLE_ERROR_THROW("Undefined select() operation"); return vector(); - } + }; }; // struct Parameters; // forward declaration of Parameters @@ -55,15 +56,23 @@ class SelectionOperator * @class Selection * @brief interfaces with selection operators. */ +template struct Selection { public: - shared_ptr pselector; + SelectionOperator* pselector; // TODO: THIS SHOULD BE A SHARED POINTER string type; bool survival; - Selection(); - ~Selection(); + //TODO: rewrite it as initializing parameters + Selection() + { + this->type = "nsga2"; + this->survival = false; + this->set_operator(); + }; + + ~Selection(){}; Selection(string type, bool survival); void set_operator(); @@ -73,17 +82,16 @@ struct Selection void set_type(string); /// perform selection. selection uses a pop that has no offspring space - template vector select(Population& pop, tuple island_range, const Parameters& params, const Dataset& data); /// perform survival. uses a pop with offspring space - template vector survive(Population& pop, tuple island_range, const Parameters& params, const Dataset& data); }; -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Selection, type, survival); +// TODO: MAKE THIS WORK +// NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Selection, type, survival); } // selection } // Brush diff --git a/src/variation.cpp b/src/variation.cpp index eb977b9a..50128194 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -605,13 +605,13 @@ void Variation::vary(Population& pop, tuple island_range, // island (with island range) while (!opt) { - Individual& mom = pop.individuals.at( - r.select_randomly(parents.begin(), parents.end())); + const Individual& mom = pop[ + *r.select_randomly(parents.begin(), parents.end())]; if ( r() < parameters.cx_prob) // crossover { - Individual& dad = pop.individuals.at( - r.select_randomly(parents.begin(), parents.end())); + const Individual& dad = pop[ + *r.select_randomly(parents.begin(), parents.end())]; opt = cross(mom.program, dad.program); } @@ -622,7 +622,8 @@ void Variation::vary(Population& pop, tuple island_range, if (opt) // no optional value was returned { - auto child = opt.value(); + Program child = opt.value(); + assert(child.size()>0); pop.individuals.at(i) = Individual(child); } From c9d763ae67c0eb071e5f08310a7c205b99d89bc6 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 15 Nov 2023 16:04:42 -0500 Subject: [PATCH 099/199] Tests for population (working! I just dont know if they are correct yet) --- brush/pybrush.py | 8 +- ...evolution_step.cpp => test_evaluation.cpp} | 0 tests/cpp/test_params.cpp | 0 tests/cpp/test_population.cpp | 143 ++++++++++++++++-- tests/cpp/testsHeader.h | 3 +- 5 files changed, 139 insertions(+), 15 deletions(-) rename tests/cpp/{test_evolution_step.cpp => test_evaluation.cpp} (100%) create mode 100644 tests/cpp/test_params.cpp diff --git a/brush/pybrush.py b/brush/pybrush.py index cac154c6..d0665a1c 100644 --- a/brush/pybrush.py +++ b/brush/pybrush.py @@ -1,11 +1,11 @@ -from _brush import CBrush # TODO: stop calling cbrush +from _brush import CBrush, Dataset, SearchSpace # TODO: stop calling cbrush, rename it from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin # TODO? LOGGER AND ARCHIVE # TODO: GET DOCUMENTATION BACK -class PybrushEstimator(BaseEstimator): +class BrushEstimator(BaseEstimator): def __init__(self): self.cbrush_ = CBrush() @@ -28,12 +28,12 @@ def score(self,X,y,Z=None): pass -class PybrushRegressor(PybrushEstimator): +class BrushRegressor(BrushEstimator): def __init__(self,**kwargs): pass -class PybrushClassifier(PybrushEstimator): +class BrushClassifier(BrushEstimator): def __init__(self,**kwargs): pass diff --git a/tests/cpp/test_evolution_step.cpp b/tests/cpp/test_evaluation.cpp similarity index 100% rename from tests/cpp/test_evolution_step.cpp rename to tests/cpp/test_evaluation.cpp diff --git a/tests/cpp/test_params.cpp b/tests/cpp/test_params.cpp new file mode 100644 index 00000000..e69de29b diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 68a407ee..0f7f53d5 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -1,18 +1,143 @@ #include "testsHeader.h" +#include "../../src/individual.cpp" +#include "../../src/population.cpp" // TODO: figure out if thats ok to include cpps instead of headers +#include "../../src/eval/evaluation.cpp" +#include "../../src/selection/nsga2.cpp" +#include "../../src/selection/selection.cpp" + +using namespace Brush::Pop; +using namespace Brush::Sel; +using namespace Brush::Eval; + TEST(Population, PopulationTests) -{ +{ + // works with even and uneven pop sizes. (TODO: PARAMETERIZE this test to do it with even and uneven, and single individual pop) + + MatrixXf X(4,2); + VectorXf y(4); + + X << 0,1, + 0.47942554,0.87758256, + 0.84147098, 0.54030231, + 0.99749499, 0.0707372; + y << 3.0, 3.59159876, 3.30384889, 2.20720158; + + fmt::print("Initializing all classes;\n"); + Dataset data(X,y); + + SearchSpace SS; + SS.init(data); - Population pop; + Parameters params; + Population pop = Population(params.pop_size, params.num_islands); + + // aux classes (they are not tested in-depth in this file) + Evaluation evaluator = Evaluation(params.scorer_); + Selection selector = Selection(params.sel, false); + Selection survivor = Selection(params.surv, true); + Variation variator = Variation(params, SS); + + selector.set_operator(); + survivor.set_operator(); + + // size, all individuals were initialized + ASSERT_TRUE(pop.size() == pop.individuals.size() + && pop.size() == params.pop_size); + + fmt::print("Initializing individuals in the population:\n"); + pop.init(SS, params); + for (auto& ind : pop.individuals) + { + fmt::print("Individual: {}\n", ind.program.get_model("compact", true)); + } - // size, - // island sizes growns and comes back to the same, - // update and prep offspring slots. - // no overlap in island indexes. - // works with even and uneven pop sizes. - // initialize population works? - // migrate? // print models + fmt::print("Printing from population method:\n{}\n", pop.print_models()); + + // no overlap in island indexes + + fmt::print("Testing island ranges\n"); + for (std::size_t i = 0; i < pop.island_ranges.size() - 1; ++i) { + int last = std::get<1>(pop.island_ranges.at(i)); + int next_first = std::get<0>(pop.island_ranges.at(i+1)); + + //(last index from one island is EQUAL than first) (no gaps between island) + // (this assumes that we will never iterate to the last index in for loops. TODO: make sure we dont) + ASSERT_TRUE(last == next_first); + + // difference between island sizes is at most 1 + auto delta = last - std::get<0>(pop.island_ranges.at(i)); + auto next_delta = std::get<1>(pop.island_ranges.at(i+1)) - next_first; + ASSERT_TRUE(delta <= next_delta+1 && next_delta <= delta+1); + } + + // island sizes increases and comes back to the same values after update + fmt::print("Performing all steps of an evolution\n"); + auto original_islands = pop.island_ranges; + for (int i=0; i<10; ++i) // update and prep offspring slots works properly + { // wax on wax off + + fmt::print("Evaluating population\n"); + vector> island_parents; + island_parents.resize(pop.n_islands); + for (int j=0; j(pop.get_island_range(j)), + std::get<1>(pop.get_island_range(j)) ); + + fmt::print("Fitness\n"); + // we can calculate the fitness for each island + evaluator.fitness(pop, pop.get_island_range(j), data, params, true, false); + + fmt::print("Selection\n"); + // just so we can call the update method + vector parents = selector.select(pop, pop.get_island_range(j), params, data); + + ASSERT_TRUE(parents.size() > 0); + fmt::print("Updating parents\n"); + island_parents.at(j) = parents; + } + fmt::print("Preparing offspring\n"); + pop.prep_offspring_slots(); + ASSERT_TRUE(pop.size() == params.pop_size*2); + + fmt::print("Preparing survivors\n"); + vector survivors(params.pop_size); + for (int j=0; j Date: Thu, 16 Nov 2023 16:41:28 -0500 Subject: [PATCH 100/199] Rewriting to store island ind indexes --- instead of (start_idx, end_idx) --- src/population.cpp | 188 +++++++++++++++------------------- src/population.h | 43 ++++---- src/selection/nsga2.cpp | 121 +++++++--------------- src/selection/nsga2.h | 19 ++-- src/selection/selection.cpp | 8 +- src/selection/selection.h | 9 +- tests/cpp/test_population.cpp | 137 ++++++++++++------------- 7 files changed, 227 insertions(+), 298 deletions(-) diff --git a/src/population.cpp b/src/population.cpp index b7b0deff..f07d984a 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -1,97 +1,76 @@ -/* FEAT -copyright 2017 William La Cava -license: GNU/GPL v3 -*/ - #include "population.h" namespace Brush{ namespace Pop{ - template -void Population::set_island_ranges() +Population::Population() { - // everytime we change popsize, this function must be called + individuals.resize(0); + mig_prob = 0.0; + pop_size = 0; + n_islands = 0; +} +template +void Population::init(SearchSpace& ss, const Parameters& params) +{ + this->mig_prob = params.mig_prob; + this->pop_size = params.pop_size; + this->n_islands=params.num_islands; + // Tuples with start and end indexes for each island. Number of individuals // in each island can slightly differ if N_ISLANDS is not a divisor of p (popsize) - island_ranges.resize(n_islands); + island_indexes.resize(n_islands); - size_t p = size(); // population size + size_t p = pop_size; // population size for (int i=0; i -Population::Population(int p, int n_islands) -{ - // this calls the default constructor for the container template class - individuals.resize(p); + size_t idx_start = std::floor(i*p/n_islands); + size_t idx_end = std::floor((i+1)*p/n_islands); - this->n_islands=n_islands; - set_island_ranges(); + auto delta = idx_end - idx_start; - island_skip.resize(n_islands); - iota(island_skip.begin(), island_skip.end(), 0); + island_indexes.at(i).resize(delta); + iota(island_indexes.at(i).begin(), island_indexes.at(i).end(), idx_start); + }; - offspring_ready = false; -} + // TODO: load file (like feat) -template -void Population::init(SearchSpace& ss, const Parameters& params) -{ - this->mig_prob = params.mig_prob; + // this calls the default constructor for the container template class + individuals.resize(2*p); // we will never increase or decrease the size during execution (because is not thread safe). this way, theres no need to sync between selecting and varying the population - // TODO: load file (like feat) #pragma omp parallel for - for (int i = 0; i< individuals.size(); ++i) + for (int i = 0; i< p; ++i) { - individuals.at(i).init(ss, params); + individuals.at(i) = std::make_shared>(); + individuals.at(i)->init(ss, params); } } /// update individual vector size and island indexes template -void Population::prep_offspring_slots() +void Population::prep_offspring_slots(int island) { // reading and writing is thread-safe, as long as there's no overlap on island ranges. // manipulating a vector IS NOT thread-safe (inserting and erasing elements). // So, prep_offspring_slots and update should be the synchronization points, not // operations performed concurrently - // TODO: add _SingleThreaded in funcname - if (offspring_ready) - HANDLE_ERROR_THROW("Allocating space in population that already has active offspring slots"); - - vector> expanded_pop; - expanded_pop.resize(2*individuals.size()); + size_t p = pop_size; // population size. prep_offspring slots will douple the population, adding the new expressions into the islands + + // this is going to be tricky (pay attention to delta and p use) + size_t idx_start = std::floor(island*p/n_islands); + size_t idx_end = std::floor((island+1)*p/n_islands); - for (int i=0; iindividuals = expanded_pop; - set_island_ranges(); - offspring_ready = true; + // inserting indexes of the offspring + island_indexes.at(island).resize(delta*2); + iota( + island_indexes.at(island).begin() + p, island_indexes.at(island).end(), + p+idx_start); // Im keeping the offspring and parents in the same population object, because we // have operations that require them together (archive, hall of fame.) @@ -100,23 +79,33 @@ void Population::prep_offspring_slots() } template -void Population::update(vector survivors) +void Population::update(vector> survivors) { - if (!offspring_ready) - HANDLE_ERROR_THROW("Shrinking a population that has no active offspring"); - - assert(survivors.size() == individuals.size()/2 - && "Cant shrink a population to a size different from the original initial size"); - - vector pop_idx(individuals.size()); - std::iota(pop_idx.begin(),pop_idx.end(),0); - std::reverse(pop_idx.begin(),pop_idx.end()); - for (const auto& i : pop_idx) - if (!in(survivors,i)) - individuals.erase(individuals.begin()+i); + vector>> new_pop; + new_pop.resize(pop_size); + size_t i=0; + for (int j=0; jset_complexity(); - set_island_ranges(); - offspring_ready = false; + ++i; + } + + // need to make island point to original range + size_t idx_start = std::floor(j*size/n_islands); + size_t idx_end = std::floor((j+1)*size/n_islands); + + auto delta = idx_end - idx_start; + + // inserting indexes of the offspring + island_indexes.at(j).resize(delta); + iota(island_indexes.at(j).begin(), island_indexes.at(j).end(), idx_start); + } + individuals = new_pop; } template @@ -125,23 +114,15 @@ string Population::print_models(bool just_offspring, string sep) // not printing the island each individual belongs to string output = ""; - for (int i=0; iget_model() + sep; + + return output; } template @@ -155,13 +136,13 @@ vector> Population::sorted_front(unsigned rank) for (int i=0; i pf; - for (unsigned int i =idx_start; irank == rank) pf.push_back(i); } std::sort(pf.begin(),pf.end(),SortComplexity(*this)); @@ -174,7 +155,6 @@ vector> Population::sorted_front(unsigned rank) return pf_islands; } - template vector Population::hall_of_fame(unsigned rank) { @@ -183,12 +163,13 @@ vector Population::hall_of_fame(unsigned rank) vector pf(0); for (unsigned int i =0; irank == rank) pf.push_back(i); } std::sort(pf.begin(),pf.end(),SortComplexity(*this)); auto it = std::unique(pf.begin(),pf.end(),SameFitComplexity(*this)); pf.resize(std::distance(pf.begin(),it)); + return pf; } @@ -196,8 +177,10 @@ vector Population::hall_of_fame(unsigned rank) template void Population::migrate() { - assert(!offspring_ready - && "pop with offspring dont migrate (run update before calling this)"); + // changes where island points to + + if (n_islands==1) + return; auto island_fronts = sorted_front(); auto global_hall_of_fame = hall_of_fame(); @@ -205,14 +188,14 @@ void Population::migrate() // This is not thread safe (as it is now) for (int island=0; island1) { // from global hall of fame + if (r() < 0.5) { // from global hall of fame migrating_idx = *r.select_randomly( global_hall_of_fame.begin(), global_hall_of_fame.end()); @@ -239,12 +222,11 @@ void Population::migrate() island_fronts.at(other_island).end()); } - individuals.at(i) = individuals.at(migrating_idx); + island_indexes.at(i) = migrating_idx; } } } } - } // Pop } // Brush diff --git a/src/population.h b/src/population.h index 24f255dc..9b0fa035 100644 --- a/src/population.h +++ b/src/population.h @@ -14,18 +14,20 @@ namespace Pop { template class Population{ -private: - void set_island_ranges(); public: - bool offspring_ready; - vector> individuals; - vector> island_ranges; - vector island_skip; // number of indexes to skip for each island (when variation fails) + size_t pop_size; unsigned int n_islands; float mig_prob; + vector>> individuals; + + // TODO: MAKE SURE THIS TWO ITEMS BELOW ARE TAKEN CARE IN THE MAIN LOOP AND IN TEST_POPULATION (I may need to create new methods for taking care of this) + // - fitting, fitness calculation, and setting the objectives are not thread safe because we write in individual attributes. + // - prepare offspring and update are not thread safe because we insert/delete elements from the array. + vector> island_indexes; + // TODO: taskflow needs to use n_islands as n_jobs - Population(int p = 0, int n_islands=1); + Population(); ~Population(){}; @@ -33,21 +35,20 @@ class Population{ void init(SearchSpace& ss, const Parameters& params); /// returns population size - int size() { return individuals.size(); }; + int size() { return pop_size; }; - tuple get_island_range(int island) { - return island_ranges.at(island); }; + vector get_island_indexes(int island){ return island_indexes.at(island); }; /// update individual vector size, distributing the expressions in n_islands - void prep_offspring_slots(); + // TODO: add elements to the end + void prep_offspring_slots(int island); - // TODO: WORK WITH ISLANDS - /// reduce programs to the indices in survivors. - void update(vector survivors); + /// reduce programs to the indices in survivors. Not thread safe,as it removes elements + void update(vector> survivors); /// setting and getting from individuals vector (will ignore islands) - const Individual operator [](size_t i) const {return individuals.at(i);} - const Individual & operator [](size_t i) {return individuals.at(i);} + const Individual& operator [](size_t i) const {return *individuals.at(i);} + const Individual& operator [](size_t i) {return *individuals.at(i);} /// return population equations. string print_models(bool just_offspring=false, string sep="\n"); @@ -60,16 +61,16 @@ class Population{ // perform a migration in the population. Individuals from sorted front or hall of fame will replace others by the // probability set in parameters. Expects a population without offspring - void migrate(); + void migrate(); // TODO: change just the indexes - /// Sort each island in increasing complexity. + /// Sort each island in increasing complexity. This is not thread safe. I should set complexities of the whole population before calling it, and use get_complexity instead struct SortComplexity { Population& pop; SortComplexity(Population& p): pop(p){} bool operator()(size_t i, size_t j) { - return pop.individuals[i].set_complexity() < pop.individuals[j].set_complexity(); + return pop.individuals[i].get_complexity() < pop.individuals[j].get_complexity(); } }; @@ -80,8 +81,8 @@ class Population{ SameFitComplexity(Population& p): pop(p){} bool operator()(size_t i, size_t j) { - return (pop.individuals[i].fitness == pop.individuals[j].fitness && - pop.individuals[i].set_complexity() == pop.individuals[j].set_complexity()); + return (pop.individuals[i].fitness == pop.individuals[j].fitness + && pop.individuals[i].get_complexity() == pop.individuals[j].get_complexity()); } }; }; diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index ca330bf5..6de51a4f 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -16,11 +16,11 @@ NSGA2::NSGA2(bool surv) } template -size_t NSGA2::tournament(vector>& pop, size_t i, size_t j) const +size_t NSGA2::tournament(Population& pop, size_t i, size_t j) const { // gets two individuals and compares them. i and j bhould be within island range - Individual& ind1 = pop.at(i); - Individual& ind2 = pop.at(j); + const Individual& ind1 = pop[i]; + const Individual& ind2 = pop[j]; // TODO: implement this int flag = ind1.check_dominance(ind2); @@ -38,46 +38,23 @@ size_t NSGA2::tournament(vector>& pop, size_t i, size_t j) cons } template -vector NSGA2::select(Population& pop, tuple island_range, +vector NSGA2::select(Population& pop, int island, const Parameters& params, const Dataset& d) { - /* Selection using Pareto tournaments. - * - * Input: - * - * pop: population of programs. - * params: parameters. - * r: random number generator - * - * Output: - * - * selected: vector of indices corresponding to pop that are selected. - * modifies individual ranks, objectives and dominations. - */ - - auto [idx_start, idx_end] = island_range; - - if (pop.offspring_ready) // dont look at offspring to select - idx_end = (idx_end - idx_start)/2; - - size_t delta = idx_end - idx_start; - - vector island_pool(delta); - std::iota(island_pool.begin(), island_pool.end(), idx_start); + auto island_pool = pop.get_island_indexes(island); // if this is first generation, just return indices to pop if (params.current_gen==0) return island_pool; // setting the objectives - for (unsigned int i=0; iset_obj(params.objectives); vector selected(0); - - for (int i = 0; i < delta; ++i) // selecting based on island_pool size + for (int i = 0; i < island_pool.size(); ++i) // selecting based on island_pool size { - size_t winner = tournament(pop.individuals, + size_t winner = tournament(pop, *r.select_randomly(island_pool.begin(), island_pool.end()), *r.select_randomly(island_pool.begin(), island_pool.end())); @@ -87,48 +64,26 @@ vector NSGA2::select(Population& pop, tuple island } template -vector NSGA2::survive(Population& pop, tuple island_range, +vector NSGA2::survive(Population& pop, int island, const Parameters& params, const Dataset& d) { - /* Selection using the survival scheme of NSGA-II. - * - * Input: - * - * pop: population of programs. - * params: parameters. - * r: random number generator - * - * Output: - * - * selected: vector of indices corresponding to pop that are selected. - * modifies individual ranks, objectives and dominations. - */ - - auto [idx_start, idx_end] = island_range; - - assert(pop.offspring_ready - && "survival was called in an island with no offspring"); - - size_t delta = (idx_end - idx_start); // the whole island (pop + offspring) - - vector island_pool(delta); // array with indexes for the specific island_pool - std::iota(island_pool.begin(), island_pool.end(), idx_start); + auto island_pool = pop.get_island_indexes(island); // set objectives (this is when the obj vector is updated.) fmt::print("-- first loop\n"); - for (unsigned int i=0; iset_obj(params.objectives); // fast non-dominated sort fmt::print("-- fast nds\n"); - auto front = fast_nds(pop.individuals, island_pool); + auto front = fast_nds(pop, island_pool); fmt::print("-- while loop\n"); // Push back selected individuals until full vector selected(0); int i = 0; - while ( selected.size() + front.at(i).size() < delta/2 ) // (delta/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) + while ( selected.size() + front.at(i).size() < island_pool.size()/2 ) // (size/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) { fmt::print("-- crawd dist\n"); std::vector& Fi = front.at(i); // indices in front i @@ -145,7 +100,7 @@ vector NSGA2::survive(Population& pop, tuple islan std::sort(front.at(i).begin(),front.at(i).end(),sort_n(pop)); fmt::print("adding last front)\n"); - const int extra = delta/2 - selected.size(); + const int extra = island_pool.size()/2 - selected.size(); for (int j = 0; j < extra; ++j) // Pt+1 = Pt+1 U Fi[1:N-|Pt+1|] selected.push_back(front.at(i).at(j)); @@ -153,7 +108,7 @@ vector NSGA2::survive(Population& pop, tuple islan } template -vector> NSGA2::fast_nds(vector>& individuals, vector& island_pool) +vector> NSGA2::fast_nds(Population& pop, vector& island_pool) { //< the Pareto fronts vector> front; @@ -167,13 +122,13 @@ vector> NSGA2::fast_nds(vector>& individuals, vecto std::vector dom; int dcount = 0; - Individual& p = individuals.at(island_pool[i]); + auto p = pop.individuals.at(island_pool[i]); for (int j = 0; j < island_pool.size(); ++j) { - Individual& q = individuals.at(island_pool[j]); + const Individual& q = pop[island_pool[j]]; - int compare = p.check_dominance(q); + int compare = p->check_dominance(q); if (compare == 1) { // p dominates q //p.dominated.push_back(j); dom.push_back(island_pool[j]); @@ -185,12 +140,12 @@ vector> NSGA2::fast_nds(vector>& individuals, vecto #pragma omp critical { - p.dcounter = dcount; - p.dominated.clear(); - p.dominated = dom; // dom will have values already referring to island indexes + p->dcounter = dcount; + p->dominated.clear(); + p->dominated = dom; // dom will have values already referring to island indexes - if (p.dcounter == 0) { - p.set_rank(1); + if (p->dcounter == 0) { + p->set_rank(1); // front will have values already referring to island indexes front.at(0).push_back(island_pool[i]); } @@ -209,16 +164,16 @@ vector> NSGA2::fast_nds(vector>& individuals, vecto std::vector Q; for (int i = 0; i < fronti.size(); ++i) { - Individual& p = individuals.at(fronti.at(i)); + const Individual& p = pop[fronti.at(i)]; // iterating over dominated individuals for (int j = 0; j < p.dominated.size() ; ++j) { - Individual& q = individuals.at(p.dominated.at(j)); - q.dcounter -= 1; + auto q = pop.individuals.at(p.dominated.at(j)); + q->dcounter -= 1; - if (q.dcounter == 0) { - q.set_rank(fi+1); + if (q->dcounter == 0) { + q->set_rank(fi+1); Q.push_back(p.dominated.at(j)); } } @@ -240,26 +195,26 @@ void NSGA2::crowding_distance(Population& pop, vector>& front, const int fsize = F.size(); for (int i = 0; i < fsize; ++i) - pop.individuals.at(F.at(i)).crowd_dist = 0; + pop.individuals.at(F.at(i))->crowd_dist = 0; - const int limit = pop.individuals.at(0).obj.size(); + const int limit = pop.individuals.at(0)->obj.size(); for (int m = 0; m < limit; ++m) { std::sort(F.begin(), F.end(), comparator_obj(pop,m)); // in the paper dist=INF for the first and last, in the code // this is only done to the first one or to the two first when size=2 - pop.individuals.at(F.at(0)).crowd_dist = std::numeric_limits::max(); + pop.individuals.at(F.at(0))->crowd_dist = std::numeric_limits::max(); if (fsize > 1) - pop.individuals.at(F.at(fsize-1)).crowd_dist = std::numeric_limits::max(); + pop.individuals.at(F.at(fsize-1))->crowd_dist = std::numeric_limits::max(); for (int i = 1; i < fsize-1; ++i) { - if (pop.individuals.at(F.at(i)).crowd_dist != std::numeric_limits::max()) + if (pop.individuals.at(F.at(i))->crowd_dist != std::numeric_limits::max()) { // crowd over obj - pop.individuals.at(F.at(i)).crowd_dist += - (pop.individuals.at(F.at(i+1)).obj.at(m) - pop.individuals.at(F.at(i-1)).obj.at(m)) - / (pop.individuals.at(F.at(fsize-1)).obj.at(m) - pop.individuals.at(F.at(0)).obj.at(m)); + pop.individuals.at(F.at(i))->crowd_dist += + (pop.individuals.at(F.at(i+1))->obj.at(m) - pop.individuals.at(F.at(i-1))->obj.at(m)) + / (pop.individuals.at(F.at(fsize-1))->obj.at(m) - pop.individuals.at(F.at(0))->obj.at(m)); } } } diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index 6e9f4eb6..58c3c51f 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -29,15 +29,15 @@ class NSGA2 : public SelectionOperator ~NSGA2(){}; /// selection according to the survival scheme of NSGA-II - vector select(Population& pop, tuple island_range, + vector select(Population& pop, int island, const Parameters& p, const Dataset& d); /// survival according to the survival scheme of NSGA-II - vector survive(Population& pop, tuple island_range, + vector survive(Population& pop, int island, const Parameters& p, const Dataset& d); //< Fast non-dominated sorting - vector> fast_nds(vector>&, vector&); + vector> fast_nds(Population&, vector&); // front cannot be an attribute because selection will be executed in different threads for different islands (this is a modificationf rom original FEAT code that I got inspiration) @@ -53,12 +53,13 @@ class NSGA2 : public SelectionOperator sort_n(const Population& population) : pop(population) {}; bool operator() (int i, int j) { - const Individual& ind1 = pop.individuals[i]; - const Individual& ind2 = pop.individuals[j]; - if (ind1.rank < ind2.rank) + // TODO: Improve operator[], and decrease use of pop.individuals.at(). Also, decrease number of auto declarations + auto ind1 = pop.individuals[i]; + auto ind2 = pop.individuals[j]; + if (ind1->rank < ind2->rank) return true; - else if (ind1.rank == ind2.rank && - ind1.crowd_dist > ind2.crowd_dist) + else if (ind1->rank == ind2->rank && + ind1->crowd_dist > ind2->crowd_dist) return true; return false; }; @@ -76,7 +77,7 @@ class NSGA2 : public SelectionOperator bool operator() (int i, int j) { return pop[i].obj[m] < pop[j].obj[m]; }; }; - size_t tournament(vector>& pop, size_t i, size_t j) const; + size_t tournament(Population& pop, size_t i, size_t j) const; }; } // selection diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index fb2e01db..8417c968 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -39,18 +39,18 @@ void Selection::set_type(string in){ type = in; set_operator();} /// perform selection template -vector Selection::select(Population& pop, tuple island_range, +vector Selection::select(Population& pop, int island, const Parameters& params, const Dataset& data) { - return pselector->select(pop, island_range, params, data); + return pselector->select(pop, island, params, data); } /// perform survival template -vector Selection::survive(Population& pop, tuple island_range, +vector Selection::survive(Population& pop, int island, const Parameters& params, const Dataset& data) { - return pselector->survive(pop, island_range, params, data); + return pselector->survive(pop, island, params, data); } } // selection diff --git a/src/selection/selection.h b/src/selection/selection.h index 4c0d1c43..1e19392f 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -29,20 +29,21 @@ class SelectionOperator public: bool survival; string name; + // TODO: implement lexicase // shoudn't have a constructor // SelectionOperator(){}; virtual ~SelectionOperator(){}; - virtual vector select(Population& pop, tuple island_range, + virtual vector select(Population& pop, int island, const Parameters& p, const Dataset& data) { HANDLE_ERROR_THROW("Undefined select() operation"); return vector(); }; - virtual vector survive(Population& pop, tuple island_range, + virtual vector survive(Population& pop, int island, const Parameters& p, const Dataset& data) { HANDLE_ERROR_THROW("Undefined select() operation"); @@ -82,11 +83,11 @@ struct Selection void set_type(string); /// perform selection. selection uses a pop that has no offspring space - vector select(Population& pop, tuple island_range, + vector select(Population& pop, int island, const Parameters& params, const Dataset& data); /// perform survival. uses a pop with offspring space - vector survive(Population& pop, tuple island_range, + vector survive(Population& pop, int island, const Parameters& params, const Dataset& data); }; diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 0f7f53d5..83ec1829 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -30,7 +30,7 @@ TEST(Population, PopulationTests) SS.init(data); Parameters params; - Population pop = Population(params.pop_size, params.num_islands); + Population pop = Population(); // aux classes (they are not tested in-depth in this file) Evaluation evaluator = Evaluation(params.scorer_); @@ -49,7 +49,7 @@ TEST(Population, PopulationTests) pop.init(SS, params); for (auto& ind : pop.individuals) { - fmt::print("Individual: {}\n", ind.program.get_model("compact", true)); + fmt::print("Individual: {}\n", ind->program.get_model("compact", true)); } // print models @@ -58,86 +58,75 @@ TEST(Population, PopulationTests) // no overlap in island indexes fmt::print("Testing island ranges\n"); - for (std::size_t i = 0; i < pop.island_ranges.size() - 1; ++i) { - int last = std::get<1>(pop.island_ranges.at(i)); - int next_first = std::get<0>(pop.island_ranges.at(i+1)); - - //(last index from one island is EQUAL than first) (no gaps between island) - // (this assumes that we will never iterate to the last index in for loops. TODO: make sure we dont) - ASSERT_TRUE(last == next_first); - - // difference between island sizes is at most 1 - auto delta = last - std::get<0>(pop.island_ranges.at(i)); - auto next_delta = std::get<1>(pop.island_ranges.at(i+1)) - next_first; - ASSERT_TRUE(delta <= next_delta+1 && next_delta <= delta+1); - } + // for (std::size_t i = 0; i < pop.island_ranges.size() - 1; ++i) { + // int last = std::get<1>(pop.island_ranges.at(i)); + // int next_first = std::get<0>(pop.island_ranges.at(i+1)); + + // //(last index from one island is EQUAL than first) (no gaps between island) + // // (this assumes that we will never iterate to the last index in for loops. TODO: make sure we dont) + // ASSERT_TRUE(last == next_first); + + // // difference between island sizes is at most 1 + // auto delta = last - std::get<0>(pop.island_ranges.at(i)); + // auto next_delta = std::get<1>(pop.island_ranges.at(i+1)) - next_first; + // ASSERT_TRUE(delta <= next_delta+1 && next_delta <= delta+1); + // } // island sizes increases and comes back to the same values after update - fmt::print("Performing all steps of an evolution\n"); - auto original_islands = pop.island_ranges; - for (int i=0; i<10; ++i) // update and prep offspring slots works properly - { // wax on wax off + // fmt::print("Performing all steps of an evolution\n"); + // auto original_islands = pop.island_ranges; + // for (int i=0; i<10; ++i) // update and prep offspring slots works properly + // { // wax on wax off - fmt::print("Evaluating population\n"); - vector> island_parents; - island_parents.resize(pop.n_islands); - for (int j=0; j(pop.get_island_range(j)), - std::get<1>(pop.get_island_range(j)) ); - - fmt::print("Fitness\n"); - // we can calculate the fitness for each island - evaluator.fitness(pop, pop.get_island_range(j), data, params, true, false); - - fmt::print("Selection\n"); - // just so we can call the update method - vector parents = selector.select(pop, pop.get_island_range(j), params, data); + // fmt::print("Evaluating population\n"); + // vector> survivors(pop.n_islands); + // vector> island_parents(pop.n_islands); + + // for (int j=0; j(pop.get_island_range(j)), + // std::get<1>(pop.get_island_range(j)) ); + + // fmt::print("Fitness\n"); + // // we can calculate the fitness for each island + // evaluator.fitness(pop, pop.get_island_range(j), data, params, true, false); + + // fmt::print("Selection\n"); + // // just so we can call the update method + // vector parents = selector.select(pop, pop.get_island_range(j), params, data); - ASSERT_TRUE(parents.size() > 0); - fmt::print("Updating parents\n"); - island_parents.at(j) = parents; - } - - fmt::print("Preparing offspring\n"); - pop.prep_offspring_slots(); - ASSERT_TRUE(pop.size() == params.pop_size*2); - - fmt::print("Preparing survivors\n"); - vector survivors(params.pop_size); - for (int j=0; j 0); + // fmt::print("Updating parents\n"); + // island_parents.at(j) = parents; + + // fmt::print("Preparing offspring\n"); + // pop.prep_offspring_slots(j); + + // fmt::print("Variations for island {}\n", j); + // // variation applied to population + // variator.vary(pop, pop.get_island_range(j), island_parents.at(j)); + + // fmt::print("fitting {}\n", j); + // evaluator.fitness(pop, pop.get_island_range(j), data, params, true, true); - fmt::print("survivors\n", j); - auto island_survivors = survivor.survive(pop, pop.get_island_range(j), params, data); + // fmt::print("survivors\n", j); + // auto island_survivors = survivor.survive(pop, pop.get_island_range(j), params, data); - fmt::print("Updating global array\n"); - auto [idx_start, idx_end] = pop.get_island_range(j); - - for (unsigned k = 0; k Date: Thu, 16 Nov 2023 19:58:07 -0500 Subject: [PATCH 101/199] Fixed printing individuals in population --- src/population.cpp | 22 ++++--- src/population.h | 5 +- tests/cpp/test_population.cpp | 120 ++++++++++++++++------------------ 3 files changed, 72 insertions(+), 75 deletions(-) diff --git a/src/population.cpp b/src/population.cpp index f07d984a..6cb160ff 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -82,7 +82,7 @@ template void Population::update(vector> survivors) { vector>> new_pop; - new_pop.resize(pop_size); + new_pop.resize(2*pop_size); size_t i=0; for (int j=0; j::print_models(bool just_offspring, string sep) // not printing the island each individual belongs to string output = ""; - int start = 0; - - if (just_offspring) - start = individuals.size()/2; + for (int j=0; jget_model() + sep; - - return output; + int start = 0; + + if (just_offspring) + start = island_indexes.at(j).size()/2; + + for (int k=start; kget_model() + sep; + } + return output; } template diff --git a/src/population.h b/src/population.h index 9b0fa035..612f2256 100644 --- a/src/population.h +++ b/src/population.h @@ -4,6 +4,7 @@ #include "search_space.h" #include "individual.h" #include "program/program.h" +#include "util/error.h" using std::vector; using std::string; @@ -34,8 +35,8 @@ class Population{ /// initialize population of programs with a starting model and/or from file void init(SearchSpace& ss, const Parameters& params); - /// returns population size - int size() { return pop_size; }; + /// returns population size (the effective size of the individuals) + int size() { return individuals.size(); }; vector get_island_indexes(int island){ return island_indexes.at(island); }; diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 83ec1829..0da60113 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -43,90 +43,82 @@ TEST(Population, PopulationTests) // size, all individuals were initialized ASSERT_TRUE(pop.size() == pop.individuals.size() - && pop.size() == params.pop_size); + && pop.size() == 0); //before initialization, it should be empty fmt::print("Initializing individuals in the population:\n"); pop.init(SS, params); - for (auto& ind : pop.individuals) + + fmt::print("pop.size() {}, pop.individuals.size() {}, params.pop_size, {}", + pop.size(), pop.individuals.size(), params.pop_size); + ASSERT_TRUE(pop.size() == pop.individuals.size() + && pop.size()/2 == params.pop_size); // now we have a population. + // Its size is actually the double, + // but the real value goes just up to the middle (no offspring was initialized) + + for (int i=0; iprogram.get_model("compact", true)); + fmt::print("{} ", i); + fmt::print("Individual: {}\n", + pop[i].program.get_model("compact", true)); } // print models - fmt::print("Printing from population method:\n{}\n", pop.print_models()); - - // no overlap in island indexes - - fmt::print("Testing island ranges\n"); - // for (std::size_t i = 0; i < pop.island_ranges.size() - 1; ++i) { - // int last = std::get<1>(pop.island_ranges.at(i)); - // int next_first = std::get<0>(pop.island_ranges.at(i+1)); - - // //(last index from one island is EQUAL than first) (no gaps between island) - // // (this assumes that we will never iterate to the last index in for loops. TODO: make sure we dont) - // ASSERT_TRUE(last == next_first); - - // // difference between island sizes is at most 1 - // auto delta = last - std::get<0>(pop.island_ranges.at(i)); - // auto next_delta = std::get<1>(pop.island_ranges.at(i+1)) - next_first; - // ASSERT_TRUE(delta <= next_delta+1 && next_delta <= delta+1); - // } + fmt::print("Printing from population method:\n"); + fmt::print("{}\n",pop.print_models()); // island sizes increases and comes back to the same values after update - // fmt::print("Performing all steps of an evolution\n"); - // auto original_islands = pop.island_ranges; - // for (int i=0; i<10; ++i) // update and prep offspring slots works properly - // { // wax on wax off + fmt::print("Performing all steps of an evolution\n"); + for (int i=0; i<10; ++i) // update and prep offspring slots works properly + { // wax on wax off - // fmt::print("Evaluating population\n"); - // vector> survivors(pop.n_islands); - // vector> island_parents(pop.n_islands); + // fmt::print("Evaluating population\n"); + // vector> survivors(pop.n_islands); + // vector> island_parents(pop.n_islands); - // for (int j=0; j(pop.get_island_range(j)), - // std::get<1>(pop.get_island_range(j)) ); - - // fmt::print("Fitness\n"); - // // we can calculate the fitness for each island - // evaluator.fitness(pop, pop.get_island_range(j), data, params, true, false); - - // fmt::print("Selection\n"); - // // just so we can call the update method - // vector parents = selector.select(pop, pop.get_island_range(j), params, data); + // for (int j=0; j(pop.get_island_range(j)), + // std::get<1>(pop.get_island_range(j)) ); + + // fmt::print("Fitness\n"); + // // we can calculate the fitness for each island + // evaluator.fitness(pop, pop.get_island_range(j), data, params, true, false); + + // fmt::print("Selection\n"); + // // just so we can call the update method + // vector parents = selector.select(pop, pop.get_island_range(j), params, data); - // ASSERT_TRUE(parents.size() > 0); - // fmt::print("Updating parents\n"); - // island_parents.at(j) = parents; + // ASSERT_TRUE(parents.size() > 0); + // fmt::print("Updating parents\n"); + // island_parents.at(j) = parents; - // fmt::print("Preparing offspring\n"); - // pop.prep_offspring_slots(j); + // fmt::print("Preparing offspring\n"); + // pop.prep_offspring_slots(j); - // fmt::print("Variations for island {}\n", j); - // // variation applied to population - // variator.vary(pop, pop.get_island_range(j), island_parents.at(j)); + // fmt::print("Variations for island {}\n", j); + // // variation applied to population + // variator.vary(pop, pop.get_island_range(j), island_parents.at(j)); - // fmt::print("fitting {}\n", j); - // evaluator.fitness(pop, pop.get_island_range(j), data, params, true, true); + // fmt::print("fitting {}\n", j); + // evaluator.fitness(pop, pop.get_island_range(j), data, params, true, true); - // fmt::print("survivors\n", j); - // auto island_survivors = survivor.survive(pop, pop.get_island_range(j), params, data); + // fmt::print("survivors\n", j); + // auto island_survivors = survivor.survive(pop, pop.get_island_range(j), params, data); - // survivors.at(j) = island_survivors; - // } + // survivors.at(j) = island_survivors; + // } - // fmt::print("Updating and migrating\n"); + // fmt::print("Updating and migrating\n"); - // // TODO: UPDATE SHOULD SORT SURVIVOR LIST AND REMOVE REPEATED VALUES - // pop.update(survivors); - // ASSERT_TRUE(pop.size() == params.pop_size); + // // TODO: UPDATE SHOULD SORT SURVIVOR LIST AND REMOVE REPEATED VALUES + // pop.update(survivors); + // ASSERT_TRUE(pop.size() == params.pop_size); - // pop.migrate(); - // ASSERT_TRUE(pop.size() == params.pop_size); + // pop.migrate(); + // ASSERT_TRUE(pop.size() == params.pop_size); - // fmt::print("Printing generation {} population:\n{}\n", i, pop.print_models()); - // } - // ASSERT_TRUE(original_islands == pop.island_ranges); + // fmt::print("Printing generation {} population:\n{}\n", i, pop.print_models()); + } } From 6f2b13c07b993f900df00fa11cd924dc3b043747 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 17 Nov 2023 13:58:03 -0500 Subject: [PATCH 102/199] Update to work with island indexes --- src/eval/evaluation.cpp | 46 +++++----------- src/eval/evaluation.h | 4 +- src/population.cpp | 31 +++++------ src/population.h | 6 +-- src/selection/nsga2.cpp | 10 +++- src/variation.cpp | 33 +++--------- src/variation.h | 3 +- tests/cpp/test_population.cpp | 98 +++++++++++++++++++---------------- 8 files changed, 102 insertions(+), 129 deletions(-) diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index 36d94ed4..e43e9ee0 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -6,32 +6,21 @@ namespace Eval{ template void Evaluation::validation(Population& pop, - tuple island_range, + int island, const Dataset& data, const Parameters& params, bool offspring ) { - // if offspring false --> if has offspring, do it on first half. else, do on entire island - // offspring true --> assert that has offspring, do it on the second half of the island + auto idxs = pop.get_island_indexes(island); - auto [idx_start, idx_end] = island_range; - size_t delta = idx_end - idx_start; + int start = 0; if (offspring) - { - assert(pop.offspring_ready - && ("Population does not have offspring to calculate validation fitness")); - - idx_start = idx_start + (delta/2); - } - else if (pop.offspring_ready) // offspring is false. We need to see where we sould stop - { - idx_end = idx_end - (delta/2); - } + start = idxs.size()/2; - for (unsigned i = idx_start; i& ind = pop[i]; + Individual& ind = *pop.individuals.at(idxs.at(i)).get(); // we are modifying it, so operator[] wont work // if there is no validation data, // set fitness_v to fitness and return ( this assumes that fitness on train was calculated previously.) @@ -60,33 +49,22 @@ void Evaluation::validation(Population& pop, // fitness of population template void Evaluation::fitness(Population& pop, - tuple island_range, + int island, const Dataset& data, const Parameters& params, bool fit, bool offspring ) { - // if offspring false --> if has offspring, do it on first half. else, do on entire island - // offspring true --> assert that has offspring, do it on the second half of the island + auto idxs = pop.get_island_indexes(island); - auto [idx_start, idx_end] = island_range; - size_t delta = idx_end - idx_start; + int start = 0; if (offspring) - { - assert(pop.offspring_ready - && ("Population does not have offspring to calculate validation fitness")); - - idx_start = idx_start + (delta/2); - } - else if (pop.offspring_ready) // offspring is false. We need to see where we sould stop - { - idx_end = idx_end - (delta/2); - } + start = idxs.size()/2; - for (unsigned i = idx_start; i& ind = pop.individuals.at(i); + Individual& ind = *pop.individuals.at(idxs.at(i)).get(); // we are modifying it, so operator[] wont work bool pass = true; diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index 5ddb141f..99b6d729 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -30,7 +30,7 @@ class Evaluation { // TODO: IMPLEMENT THIS /// validation of population. void validation(Population& pop, - tuple island_range, + int island, const Dataset& data, const Parameters& params, bool offspring = false @@ -40,7 +40,7 @@ class Evaluation { // TODO: MAKE it work for classification (do I need to have a way to set accuracy as a minimization problem?) /// fitness of population. void fitness(Population& pop, - tuple island_range, + int island, const Dataset& data, const Parameters& params, bool fit=true, diff --git a/src/population.cpp b/src/population.cpp index 6cb160ff..d0f3446d 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -41,7 +41,6 @@ void Population::init(SearchSpace& ss, const Parameters& params) // this calls the default constructor for the container template class individuals.resize(2*p); // we will never increase or decrease the size during execution (because is not thread safe). this way, theres no need to sync between selecting and varying the population - #pragma omp parallel for for (int i = 0; i< p; ++i) { individuals.at(i) = std::make_shared>(); @@ -50,7 +49,7 @@ void Population::init(SearchSpace& ss, const Parameters& params) } /// update individual vector size and island indexes -template +template // TODO: rename to include_offspring_indexes or something like this void Population::prep_offspring_slots(int island) { // reading and writing is thread-safe, as long as there's no overlap on island ranges. @@ -64,12 +63,12 @@ void Population::prep_offspring_slots(int island) size_t idx_start = std::floor(island*p/n_islands); size_t idx_end = std::floor((island+1)*p/n_islands); - auto delta = idx_end - idx_start; + auto delta = idx_end - idx_start; // island size // inserting indexes of the offspring island_indexes.at(island).resize(delta*2); iota( - island_indexes.at(island).begin() + p, island_indexes.at(island).end(), + island_indexes.at(island).begin() + delta, island_indexes.at(island).end(), p+idx_start); // Im keeping the offspring and parents in the same population object, because we @@ -92,12 +91,12 @@ void Population::update(vector> survivors) // update will set the complexities (for migration step. we do it here because update handles non-thread safe operations) new_pop.at(i)->set_complexity(); - ++i; + ++i; // this will fill just half of the pop } // need to make island point to original range - size_t idx_start = std::floor(j*size/n_islands); - size_t idx_end = std::floor((j+1)*size/n_islands); + size_t idx_start = std::floor(j*pop_size/n_islands); + size_t idx_end = std::floor((j+1)*pop_size/n_islands); auto delta = idx_end - idx_start; @@ -119,12 +118,14 @@ string Population::print_models(bool just_offspring, string sep) output += "island " + to_string(j) + ":\n"; int start = 0; - if (just_offspring) start = island_indexes.at(j).size()/2; - for (int k=start; kget_model() + sep; + for (int k=start; k& ind = *individuals.at(island_indexes.at(j).at(k)).get(); + output += ind.get_model() + sep; + } } return output; } @@ -138,12 +139,12 @@ vector> Population::sorted_front(unsigned rank) vector> pf_islands; pf_islands.resize(n_islands); - for (int i=0; i pf; - for (unsigned int& i : idxs) + for (unsigned int i : idxs) { // this assumes that rank was previously calculated. It is set in selection (ie nsga2) if the information is useful to select/survive if (individuals.at(i)->rank == rank) @@ -153,7 +154,7 @@ vector> Population::sorted_front(unsigned rank) auto it = std::unique(pf.begin(),pf.end(),SameFitComplexity(*this)); pf.resize(std::distance(pf.begin(),it)); - pf_islands.at(i) = pf; + pf_islands.at(j) = pf; } return pf_islands; @@ -226,7 +227,7 @@ void Population::migrate() island_fronts.at(other_island).end()); } - island_indexes.at(i) = migrating_idx; + island_indexes.at(island).at(i) = migrating_idx; } } } diff --git a/src/population.h b/src/population.h index 612f2256..545a284a 100644 --- a/src/population.h +++ b/src/population.h @@ -71,7 +71,7 @@ class Population{ SortComplexity(Population& p): pop(p){} bool operator()(size_t i, size_t j) { - return pop.individuals[i].get_complexity() < pop.individuals[j].get_complexity(); + return pop[i].get_complexity() < pop[j].get_complexity(); } }; @@ -82,8 +82,8 @@ class Population{ SameFitComplexity(Population& p): pop(p){} bool operator()(size_t i, size_t j) { - return (pop.individuals[i].fitness == pop.individuals[j].fitness - && pop.individuals[i].get_complexity() == pop.individuals[j].get_complexity()); + return (pop[i].fitness == pop[j].fitness + && pop[i].get_complexity() == pop[j].get_complexity()); } }; }; diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 6de51a4f..1c59cf8e 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -67,6 +67,12 @@ template vector NSGA2::survive(Population& pop, int island, const Parameters& params, const Dataset& d) { + + size_t idx_start = std::floor(island*pop.size()/pop.n_islands); + size_t idx_end = std::floor((island+1)*pop.size()/pop.n_islands); + + auto original_size = idx_end - idx_start; // island size + auto island_pool = pop.get_island_indexes(island); // set objectives (this is when the obj vector is updated.) @@ -83,7 +89,7 @@ vector NSGA2::survive(Population& pop, int island, // Push back selected individuals until full vector selected(0); int i = 0; - while ( selected.size() + front.at(i).size() < island_pool.size()/2 ) // (size/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) + while ( selected.size() + front.at(i).size() < original_size ) // (size/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) { fmt::print("-- crawd dist\n"); std::vector& Fi = front.at(i); // indices in front i @@ -100,7 +106,7 @@ vector NSGA2::survive(Population& pop, int island, std::sort(front.at(i).begin(),front.at(i).end(),sort_n(pop)); fmt::print("adding last front)\n"); - const int extra = island_pool.size()/2 - selected.size(); + const int extra = original_size - selected.size(); for (int j = 0; j < extra; ++j) // Pt+1 = Pt+1 U Fi[1:N-|Pt+1|] selected.push_back(front.at(i).at(j)); diff --git a/src/variation.cpp b/src/variation.cpp index 50128194..8b0a7c99 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -568,34 +568,17 @@ std::optional> Variation::mutate(const Program& parent) } template -void Variation::vary(Population& pop, tuple island_range, +void Variation::vary(Population& pop, int island, const vector& parents) -{ - /*! - * performs variation on the current population. - * - * @param pop: current population - * @param parents: indices of population to use for variation - * @param params: feat parameters - * - * @return appends params.pop_size offspring derived from parent variation - */ - - assert(pop.offspring_ready - && ("Population does not have slots for generating the offspring. " - +"You should `prep_offspring_slots`. `vary` will add new xmen individuals " - +"starting from the middle of the island")); - - // parents should be within island range. TODO: assert that they are - - auto [idx_start, idx_end] = island_range; - size_t delta = idx_end - idx_start; - - idx_start = idx_start + (delta/2); +{ + auto idxs = pop.get_island_indexes(island); + + // assumes it should save new individuals in second half of the island + int start = idxs.size()/2; // TODO: fix pragma omp usage //#pragma omp parallel for - for (unsigned i = idx_start; i> opt=std::nullopt; // new individual @@ -625,7 +608,7 @@ void Variation::vary(Population& pop, tuple island_range, Program child = opt.value(); assert(child.size()>0); - pop.individuals.at(i) = Individual(child); + pop.individuals.at(idxs.at(i)) = std::make_shared>(child); } } } diff --git a/src/variation.h b/src/variation.h index 8fc34c00..564066fc 100644 --- a/src/variation.h +++ b/src/variation.h @@ -124,8 +124,7 @@ class Variation std::optional> mutate(const Program& parent); /// method to handle variation of population - void vary(Population& pop, tuple island_range, - const vector& parents); + void vary(Population& pop, int island, const vector& parents); }; } //namespace Var diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 0da60113..819bac09 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -30,6 +30,7 @@ TEST(Population, PopulationTests) SS.init(data); Parameters params; + params.pop_size = 20; // small pop just for tests Population pop = Population(); // aux classes (they are not tested in-depth in this file) @@ -64,61 +65,66 @@ TEST(Population, PopulationTests) // print models fmt::print("Printing from population method:\n"); - fmt::print("{}\n",pop.print_models()); + fmt::print("{}\n",pop.print_models()); // may yeld seg fault if string is too large for buffer // island sizes increases and comes back to the same values after update - fmt::print("Performing all steps of an evolution\n"); + fmt::print("Performing all steps of an evolution (sequential, not parallel)\n"); for (int i=0; i<10; ++i) // update and prep offspring slots works properly - { // wax on wax off - - // fmt::print("Evaluating population\n"); - // vector> survivors(pop.n_islands); - // vector> island_parents(pop.n_islands); - - // for (int j=0; j(pop.get_island_range(j)), - // std::get<1>(pop.get_island_range(j)) ); - - // fmt::print("Fitness\n"); - // // we can calculate the fitness for each island - // evaluator.fitness(pop, pop.get_island_range(j), data, params, true, false); - - // fmt::print("Selection\n"); - // // just so we can call the update method - // vector parents = selector.select(pop, pop.get_island_range(j), params, data); - - // ASSERT_TRUE(parents.size() > 0); - // fmt::print("Updating parents\n"); - // island_parents.at(j) = parents; + { + vector> survivors(pop.n_islands); - // fmt::print("Preparing offspring\n"); - // pop.prep_offspring_slots(j); + fmt::print("Evolution step {}\n", i); + for (int j=0; j parents = selector.select(pop, j, params, data); + ASSERT_TRUE(parents.size() > 0); - // survivors.at(j) = island_survivors; - // } - - // fmt::print("Updating and migrating\n"); - - // // TODO: UPDATE SHOULD SORT SURVIVOR LIST AND REMOVE REPEATED VALUES - // pop.update(survivors); - // ASSERT_TRUE(pop.size() == params.pop_size); + fmt::print("Preparing offspring\n"); + pop.prep_offspring_slots(j); - // pop.migrate(); - // ASSERT_TRUE(pop.size() == params.pop_size); + // variation applied to population + fmt::print("Variations for island {}\n", j); + variator.vary(pop, j, parents); - // fmt::print("Printing generation {} population:\n{}\n", i, pop.print_models()); + fmt::print("fitting {}\n", j); + evaluator.fitness(pop, j, data, params, true, true); + + fmt::print("survivors\n", j); + auto island_survivors = survivor.survive(pop, j, params, data); + survivors.at(j) = island_survivors; + } + + fmt::print("Updating and migrating\n"); + pop.update(survivors); + pop.migrate(); + + // TODO: print islands + fmt::print("Printing generation {} population:\n", i); + for (int i=0; i Date: Mon, 20 Nov 2023 14:15:16 -0500 Subject: [PATCH 103/199] Clean up. Moving to finish pending TODOs and fix python wrapper --- src/bindings/bind_dataset.cpp | 69 ---- src/bindings/bind_individuals.cpp | 1 + src/bindings/bind_individuals.h | 0 src/bindings/bind_params.cpp | 21 - src/bindings/bind_search_space.cpp | 39 -- src/bindings/module.cpp | 10 +- src/bindings/module.h | 2 + src/cbrush.cpp | 4 +- src/cbrush.h | 33 +- src/eval/evaluation.cpp | 1 + src/eval/evaluation.h | 1 - src/individual.cpp | 1 - src/params.h | 3 +- src/population.cpp | 31 +- src/population.h | 11 +- src/selection/nsga2.cpp | 15 +- src/selection/nsga2.h | 1 + src/selection/selection.h | 1 - tests/cpp/test_population.cpp | 14 +- tests/python/test_brush.py | 274 ++++++------- tests/python/test_optimization.py | 610 ++++++++++++++--------------- tests/python/test_params.py | 146 +++---- tests/python/test_program.py | 98 ++--- 23 files changed, 619 insertions(+), 767 deletions(-) delete mode 100644 src/bindings/bind_dataset.cpp create mode 100644 src/bindings/bind_individuals.h delete mode 100644 src/bindings/bind_params.cpp delete mode 100644 src/bindings/bind_search_space.cpp diff --git a/src/bindings/bind_dataset.cpp b/src/bindings/bind_dataset.cpp deleted file mode 100644 index ade036dc..00000000 --- a/src/bindings/bind_dataset.cpp +++ /dev/null @@ -1,69 +0,0 @@ -#include "module.h" -#include "../data/data.h" -#include "../types.h" -#include "../data/io.h" -namespace py = pybind11; -namespace br = Brush; -namespace nl = nlohmann; - -void bind_dataset(py::module & m) -{ - py::class_(m, "Dataset") - // construct from X, feature names (and optional validation and batch sizes) with constructor 3. - .def(py::init([](const Ref& X, - const vector& feature_names=vector(), - const float validation_size=0.0, - const float batch_size=1.0){ - return br::Data::Dataset( - X, feature_names, validation_size, batch_size); - }), - py::arg("X"), - py::arg("feature_names") = vector(), - py::arg("validation_size") = 0.0, - py::arg("batch_size") = 1.0 - ) - // construct from X, y, feature names (and optional validation and batch sizes) with constructor 2. - .def(py::init([](const Ref& X, - const Ref& y, - const vector& feature_names=vector(), - const float validation_size=0.0, - const float batch_size=1.0){ - return br::Data::Dataset( - X, y, feature_names, {}, false, validation_size, batch_size); - }), - py::arg("X"), - py::arg("y"), - py::arg("feature_names") = vector(), - py::arg("validation_size") = 0.0, - py::arg("batch_size") = 1.0 - ) - // construct from X, feature names, but copying the feature types from a - // reference dataset with constructor 4. Useful for predicting (specially - // because the user can provide a single element matrix, or an array with - // no feature names). - .def(py::init([](const Ref& X, - const br::Data::Dataset& ref_dataset, - const vector& feature_names){ - return br::Data::Dataset(X, ref_dataset, feature_names); - }), - py::arg("X"), - py::arg("ref_dataset"), - py::arg("feature_names") - ) - - .def_readwrite("y", &br::Data::Dataset::y) - // .def_readwrite("features", &br::Data::Dataset::features) - .def("get_n_samples", &br::Data::Dataset::get_n_samples) - .def("get_n_features", &br::Data::Dataset::get_n_features) - .def("print", &br::Data::Dataset::print) - .def("get_batch", &br::Data::Dataset::get_batch) - .def("get_training_data", &br::Data::Dataset::get_training_data) - .def("get_validation_data", &br::Data::Dataset::get_validation_data) - .def("get_batch_size", &br::Data::Dataset::get_batch_size) - .def("set_batch_size", &br::Data::Dataset::set_batch_size) - .def("split", &br::Data::Dataset::split) - .def("get_X", &br::Data::Dataset::get_X) - ; - - m.def("read_csv", &br::Data::read_csv, py::arg("path"), py::arg("target"), py::arg("sep")=','); -} \ No newline at end of file diff --git a/src/bindings/bind_individuals.cpp b/src/bindings/bind_individuals.cpp index e69de29b..0275ca93 100644 --- a/src/bindings/bind_individuals.cpp +++ b/src/bindings/bind_individuals.cpp @@ -0,0 +1 @@ +#include "module.h" diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h new file mode 100644 index 00000000..e69de29b diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp deleted file mode 100644 index 75521ab3..00000000 --- a/src/bindings/bind_params.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include "module.h" -#include "../params.h" -#include "../util/rnd.h" - -namespace br = Brush; - -void bind_params(py::module& m) -{ - // py::object params = Brush::PARAMS; - // m.attr("PARAMS") = params; - - // py::class_(m, "Params", py::dynamic_attr()) - // .def(py::init<>()) - - m.def("set_params", &br::set_params); - m.def("get_params", &br::get_params); - m.def("set_random_state", [](unsigned int seed) - { br::Util::r = *br::Util::Rnd::initRand(); - br::Util::r.set_seed(seed); }); - m.def("rnd_flt", [](){ return br::Util::r.rnd_flt(); }); -} \ No newline at end of file diff --git a/src/bindings/bind_search_space.cpp b/src/bindings/bind_search_space.cpp deleted file mode 100644 index 4f43449a..00000000 --- a/src/bindings/bind_search_space.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include "module.h" -#include "../search_space.h" -#include "../program/program.h" -namespace py = pybind11; -namespace br = Brush; -namespace nl = nlohmann; - -using stream_redirect = py::call_guard; - -void bind_search_space(py::module &m) -{ - // Notice: We change the interface for SearchSpace a little bit by - // constructing it with a Dataset object, rather than initializing it as an - // empty struct and then calling init() with the Dataset object. - py::class_(m, "SearchSpace") - .def(py::init([](br::Data::Dataset data, bool weights_init=true){ - SearchSpace SS; - SS.init(data, {}, weights_init); - return SS; - }), - py::arg("data"), - py::arg("weights_init") = true - ) - .def(py::init&, - bool>(), - py::arg("data"), - py::arg("user_ops"), - py::arg("weights_init") = true - ) - .def("make_regressor", &br::SearchSpace::make_regressor) - .def("make_classifier", &br::SearchSpace::make_classifier) - .def("make_multiclass_classifier", &br::SearchSpace::make_multiclass_classifier) - .def("make_representer", &br::SearchSpace::make_representer) - .def("print", - &br::SearchSpace::print, - stream_redirect() - ) - ; -} \ No newline at end of file diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index 1da4692a..86694de4 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -15,10 +15,8 @@ license: GNU/GPL v3 namespace py = pybind11; // forward declarations -void bind_dataset(py::module &); -void bind_search_space(py::module &); void bind_programs(py::module &); -void bind_params(py::module &); +void bind_individuals(py::module &); void bind_cbrush(py::module &); PYBIND11_MODULE(_brush, m) { @@ -28,12 +26,8 @@ PYBIND11_MODULE(_brush, m) { #else m.attr("__version__") = "dev"; #endif - - bind_params(m); - bind_dataset(m); - bind_search_space(m); bind_cbrush(m); py::module_ m2 = m.def_submodule("program", "Contains Program classes."); bind_programs(m2); - + bind_individuals(m2); } diff --git a/src/bindings/module.h b/src/bindings/module.h index 2cc2a7ab..53c0a4a3 100644 --- a/src/bindings/module.h +++ b/src/bindings/module.h @@ -6,10 +6,12 @@ copyright 2021 William La Cava authors: William La Cava and Joseph D. Romano license: GNU/GPL v3 */ + #include #include #include #include + // json support #include "pybind11_json/pybind11_json.hpp" #include "nlohmann/json.hpp" \ No newline at end of file diff --git a/src/cbrush.cpp b/src/cbrush.cpp index fafed3e4..539b49e1 100644 --- a/src/cbrush.cpp +++ b/src/cbrush.cpp @@ -77,7 +77,7 @@ void CBrush::run_generation(unsigned int g, Dataset &data) { // https://taskflow.github.io/taskflow/ParallelIterations.html tf::Executor executor; - tf::Taskflow taskflow; + tf::Taskflow taskflow; // TODO: how to set number of threads? // TODO: implement custom behavior for first generation (specially regarding evaluator) params.current_gen = g; @@ -103,7 +103,7 @@ void CBrush::run_generation(unsigned int g, Dataset &data) }); vector survivors(pop.size()); - pop.prep_offspring_slots(); + pop.add_offspring_indexes(); taskflow.for_each_index(0, pop.n_islands, 1, [&](int island) { tuple island_range = pop.get_island_range(island); diff --git a/src/cbrush.h b/src/cbrush.h index 8a842bf8..2cf104ba 100644 --- a/src/cbrush.h +++ b/src/cbrush.h @@ -134,45 +134,20 @@ class CBrush{ Parameters params; ///< hyperparameters of brush SearchSpace ss; - // TODO: make other classes like selection (no template), or make selection like other classes? Population pop; ///< population of programs - Selection selector; ///< selection algorithm - Evaluation evaluator; ///< evaluation code - Variation variator; ///< variation operators - Selection survivor; ///< survival algorithm + Selection selector; ///< selection algorithm + Evaluation evaluator; ///< evaluation code + Variation variator; ///< variation operators + Selection survivor; ///< survival algorithm // TODO: MISSING CLASSES: timer, archive, logger - - // TODO - // results so far float best_loss; int best_complexity; Individual best_ind; - // update best // calculate/print stats }; -int main(){ - - tf::Executor executor; - tf::Taskflow taskflow; - - auto [A, B, C, D] = taskflow.emplace( // create four tasks - [] () { std::cout << "TaskA\n"; }, - [] () { std::cout << "TaskB\n"; }, - [] () { std::cout << "TaskC\n"; }, - [] () { std::cout << "TaskD\n"; } - ); - - A.precede(B, C); // A runs before B and C - D.succeed(B, C); // D runs after B and C - - executor.run(taskflow).wait(); - - return 0; -} - } // Brush #endif diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index e43e9ee0..ebed9669 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -43,6 +43,7 @@ void Evaluation::validation(Population& pop, VectorXf y_pred = ind.program.predict(data.get_validation_data()); assign_fit(ind, y_pred, data, params, true); } + ind.set_obj(params.objectives); } } diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index 99b6d729..743deca7 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -27,7 +27,6 @@ class Evaluation { Evaluation(string scorer="mse"): S(scorer) { this->S.set_scorer(scorer); }; ~Evaluation(){}; - // TODO: IMPLEMENT THIS /// validation of population. void validation(Population& pop, int island, diff --git a/src/individual.cpp b/src/individual.cpp index becfa42c..39a94bb4 100644 --- a/src/individual.cpp +++ b/src/individual.cpp @@ -36,7 +36,6 @@ void Individual::set_obj(const vector& objectives) for (const auto& n : objectives) { - // TODO: implement other objectives? if (n.compare("fitness")==0) obj.push_back(fitness); // fitness on training data, not validation. // if you use batch, this value will change every generation diff --git a/src/params.h b/src/params.h index 5d2d9534..68c476cf 100644 --- a/src/params.h +++ b/src/params.h @@ -22,14 +22,13 @@ struct Parameters //int verbosity = 0; // TODO: implement log and verbosity // TODO: every parameter should have a default value - // TODO: python wrapper should have getters and setters for all this stuff // Evolutionary stuff string mode="regression"; unsigned int current_gen = 1; int pop_size = 100; - int gens = 100; + int gens = 1000; unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size unsigned int max_size = 50; vector objectives{"fitness","complexity"}; // error should be generic and deducted based on mode diff --git a/src/population.cpp b/src/population.cpp index d0f3446d..a9e25903 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -49,12 +49,12 @@ void Population::init(SearchSpace& ss, const Parameters& params) } /// update individual vector size and island indexes -template // TODO: rename to include_offspring_indexes or something like this -void Population::prep_offspring_slots(int island) +template +void Population::add_offspring_indexes(int island) { // reading and writing is thread-safe, as long as there's no overlap on island ranges. // manipulating a vector IS NOT thread-safe (inserting and erasing elements). - // So, prep_offspring_slots and update should be the synchronization points, not + // So, add_offspring_indexes and update should be the synchronization points, not // operations performed concurrently size_t p = pop_size; // population size. prep_offspring slots will douple the population, adding the new expressions into the islands @@ -131,7 +131,7 @@ string Population::print_models(bool just_offspring, string sep) } template -vector> Population::sorted_front(unsigned rank) +vector> Population::sorted_front(unsigned rank, bool ignore_offspring) { // this is used to migration and update archive at the end of a generation. expect islands without offspring @@ -144,12 +144,17 @@ vector> Population::sorted_front(unsigned rank) auto idxs = island_indexes.at(j); vector pf; - for (unsigned int i : idxs) + auto end = idxs.size(); + if (ignore_offspring) + end = end/2; + + for (int i=0; irank == rank) + if (individuals.at(idxs.at(i))->rank == rank) pf.push_back(i); } + std::sort(pf.begin(),pf.end(),SortComplexity(*this)); auto it = std::unique(pf.begin(),pf.end(),SameFitComplexity(*this)); @@ -161,12 +166,17 @@ vector> Population::sorted_front(unsigned rank) } template -vector Population::hall_of_fame(unsigned rank) +vector Population::hall_of_fame(unsigned rank, bool ignore_offspring) { // this is used to migration and update archive at the end of a generation. expect islands without offspring vector pf(0); - for (unsigned int i =0; irank == rank) pf.push_back(i); @@ -187,8 +197,9 @@ void Population::migrate() if (n_islands==1) return; - auto island_fronts = sorted_front(); - auto global_hall_of_fame = hall_of_fame(); + // we cant use more than half of population here + auto island_fronts = sorted_front(1, true); + auto global_hall_of_fame = hall_of_fame(1, true); // This is not thread safe (as it is now) for (int island=0; island get_island_indexes(int island){ return island_indexes.at(island); }; /// update individual vector size, distributing the expressions in n_islands - // TODO: add elements to the end - void prep_offspring_slots(int island); + void add_offspring_indexes(int island); /// reduce programs to the indices in survivors. Not thread safe,as it removes elements void update(vector> survivors); @@ -55,14 +56,14 @@ class Population{ string print_models(bool just_offspring=false, string sep="\n"); /// return complexity-sorted Pareto front indices for each island - vector> sorted_front(unsigned rank=1); + vector> sorted_front(unsigned rank=1, bool ignore_offspring=false); // pareto front ignoring island divisions - vector hall_of_fame(unsigned rank=1); + vector hall_of_fame(unsigned rank=1, bool ignore_offspring=false); // perform a migration in the population. Individuals from sorted front or hall of fame will replace others by the // probability set in parameters. Expects a population without offspring - void migrate(); // TODO: change just the indexes + void migrate(); /// Sort each island in increasing complexity. This is not thread safe. I should set complexities of the whole population before calling it, and use get_complexity instead struct SortComplexity diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 1c59cf8e..19ba0b99 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -22,7 +22,6 @@ size_t NSGA2::tournament(Population& pop, size_t i, size_t j) const const Individual& ind1 = pop[i]; const Individual& ind2 = pop[j]; - // TODO: implement this int flag = ind1.check_dominance(ind2); if (flag == 1) // ind1 dominates ind2 @@ -48,8 +47,8 @@ vector NSGA2::select(Population& pop, int island, return island_pool; // setting the objectives - for (unsigned int i=0; iset_obj(params.objectives); + // for (unsigned int i=0; iset_obj(params.objectives); vector selected(0); for (int i = 0; i < island_pool.size(); ++i) // selecting based on island_pool size @@ -77,25 +76,21 @@ vector NSGA2::survive(Population& pop, int island, // set objectives (this is when the obj vector is updated.) - fmt::print("-- first loop\n"); - for (unsigned int i=0; iset_obj(params.objectives); + // for loop below (originally performed in selection in FEAT) was moved to evaluation --- multiple islands may have the same individual + // for (unsigned int i=0; iset_obj(params.objectives); // fast non-dominated sort - fmt::print("-- fast nds\n"); auto front = fast_nds(pop, island_pool); - fmt::print("-- while loop\n"); // Push back selected individuals until full vector selected(0); int i = 0; while ( selected.size() + front.at(i).size() < original_size ) // (size/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) { - fmt::print("-- crawd dist\n"); std::vector& Fi = front.at(i); // indices in front i crowding_distance(pop, front, i); // calculate crowding in Fi - fmt::print("-- select loop\n"); for (int j = 0; j < Fi.size(); ++j) // Pt+1 = Pt+1 U Fi selected.push_back(Fi.at(j)); diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index 58c3c51f..966402db 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -56,6 +56,7 @@ class NSGA2 : public SelectionOperator // TODO: Improve operator[], and decrease use of pop.individuals.at(). Also, decrease number of auto declarations auto ind1 = pop.individuals[i]; auto ind2 = pop.individuals[j]; + if (ind1->rank < ind2->rank) return true; else if (ind1->rank == ind2->rank && diff --git a/src/selection/selection.h b/src/selection/selection.h index 1e19392f..6b94cec3 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -65,7 +65,6 @@ struct Selection string type; bool survival; - //TODO: rewrite it as initializing parameters Selection() { this->type = "nsga2"; diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 819bac09..64e897ad 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -69,11 +69,11 @@ TEST(Population, PopulationTests) // island sizes increases and comes back to the same values after update fmt::print("Performing all steps of an evolution (sequential, not parallel)\n"); - for (int i=0; i<10; ++i) // update and prep offspring slots works properly + for (int i=0; i<100; ++i) // update and prep offspring slots works properly { vector> survivors(pop.n_islands); - fmt::print("Evolution step {}\n", i); + fmt::print("Fitting individuals\n"); // this must be done in one thread (or implement mutex), because we can have multiple islands pointing to same individuals for (int j=0; j parents = selector.select(pop, j, params, data); ASSERT_TRUE(parents.size() > 0); fmt::print("Preparing offspring\n"); - pop.prep_offspring_slots(j); + pop.add_offspring_indexes(j); // variation applied to population fmt::print("Variations for island {}\n", j); variator.vary(pop, j, parents); - fmt::print("fitting {}\n", j); + fmt::print("fitting {}\n", j); // at this step, we know that theres only one pointer to each individual being fitted, so we can perform it in parallel evaluator.fitness(pop, j, data, params, true, true); fmt::print("survivors\n", j); @@ -106,7 +111,6 @@ TEST(Population, PopulationTests) pop.update(survivors); pop.migrate(); - // TODO: print islands fmt::print("Printing generation {} population:\n", i); for (int i=0; i= 2, \ - "every class should have its own column (even for binary clf)" +# y_prob = est.predict_proba(X) +# assert len(y_prob.shape) == 2, "predict_proba should be 2-dimensional" +# assert y_prob.shape[1] >= 2, \ +# "every class should have its own column (even for binary clf)" -@pytest.mark.parametrize('setup,fixed_node', [ - ('classification_setup', 'Logistic'), - # ('multiclass_classification_setup', 'Softmax') - ]) -def test_fixed_nodes(setup, fixed_node, brush_args, request): - # Classification has a fixed root that should not change after mutation or crossover - - Estimator, X, y = request.getfixturevalue(setup) - - est = Estimator(**brush_args) - est.fit(X, y) # Calling fit to make it create the setup toolbox and variation functions - - for i in range(10): - # Initial population - pop = est.toolbox_.population(n=100) - pop_models = [] - for p in pop: - pop_models.append(p.prg.get_model()) - assert p.prg.get_model().startswith(fixed_node), \ - (f"An individual for {setup} was criated without {fixed_node} " + - f"node on root. Model was {p.ind.get_model()}") - - # Clones - clones = [est.toolbox_.Clone(p) for p in pop] - for c in clones: - assert c.prg.get_model().startswith(fixed_node), \ - (f"An individual for {setup} was cloned without {fixed_node} " + - f"node on root. Model was {c.ind.get_model()}") - - # Mutation - xmen = [est.toolbox_.mutate(c) for c in clones] - xmen = [x for x in xmen if x is not None] - assert len(xmen) > 0, "Mutation didn't worked for any individual" - for x in xmen: - assert x.prg.get_model().startswith(fixed_node), \ - (f"An individual for {setup} was mutated without {fixed_node} " + - f"node on root. Model was {x.ind.get_model()}") +# @pytest.mark.parametrize('setup,fixed_node', [ +# ('classification_setup', 'Logistic'), +# # ('multiclass_classification_setup', 'Softmax') +# ]) +# def test_fixed_nodes(setup, fixed_node, brush_args, request): +# # Classification has a fixed root that should not change after mutation or crossover + +# Estimator, X, y = request.getfixturevalue(setup) + +# est = Estimator(**brush_args) +# est.fit(X, y) # Calling fit to make it create the setup toolbox and variation functions + +# for i in range(10): +# # Initial population +# pop = est.toolbox_.population(n=100) +# pop_models = [] +# for p in pop: +# pop_models.append(p.prg.get_model()) +# assert p.prg.get_model().startswith(fixed_node), \ +# (f"An individual for {setup} was criated without {fixed_node} " + +# f"node on root. Model was {p.ind.get_model()}") + +# # Clones +# clones = [est.toolbox_.Clone(p) for p in pop] +# for c in clones: +# assert c.prg.get_model().startswith(fixed_node), \ +# (f"An individual for {setup} was cloned without {fixed_node} " + +# f"node on root. Model was {c.ind.get_model()}") + +# # Mutation +# xmen = [est.toolbox_.mutate(c) for c in clones] +# xmen = [x for x in xmen if x is not None] +# assert len(xmen) > 0, "Mutation didn't worked for any individual" +# for x in xmen: +# assert x.prg.get_model().startswith(fixed_node), \ +# (f"An individual for {setup} was mutated without {fixed_node} " + +# f"node on root. Model was {x.ind.get_model()}") - # Crossover - cxmen = [] - [cxmen.extend(est.toolbox_.mate(c1, c2)) - for (c1, c2) in zip(clones[::2], clones[1::2])] - cxmen = [x for x in cxmen if x is not None] - assert len(cxmen) > 0, "Crossover didn't worked for any individual" - for cx in cxmen: - assert cx.prg.get_model().startswith(fixed_node), \ - (f"An individual for {setup} was crossovered without {fixed_node} " + - f"node on root. Model was {cx.ind.get_model()}") +# # Crossover +# cxmen = [] +# [cxmen.extend(est.toolbox_.mate(c1, c2)) +# for (c1, c2) in zip(clones[::2], clones[1::2])] +# cxmen = [x for x in cxmen if x is not None] +# assert len(cxmen) > 0, "Crossover didn't worked for any individual" +# for cx in cxmen: +# assert cx.prg.get_model().startswith(fixed_node), \ +# (f"An individual for {setup} was crossovered without {fixed_node} " + +# f"node on root. Model was {cx.ind.get_model()}") - # Originals still the same - for p, p_original_model in zip(pop, pop_models): - assert p.prg.get_model() == p_original_model, \ - "Variation operator changed the original model." +# # Originals still the same +# for p, p_original_model in zip(pop, pop_models): +# assert p.prg.get_model() == p_original_model, \ +# "Variation operator changed the original model." -# def test_random_state(): # TODO: make it work -# test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) -# test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0], -# [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T +# # def test_random_state(): # TODO: make it work +# # test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) +# # test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0], +# # [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T -# est1 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) -# est2 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) +# # est1 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) +# # est2 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) -# assert est1.best_estimator_.get_model() == est2.best_estimator_.get_model(), \ -# "random state failed to generate same results" \ No newline at end of file +# # assert est1.best_estimator_.get_model() == est2.best_estimator_.get_model(), \ +# # "random state failed to generate same results" \ No newline at end of file diff --git a/tests/python/test_optimization.py b/tests/python/test_optimization.py index 06cb0339..711b07db 100644 --- a/tests/python/test_optimization.py +++ b/tests/python/test_optimization.py @@ -1,305 +1,305 @@ -#!/usr/bin/env python3 - -import brush -import pytest -import numpy as np -import pandas as pd -from pmlb import fetch_data -from sklearn.utils import resample - -import _brush -import json - -import traceback -import logging - -@pytest.fixture -def optimize_addition_positive_weights(): - data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Add", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": True } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 3.0], atol=1e-3) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_addition_negative_weights(): - data = _brush.read_csv("docs/examples/datasets/d_2x1_subtract_3x2.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Add", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": True } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, -3.0], atol=1e-3) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_subtraction_positive_weights(): - data = _brush.read_csv("docs/examples/datasets/d_2x1_subtract_3x2.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Sub", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": True } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 3.0], atol=1e-3) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_subtraction_negative_weights(): - data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Sub", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": True } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, -3.0], atol=1e-3) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_multiply(): - data = _brush.read_csv("docs/examples/datasets/d_2x1_multiply_3x2.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Mul", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": False } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(np.prod(learned_weights), 2.0*3.0, atol=1e-3) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_divide(): - data = _brush.read_csv("docs/examples/datasets/d_2x1_divide_3x2.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Div", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": False } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights[0], 2.0/3.0, atol=1e-3) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_sqrt_outer_weight(): - data = _brush.read_csv("docs/examples/datasets/d_2_sqrt_x1.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Sqrt", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": False } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0], atol=1e-3) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_sqrt_inner_weight(): - data = _brush.read_csv("docs/examples/datasets/d_2_sqrt_x1.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Sqrt", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [4.0], atol=1e-3) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_sin_outer_weight(): - data = _brush.read_csv("docs/examples/datasets/d_2_sin_x1.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Sin", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": False } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0], atol=1e-3) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_sin_inner_weight(): - data = _brush.read_csv("docs/examples/datasets/d_sin_0_25x1.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Sin", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [0.25], atol=1e-2) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_notable_product_weight(): - data = _brush.read_csv("docs/examples/datasets/d_square_x1_plus_2_x1_x2_plus_square_x2.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Add", "is_weighted": False }, - { "node_type":"Mul", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": False }, - { "node_type":"Add", "is_weighted": False }, - { "node_type":"Square", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, - { "node_type":"Square", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": False } - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 1.0], atol=1e-2) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_3ary_prod_inner_weight(): - data = _brush.read_csv("docs/examples/datasets/d_5x1_multiply_x2_multiply_x3.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Prod", "is_weighted": False, - "arg_types" :["ArrayF", "ArrayF", "ArrayF"], - "ret_type" :"ArrayF", - "sig_hash" :5617655905677279916, - "sig_dual_hash":10188582206427064428, - "complete_hash":1786662244046809282 }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, - { "node_type":"Terminal", "feature":"x3", "is_weighted": False} - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [5.0], atol=1e-2) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_3ary_prod_outer_weight(): - data = _brush.read_csv("docs/examples/datasets/d_5x1_multiply_x2_multiply_x3.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Prod", "is_weighted": True, - "arg_types" :["ArrayF", "ArrayF", "ArrayF"], - "ret_type" :"ArrayF", - "sig_hash" :5617655905677279916, - "sig_dual_hash":10188582206427064428, - "complete_hash":1786662244046809282 }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": False}, - { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, - { "node_type":"Terminal", "feature":"x3", "is_weighted": False} - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [5.0], atol=1e-2) - - return (data, json_program, weight_check) - -@pytest.fixture -def optimize_constant_weight(): - data = _brush.read_csv("docs/examples/datasets/d_2x1_multiply_3x2.csv","target") - - json_program = { - "Tree": [ - { "node_type":"Mul", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": False}, - { "node_type":"Mul", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, - { "node_type":"Constant", "feature":"C", "is_weighted": True }, - ], - "is_fitted_":False - } - - weight_check = lambda learned_weights: np.allclose(learned_weights, [6.0], atol=1e-2) - - return (data, json_program, weight_check) - -@pytest.mark.parametrize( - 'optimization_problem', ['optimize_addition_positive_weights', - 'optimize_addition_negative_weights', - 'optimize_subtraction_positive_weights', - 'optimize_subtraction_negative_weights', - 'optimize_multiply', - 'optimize_divide', - 'optimize_sqrt_outer_weight', - 'optimize_sqrt_inner_weight', - 'optimize_sin_outer_weight', - 'optimize_sin_inner_weight', - 'optimize_notable_product_weight', - 'optimize_3ary_prod_inner_weight', - 'optimize_3ary_prod_outer_weight', - 'optimize_constant_weight' - ]) -def test_optimizer(optimization_problem, request): - - data, json_program, weight_check = request.getfixturevalue(optimization_problem) - - print( "initial json: {}\n", json_program) - prg = _brush.program.Regressor(json_program) - print( "program:", prg.get_model()) - - # fit model - print( "fit") - prg.fit(data) - print( "predict") - y_pred = prg.predict(data) - - learned_weights = prg.get_weights(); - print('learned weights:', learned_weights) - - assert np.sum(np.square(data.y-y_pred)) <= 1e-3 - assert np.allclose(data.y, y_pred, atol=1e-3) - assert weight_check(learned_weights) \ No newline at end of file +# #!/usr/bin/env python3 + +# import brush +# import pytest +# import numpy as np +# import pandas as pd +# from pmlb import fetch_data +# from sklearn.utils import resample + +# import _brush +# import json + +# import traceback +# import logging + +# @pytest.fixture +# def optimize_addition_positive_weights(): +# data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Add", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": True } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 3.0], atol=1e-3) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_addition_negative_weights(): +# data = _brush.read_csv("docs/examples/datasets/d_2x1_subtract_3x2.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Add", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": True } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, -3.0], atol=1e-3) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_subtraction_positive_weights(): +# data = _brush.read_csv("docs/examples/datasets/d_2x1_subtract_3x2.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Sub", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": True } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 3.0], atol=1e-3) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_subtraction_negative_weights(): +# data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Sub", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": True } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, -3.0], atol=1e-3) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_multiply(): +# data = _brush.read_csv("docs/examples/datasets/d_2x1_multiply_3x2.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Mul", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": False } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(np.prod(learned_weights), 2.0*3.0, atol=1e-3) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_divide(): +# data = _brush.read_csv("docs/examples/datasets/d_2x1_divide_3x2.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Div", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": False } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights[0], 2.0/3.0, atol=1e-3) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_sqrt_outer_weight(): +# data = _brush.read_csv("docs/examples/datasets/d_2_sqrt_x1.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Sqrt", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": False } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0], atol=1e-3) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_sqrt_inner_weight(): +# data = _brush.read_csv("docs/examples/datasets/d_2_sqrt_x1.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Sqrt", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [4.0], atol=1e-3) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_sin_outer_weight(): +# data = _brush.read_csv("docs/examples/datasets/d_2_sin_x1.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Sin", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": False } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0], atol=1e-3) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_sin_inner_weight(): +# data = _brush.read_csv("docs/examples/datasets/d_sin_0_25x1.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Sin", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [0.25], atol=1e-2) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_notable_product_weight(): +# data = _brush.read_csv("docs/examples/datasets/d_square_x1_plus_2_x1_x2_plus_square_x2.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Add", "is_weighted": False }, +# { "node_type":"Mul", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": False }, +# { "node_type":"Add", "is_weighted": False }, +# { "node_type":"Square", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, +# { "node_type":"Square", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": False } +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 1.0], atol=1e-2) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_3ary_prod_inner_weight(): +# data = _brush.read_csv("docs/examples/datasets/d_5x1_multiply_x2_multiply_x3.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Prod", "is_weighted": False, +# "arg_types" :["ArrayF", "ArrayF", "ArrayF"], +# "ret_type" :"ArrayF", +# "sig_hash" :5617655905677279916, +# "sig_dual_hash":10188582206427064428, +# "complete_hash":1786662244046809282 }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, +# { "node_type":"Terminal", "feature":"x3", "is_weighted": False} +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [5.0], atol=1e-2) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_3ary_prod_outer_weight(): +# data = _brush.read_csv("docs/examples/datasets/d_5x1_multiply_x2_multiply_x3.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Prod", "is_weighted": True, +# "arg_types" :["ArrayF", "ArrayF", "ArrayF"], +# "ret_type" :"ArrayF", +# "sig_hash" :5617655905677279916, +# "sig_dual_hash":10188582206427064428, +# "complete_hash":1786662244046809282 }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": False}, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, +# { "node_type":"Terminal", "feature":"x3", "is_weighted": False} +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [5.0], atol=1e-2) + +# return (data, json_program, weight_check) + +# @pytest.fixture +# def optimize_constant_weight(): +# data = _brush.read_csv("docs/examples/datasets/d_2x1_multiply_3x2.csv","target") + +# json_program = { +# "Tree": [ +# { "node_type":"Mul", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": False}, +# { "node_type":"Mul", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, +# { "node_type":"Constant", "feature":"C", "is_weighted": True }, +# ], +# "is_fitted_":False +# } + +# weight_check = lambda learned_weights: np.allclose(learned_weights, [6.0], atol=1e-2) + +# return (data, json_program, weight_check) + +# @pytest.mark.parametrize( +# 'optimization_problem', ['optimize_addition_positive_weights', +# 'optimize_addition_negative_weights', +# 'optimize_subtraction_positive_weights', +# 'optimize_subtraction_negative_weights', +# 'optimize_multiply', +# 'optimize_divide', +# 'optimize_sqrt_outer_weight', +# 'optimize_sqrt_inner_weight', +# 'optimize_sin_outer_weight', +# 'optimize_sin_inner_weight', +# 'optimize_notable_product_weight', +# 'optimize_3ary_prod_inner_weight', +# 'optimize_3ary_prod_outer_weight', +# 'optimize_constant_weight' +# ]) +# def test_optimizer(optimization_problem, request): + +# data, json_program, weight_check = request.getfixturevalue(optimization_problem) + +# print( "initial json: {}\n", json_program) +# prg = _brush.program.Regressor(json_program) +# print( "program:", prg.get_model()) + +# # fit model +# print( "fit") +# prg.fit(data) +# print( "predict") +# y_pred = prg.predict(data) + +# learned_weights = prg.get_weights(); +# print('learned weights:', learned_weights) + +# assert np.sum(np.square(data.y-y_pred)) <= 1e-3 +# assert np.allclose(data.y, y_pred, atol=1e-3) +# assert weight_check(learned_weights) \ No newline at end of file diff --git a/tests/python/test_params.py b/tests/python/test_params.py index 03d08bc4..26c8e3f2 100644 --- a/tests/python/test_params.py +++ b/tests/python/test_params.py @@ -6,90 +6,90 @@ import numpy as np -def test_param_random_state(): - # Check if make_regressor, mutation and crossover will create the same expressions - test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) - test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0], - [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T +# def test_param_random_state(): +# # Check if make_regressor, mutation and crossover will create the same expressions +# test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) +# test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0], +# [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T - data = _brush.Dataset(test_X, test_y) - SS = _brush.SearchSpace(data) +# data = _brush.Dataset(test_X, test_y) +# SS = _brush.SearchSpace(data) - _brush.set_random_state(123) +# _brush.set_random_state(123) - first_run = [] - for d in range(1,4): - for s in range(1,20): - prg = SS.make_regressor(d, s) - prg = prg.mutate() +# first_run = [] +# for d in range(1,4): +# for s in range(1,20): +# prg = SS.make_regressor(d, s) +# prg = prg.mutate() - if prg != None: prg = prg.cross(prg) - if prg != None: first_run.append(prg.get_model()) +# if prg != None: prg = prg.cross(prg) +# if prg != None: first_run.append(prg.get_model()) - assert len(first_run) > 0, "either mutation or crossover is always failing" +# assert len(first_run) > 0, "either mutation or crossover is always failing" - _brush.set_random_state(123) +# _brush.set_random_state(123) - second_run = [] - for d in range(1,4): - for s in range(1,20): - prg = SS.make_regressor(d, s) - prg = prg.mutate() +# second_run = [] +# for d in range(1,4): +# for s in range(1,20): +# prg = SS.make_regressor(d, s) +# prg = prg.mutate() - if prg != None: prg = prg.cross(prg) - if prg != None: second_run.append(prg.get_model()) +# if prg != None: prg = prg.cross(prg) +# if prg != None: second_run.append(prg.get_model()) - assert len(second_run) > 0, "either mutation or crossover is always failing" - - for fr, sr in zip(first_run, second_run): - assert fr==sr, "random state failed to generate same expressions" - - -def _change_and_wait(config): - "Will change the mutation weights to set only the `index` to 1, then wait " - "`seconts` to retrieve the _brush PARAMS and print weight values" - index, seconds = config - - # Sample configuration - params = { - 'verbosity': False, - 'pop_size' : 100, - 'max_gen' : 100, - 'max_depth': 5, - 'max_size' : 50, - 'mutation_options': {'point' : 0.0, - 'insert' : 0.0, - 'delete' : 0.0, - 'subtree' : 0.0, - 'toggle_weight_on' : 0.0, - 'toggle_weight_off': 0.0} - } - - # We need to guarantee order to use the index correctly - mutations = ['point', 'insert', 'delete', 'subtree', 'toggle_weight_on', 'toggle_weight_off'] - - for i, m in enumerate(mutations): - params['mutation_options'][m] = 0 if i != index else 1.0 - - print(f"(Thread id {index}{seconds}) Setting mutation {mutations[index]} to 1 and wait {seconds} seconds") - - _brush.set_params(params) - time.sleep(seconds) +# assert len(second_run) > 0, "either mutation or crossover is always failing" + +# for fr, sr in zip(first_run, second_run): +# assert fr==sr, "random state failed to generate same expressions" + + +# def _change_and_wait(config): +# "Will change the mutation weights to set only the `index` to 1, then wait " +# "`seconts` to retrieve the _brush PARAMS and print weight values" +# index, seconds = config + +# # Sample configuration +# params = { +# 'verbosity': False, +# 'pop_size' : 100, +# 'max_gen' : 100, +# 'max_depth': 5, +# 'max_size' : 50, +# 'mutation_options': {'point' : 0.0, +# 'insert' : 0.0, +# 'delete' : 0.0, +# 'subtree' : 0.0, +# 'toggle_weight_on' : 0.0, +# 'toggle_weight_off': 0.0} +# } + +# # We need to guarantee order to use the index correctly +# mutations = ['point', 'insert', 'delete', 'subtree', 'toggle_weight_on', 'toggle_weight_off'] + +# for i, m in enumerate(mutations): +# params['mutation_options'][m] = 0 if i != index else 1.0 + +# print(f"(Thread id {index}{seconds}) Setting mutation {mutations[index]} to 1 and wait {seconds} seconds") + +# _brush.set_params(params) +# time.sleep(seconds) - print(f"(Thread id {index}{seconds}) Retrieving PARAMS: {_brush.get_params()['mutation_options']}") +# print(f"(Thread id {index}{seconds}) Retrieving PARAMS: {_brush.get_params()['mutation_options']}") - assert params['mutation_options']==_brush.get_params()['mutation_options'], \ - f"(Thread id {index}{seconds}) BRUSH FAILED TO KEEP SEPARATE INSTANCES OF `PARAMS` BETWEEN MULTIPLE THREADS" +# assert params['mutation_options']==_brush.get_params()['mutation_options'], \ +# f"(Thread id {index}{seconds}) BRUSH FAILED TO KEEP SEPARATE INSTANCES OF `PARAMS` BETWEEN MULTIPLE THREADS" -def test_global_PARAMS_sharing(): - print("By default, all threads starts with all mutations having weight zero.") +# def test_global_PARAMS_sharing(): +# print("By default, all threads starts with all mutations having weight zero.") - scale = 0.25 # Scale the time of each thread (for human manual checking) - - # Checking if brush's PARAMS can be modified inside a pool without colateral effects. - # Each configuration will start in the same order as they are listed, but they - # will finish in different times. They are all modifying the brush's PARAMS. - Pool(processes=3).map(_change_and_wait, [(0, 3*scale), - (1, 1*scale), - (2, 2*scale)]) +# scale = 0.25 # Scale the time of each thread (for human manual checking) + +# # Checking if brush's PARAMS can be modified inside a pool without colateral effects. +# # Each configuration will start in the same order as they are listed, but they +# # will finish in different times. They are all modifying the brush's PARAMS. +# Pool(processes=3).map(_change_and_wait, [(0, 3*scale), +# (1, 1*scale), +# (2, 2*scale)]) \ No newline at end of file diff --git a/tests/python/test_program.py b/tests/python/test_program.py index 78356bee..70aa067e 100644 --- a/tests/python/test_program.py +++ b/tests/python/test_program.py @@ -32,60 +32,60 @@ def test_make_program(test_data): prg = SS.make_regressor(d, s) print(f"Tree model for depth {d}, size {s}:", prg.get_model()) -def test_fit_regressor(test_data): - test_X, test_y = test_data - data = _brush.Dataset(test_X, test_y) - SS = _brush.SearchSpace(data) - # pytest.set_trace() - for d in range(1,4): - for s in range(1,20): - prg = SS.make_regressor(d, s) - print(f"Tree model for depth {d}, size {s}:", prg.get_model()) - # prg.fit(data) - y = prg.fit(data).predict(data) - print(y) +# def test_fit_regressor(test_data): +# test_X, test_y = test_data +# data = _brush.Dataset(test_X, test_y) +# SS = _brush.SearchSpace(data) +# # pytest.set_trace() +# for d in range(1,4): +# for s in range(1,20): +# prg = SS.make_regressor(d, s) +# print(f"Tree model for depth {d}, size {s}:", prg.get_model()) +# # prg.fit(data) +# y = prg.fit(data).predict(data) +# print(y) -def test_fit_classifier(): - df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') - data = _brush.Dataset(df.drop(columns='target'), df['target']) - SS = _brush.SearchSpace(data) - # pytest.set_trace() - for d in range(1,4): - for s in range(1,20): - prg = SS.make_classifier(d, s) - print(f"Tree model for depth {d}, size {s}:", prg.get_model()) - print(f"fitting {prg.get_model()}") - # prg.fit(data) - y = prg.fit(data).predict(data) - print(y) +# def test_fit_classifier(): +# df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') +# data = _brush.Dataset(df.drop(columns='target'), df['target']) +# SS = _brush.SearchSpace(data) +# # pytest.set_trace() +# for d in range(1,4): +# for s in range(1,20): +# prg = SS.make_classifier(d, s) +# print(f"Tree model for depth {d}, size {s}:", prg.get_model()) +# print(f"fitting {prg.get_model()}") +# # prg.fit(data) +# y = prg.fit(data).predict(data) +# print(y) -def test_json_regressor(): - data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") - json_program = { - "Tree": [ - { "node_type":"Add", "is_weighted": False }, - { "node_type":"Terminal", "feature":"x1", "is_weighted": True}, - { "node_type":"Terminal", "feature":"x2", "is_weighted": True} - ], - "is_fitted_":False - } - print( "initial json: {}\n", json_program) - prg = _brush.program.Regressor(json_program) - print( "program:", prg.get_model()) - # fit model - print( "fit") - prg.fit(data) - print( "predict") - y_pred = prg.predict(data) +# def test_json_regressor(): +# data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") +# json_program = { +# "Tree": [ +# { "node_type":"Add", "is_weighted": False }, +# { "node_type":"Terminal", "feature":"x1", "is_weighted": True}, +# { "node_type":"Terminal", "feature":"x2", "is_weighted": True} +# ], +# "is_fitted_":False +# } +# print( "initial json: {}\n", json_program) +# prg = _brush.program.Regressor(json_program) +# print( "program:", prg.get_model()) +# # fit model +# print( "fit") +# prg.fit(data) +# print( "predict") +# y_pred = prg.predict(data) - learned_weights = prg.get_weights() - print('learned weights:', learned_weights) +# learned_weights = prg.get_weights() +# print('learned weights:', learned_weights) - true_weights = [2.0, 3.0] +# true_weights = [2.0, 3.0] - assert np.sum(np.abs(data.y-y_pred)) <= 1e-4 - #assert all(round(i,4) == round(j, 4) for i,j in zip(learned_weights, true_weights)) - np.allclose(learned_weights, true_weights, atol=1e-4) +# assert np.sum(np.abs(data.y-y_pred)) <= 1e-4 +# #assert all(round(i,4) == round(j, 4) for i,j in zip(learned_weights, true_weights)) +# np.allclose(learned_weights, true_weights, atol=1e-4) # def test_serialization(): # data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") From 344b93ee586ea5d4c927950e82720b3060cbd7f0 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 11 Dec 2023 21:38:13 -0300 Subject: [PATCH 104/199] Moving python files outside src directory --- brush/__init__.py | 2 - brush/pybrush.py => pybrush/BrushEstimator.py | 7 +- pybrush/DeapEstimator.py | 600 ++++++++++++++++++ pybrush/__init__.py | 8 + brush/versionstr.py => pybrush/_versionstr.py | 0 pybrush/deap_api/__init__.py | 1 + pybrush/deap_api/nsga2.py | 114 ++++ setup.py | 16 +- 8 files changed, 735 insertions(+), 13 deletions(-) delete mode 100644 brush/__init__.py rename brush/pybrush.py => pybrush/BrushEstimator.py (83%) create mode 100644 pybrush/DeapEstimator.py create mode 100644 pybrush/__init__.py rename brush/versionstr.py => pybrush/_versionstr.py (100%) create mode 100644 pybrush/deap_api/__init__.py create mode 100644 pybrush/deap_api/nsga2.py diff --git a/brush/__init__.py b/brush/__init__.py deleted file mode 100644 index 8e705ae4..00000000 --- a/brush/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .estimator import BrushClassifier, BrushRegressor -from _brush import Dataset, SearchSpace \ No newline at end of file diff --git a/brush/pybrush.py b/pybrush/BrushEstimator.py similarity index 83% rename from brush/pybrush.py rename to pybrush/BrushEstimator.py index d0665a1c..4440bc55 100644 --- a/brush/pybrush.py +++ b/pybrush/BrushEstimator.py @@ -1,13 +1,14 @@ -from _brush import CBrush, Dataset, SearchSpace # TODO: stop calling cbrush, rename it +from _brush import Dataset, SearchSpace # TODO: stop calling cbrush, rename it from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin -# TODO? LOGGER AND ARCHIVE +# TODO: LOGGER AND ARCHIVE # TODO: GET DOCUMENTATION BACK class BrushEstimator(BaseEstimator): def __init__(self): - self.cbrush_ = CBrush() + # self.cbrush_ = CBrush() + pass def fit(self, X, y, Z=None): pass diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py new file mode 100644 index 00000000..67d32bbd --- /dev/null +++ b/pybrush/DeapEstimator.py @@ -0,0 +1,600 @@ +""" +sklearn-compatible wrapper for GP analyses. + +See brushgp.cpp for Python (via pybind11) modules that give more fine-grained +control of the underlying GP objects. +""" +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +from sklearn.utils.validation import check_is_fitted +# from sklearn.metrics import mean_squared_error +import numpy as np +import pandas as pd +# import deap as dp +from deap import algorithms, base, creator, tools +# from tqdm import tqdm +from types import NoneType +from sklearn.metrics import average_precision_score +from sklearn.preprocessing import MinMaxScaler +import _brush +from pybrush.deap_api import nsga2, DeapIndividual +# from _brush import Dataset, SearchSpace + + +class DeapEstimator(BaseEstimator): + """ + This is the base class for Deap-based Brush estimators. + This class shouldn't be called directly; instead, call a child class like + :py:class:`DeapRegressor ` or :py:class:`DeapClassifier `. + All of the shared parameters are documented here. + + Parameters + ---------- + mode : str, default 'classification' + The mode of the estimator. Used by subclasses + pop_size : int, default 100 + Population size. + max_gen : int, default 100 + Maximum iterations of the algorithm. + verbosity : int, default 0 + Controls level of printouts. + max_depth : int, default 0 + Maximum depth of GP trees in the GP program. Use 0 for no limit. + max_size : int, default 0 + Maximum number of nodes in a tree. Use 0 for no limit. + n_islands : int, default 5 + Number of independent islands to use in evolutionary framework. + Ignored if `algorithm!="nsga2island"`. + mig_prob : float, default 0.05 + Probability of occuring a migration between two random islands at the + end of a generation, must be between 0 and 1. + cx_prob : float, default 1/7 + Probability of applying the crossover variation when generating the offspring, + must be between 0 and 1. + Given that there are `n` mutations, and either crossover or mutation is + used to generate each individual in the offspring (but not both at the + same time), we want to have by default an uniform probability between + crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and + `1/n` for each mutation, we can achieve an uniform distribution. + mutation_options : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} + A dictionary with keys naming the types of mutation and floating point + values specifying the fraction of total mutations to do with that method. + The probability of having a mutation is `(1-cx_prob)` and, in case the mutation + is applied, then each mutation option is sampled based on the probabilities + defined in `mutation_options`. The set of probabilities should add up to 1.0. + functions: dict[str,float] or list[str], default {} + A dictionary with keys naming the function set and values giving the probability + of sampling them, or a list of functions which will be weighted uniformly. + If empty, all available functions are included in the search space. + initialization : {"uniform", "max_size"}, default "uniform" + Distribution of sizes on the initial population. If `max_size`, then every + expression is created with `max_size` nodes. If `uniform`, size will be + uniformly distributed between 1 and `max_size`. + objectives : list[str], default ["error", "size"] + list with one or more objectives to use. Options are `"error", "size", "complexity"`. + If `"error"` is used, then it will be the mean squared error for regression, + and accuracy for classification. + algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2" + Which Evolutionary Algorithm framework to use to evolve the population. + weights_init : bool, default True + Whether the search space should initialize the sampling weights of terminal nodes + based on the correlation with the output y. If `False`, then all terminal nodes + will have the same probability of 1.0. + validation_size : float, default 0.0 + Percentage of samples to use as a hold-out partition. These samples are used + to calculate statistics during evolution, but not used to train the models. + The `best_estimator_` will be selected using this partition. If zero, then + the same data used for training is used for validation. + batch_size : float, default 1.0 + Percentage of training data to sample every generation. If `1.0`, then + all data is used. Very small values can improve execution time, but + also lead to underfit. + random_state: int or None, default None + If int, then the value is used to seed the c++ random generator; if None, + then a seed will be generated using a non-deterministic generator. It is + important to notice that, even if the random state is fixed, it is + unlikely that running brush using multiple threads will have the same + results. This happens because the Operating System's scheduler is + responsible to choose which thread will run at any given time, thus + reproductibility is not guaranteed. + + Attributes + ---------- + best_estimator_ : _brush.Program + The final model picked from training. Used in subsequent calls to :func:`predict`. + archive_ : list[deap_api.DeapIndividual] + The final population from training. + data_ : _brush.Dataset + The complete data in Brush format. + train_ : _brush.Dataset + Partition of `data_` containing `(1-validation_size)`% of the data, in Brush format. + validation_ : _brush.Dataset + Partition of `data_` containing `(validation_size)`% of the data, in Brush format. + search_space_ : a Brush `SearchSpace` object. + Holds the operators and terminals and sampling utilities to update programs. + toolbox_ : deap.Toolbox + The toolbox used by DEAP for EA algorithm. + """ + + def __init__( + self, + mode='classification', + pop_size=100, + max_gen=100, + verbosity=0, + max_depth=3, + max_size=20, + n_islands=5, + mig_prob=0.05, + cx_prob= 1/7, + mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, + "toggle_weight_on":1/6, "toggle_weight_off":1/6}, + functions: list[str]|dict[str,float] = {}, + initialization="uniform", + algorithm="nsga2", + objectives=["error", "size"], + random_state=None, + weights_init=True, + validation_size: float = 0.0, + batch_size: float = 1.0 + ): + self.pop_size=pop_size + self.max_gen=max_gen + self.verbosity=verbosity + self.algorithm=algorithm + self.mode=mode + self.max_depth=max_depth + self.max_size=max_size + self.n_islands=n_islands + self.mig_prob=mig_prob + self.cx_prob=cx_prob + self.mutation_options=mutation_options + self.functions=functions + self.objectives=objectives + self.initialization=initialization + self.random_state=random_state + self.batch_size=batch_size + self.weights_init=weights_init + self.validation_size=validation_size + + + def _setup_toolbox(self, data_train, data_validation): + """Setup the deap toolbox""" + toolbox: base.Toolbox = base.Toolbox() + + # creator.create is used to "create new functions", and takes at least + # 2 arguments: the name of the newly created class and a base class + + # Cleaning possible previous classes that are model-dependent (clf and reg are differente) + if hasattr(creator, "FitnessMulti"): + del creator.FitnessMulti + if hasattr(creator, "Individual"): + del creator.Individual + + # Minimizing/maximizing problem: negative/positive weight, respectively. + # Our classification is using the error as a metric + # Comparing fitnesses: https://deap.readthedocs.io/en/master/api/base.html#deap.base.Fitness + creator.create("FitnessMulti", base.Fitness, weights=self.weights) + + # create Individual class, inheriting from self.Individual with a fitness attribute + creator.create("Individual", DeapIndividual, fitness=creator.FitnessMulti) + + toolbox.register("Clone", lambda ind: creator.Individual(ind.prg.copy())) + + toolbox.register("mate", self._crossover) + toolbox.register("mutate", self._mutate) + + # When solving multi-objective problems, selection and survival must + # support this feature. This means that these selection operators must + # accept a tuple of fitnesses as argument) + if self.algorithm=="nsga2" or self.algorithm=="nsga2island": + toolbox.register("select", tools.selTournamentDCD) + toolbox.register("survive", tools.selNSGA2) + elif self.algorithm=="ga" or self.algorithm=="gaisland": + toolbox.register("select", tools.selTournament, tournsize=3) + def offspring(pop, MU): return pop[-MU:] + toolbox.register("survive", offspring) + + # toolbox.population will return a list of elements by calling toolbox.individual + toolbox.register("createRandom", self._make_individual) + toolbox.register("population", tools.initRepeat, list, toolbox.createRandom) + + toolbox.register("get_objectives", lambda: self.objectives) + toolbox.register("getBatch", data_train.get_batch) + toolbox.register("evaluate", self._fitness_function, data=data_train) + toolbox.register("evaluateValidation", self._fitness_validation, data=data_validation) + + return toolbox + + + def _crossover(self, ind1, ind2): + offspring = [] + + for i,j in [(ind1,ind2),(ind2,ind1)]: + attempts = 0 + child = None + while (attempts < 3 and child is None): + attempts = attempts + 1 + child = self.variator_.cross(i.prg, j.prg) + + if child is not None: + child = creator.Individual(child) + + offspring.extend([child]) + + # so we always need to have two elements to unpack inside `offspring` + return offspring[0], offspring[1] + + + def _mutate(self, ind1): + # offspring = (creator.Individual(ind1.prg.mutate(self.search_space_)),) + attempts = 0 + offspring = None + print("starting mutation") + while (attempts < 3 and offspring is None): + print("attempt", attempts) + offspring = self.variator_.mutate(ind1.prg) + print("got offspring") + + if offspring is not None: + print('and it wasnt none') + return creator.Individual(offspring) + attempts = attempts + 1 + + print("i failed") + return None + + + def fit(self, X, y): + """ + Fit an estimator to X,y. + + Parameters + ---------- + X : np.ndarray + 2-d array of input data. + y : np.ndarray + 1-d array of (boolean) target values. + """ + _brush.set_params(self.get_params()) + + if self.random_state is not None: + _brush.set_random_state(self.random_state) + + self.feature_names_ = [] + if isinstance(X, pd.DataFrame): + self.feature_names_ = X.columns.to_list() + + self.data_ = self._make_data(X, y, + feature_names=self.feature_names_, + validation_size=self.validation_size) + + if isinstance(self.functions, list): + self.functions_ = {k:1.0 for k in self.functions} + else: + self.functions_ = self.functions + + # set n classes if relevant + if self.mode=="classification": + self.n_classes_ = len(np.unique(y)) + + # Including necessary functions for classification programs. This + # is needed so the search space can create the hash and mapping of + # the functions. + if self.n_classes_ == 2 and "Logistic" not in self.functions_: + self.functions_["Logistic"] = 1.0 + # elif "Softmax" not in self.functions_: # TODO: implement multiclassific. + # self.functions_["Softmax"] = 1.0 + + # Weight of each objective (+ for maximization, - for minimization) + obj_weight = { + "error" : +1.0 if self.mode=="classification" else -1.0, + "size" : -1.0, + "complexity" : -1.0 + } + self.weights = [obj_weight[w] for w in self.objectives] + + # These have a default behavior to return something meaningfull if + # no values are set + self.train_ = self.data_.get_training_data() + self.train_.set_batch_size(self.batch_size) + self.validation_ = self.data_.get_validation_data() + + self.search_space_ = _brush.SearchSpace(self.train_, self.functions_, self.weights_init) + + # TODO: use variation operator here instead of these functions + # TODO: store parameters in Parameter and use it to create the variator, selector, survivor, etc. + self.parameters_ = _brush.Parameters() + if self.mode == "classification": + self.variator_ = _brush.ClassifierVariator(self.parameters_, self.search_space_) + elif self.mode == "regressor": + self.variator_ = _brush.RegressorVariator(self.parameters_, self.search_space_) + else: + raise("Unsupported mode") + + self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) + + # nsga2 and ga differ in the toolbox + self.archive_, self.logbook_ = nsga2( + self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, + (0.0 0: + print(f'best model {self.best_estimator_.get_model()}' + + f' with size {self.best_estimator_.size()}, ' + + f' depth {self.best_estimator_.depth()}, ' + + f' and fitness {self.archive_[final_ind_idx].fitness}') + + return self + + def _make_data(self, X, y=None, feature_names=[], validation_size=0.0): + # This function should not partition data (since it may be used in `predict`). + # partitioning is done by `fit`. Feature names should be inferred + # before calling _make_data (so predict can be made with np arrays or + # pd dataframes). + + if isinstance(y, pd.Series): + y = y.values + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + if isinstance(y, NoneType): + return _brush.Dataset(X=X, + feature_names=feature_names, validation_size=validation_size) + + return _brush.Dataset(X=X, y=y, + feature_names=feature_names, validation_size=validation_size) + + + def predict(self, X): + """Predict using the best estimator in the archive. """ + + check_is_fitted(self) + + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + data = _brush.Dataset(X=X, ref_dataset=self.data_, + feature_names=self.feature_names_) + + # data = self._make_data(X, feature_names=self.feature_names_) + + return self.best_estimator_.predict(data) + + # def _setup_population(self): + # """initialize programs""" + # if self.mode == 'classification': + # generate = self.search_space_.make_classifier + # else: + # generate = self.search_space_.make_regressor + + # programs = [ + # DeapIndividual(generate(self.max_depth, self.max_size)) + # for i in range(self.pop_size) + # ] + # # return [self._create_deap_individual_(p) for p in programs] + # return programs + + def get_params(self, deep=True): + out = dict() + for (key, value) in self.__dict__.items(): + if not key.endswith('_'): + if deep and hasattr(value, "get_params") and not isinstance(value, type): + deep_items = value.get_params().items() + out.update((key + "__" + k, val) for k, val in deep_items) + out[key] = value + return out + + +class DeapClassifier(DeapEstimator,ClassifierMixin): + """Deap-based Brush for classification. + + For options, see :py:class:`DeapEstimator `. + + Examples + -------- + >>> import pandas as pd + >>> df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') + >>> X = df.drop(columns='target') + >>> y = df['target'] + >>> from pybrush import DeapClassifier + >>> est = DeapClassifier() + >>> est.fit(X,y) + >>> print('score:', est.score(X,y)) + """ + def __init__( self, **kwargs): + super().__init__(mode='classification',**kwargs) + + def _error(self, ind, data: _brush.Dataset): + #return (data.y==ind.prg.predict(data)).sum() / data.y.shape[0] + return average_precision_score(data.y, ind.prg.predict(data)) + + def _fitness_validation(self, ind, data: _brush.Dataset): + # Fitness without fitting the expression, used with validation data + + ind_objectives = { + "error" : self._error(ind, data), + "size" : ind.prg.size(), + "complexity": ind.prg.complexity() + } + return [ ind_objectives[obj] for obj in self.objectives ] + + def _fitness_function(self, ind, data: _brush.Dataset): + ind.prg.fit(data) + + return self._fitness_validation(ind, data) + + def _make_individual(self): + # C++'s PTC2-based `make_individual` will create a tree of at least + # the given size. By uniformly sampling the size, we can instantiate a + # population with more diversity + + if self.initialization not in ["uniform", "max_size"]: + raise ValueError(f"Invalid argument value for `initialization`. " + f"expected 'max_size' or 'uniform'. got {self.initialization}") + + return creator.Individual( + self.search_space_.make_classifier( + self.max_depth,(0 if self.initialization=='uniform' else self.max_size)) + if self.n_classes_ == 2 else + self.search_space_.make_multiclass_classifier( + self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) + ) + + def predict_proba(self, X): + """Predict class probabilities for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32``. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + + """ + + check_is_fitted(self) + + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + data = _brush.Dataset(X=X, ref_dataset=self.data_, + feature_names=self.feature_names_) + + # data = self._make_data(X, feature_names=self.feature_names_) + + prob = self.best_estimator_.predict_proba(data) + + if self.n_classes_ <= 2: + prob = np.hstack( (np.ones(X.shape[0]).reshape(-1,1), prob.reshape(-1,1)) ) + prob[:, 0] -= prob[:, 1] + + return prob + + +class DeapRegressor(DeapEstimator, RegressorMixin): + """Deap-based Brush for regression. + + For options, see :py:class:`DeapEstimator `. + + Examples + -------- + >>> import pandas as pd + >>> df = pd.read_csv('docs/examples/datasets/d_enc.csv') + >>> X = df.drop(columns='label') + >>> y = df['label'] + >>> from pybrush import DeapRegressor + >>> est = DeapRegressor() + >>> est.fit(X,y) + >>> print('score:', est.score(X,y)) + """ + def __init__(self, **kwargs): + super().__init__(mode='regressor',**kwargs) + + def _error(self, ind, data: _brush.Dataset): + MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) + if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf + MSE = np.inf + + return MSE + + def _fitness_validation(self, ind, data: _brush.Dataset): + # Fitness without fitting the expression, used with validation data + + ind_objectives = { + "error" : self._error(ind, data), + "size" : ind.prg.size(), + "complexity": ind.prg.complexity() + } + return [ ind_objectives[obj] for obj in self.objectives ] + + def _fitness_function(self, ind, data: _brush.Dataset): + ind.prg.fit(data) + + return self._fitness_validation(ind, data) + + def _make_individual(self): + if self.initialization not in ["uniform", "max_size"]: + raise ValueError(f"Invalid argument value for `initialization`. " + f"expected 'max_size' or 'uniform'. got {self.initialization}") + + # No arguments (or zero): brush will use PARAMS passed in set_params. + # max_size is sampled between 1 and params['max_size'] if zero is provided + return creator.Individual( + self.search_space_.make_regressor( + self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) + ) + +# Under development +# class DeapRepresenter(DeapEstimator, TransformerMixin): +# """Deap-based Brush for representation learning. + +# For options, see :py:class:`DeapEstimator `. + +# Examples +# -------- +# >>> import pandas as pd +# >>> df = pd.read_csv('docs/examples/datasets/d_enc.csv') +# >>> X = df.drop(columns='label') +# >>> y = df['label'] +# >>> from pybrush import DeapRegressor +# >>> est = DeapRegressor() +# >>> est.fit(X,y) +# >>> print('score:', est.score(X,y)) +# """ +# def __init__(self, **kwargs): +# super().__init__(mode='regressor',**kwargs) + +# def _fitness_function(self, ind, data: _brush.Dataset): +# ind.prg.fit(data) +# return ( +# # todo: need to return a matrix from X for this +# np.sum((data.get_X()- ind.prg.predict(data))**2), +# ind.prg.size() +# ) + +# def _make_individual(self): +# return creator.Individual( +# self.search_space_.make_representer(self.max_depth, self.max_size) +# ) + +# def transform(self, X): +# """Transform X using the best estimator in the archive. """ +# return self.predict(X) \ No newline at end of file diff --git a/pybrush/__init__.py b/pybrush/__init__.py new file mode 100644 index 00000000..da941360 --- /dev/null +++ b/pybrush/__init__.py @@ -0,0 +1,8 @@ +# Interfaces for Brush classes. Use to prototype with Brush +from _brush import Dataset, SearchSpace, Parameters # TODO: make individual wrapper, Individual + +# Brush's original EA algorithm +from pybrush.BrushEstimator import BrushClassifier, BrushRegressor + +# Prototyping an EA using brush classes, but other EA framework +from pybrush.DeapEstimator import DeapClassifier, DeapRegressor \ No newline at end of file diff --git a/brush/versionstr.py b/pybrush/_versionstr.py similarity index 100% rename from brush/versionstr.py rename to pybrush/_versionstr.py diff --git a/pybrush/deap_api/__init__.py b/pybrush/deap_api/__init__.py new file mode 100644 index 00000000..d74636f0 --- /dev/null +++ b/pybrush/deap_api/__init__.py @@ -0,0 +1 @@ +from pybrush.deap_api.nsga2 import nsga2, DeapIndividual # TODO: use brush individual instead of deap \ No newline at end of file diff --git a/pybrush/deap_api/nsga2.py b/pybrush/deap_api/nsga2.py new file mode 100644 index 00000000..c6862d79 --- /dev/null +++ b/pybrush/deap_api/nsga2.py @@ -0,0 +1,114 @@ +from deap import tools +from deap.benchmarks.tools import hypervolume +import numpy as np +import functools + +class DeapIndividual(): + """Class that wraps brush program for creator.Individual class from DEAP.""" + def __init__(self, prg): + self.prg = prg + +def nsga2(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rnd_flt): + # NGEN = 250 + # MU = 100 + # CXPB = 0.9 + # rnd_flt: random number generator to sample crossover prob + + def calculate_statistics(ind): + on_train = ind.fitness.values + on_val = toolbox.evaluateValidation(ind) + + return (*on_train, *on_val) + + stats = tools.Statistics(calculate_statistics) + + stats.register("avg", np.nanmean, axis=0) + stats.register("med", np.nanmedian, axis=0) + stats.register("std", np.nanstd, axis=0) + stats.register("min", np.nanmin, axis=0) + stats.register("max", np.nanmax, axis=0) + + logbook = tools.Logbook() + logbook.header = ['gen', 'evals'] + \ + [f"{stat} {partition} O{objective}" + for stat in ['avg', 'med', 'std', 'min', 'max'] + for partition in ['train', 'val'] + for objective in toolbox.get_objectives()] + + pop = toolbox.population(n=MU) + + # OBS: evaluate calls fit in the individual. It is different from using it to predict. The + # function evaluateValidation don't call the fit + fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) + for ind, fit in zip(pop, fitnesses): + ind.fitness.values = fit + + # This is just to assign the crowding distance to the individuals + # no actual selection is done + pop = toolbox.survive(pop, len(pop)) + + record = stats.compile(pop) + logbook.record(gen=0, evals=len(pop), **record) + + if verbosity > 0: + print(logbook.stream) + + # Begin the generational process + for gen in range(1, NGEN): + batch = toolbox.getBatch() # batch will be a random subset only if it was not defined as the size of the train set. + # everytime this function is called, a new random batch is generated. + if (use_batch): # recalculate the fitness for the parents + # use_batch is false if batch_size is different from train set size. + # If we're using batch, we need to re-evaluate every model (without changing its weights). + # evaluateValidation doesnt fit the weights + fitnesses = toolbox.map( + functools.partial(toolbox.evaluateValidation, data=batch), pop) + + for ind, fit in zip(pop, fitnesses): + ind.fitness.values = fit + + # Vary the population + # offspring = tools.selTournamentDCD(pop, len(pop)) + parents = toolbox.select(pop, len(pop)) + # offspring = [toolbox.clone(ind) for ind in offspring] + offspring = [] + for ind1, ind2 in zip(parents[::2], parents[1::2]): + off1, off2 = None, None + if rnd_flt() < CXPB: # either mutation or crossover. + off1, off2 = toolbox.mate(ind1, ind2) + else: + off1 = toolbox.mutate(ind1) + off2 = toolbox.mutate(ind2) + + if off1 is not None: # Mutation worked. first we fit, then add to offspring + # Evaluate (instead of evaluateValidation) to fit the weights of the offspring + off1.fitness.values = toolbox.evaluate(off1) + if use_batch: # Adjust fitness to the same data as parents + off1.fitness.values = toolbox.evaluateValidation(off1, data=batch) + offspring.extend([off1]) + + if off2 is not None: + off2.fitness.values = toolbox.evaluate(off2) + if use_batch: + off2.fitness.values = toolbox.evaluateValidation(off2, data=batch) + offspring.extend([off2]) + + # Select the next generation population (no sorting before this step, as + # survive==offspring will cut it in half) + pop = toolbox.survive(pop + offspring, MU) + + pop.sort(key=lambda x: x.fitness, reverse=True) + + record = stats.compile(pop) + logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) + + if verbosity > 0: + print(logbook.stream) + + if verbosity > 0: + print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) + + archive = tools.ParetoFront() + archive.update(pop) + + return archive, logbook \ No newline at end of file diff --git a/setup.py b/setup.py index 0dd66c13..e9359ec5 100644 --- a/setup.py +++ b/setup.py @@ -99,11 +99,11 @@ def build_extension(self, ext): ) # # # Clean old build/ directory if it exists -# try: -# remove_tree("./build") -# print("Removed old build directory.") -# except FileNotFoundError: -# print("No existing build directory found - skipping.") +try: + remove_tree("./build") + print("Removed old build directory.") +except FileNotFoundError: + print("No existing build directory found - skipping.") setup( name="pybrush", @@ -117,9 +117,9 @@ def build_extension(self, ext): project_urls={ "Bug Tracker": "https://github.com/lacava/brush/issues", }, - package_dir={"": "src"}, - packages=find_packages(where="src"), - # cmake_install_dir="src/brush", + package_dir={"": "."}, + packages=find_packages(where="."), + #cmake_install_dir="src/", python_requires=">=3.6", install_requires=[ 'numpy', From bf81b8b0f6ca21f1c2e9a800697cd639c0e56e65 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 11 Dec 2023 21:39:35 -0300 Subject: [PATCH 105/199] Improved individuals, parameters, types and variation --- src/cbrush.h | 4 ++-- src/individual.h | 42 ++++++++++++++++++++++++++++++++++++++---- src/params.h | 12 ++++++++++-- src/types.h | 18 ++++++++++++++++++ src/variation.cpp | 27 +++++++++++++++++++++++---- 5 files changed, 91 insertions(+), 12 deletions(-) diff --git a/src/cbrush.h b/src/cbrush.h index 2cf104ba..c8abf667 100644 --- a/src/cbrush.h +++ b/src/cbrush.h @@ -29,7 +29,7 @@ using namespace Eval; // using namespace variation; template -class CBrush{ +class CBrush{ // TODO: rename it to BrushEstimator public: CBrush() : params(Parameters()) @@ -45,7 +45,7 @@ class CBrush{ inline void set_is_fitted(bool f){is_fitted=f;} inline bool get_is_fitted(){return is_fitted;} - // TODO: WRAPPER SHOULD SET ALL THESE + // TODO: WRAPPER SHOULD SET ALL THESE (by changing the inner parameter instance) void set_pop_size(int pop_size){ params.pop_size = pop_size; }; int get_pop_size(){ return params.pop_size; }; diff --git a/src/individual.h b/src/individual.h index 2e479ef1..f13d8aef 100644 --- a/src/individual.h +++ b/src/individual.h @@ -9,7 +9,7 @@ namespace Pop{ template class Individual{ -private: +public: // TODO: make these private (and work with nlohman json) Program program; ///< executable data structure // store just info that we dont have a getter. size, depth, complexity: they can all be obtained with program. @@ -26,8 +26,7 @@ class Individual{ unsigned int rank; ///< pareto front rank float crowd_dist; ///< crowding distance on the Pareto front vector obj; ///< objectives for use with Pareto selection - -public: + Individual() { fitness = -1; @@ -62,6 +61,7 @@ class Individual{ string get_model() { return program.get_model(); }; size_t get_size() { return program.size(); }; size_t get_depth() { return program.depth(); }; + Program& get_program() { return program; }; // setters and getters size_t set_complexity() { @@ -70,17 +70,51 @@ class Individual{ }; // sets and returns it size_t get_complexity() const { return complexity; }; + // TODO: USE setters and getters intead of accessing it directly + void set_fitness(float f){ fitness=f; }; + float get_fitness() const { return fitness; }; + + void set_fitness_v(float f_v){ fitness_v=f_v; }; + float get_fitness_v() const { return fitness_v; }; + void set_rank(unsigned r){ rank=r; }; size_t get_rank() const { return rank; }; void set_crowd_dist(unsigned cd){ crowd_dist=cd; }; - size_t get_crow_dist() const { return crowd_dist; }; + size_t get_crowd_dist() const { return crowd_dist; }; /// set obj vector given a string of objective names void set_obj(const vector&); int check_dominance(const Individual& b) const; }; + +// serialization for Individual +template +void to_json(json &j, const Individual &p) +{ + j = json{ + {"program", p.program}, + {"fitness", p.fitness}, + {"fitness_v", p.fitness_v}, + {"complexity", p.complexity}, + {"rank", p.rank}, + {"crowd_dist", p.crowd_dist} + }; +} + +template +void from_json(const json &j, Individual& p) +{ + j.at("program").get_to( p.program ); + j.at("fitness").get_to( p.fitness ); + j.at("fitness_v").get_to( p.fitness_v ); + j.at("complexity").get_to( p.complexity ); + j.at("rank").get_to( p.rank ); + j.at("crowd_dist").get_to( p.crowd_dist ); +} + + } // Pop } // Brush diff --git a/src/params.h b/src/params.h index 68c476cf..688c221b 100644 --- a/src/params.h +++ b/src/params.h @@ -15,8 +15,8 @@ namespace Brush struct Parameters { public: - // TODO: setters and getters for all parameters? (and do checks in setters?) - + // TODO: setters and getters for all parameters? (and do checks in setters?). Also make them private, and use the getters and setters in the code + // settings int random_state; // TODO: constructor should set the global rng to random_state (if given, otherwise just let it work normally) //int verbosity = 0; // TODO: implement log and verbosity @@ -46,6 +46,7 @@ struct Parameters {"toggle_weight_on", 0.167}, {"toggle_weight_off", 0.167} }; + float cx_prob=0.2; ///< cross rate for variation float mig_prob = 0.05; @@ -68,6 +69,13 @@ struct Parameters Parameters(){}; ~Parameters(){}; + + void set_pop_size(int new_pop_size){ pop_size = new_pop_size; }; + int get_pop_size(){ return pop_size; }; + + + void set_mutation_probs(std::map new_mutation_probs){ mutation_probs = new_mutation_probs; }; + std::map get_mutation_probs(){ return mutation_probs; }; }; // Global (deprecated) params diff --git a/src/types.h b/src/types.h index 5badc481..65e08cdd 100644 --- a/src/types.h +++ b/src/types.h @@ -80,6 +80,24 @@ typedef Program ClassifierProgram; typedef Program MulticlassClassifierProgram; typedef Program RepresenterProgram; +//////////////////////////////////////////////////////////////////////////////// +// Individual +namespace Pop { + template class Individual; +} +typedef Pop::Individual RegressorIndividual; +typedef Pop::Individual ClassifierIndividual; +typedef Pop::Individual MulticlassClassifierIndividual; +typedef Pop::Individual RepresenterIndividual; + +//////////////////////////////////////////////////////////////////////////////// +// Estimator +using PT = ProgramType; +template class CBrush; +typedef CBrush RegressorEstimator; +typedef CBrush ClassifierEstimator; +typedef CBrush MulticlassClassifierEstimator; +typedef CBrush RepresenterEstimator; //////////////////////////////////////////////////////////////////////////////// // Data diff --git a/src/variation.cpp b/src/variation.cpp index 8b0a7c99..a7db8325 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -498,18 +498,31 @@ std::optional> Variation::cross( template std::optional> Variation::mutate(const Program& parent) { + std::cout << "selecting options" << parameters.mutation_probs.size() << std::endl; auto options = parameters.mutation_probs; - if (std::all_of(options.begin(), options.end(), - [](const auto& kv) { return kv.second<=0.0; }) - ) + std::cout << "selecting options2" << options.size() << std::endl; + + bool all_zero = true; + for (auto &it : parameters.mutation_probs) { + std::cout << it.first << it.second << std::endl; + if (it.second > 0.0) { + all_zero = false; + break; + } + } + + if (all_zero) { // No mutation can be successfully applied to this solution + std::cout << "no viable one" << std::endl; return std::nullopt; } + std::cout << "selecting (not all are zero)" << std::endl; // choose a valid mutation option - string choice = r.random_choice(options); + string choice = r.random_choice(parameters.mutation_probs); + std::cout << "picked mutation" << choice << std::endl; // TODO: this could be improved (specially with the Variation class) std::unique_ptr mutation; if (choice == "point") @@ -535,8 +548,10 @@ std::optional> Variation::mutate(const Program& parent) HANDLE_ERROR_THROW(msg); } + std::cout << "cloning parent" << std::endl; Program child(parent); + std::cout << "findind spot" << std::endl; // choose location by weighted sampling of program auto weights = mutation->find_spots(child.Tree); @@ -544,18 +559,22 @@ std::optional> Variation::mutate(const Program& parent) return w<=0.0; })) { // There is no spot that has a probability to be selected + std::cout << "no spots" << std::endl; return std::nullopt; } + std::cout << "apickingt spot" << std::endl; // apply the mutation and check if it succeeded auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), weights.begin(), weights.end()); + std::cout << "mutating" << std::endl; // Every mutation here works inplace, so they return bool instead of // std::optional to indicare the result of their manipulation over the // program tree. Here we call the mutation function and return the result bool success = (*mutation)(child.Tree, spot); + std::cout << "returning" << std::endl; if (success && ( (child.size() <= parameters.max_size) && (child.depth() <= parameters.max_depth) )){ From 3ecfb44fbd84cbf96ff3ffe6eec240e0002a7e8a Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 11 Dec 2023 21:40:20 -0300 Subject: [PATCH 106/199] Lots of new bindings. Rework to make previous keep working --- src/bindings/bind_cbrush.cpp | 14 +++--- src/bindings/bind_cbrush.h | 29 +++++++++++++ src/bindings/bind_dataset.cpp | 69 ++++++++++++++++++++++++++++++ src/bindings/bind_individuals.cpp | 13 ++++++ src/bindings/bind_individuals.h | 30 +++++++++++++ src/bindings/bind_params.cpp | 30 +++++++++++++ src/bindings/bind_programs.cpp | 2 +- src/bindings/bind_programs.h | 3 +- src/bindings/bind_search_space.cpp | 50 ++++++++++++++++++++++ src/bindings/bind_variation.cpp | 19 ++++++++ src/bindings/bind_variation.h | 23 ++++++++++ src/bindings/module.cpp | 22 ++++++++-- 12 files changed, 291 insertions(+), 13 deletions(-) create mode 100644 src/bindings/bind_cbrush.h create mode 100644 src/bindings/bind_dataset.cpp create mode 100644 src/bindings/bind_params.cpp create mode 100644 src/bindings/bind_search_space.cpp create mode 100644 src/bindings/bind_variation.cpp create mode 100644 src/bindings/bind_variation.h diff --git a/src/bindings/bind_cbrush.cpp b/src/bindings/bind_cbrush.cpp index 90f917f3..16cd60ab 100644 --- a/src/bindings/bind_cbrush.cpp +++ b/src/bindings/bind_cbrush.cpp @@ -1,16 +1,14 @@ #include "module.h" -#include "../cbrush.h" -#include "../types.h" +#include "bind_cbrush.h" // TODO: rename it to bind_estimators namespace py = pybind11; namespace br = Brush; namespace nl = nlohmann; -// TODO: copy bind_programs.h to make the cbrush -void bind_cbrush(py::module& m) +void bind_estimators(py::module& m) { - py::class_>(m, "BrushRegressor") - .def(py::init([]() - { br::CBrush est; return est; })) - ; + bind_estimator(m, "Regressor"); + bind_estimator(m, "Classifier"); + bind_estimator(m, "MultiClassifier"); + bind_estimator(m, "Representer"); } \ No newline at end of file diff --git a/src/bindings/bind_cbrush.h b/src/bindings/bind_cbrush.h new file mode 100644 index 00000000..88284852 --- /dev/null +++ b/src/bindings/bind_cbrush.h @@ -0,0 +1,29 @@ +#include "module.h" +#include "../cbrush.h" + +using Reg = Brush::RegressorEstimator; +using Cls = Brush::ClassifierEstimator; +using Rep = Brush::RepresenterEstimator; +using MCls = Brush::MulticlassClassifierEstimator; + +namespace nl = nlohmann; +namespace br = Brush; + +using stream_redirect = py::call_guard; + +template +void bind_estimator(py::module& m, string name) +{ + // using RetType = std::conditional_t< + // std::is_same_v, ArrayXf, + // std::conditional_t, ArrayXb, + // std::conditional_t, ArrayXi, ArrayXXf>>>; + + // py::class_ ind(m, name.data() ); + // ind.def(py::init<>()) + // ; + // if constexpr (std::is_same_v) + // { + + // } +} \ No newline at end of file diff --git a/src/bindings/bind_dataset.cpp b/src/bindings/bind_dataset.cpp new file mode 100644 index 00000000..ade036dc --- /dev/null +++ b/src/bindings/bind_dataset.cpp @@ -0,0 +1,69 @@ +#include "module.h" +#include "../data/data.h" +#include "../types.h" +#include "../data/io.h" +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +void bind_dataset(py::module & m) +{ + py::class_(m, "Dataset") + // construct from X, feature names (and optional validation and batch sizes) with constructor 3. + .def(py::init([](const Ref& X, + const vector& feature_names=vector(), + const float validation_size=0.0, + const float batch_size=1.0){ + return br::Data::Dataset( + X, feature_names, validation_size, batch_size); + }), + py::arg("X"), + py::arg("feature_names") = vector(), + py::arg("validation_size") = 0.0, + py::arg("batch_size") = 1.0 + ) + // construct from X, y, feature names (and optional validation and batch sizes) with constructor 2. + .def(py::init([](const Ref& X, + const Ref& y, + const vector& feature_names=vector(), + const float validation_size=0.0, + const float batch_size=1.0){ + return br::Data::Dataset( + X, y, feature_names, {}, false, validation_size, batch_size); + }), + py::arg("X"), + py::arg("y"), + py::arg("feature_names") = vector(), + py::arg("validation_size") = 0.0, + py::arg("batch_size") = 1.0 + ) + // construct from X, feature names, but copying the feature types from a + // reference dataset with constructor 4. Useful for predicting (specially + // because the user can provide a single element matrix, or an array with + // no feature names). + .def(py::init([](const Ref& X, + const br::Data::Dataset& ref_dataset, + const vector& feature_names){ + return br::Data::Dataset(X, ref_dataset, feature_names); + }), + py::arg("X"), + py::arg("ref_dataset"), + py::arg("feature_names") + ) + + .def_readwrite("y", &br::Data::Dataset::y) + // .def_readwrite("features", &br::Data::Dataset::features) + .def("get_n_samples", &br::Data::Dataset::get_n_samples) + .def("get_n_features", &br::Data::Dataset::get_n_features) + .def("print", &br::Data::Dataset::print) + .def("get_batch", &br::Data::Dataset::get_batch) + .def("get_training_data", &br::Data::Dataset::get_training_data) + .def("get_validation_data", &br::Data::Dataset::get_validation_data) + .def("get_batch_size", &br::Data::Dataset::get_batch_size) + .def("set_batch_size", &br::Data::Dataset::set_batch_size) + .def("split", &br::Data::Dataset::split) + .def("get_X", &br::Data::Dataset::get_X) + ; + + m.def("read_csv", &br::Data::read_csv, py::arg("path"), py::arg("target"), py::arg("sep")=','); +} \ No newline at end of file diff --git a/src/bindings/bind_individuals.cpp b/src/bindings/bind_individuals.cpp index 0275ca93..659259c0 100644 --- a/src/bindings/bind_individuals.cpp +++ b/src/bindings/bind_individuals.cpp @@ -1 +1,14 @@ #include "module.h" +#include "bind_individuals.h" + +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +void bind_individuals(py::module& m) +{ + bind_individual(m, "Regressor"); + bind_individual(m, "Classifier"); + bind_individual(m, "MultiClassifier"); + bind_individual(m, "Representer"); +} \ No newline at end of file diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index e69de29b..ce34745c 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -0,0 +1,30 @@ +#include "module.h" +#include "../individual.h" + +using Reg = Brush::RegressorIndividual; +using Cls = Brush::ClassifierIndividual; +using Rep = Brush::RepresenterIndividual; +using MCls = Brush::MulticlassClassifierIndividual; + +namespace nl = nlohmann; +namespace br = Brush; + +using stream_redirect = py::call_guard; + +template +void bind_individual(py::module& m, string name) +{ + using RetType = std::conditional_t< + std::is_same_v, ArrayXf, + std::conditional_t, ArrayXb, + std::conditional_t, ArrayXi, ArrayXXf>>>; + + py::class_ ind(m, name.data() ); + ind.def(py::init<>()) + ; + if constexpr (std::is_same_v) + { + + } + +} \ No newline at end of file diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp new file mode 100644 index 00000000..dadbfaf6 --- /dev/null +++ b/src/bindings/bind_params.cpp @@ -0,0 +1,30 @@ +#include "module.h" +#include "../params.h" +#include "../util/rnd.h" + +namespace br = Brush; + +void bind_params(py::module& m) +{ + // py::object params = Brush::PARAMS; + // m.attr("PARAMS") = params; + + // py::class_(m, "Params", py::dynamic_attr()) + // .def(py::init<>()) + + m.def("set_params", &Brush::set_params); // TODO: delete this. use parameters class + + m.def("get_params", &br::get_params); + m.def("set_random_state", [](unsigned int seed) + { br::Util::r = *br::Util::Rnd::initRand(); + br::Util::r.set_seed(seed); }); + m.def("rnd_flt", [](){ return br::Util::r.rnd_flt(); }); + + py::class_(m, "Parameters") + .def(py::init([]() + { Brush::Parameters p; return p; })) + // TODO: define getters and setters, and create the bindings here. Make the Brush bindings use these here + .def_property("pop_size", &Brush::Parameters::get_pop_size, &Brush::Parameters::set_pop_size) + .def_property("mutation_probs", &Brush::Parameters::get_mutation_probs, &Brush::Parameters::set_mutation_probs) + ; +} \ No newline at end of file diff --git a/src/bindings/bind_programs.cpp b/src/bindings/bind_programs.cpp index 136bae5f..3d8b1ce6 100644 --- a/src/bindings/bind_programs.cpp +++ b/src/bindings/bind_programs.cpp @@ -11,6 +11,7 @@ namespace nl = nlohmann; void bind_programs(py::module& m) { + // fitness is used to prototype with deap API. TODO: replace deapIndividual with brush individual (once it gets implemented) py::class_(m, "Fitness", py::dynamic_attr()) .def(py::init<>()) .def_readwrite("values", &br::Fitness::values) @@ -21,5 +22,4 @@ void bind_programs(py::module& m) bind_program(m, "Classifier"); bind_program(m, "MultiClassifier"); bind_program(m, "Representer"); - } \ No newline at end of file diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index 7ba5d585..92370401 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -77,7 +77,8 @@ void bind_program(py::module& m, string name) "predict from Dataset object") .def("predict_proba", static_cast &X)>(&T::predict_proba), - "predict from X data"); + "predict from X data") + ; } } \ No newline at end of file diff --git a/src/bindings/bind_search_space.cpp b/src/bindings/bind_search_space.cpp new file mode 100644 index 00000000..c86e2fba --- /dev/null +++ b/src/bindings/bind_search_space.cpp @@ -0,0 +1,50 @@ +#include "module.h" +#include "../search_space.h" +#include "../program/program.h" +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +using stream_redirect = py::call_guard; + +void bind_search_space(py::module &m) +{ + // Notice: We change the interface for SearchSpace a little bit by + // constructing it with a Dataset object, rather than initializing it as an + // empty struct and then calling init() with the Dataset object. + py::class_(m, "SearchSpace") + .def(py::init([](br::Data::Dataset data, bool weights_init=true){ + SearchSpace SS; + SS.init(data, {}, weights_init); + return SS; + }), + py::arg("data"), + py::arg("weights_init") = true ) + .def(py::init&, + bool>(), + py::arg("data"), + py::arg("user_ops"), + py::arg("weights_init") = true ) + .def("make_regressor", &br::SearchSpace::make_regressor, + py::arg("max_d") = 0, + py::arg("max_size") = 0, + py::arg("params") = Brush::Parameters() ) + .def("make_classifier", &br::SearchSpace::make_classifier, + py::arg("max_d") = 0, + py::arg("max_size") = 0, + py::arg("params") = Brush::Parameters() ) + .def("make_multiclass_classifier", + &br::SearchSpace::make_multiclass_classifier, + py::arg("max_d") = 0, + py::arg("max_size") = 0, + py::arg("params") = Brush::Parameters() ) + .def("make_representer", &br::SearchSpace::make_representer, + py::arg("max_d") = 0, + py::arg("max_size") = 0, + py::arg("params") = Brush::Parameters() ) + .def("print", + &br::SearchSpace::print, + stream_redirect() + ) + ; +} \ No newline at end of file diff --git a/src/bindings/bind_variation.cpp b/src/bindings/bind_variation.cpp new file mode 100644 index 00000000..0a772c7c --- /dev/null +++ b/src/bindings/bind_variation.cpp @@ -0,0 +1,19 @@ +#include "module.h" +#include "bind_variation.h" + +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +// using Reg = br::Program; +// using Cls = br::Program; +// using Rep = br::Program; +// using MCls = br::Program; + +void bind_variations(py::module& m) +{ + bind_variation(m, "RegressorVariator"); + bind_variation(m, "ClassifierVariator"); + bind_variation(m, "MultiClassifierVariator"); + bind_variation(m, "RepresenterVariator"); +} \ No newline at end of file diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h new file mode 100644 index 00000000..e7cda407 --- /dev/null +++ b/src/bindings/bind_variation.h @@ -0,0 +1,23 @@ +#include "module.h" +#include "../variation.h" +#include "../variation.cpp" // TODO: figure out why im having symbol errors + +namespace py = pybind11; +namespace nl = nlohmann; +namespace br = Brush; + +template +void bind_variation(py::module& m, string name) +{ + using Class = br::Var::Variation; + + // TODO: make variation a non-templated class + py::class_ vary(m, name.data() ); + + vary.def(py::init<>([](br::Parameters& p, br::SearchSpace& ss){ + Class variation(p, ss); + return variation; })) + .def("mutate", &Class::mutate, py::return_value_policy::automatic) + .def("cross", &Class::cross, py::return_value_policy::automatic) + ; +} \ No newline at end of file diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index 86694de4..21524853 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -15,9 +15,13 @@ license: GNU/GPL v3 namespace py = pybind11; // forward declarations +void bind_params(py::module &); +void bind_dataset(py::module &); +void bind_search_space(py::module &); void bind_programs(py::module &); +void bind_variations(py::module &); void bind_individuals(py::module &); -void bind_cbrush(py::module &); +void bind_estimators(py::module &); PYBIND11_MODULE(_brush, m) { @@ -26,8 +30,20 @@ PYBIND11_MODULE(_brush, m) { #else m.attr("__version__") = "dev"; #endif - bind_cbrush(m); + // bind_cbrush(m); + + bind_params(m); + bind_dataset(m); + bind_search_space(m); + bind_variations(m); + py::module_ m2 = m.def_submodule("program", "Contains Program classes."); bind_programs(m2); - bind_individuals(m2); + + py::module_ m3 = m.def_submodule("individual", "Contains Individual classes."); + bind_individuals(m3); + + py::module_ m4 = m.def_submodule("Estimator", "Contains Estimator classes."); + bind_estimators(m4); + } From 99ba4ce0b4ab962c4600d238ab9b3ab036e6f8ac Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 11 Dec 2023 21:41:00 -0300 Subject: [PATCH 107/199] Updated tests to run with all the rework At this point, the deap API still work, and all the stuff we need to implement the island are there. I need to implement a lot of testings, and make sure the island work with the cpp implementations. After that, I'm gonna do a cleaning up in the code (there's a lot of TODOs that I'm leaving to fix). Finally, I'll be ready to open a PR into master branch --- tests/python/test_brush.py | 154 -------- tests/python/test_deap_api.py | 155 ++++++++ tests/python/test_optimization.py | 609 +++++++++++++++--------------- tests/python/test_program.py | 130 +++---- 4 files changed, 524 insertions(+), 524 deletions(-) delete mode 100644 tests/python/test_brush.py create mode 100644 tests/python/test_deap_api.py diff --git a/tests/python/test_brush.py b/tests/python/test_brush.py deleted file mode 100644 index 28de19d9..00000000 --- a/tests/python/test_brush.py +++ /dev/null @@ -1,154 +0,0 @@ -# #!/usr/bin/env python3 -# import brush -# import pytest -# import numpy as np -# import pandas as pd -# from pmlb import fetch_data -# from sklearn.utils import resample - -# import traceback -# import logging - -# @pytest.fixture -# def brush_args(): -# return dict( -# max_gen=10, -# pop_size=20, -# max_size=50, -# max_depth=6, -# cx_prob= 1/7, -# mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, -# "toggle_weight_on":1/6, "toggle_weight_off":1/6}, -# ) - -# @pytest.fixture -# def classification_setup(): -# df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') -# X = df.drop(columns='target') -# y = df['target'] - -# return brush.BrushClassifier, X, y - -# @pytest.fixture -# def multiclass_classification_setup(): -# df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') -# X = df.drop(columns='target') -# y = df['target'] - -# return brush.BrushClassifier, X, y - -# @pytest.fixture -# def regression_setup(): -# df = pd.read_csv('docs/examples/datasets/d_enc.csv') -# X = df.drop(columns='label') -# y = df['label'] - -# return brush.BrushRegressor, X, y - -# @pytest.mark.parametrize('setup,algorithm', -# [('classification_setup', 'nsga2island'), -# ('classification_setup', 'nsga2' ), -# ('classification_setup', 'gaisland' ), -# ('classification_setup', 'ga' ), -# ('regression_setup', 'nsga2island'), -# ('regression_setup', 'nsga2' ), -# ('regression_setup', 'gaisland' ), -# ('regression_setup', 'ga' )]) -# def test_fit(setup, algorithm, brush_args, request): -# """Testing common utilities related to fitting and generic brush estimator. -# """ - -# Estimator, X, y = request.getfixturevalue(setup) - -# brush_args["algorithm"] = algorithm -# try: -# est = Estimator(**brush_args) -# est.fit(X, y) - -# print('score:',est.score(X,y)) - -# except Exception as e: -# pytest.fail(f"Unexpected Exception caught: {e}") -# logging.error(traceback.format_exc()) - -# @pytest.mark.parametrize('setup', -# [('classification_setup'), -# ('multiclass_classification_setup')]) -# def test_predict_proba(setup, brush_args, request): - -# Estimator, X, y = request.getfixturevalue(setup) - -# est = Estimator(**brush_args) -# est.fit(X, y) - -# y_prob = est.predict_proba(X) -# assert len(y_prob.shape) == 2, "predict_proba should be 2-dimensional" -# assert y_prob.shape[1] >= 2, \ -# "every class should have its own column (even for binary clf)" - -# @pytest.mark.parametrize('setup,fixed_node', [ -# ('classification_setup', 'Logistic'), -# # ('multiclass_classification_setup', 'Softmax') -# ]) -# def test_fixed_nodes(setup, fixed_node, brush_args, request): -# # Classification has a fixed root that should not change after mutation or crossover - -# Estimator, X, y = request.getfixturevalue(setup) - -# est = Estimator(**brush_args) -# est.fit(X, y) # Calling fit to make it create the setup toolbox and variation functions - -# for i in range(10): -# # Initial population -# pop = est.toolbox_.population(n=100) -# pop_models = [] -# for p in pop: -# pop_models.append(p.prg.get_model()) -# assert p.prg.get_model().startswith(fixed_node), \ -# (f"An individual for {setup} was criated without {fixed_node} " + -# f"node on root. Model was {p.ind.get_model()}") - -# # Clones -# clones = [est.toolbox_.Clone(p) for p in pop] -# for c in clones: -# assert c.prg.get_model().startswith(fixed_node), \ -# (f"An individual for {setup} was cloned without {fixed_node} " + -# f"node on root. Model was {c.ind.get_model()}") - -# # Mutation -# xmen = [est.toolbox_.mutate(c) for c in clones] -# xmen = [x for x in xmen if x is not None] -# assert len(xmen) > 0, "Mutation didn't worked for any individual" -# for x in xmen: -# assert x.prg.get_model().startswith(fixed_node), \ -# (f"An individual for {setup} was mutated without {fixed_node} " + -# f"node on root. Model was {x.ind.get_model()}") - -# # Crossover -# cxmen = [] -# [cxmen.extend(est.toolbox_.mate(c1, c2)) -# for (c1, c2) in zip(clones[::2], clones[1::2])] -# cxmen = [x for x in cxmen if x is not None] -# assert len(cxmen) > 0, "Crossover didn't worked for any individual" -# for cx in cxmen: -# assert cx.prg.get_model().startswith(fixed_node), \ -# (f"An individual for {setup} was crossovered without {fixed_node} " + -# f"node on root. Model was {cx.ind.get_model()}") - -# # Originals still the same -# for p, p_original_model in zip(pop, pop_models): -# assert p.prg.get_model() == p_original_model, \ -# "Variation operator changed the original model." - - - -# # def test_random_state(): # TODO: make it work -# # test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) -# # test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0], -# # [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T - -# # est1 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) -# # est2 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) - -# # assert est1.best_estimator_.get_model() == est2.best_estimator_.get_model(), \ -# # "random state failed to generate same results" \ No newline at end of file diff --git a/tests/python/test_deap_api.py b/tests/python/test_deap_api.py new file mode 100644 index 00000000..deed1df9 --- /dev/null +++ b/tests/python/test_deap_api.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +import pybrush +import pytest +import numpy as np +import pandas as pd +from pmlb import fetch_data +from sklearn.utils import resample + +import traceback +import logging + +# TODO: get deap api back and implement it as deap_nsga2 (or something like that. the idea is that it can be used as a reference. I could even do a documentation prototyping_with_brush.ipynb) +@pytest.fixture +def brush_args(): + return dict( + max_gen=10, + pop_size=20, + max_size=50, + max_depth=6, + cx_prob= 1/7, + mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, + "toggle_weight_on":1/6, "toggle_weight_off":1/6}, + ) + +@pytest.fixture +def classification_setup(): + df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') + X = df.drop(columns='target') + y = df['target'] + + return pybrush.DeapClassifier, X, y + +@pytest.fixture +def multiclass_classification_setup(): + df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') + X = df.drop(columns='target') + y = df['target'] + + return pybrush.DeapClassifier, X, y + +@pytest.fixture +def regression_setup(): + df = pd.read_csv('docs/examples/datasets/d_enc.csv') + X = df.drop(columns='label') + y = df['label'] + + return pybrush.DeapRegressor, X, y + +@pytest.mark.parametrize('setup,algorithm', + [('classification_setup', 'nsga2island'), + ('classification_setup', 'nsga2' ), + ('classification_setup', 'gaisland' ), + ('classification_setup', 'ga' ), + ('regression_setup', 'nsga2island'), + ('regression_setup', 'nsga2' ), + ('regression_setup', 'gaisland' ), + ('regression_setup', 'ga' )]) +def test_fit(setup, algorithm, brush_args, request): + """Testing common utilities related to fitting and generic brush estimator. + """ + + Estimator, X, y = request.getfixturevalue(setup) + + brush_args["algorithm"] = algorithm + try: + est = Estimator(**brush_args) + est.fit(X, y) + + print('score:',est.score(X,y)) + + except Exception as e: + pytest.fail(f"Unexpected Exception caught: {e}") + logging.error(traceback.format_exc()) + +@pytest.mark.parametrize('setup', + [('classification_setup'), + ('multiclass_classification_setup')]) +def test_predict_proba(setup, brush_args, request): + + Estimator, X, y = request.getfixturevalue(setup) + + est = Estimator(**brush_args) + est.fit(X, y) + + y_prob = est.predict_proba(X) + assert len(y_prob.shape) == 2, "predict_proba should be 2-dimensional" + assert y_prob.shape[1] >= 2, \ + "every class should have its own column (even for binary clf)" + +@pytest.mark.parametrize('setup,fixed_node', [ + ('classification_setup', 'Logistic'), + # ('multiclass_classification_setup', 'Softmax') + ]) +def test_fixed_nodes(setup, fixed_node, brush_args, request): + # Classification has a fixed root that should not change after mutation or crossover + + Estimator, X, y = request.getfixturevalue(setup) + + est = Estimator(**brush_args) + est.fit(X, y) # Calling fit to make it create the setup toolbox and variation functions + + for i in range(10): + # Initial population + pop = est.toolbox_.population(n=100) + pop_models = [] + for p in pop: + pop_models.append(p.prg.get_model()) + assert p.prg.get_model().startswith(fixed_node), \ + (f"An individual for {setup} was criated without {fixed_node} " + + f"node on root. Model was {p.ind.get_model()}") + + # Clones + clones = [est.toolbox_.Clone(p) for p in pop] + for c in clones: + assert c.prg.get_model().startswith(fixed_node), \ + (f"An individual for {setup} was cloned without {fixed_node} " + + f"node on root. Model was {c.ind.get_model()}") + + # Mutation + xmen = [est.toolbox_.mutate(c) for c in clones] + xmen = [x for x in xmen if x is not None] + assert len(xmen) > 0, "Mutation didn't worked for any individual" + for x in xmen: + assert x.prg.get_model().startswith(fixed_node), \ + (f"An individual for {setup} was mutated without {fixed_node} " + + f"node on root. Model was {x.ind.get_model()}") + + # Crossover + cxmen = [] + [cxmen.extend(est.toolbox_.mate(c1, c2)) + for (c1, c2) in zip(clones[::2], clones[1::2])] + cxmen = [x for x in cxmen if x is not None] + assert len(cxmen) > 0, "Crossover didn't worked for any individual" + for cx in cxmen: + assert cx.prg.get_model().startswith(fixed_node), \ + (f"An individual for {setup} was crossovered without {fixed_node} " + + f"node on root. Model was {cx.ind.get_model()}") + + # Originals still the same + for p, p_original_model in zip(pop, pop_models): + assert p.prg.get_model() == p_original_model, \ + "Variation operator changed the original model." + + + +# def test_random_state(): # TODO: make it work +# test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) +# test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0], +# [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T + +# est1 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) +# est2 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) + +# assert est1.best_estimator_.get_model() == est2.best_estimator_.get_model(), \ +# "random state failed to generate same results" \ No newline at end of file diff --git a/tests/python/test_optimization.py b/tests/python/test_optimization.py index 711b07db..7eab2743 100644 --- a/tests/python/test_optimization.py +++ b/tests/python/test_optimization.py @@ -1,305 +1,304 @@ -# #!/usr/bin/env python3 - -# import brush -# import pytest -# import numpy as np -# import pandas as pd -# from pmlb import fetch_data -# from sklearn.utils import resample - -# import _brush -# import json - -# import traceback -# import logging - -# @pytest.fixture -# def optimize_addition_positive_weights(): -# data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Add", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": True } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 3.0], atol=1e-3) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_addition_negative_weights(): -# data = _brush.read_csv("docs/examples/datasets/d_2x1_subtract_3x2.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Add", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": True } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, -3.0], atol=1e-3) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_subtraction_positive_weights(): -# data = _brush.read_csv("docs/examples/datasets/d_2x1_subtract_3x2.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Sub", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": True } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 3.0], atol=1e-3) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_subtraction_negative_weights(): -# data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Sub", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": True } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, -3.0], atol=1e-3) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_multiply(): -# data = _brush.read_csv("docs/examples/datasets/d_2x1_multiply_3x2.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Mul", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": False } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(np.prod(learned_weights), 2.0*3.0, atol=1e-3) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_divide(): -# data = _brush.read_csv("docs/examples/datasets/d_2x1_divide_3x2.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Div", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": False } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights[0], 2.0/3.0, atol=1e-3) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_sqrt_outer_weight(): -# data = _brush.read_csv("docs/examples/datasets/d_2_sqrt_x1.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Sqrt", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": False } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0], atol=1e-3) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_sqrt_inner_weight(): -# data = _brush.read_csv("docs/examples/datasets/d_2_sqrt_x1.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Sqrt", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [4.0], atol=1e-3) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_sin_outer_weight(): -# data = _brush.read_csv("docs/examples/datasets/d_2_sin_x1.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Sin", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": False } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0], atol=1e-3) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_sin_inner_weight(): -# data = _brush.read_csv("docs/examples/datasets/d_sin_0_25x1.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Sin", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [0.25], atol=1e-2) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_notable_product_weight(): -# data = _brush.read_csv("docs/examples/datasets/d_square_x1_plus_2_x1_x2_plus_square_x2.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Add", "is_weighted": False }, -# { "node_type":"Mul", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": False }, -# { "node_type":"Add", "is_weighted": False }, -# { "node_type":"Square", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, -# { "node_type":"Square", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": False } -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 1.0], atol=1e-2) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_3ary_prod_inner_weight(): -# data = _brush.read_csv("docs/examples/datasets/d_5x1_multiply_x2_multiply_x3.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Prod", "is_weighted": False, -# "arg_types" :["ArrayF", "ArrayF", "ArrayF"], -# "ret_type" :"ArrayF", -# "sig_hash" :5617655905677279916, -# "sig_dual_hash":10188582206427064428, -# "complete_hash":1786662244046809282 }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, -# { "node_type":"Terminal", "feature":"x3", "is_weighted": False} -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [5.0], atol=1e-2) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_3ary_prod_outer_weight(): -# data = _brush.read_csv("docs/examples/datasets/d_5x1_multiply_x2_multiply_x3.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Prod", "is_weighted": True, -# "arg_types" :["ArrayF", "ArrayF", "ArrayF"], -# "ret_type" :"ArrayF", -# "sig_hash" :5617655905677279916, -# "sig_dual_hash":10188582206427064428, -# "complete_hash":1786662244046809282 }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": False}, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, -# { "node_type":"Terminal", "feature":"x3", "is_weighted": False} -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [5.0], atol=1e-2) - -# return (data, json_program, weight_check) - -# @pytest.fixture -# def optimize_constant_weight(): -# data = _brush.read_csv("docs/examples/datasets/d_2x1_multiply_3x2.csv","target") - -# json_program = { -# "Tree": [ -# { "node_type":"Mul", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": False}, -# { "node_type":"Mul", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, -# { "node_type":"Constant", "feature":"C", "is_weighted": True }, -# ], -# "is_fitted_":False -# } - -# weight_check = lambda learned_weights: np.allclose(learned_weights, [6.0], atol=1e-2) - -# return (data, json_program, weight_check) - -# @pytest.mark.parametrize( -# 'optimization_problem', ['optimize_addition_positive_weights', -# 'optimize_addition_negative_weights', -# 'optimize_subtraction_positive_weights', -# 'optimize_subtraction_negative_weights', -# 'optimize_multiply', -# 'optimize_divide', -# 'optimize_sqrt_outer_weight', -# 'optimize_sqrt_inner_weight', -# 'optimize_sin_outer_weight', -# 'optimize_sin_inner_weight', -# 'optimize_notable_product_weight', -# 'optimize_3ary_prod_inner_weight', -# 'optimize_3ary_prod_outer_weight', -# 'optimize_constant_weight' -# ]) -# def test_optimizer(optimization_problem, request): - -# data, json_program, weight_check = request.getfixturevalue(optimization_problem) - -# print( "initial json: {}\n", json_program) -# prg = _brush.program.Regressor(json_program) -# print( "program:", prg.get_model()) - -# # fit model -# print( "fit") -# prg.fit(data) -# print( "predict") -# y_pred = prg.predict(data) - -# learned_weights = prg.get_weights(); -# print('learned weights:', learned_weights) - -# assert np.sum(np.square(data.y-y_pred)) <= 1e-3 -# assert np.allclose(data.y, y_pred, atol=1e-3) -# assert weight_check(learned_weights) \ No newline at end of file +#!/usr/bin/env python3 + +import pytest +import numpy as np +import pandas as pd +from pmlb import fetch_data +from sklearn.utils import resample + +import _brush +import json + +import traceback +import logging + +@pytest.fixture +def optimize_addition_positive_weights(): + data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Add", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": True } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 3.0], atol=1e-3) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_addition_negative_weights(): + data = _brush.read_csv("docs/examples/datasets/d_2x1_subtract_3x2.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Add", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": True } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, -3.0], atol=1e-3) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_subtraction_positive_weights(): + data = _brush.read_csv("docs/examples/datasets/d_2x1_subtract_3x2.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Sub", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": True } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 3.0], atol=1e-3) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_subtraction_negative_weights(): + data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Sub", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": True } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, -3.0], atol=1e-3) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_multiply(): + data = _brush.read_csv("docs/examples/datasets/d_2x1_multiply_3x2.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Mul", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": False } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(np.prod(learned_weights), 2.0*3.0, atol=1e-3) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_divide(): + data = _brush.read_csv("docs/examples/datasets/d_2x1_divide_3x2.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Div", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": False } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights[0], 2.0/3.0, atol=1e-3) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_sqrt_outer_weight(): + data = _brush.read_csv("docs/examples/datasets/d_2_sqrt_x1.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Sqrt", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": False } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0], atol=1e-3) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_sqrt_inner_weight(): + data = _brush.read_csv("docs/examples/datasets/d_2_sqrt_x1.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Sqrt", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [4.0], atol=1e-3) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_sin_outer_weight(): + data = _brush.read_csv("docs/examples/datasets/d_2_sin_x1.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Sin", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": False } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0], atol=1e-3) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_sin_inner_weight(): + data = _brush.read_csv("docs/examples/datasets/d_sin_0_25x1.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Sin", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [0.25], atol=1e-2) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_notable_product_weight(): + data = _brush.read_csv("docs/examples/datasets/d_square_x1_plus_2_x1_x2_plus_square_x2.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Add", "is_weighted": False }, + { "node_type":"Mul", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": False }, + { "node_type":"Add", "is_weighted": False }, + { "node_type":"Square", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, + { "node_type":"Square", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": False } + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [2.0, 1.0], atol=1e-2) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_3ary_prod_inner_weight(): + data = _brush.read_csv("docs/examples/datasets/d_5x1_multiply_x2_multiply_x3.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Prod", "is_weighted": False, + "arg_types" :["ArrayF", "ArrayF", "ArrayF"], + "ret_type" :"ArrayF", + "sig_hash" :5617655905677279916, + "sig_dual_hash":10188582206427064428, + "complete_hash":1786662244046809282 }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, + { "node_type":"Terminal", "feature":"x3", "is_weighted": False} + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [5.0], atol=1e-2) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_3ary_prod_outer_weight(): + data = _brush.read_csv("docs/examples/datasets/d_5x1_multiply_x2_multiply_x3.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Prod", "is_weighted": True, + "arg_types" :["ArrayF", "ArrayF", "ArrayF"], + "ret_type" :"ArrayF", + "sig_hash" :5617655905677279916, + "sig_dual_hash":10188582206427064428, + "complete_hash":1786662244046809282 }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": False}, + { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, + { "node_type":"Terminal", "feature":"x3", "is_weighted": False} + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [5.0], atol=1e-2) + + return (data, json_program, weight_check) + +@pytest.fixture +def optimize_constant_weight(): + data = _brush.read_csv("docs/examples/datasets/d_2x1_multiply_3x2.csv","target") + + json_program = { + "Tree": [ + { "node_type":"Mul", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": False}, + { "node_type":"Mul", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x2", "is_weighted": False}, + { "node_type":"Constant", "feature":"C", "is_weighted": True }, + ], + "is_fitted_":False + } + + weight_check = lambda learned_weights: np.allclose(learned_weights, [6.0], atol=1e-2) + + return (data, json_program, weight_check) + +@pytest.mark.parametrize( + 'optimization_problem', ['optimize_addition_positive_weights', + 'optimize_addition_negative_weights', + 'optimize_subtraction_positive_weights', + 'optimize_subtraction_negative_weights', + 'optimize_multiply', + 'optimize_divide', + 'optimize_sqrt_outer_weight', + 'optimize_sqrt_inner_weight', + 'optimize_sin_outer_weight', + 'optimize_sin_inner_weight', + 'optimize_notable_product_weight', + 'optimize_3ary_prod_inner_weight', + 'optimize_3ary_prod_outer_weight', + 'optimize_constant_weight' + ]) +def test_optimizer(optimization_problem, request): + + data, json_program, weight_check = request.getfixturevalue(optimization_problem) + + print( "initial json: {}\n", json_program) + prg = _brush.program.Regressor(json_program) + print( "program:", prg.get_model()) + + # fit model + print( "fit") + prg.fit(data) + print( "predict") + y_pred = prg.predict(data) + + learned_weights = prg.get_weights(); + print('learned weights:', learned_weights) + + assert np.sum(np.square(data.y-y_pred)) <= 1e-3 + assert np.allclose(data.y, y_pred, atol=1e-3) + assert weight_check(learned_weights) \ No newline at end of file diff --git a/tests/python/test_program.py b/tests/python/test_program.py index 70aa067e..e1933c18 100644 --- a/tests/python/test_program.py +++ b/tests/python/test_program.py @@ -32,77 +32,77 @@ def test_make_program(test_data): prg = SS.make_regressor(d, s) print(f"Tree model for depth {d}, size {s}:", prg.get_model()) -# def test_fit_regressor(test_data): -# test_X, test_y = test_data -# data = _brush.Dataset(test_X, test_y) -# SS = _brush.SearchSpace(data) -# # pytest.set_trace() -# for d in range(1,4): -# for s in range(1,20): -# prg = SS.make_regressor(d, s) -# print(f"Tree model for depth {d}, size {s}:", prg.get_model()) -# # prg.fit(data) -# y = prg.fit(data).predict(data) -# print(y) +def test_fit_regressor(test_data): + test_X, test_y = test_data + data = _brush.Dataset(test_X, test_y) + SS = _brush.SearchSpace(data) + # pytest.set_trace() + for d in range(1,4): + for s in range(1,20): + prg = SS.make_regressor(d, s) + print(f"Tree model for depth {d}, size {s}:", prg.get_model()) + # prg.fit(data) + y = prg.fit(data).predict(data) + print(y) -# def test_fit_classifier(): -# df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') -# data = _brush.Dataset(df.drop(columns='target'), df['target']) -# SS = _brush.SearchSpace(data) -# # pytest.set_trace() -# for d in range(1,4): -# for s in range(1,20): -# prg = SS.make_classifier(d, s) -# print(f"Tree model for depth {d}, size {s}:", prg.get_model()) -# print(f"fitting {prg.get_model()}") -# # prg.fit(data) -# y = prg.fit(data).predict(data) -# print(y) +def test_fit_classifier(): + df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') + data = _brush.Dataset(df.drop(columns='target'), df['target']) + SS = _brush.SearchSpace(data) + # pytest.set_trace() + for d in range(1,4): + for s in range(1,20): + prg = SS.make_classifier(d, s) + print(f"Tree model for depth {d}, size {s}:", prg.get_model()) + print(f"fitting {prg.get_model()}") + # prg.fit(data) + y = prg.fit(data).predict(data) + print(y) -# def test_json_regressor(): -# data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") -# json_program = { -# "Tree": [ -# { "node_type":"Add", "is_weighted": False }, -# { "node_type":"Terminal", "feature":"x1", "is_weighted": True}, -# { "node_type":"Terminal", "feature":"x2", "is_weighted": True} -# ], -# "is_fitted_":False -# } -# print( "initial json: {}\n", json_program) -# prg = _brush.program.Regressor(json_program) -# print( "program:", prg.get_model()) -# # fit model -# print( "fit") -# prg.fit(data) -# print( "predict") -# y_pred = prg.predict(data) +def test_json_regressor(): + data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") + json_program = { + "Tree": [ + { "node_type":"Add", "is_weighted": False }, + { "node_type":"Terminal", "feature":"x1", "is_weighted": True}, + { "node_type":"Terminal", "feature":"x2", "is_weighted": True} + ], + "is_fitted_":False + } + print( "initial json: {}\n", json_program) + prg = _brush.program.Regressor(json_program) + print( "program:", prg.get_model()) + # fit model + print( "fit") + prg.fit(data) + print( "predict") + y_pred = prg.predict(data) -# learned_weights = prg.get_weights() -# print('learned weights:', learned_weights) + learned_weights = prg.get_weights() + print('learned weights:', learned_weights) -# true_weights = [2.0, 3.0] + true_weights = [2.0, 3.0] -# assert np.sum(np.abs(data.y-y_pred)) <= 1e-4 -# #assert all(round(i,4) == round(j, 4) for i,j in zip(learned_weights, true_weights)) -# np.allclose(learned_weights, true_weights, atol=1e-4) + assert np.sum(np.abs(data.y-y_pred)) <= 1e-4 + #assert all(round(i,4) == round(j, 4) for i,j in zip(learned_weights, true_weights)) + np.allclose(learned_weights, true_weights, atol=1e-4) -# def test_serialization(): -# data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") -# SS = _brush.SearchSpace(data) +def test_serialization(): + data = _brush.read_csv("docs/examples/datasets/d_2x1_plus_3x2.csv","target") + SS = _brush.SearchSpace(data) -# for d in range(1,4): -# for s in range(1, 20): -# prg = SS.make_regressor(d, s) -# prg.fit(data) -# print(f"Initial Model:", prg.get_model()) -# y_pred = prg.predict(data) -# pgr_pickle = pickle.dumps(prg) + for d in range(1,4): + for s in range(1, 20): + prg = SS.make_regressor(d, s) + prg.fit(data) + print(f"Initial Model:", prg.get_model()) + y_pred = prg.predict(data) + pgr_pickle = pickle.dumps(prg) -# new_pgr = pickle.loads(pgr_pickle) -# new_pgr.fit(data) -# print(f"Loaded Model:", new_pgr.get_model()) -# new_y_pred = new_pgr.predict(data) + new_pgr = pickle.loads(pgr_pickle) + #new_pgr.fit(data) + print(f"Loaded Model:", new_pgr.get_model()) + new_y_pred = new_pgr.predict(data) -# assert prg.get_model() == new_pgr.get_model() -# assert np.allclose(new_y_pred, y_pred, atol=1e-3) \ No newline at end of file + assert prg.get_model() == new_pgr.get_model() + assert np.allclose(new_y_pred, y_pred, atol=1e-3, equal_nan=True) \ No newline at end of file From 37c73d0e99a819e333ef7a6e807e8057a23c0544 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 7 Feb 2024 08:38:18 -0300 Subject: [PATCH 108/199] Updated DEAP to use brush's individuals --- src/bindings/bind_cbrush.cpp | 14 -- src/bindings/bind_cbrush.h | 29 --- src/bindings/bind_estimators.cpp | 16 ++ src/bindings/bind_estimators.h | 33 +++ src/bindings/bind_individuals.cpp | 48 ++++- src/bindings/bind_individuals.h | 49 +++-- src/bindings/bind_params.cpp | 12 +- src/bindings/bind_population.cpp | 21 ++ src/bindings/bind_population.h | 20 ++ src/bindings/bind_programs.cpp | 7 - src/bindings/bind_programs.h | 1 - src/bindings/bind_selection.cpp | 21 ++ src/bindings/bind_selection.h | 22 ++ src/bindings/bind_variation.cpp | 1 + src/bindings/bind_variation.h | 3 +- src/bindings/module.cpp | 11 +- src/{cbrush.cpp => estimator.cpp} | 26 +-- src/{cbrush.h => estimator.h} | 30 ++- src/eval/evaluation.cpp | 51 ++++- src/eval/evaluation.h | 4 +- src/eval/fitness.cpp | 0 src/eval/fitness.h | 3 + src/individual.cpp | 71 ++++--- src/individual.h | 288 ++++++++++++++++++++++----- src/params.h | 32 ++- src/population.cpp | 41 ++-- src/population.h | 10 +- src/program/program.h | 7 +- src/selection/nsga2.cpp | 84 +++++--- src/selection/nsga2.h | 14 +- src/selection/selection.cpp | 11 +- src/selection/selection.h | 48 +---- src/selection/selection_operator.cpp | 29 +++ src/selection/selection_operator.h | 45 +++++ src/types.h | 10 +- src/variation.cpp | 24 +-- 36 files changed, 813 insertions(+), 323 deletions(-) delete mode 100644 src/bindings/bind_cbrush.cpp delete mode 100644 src/bindings/bind_cbrush.h create mode 100644 src/bindings/bind_estimators.cpp create mode 100644 src/bindings/bind_estimators.h create mode 100644 src/bindings/bind_population.cpp create mode 100644 src/bindings/bind_population.h create mode 100644 src/bindings/bind_selection.cpp create mode 100644 src/bindings/bind_selection.h rename src/{cbrush.cpp => estimator.cpp} (82%) rename src/{cbrush.h => estimator.h} (89%) create mode 100644 src/eval/fitness.cpp create mode 100644 src/eval/fitness.h create mode 100644 src/selection/selection_operator.cpp create mode 100644 src/selection/selection_operator.h diff --git a/src/bindings/bind_cbrush.cpp b/src/bindings/bind_cbrush.cpp deleted file mode 100644 index 16cd60ab..00000000 --- a/src/bindings/bind_cbrush.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include "module.h" -#include "bind_cbrush.h" // TODO: rename it to bind_estimators - -namespace py = pybind11; -namespace br = Brush; -namespace nl = nlohmann; - -void bind_estimators(py::module& m) -{ - bind_estimator(m, "Regressor"); - bind_estimator(m, "Classifier"); - bind_estimator(m, "MultiClassifier"); - bind_estimator(m, "Representer"); -} \ No newline at end of file diff --git a/src/bindings/bind_cbrush.h b/src/bindings/bind_cbrush.h deleted file mode 100644 index 88284852..00000000 --- a/src/bindings/bind_cbrush.h +++ /dev/null @@ -1,29 +0,0 @@ -#include "module.h" -#include "../cbrush.h" - -using Reg = Brush::RegressorEstimator; -using Cls = Brush::ClassifierEstimator; -using Rep = Brush::RepresenterEstimator; -using MCls = Brush::MulticlassClassifierEstimator; - -namespace nl = nlohmann; -namespace br = Brush; - -using stream_redirect = py::call_guard; - -template -void bind_estimator(py::module& m, string name) -{ - // using RetType = std::conditional_t< - // std::is_same_v, ArrayXf, - // std::conditional_t, ArrayXb, - // std::conditional_t, ArrayXi, ArrayXXf>>>; - - // py::class_ ind(m, name.data() ); - // ind.def(py::init<>()) - // ; - // if constexpr (std::is_same_v) - // { - - // } -} \ No newline at end of file diff --git a/src/bindings/bind_estimators.cpp b/src/bindings/bind_estimators.cpp new file mode 100644 index 00000000..cb7cb4af --- /dev/null +++ b/src/bindings/bind_estimators.cpp @@ -0,0 +1,16 @@ +#include "module.h" +#include "bind_estimators.h" + +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +void bind_estimators(py::module& m) +{ + bind_estimator(m, "BrushRegressorEstimator"); + bind_estimator(m, "BrushClassifierEstimator"); + + // TODO: make these work + bind_estimator(m, "BrushMultiClassifierEstimator"); + bind_estimator(m, "BrushRepresenterEstimator"); +} \ No newline at end of file diff --git a/src/bindings/bind_estimators.h b/src/bindings/bind_estimators.h new file mode 100644 index 00000000..55dd0661 --- /dev/null +++ b/src/bindings/bind_estimators.h @@ -0,0 +1,33 @@ +#include "module.h" +#include "../estimator.h" + +using Reg = Brush::RegressorEstimator; +using Cls = Brush::ClassifierEstimator; +using Rep = Brush::RepresenterEstimator; +using MCls = Brush::MulticlassClassifierEstimator; + +namespace nl = nlohmann; +namespace br = Brush; + +using stream_redirect = py::call_guard; + +template +void bind_estimator(py::module& m, string name) +{ + using RetType = std::conditional_t< + std::is_same_v, ArrayXf, + std::conditional_t, ArrayXb, + std::conditional_t, ArrayXi, ArrayXXf>>>; + + py::class_ estimator(m, name.data() ); + estimator.def(py::init<>()) + .def_property("pop_size", &T::get_pop_size, &T::set_pop_size) + .def_property("gens", &T::get_gens, &T::set_gens) + ; + + // specialization for subclasses + if constexpr (std::is_same_v) + { + + } +} \ No newline at end of file diff --git a/src/bindings/bind_individuals.cpp b/src/bindings/bind_individuals.cpp index 659259c0..568f8a54 100644 --- a/src/bindings/bind_individuals.cpp +++ b/src/bindings/bind_individuals.cpp @@ -5,10 +5,50 @@ namespace py = pybind11; namespace br = Brush; namespace nl = nlohmann; + void bind_individuals(py::module& m) { - bind_individual(m, "Regressor"); - bind_individual(m, "Classifier"); - bind_individual(m, "MultiClassifier"); - bind_individual(m, "Representer"); + // fitness is used to prototype with deap API. TODO: replace deapIndividual with brush individual (once it gets implemented) + py::class_(m, "Fitness", py::dynamic_attr()) + .def(py::init<>()) + .def(py::init&>(), "Constructor with weights") + .def_property("values", &br::Fitness::get_values, &br::Fitness::set_values) + .def_property_readonly("weights", &br::Fitness::get_weights) + .def_property_readonly("wvalues", &br::Fitness::get_wvalues) + .def("dominates", &Fitness::dominates) + .def("clearValues", &Fitness::clearValues, "Clear the weighted values vector") + .def_property("rank", &Fitness::get_rank, &Fitness::set_rank) + .def_property("loss", &Fitness::get_loss, &Fitness::set_loss) + .def_property("loss_v", &Fitness::get_loss_v, &Fitness::set_loss_v) + .def_property("crowding_dist", &Fitness::get_crowding_dist, &Fitness::set_crowding_dist) + + .def("valid", &Fitness::valid, "Check if the fitness is valid") + .def("__hash__", &Fitness::hash, py::is_operator()) + .def("__eq__", &Fitness::operator==, py::is_operator()) + .def("__ne__", &Fitness::operator!=, py::is_operator()) + .def("__lt__", &Fitness::operator<, py::is_operator()) + .def("__gt__", &Fitness::operator>, py::is_operator()) + .def("__le__", &Fitness::operator<=, py::is_operator()) + .def("__ge__", &Fitness::operator>=, py::is_operator()) + // .def("__str__", &Fitness::toString, "String representation of the Fitness object") + // .def("__repr__", &Fitness::repr, "Representation for debugging the Fitness object") + .def(py::pickle( + [](const br::Fitness &f) { // __getstate__ + /* Return a tuple that fully encodes the state of the object */ + // return py::make_tuple(p.value(), p.extra()); + nl::json j = f; + return j; + }, + [](nl::json j) { // __setstate__ + br::Fitness f = j; + return f; + } + ) + ) + ; + + bind_individual(m, "RegressorIndividual"); + bind_individual(m, "ClassifierIndividual"); + bind_individual(m, "MultiClassifierIndividual"); + bind_individual(m, "RepresenterIndividual"); } \ No newline at end of file diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index ce34745c..72e411f4 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -1,30 +1,51 @@ #include "module.h" -#include "../individual.h" -using Reg = Brush::RegressorIndividual; -using Cls = Brush::ClassifierIndividual; -using Rep = Brush::RepresenterIndividual; -using MCls = Brush::MulticlassClassifierIndividual; +#include "../individual.h" namespace nl = nlohmann; namespace br = Brush; using stream_redirect = py::call_guard; -template +// TODO: unify PT or T +template void bind_individual(py::module& m, string name) { - using RetType = std::conditional_t< - std::is_same_v, ArrayXf, - std::conditional_t, ArrayXb, - std::conditional_t, ArrayXi, ArrayXXf>>>; + using Class = br::Pop::Individual; - py::class_ ind(m, name.data() ); + py::class_ ind(m, name.data() ); ind.def(py::init<>()) + .def(py::init([](br::Program& prg){ Class i(prg); + return i; }) + ) + .def(py::init([](const json& j){ br::Program prg = j; + Class i(prg); + return i; }) + ) + .def("init", &Class::init) + .def_property("objectives", &Class::get_objectives, &Class::set_objectives) + .def_property_readonly("program", &Class::get_program) // program cannot be changed by the user. Either create a new instance with the program as argument (so it will be a clone), or call init() (TODO: I should make init reset the attributes in the cpp end to avoid reseting the program but keeping the attributes) + .def_property_readonly("fitness", &Class::get_fitness) // program cannot be changed by the user. Either create a new instance with the program as argument (so it will be a clone), or call init() (TODO: I should make init reset the attributes in the cpp end to avoid reseting the program but keeping the attributes) + // .def_property("fitness", &Class::get_fitness, &Class::set_fitness) + // .def_property("complexity", &Class::get_complexity, &Class::set_complexity) + .def(py::pickle( + [](const Class &p) { // __getstate__ + /* Return a tuple that fully encodes the state of the object */ + // return py::make_tuple(p.value(), p.extra()); + nl::json j = p; + return j; + }, + [](nl::json j) { // __setstate__ + Class p = j; + return p; + } + ) + ) ; - if constexpr (std::is_same_v) - { - } + // if constexpr (std::is_same_v) + // { + + // } } \ No newline at end of file diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index dadbfaf6..2d95f4d7 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -23,8 +23,18 @@ void bind_params(py::module& m) py::class_(m, "Parameters") .def(py::init([]() { Brush::Parameters p; return p; })) - // TODO: define getters and setters, and create the bindings here. Make the Brush bindings use these here .def_property("pop_size", &Brush::Parameters::get_pop_size, &Brush::Parameters::set_pop_size) + .def_property("gens", &Brush::Parameters::get_gens, &Brush::Parameters::set_gens) + .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands) + .def_property("max_depth", &Brush::Parameters::get_max_depth, &Brush::Parameters::set_max_depth) + .def_property("max_size", &Brush::Parameters::get_max_size, &Brush::Parameters::set_max_size) + .def_property("objectives", &Brush::Parameters::get_objectives, &Brush::Parameters::set_objectives) + .def_property("sel", &Brush::Parameters::get_sel, &Brush::Parameters::set_sel) + .def_property("surv", &Brush::Parameters::get_surv, &Brush::Parameters::set_surv) + .def_property("cx_prob", &Brush::Parameters::get_cx_prob, &Brush::Parameters::set_cx_prob) + .def_property("mig_prob", &Brush::Parameters::get_mig_prob, &Brush::Parameters::set_mig_prob) + .def_property("functions", &Brush::Parameters::get_functions, &Brush::Parameters::set_functions) .def_property("mutation_probs", &Brush::Parameters::get_mutation_probs, &Brush::Parameters::set_mutation_probs) + ; } \ No newline at end of file diff --git a/src/bindings/bind_population.cpp b/src/bindings/bind_population.cpp new file mode 100644 index 00000000..11b012e8 --- /dev/null +++ b/src/bindings/bind_population.cpp @@ -0,0 +1,21 @@ +#include "module.h" +#include "bind_population.h" + +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +// using Reg = br::Program; +// using Cls = br::Program; +// using Rep = br::Program; +// using MCls = br::Program; + +void bind_populations(py::module& m) +{ + // TODO: make them a single class + bind_population(m, "RegressorPopulation"); + bind_population(m, "ClassifierPopulation"); + + bind_population(m, "MultiClassifierPopulation"); + bind_population(m, "RepresenterPopulation"); +} \ No newline at end of file diff --git a/src/bindings/bind_population.h b/src/bindings/bind_population.h new file mode 100644 index 00000000..53fbcd6a --- /dev/null +++ b/src/bindings/bind_population.h @@ -0,0 +1,20 @@ +#include "module.h" +#include "../population.h" +#include "../population.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) + +namespace py = pybind11; +namespace nl = nlohmann; +namespace br = Brush; + +template +void bind_population(py::module& m, string name) +{ + using Class = br::Pop::Population; + + // TODO: make population a non-templated class + py::class_ pop(m, name.data() ); + + // TODO: access individuals by index + pop.def(py::init<>()) + ; +} \ No newline at end of file diff --git a/src/bindings/bind_programs.cpp b/src/bindings/bind_programs.cpp index 3d8b1ce6..905b3dfa 100644 --- a/src/bindings/bind_programs.cpp +++ b/src/bindings/bind_programs.cpp @@ -11,13 +11,6 @@ namespace nl = nlohmann; void bind_programs(py::module& m) { - // fitness is used to prototype with deap API. TODO: replace deapIndividual with brush individual (once it gets implemented) - py::class_(m, "Fitness", py::dynamic_attr()) - .def(py::init<>()) - .def_readwrite("values", &br::Fitness::values) - .def_readwrite("valid", &br::Fitness::valid) - ; - bind_program(m, "Regressor"); bind_program(m, "Classifier"); bind_program(m, "MultiClassifier"); diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index 92370401..49ca8ff7 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -24,7 +24,6 @@ void bind_program(py::module& m, string name) .def(py::init( [](const json& j){ T p = j; return p; }) ) - .def_readwrite("fitness", &T::fitness) .def("fit", static_cast(&T::fit), "fit from Dataset object") diff --git a/src/bindings/bind_selection.cpp b/src/bindings/bind_selection.cpp new file mode 100644 index 00000000..e2a74442 --- /dev/null +++ b/src/bindings/bind_selection.cpp @@ -0,0 +1,21 @@ +#include "module.h" +#include "bind_selection.h" + +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +// using Reg = br::Program; +// using Cls = br::Program; +// using Rep = br::Program; +// using MCls = br::Program; + +void bind_selections(py::module& m) +{ + // TODO: make them a single class + bind_selection(m, "RegressorSelector"); + bind_selection(m, "ClassifierSelector"); + + bind_selection(m, "MultiClassifierSelector"); + bind_selection(m, "RepresenterSelector"); +} \ No newline at end of file diff --git a/src/bindings/bind_selection.h b/src/bindings/bind_selection.h new file mode 100644 index 00000000..1c101fee --- /dev/null +++ b/src/bindings/bind_selection.h @@ -0,0 +1,22 @@ +#include "module.h" +#include "../selection/selection.h" +#include "../selection/selection.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) + +namespace py = pybind11; +namespace nl = nlohmann; +namespace br = Brush; + +template +void bind_selection(py::module& m, string name) +{ + using Class = br::Sel::Selection; + + // TODO: make selection a non-templated class + py::class_ sel(m, name.data() ); + + sel.def(py::init<>()) + .def(py::init( + [](string type, bool survival){ Class s(type, survival); return s; }) + ) + ; +} \ No newline at end of file diff --git a/src/bindings/bind_variation.cpp b/src/bindings/bind_variation.cpp index 0a772c7c..0b66a1b7 100644 --- a/src/bindings/bind_variation.cpp +++ b/src/bindings/bind_variation.cpp @@ -14,6 +14,7 @@ void bind_variations(py::module& m) { bind_variation(m, "RegressorVariator"); bind_variation(m, "ClassifierVariator"); + bind_variation(m, "MultiClassifierVariator"); bind_variation(m, "RepresenterVariator"); } \ No newline at end of file diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index e7cda407..74ab2564 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -1,6 +1,6 @@ #include "module.h" #include "../variation.h" -#include "../variation.cpp" // TODO: figure out why im having symbol errors +#include "../variation.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) namespace py = pybind11; namespace nl = nlohmann; @@ -19,5 +19,6 @@ void bind_variation(py::module& m, string name) return variation; })) .def("mutate", &Class::mutate, py::return_value_policy::automatic) .def("cross", &Class::cross, py::return_value_policy::automatic) + // .def("vary", &Class::vary) // apply variation to the population TODO: implement it: wrap a list of individuals into a population, modify it, return as a vector of individuals (so we dont have to expose population to python) ; } \ No newline at end of file diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index 21524853..30c8e926 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -20,7 +20,9 @@ void bind_dataset(py::module &); void bind_search_space(py::module &); void bind_programs(py::module &); void bind_variations(py::module &); +void bind_selections(py::module &); void bind_individuals(py::module &); +void bind_populations(py::module &); void bind_estimators(py::module &); PYBIND11_MODULE(_brush, m) { @@ -30,20 +32,23 @@ PYBIND11_MODULE(_brush, m) { #else m.attr("__version__") = "dev"; #endif + // main algorithm // bind_cbrush(m); + // data structures to store solutions bind_params(m); bind_dataset(m); bind_search_space(m); bind_variations(m); + // bind_selections(m); + bind_populations(m); + // solutions py::module_ m2 = m.def_submodule("program", "Contains Program classes."); bind_programs(m2); py::module_ m3 = m.def_submodule("individual", "Contains Individual classes."); bind_individuals(m3); - py::module_ m4 = m.def_submodule("Estimator", "Contains Estimator classes."); - bind_estimators(m4); - + // bind_estimators(m); } diff --git a/src/cbrush.cpp b/src/estimator.cpp similarity index 82% rename from src/cbrush.cpp rename to src/estimator.cpp index 539b49e1..c542665e 100644 --- a/src/cbrush.cpp +++ b/src/estimator.cpp @@ -1,4 +1,5 @@ -#include "cbrush.h" +#include "estimator.h" + #include @@ -6,7 +7,7 @@ namespace Brush{ /// @brief initialize Feat object for fitting. template -void CBrush::init() +void Estimator::init() { if (params.n_jobs!=0) // TODO: change this to set taskflow jobs omp_set_num_threads(params.n_jobs); @@ -34,7 +35,7 @@ void CBrush::init() } template -bool CBrush::update_best(const Dataset& data, bool val) +bool Estimator::update_best(const Dataset& data, bool val) { float bs; bs = this->best_loss; @@ -73,7 +74,7 @@ bool CBrush::update_best(const Dataset& data, bool val) template -void CBrush::run_generation(unsigned int g, Dataset &data) +void Estimator::run_generation(unsigned int g, Dataset &data) { // https://taskflow.github.io/taskflow/ParallelIterations.html tf::Executor executor; @@ -85,17 +86,17 @@ void CBrush::run_generation(unsigned int g, Dataset &data) auto batch = data.get_batch(); // will return the original dataset if it is set to dont use batch vector> island_parents; - island_parents.resize(pop.n_islands); - taskflow.for_each_index(0, pop.n_islands, 1, [&](int island) { + island_parents.resize(pop.num_islands); + taskflow.for_each_index(0, pop.num_islands, 1, [&](int island) { tuple island_range = pop.get_island_range(island); // fit the weights with all training data - evaluator.fitness(pop, island_range, data, params, true, false); + evaluator.update_fitness(pop, island_range, data, params, true, false); evaluator.validation(pop, island_range, data, params, false); // TODO: if using batch, fitness should be called before selection to set the batch if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) - evaluator.fitness(pop, island_range, batch, params, false, false); + evaluator.update_fitness(pop, island_range, batch, params, false, false); // select parents vector parents = selector.select(pop, island_range, params, data); @@ -105,17 +106,17 @@ void CBrush::run_generation(unsigned int g, Dataset &data) vector survivors(pop.size()); pop.add_offspring_indexes(); - taskflow.for_each_index(0, pop.n_islands, 1, [&](int island) { + taskflow.for_each_index(0, pop.num_islands, 1, [&](int island) { tuple island_range = pop.get_island_range(island); // // variation to produce offspring variator.vary(pop, island_range, island_parents.at(island)); - evaluator.fitness(pop, island_range, data, params, true, true); + evaluator.update_fitness(pop, island_range, data, params, true, true); evaluator.validation(pop, island_range, data, params, true); if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) - evaluator.fitness(pop, island_range, batch, params, false, true); + evaluator.update_fitness(pop, island_range, batch, params, false, true); // select survivors from combined pool of parents and offspring auto island_survivors = survivor.survive(pop, island_range, params, data); @@ -135,13 +136,14 @@ void CBrush::run_generation(unsigned int g, Dataset &data) } template -void CBrush::fit(MatrixXf& X, VectorXf& y) +void Estimator::fit(MatrixXf& X, VectorXf& y) { this->init(); // TODO: fit method that takes different arguments? Dataset data(X, y); + //TODO: i need to make sure i initialize everything (pybind needs to have constructors without arguments to work, and i need to handle correcting these values before running) this->ss = SearchSpace(data, params.functions); this->pop = Population(params.pop_size, params.num_islands); this->evaluator = Evaluation(params.scorer_); diff --git a/src/cbrush.h b/src/estimator.h similarity index 89% rename from src/cbrush.h rename to src/estimator.h index c8abf667..3f3e9550 100644 --- a/src/cbrush.h +++ b/src/estimator.h @@ -3,41 +3,36 @@ copyright 2020 William La Cava license: GNU/GPL v3 */ -#ifndef CBrush_H -#define CBrush_H +#ifndef Estimator_H +#define Estimator_H +#include "./util/rnd.h" #include "init.h" -#include "population.h" #include "params.h" +#include "population.h" #include "./eval/evaluation.h" +#include "variation.h" #include "selection/selection.h" -#include "./util/rnd.h" #include "taskflow/taskflow.hpp" -// TODO: improve the includes (why does this lines below does not work?) -// #include "variation.h" -// #include "selection.h" - - namespace Brush { using namespace Pop; using namespace Sel; using namespace Eval; - -// using namespace variation; +using namespace Var; template -class CBrush{ // TODO: rename it to BrushEstimator +class Estimator{ public: - CBrush() + Estimator() : params(Parameters()) , ss(SearchSpace()) , variator(Variation(params, ss)) {}; - ~CBrush(){}; + ~Estimator(){}; void init(); //getters and setters for GA configuration --------------------------------- @@ -68,7 +63,7 @@ class CBrush{ // TODO: rename it to BrushEstimator void set_survival(string surv){ params.surv = surv; }; string get_survival(){ return params.surv; }; - void set_num_islands(int n_islands){ params.num_islands = n_islands; }; + void set_num_islands(int num_islands){ params.num_islands = num_islands; }; int get_num_islands(){ return params.num_islands; }; void set_objectives(const vector& obj){params.objectives = obj; }; @@ -86,9 +81,10 @@ class CBrush{ // TODO: rename it to BrushEstimator void set_cross_prob(float cross_prob){ params.cx_prob = cross_prob;}; float get_cross_prob(){ return params.cx_prob; }; + // TODO: MAKE functions work // sets available functions based on comma-separated list. - void set_functions(const vector& fns){ params.functions = fns; }; - unordered_map get_functions(){ return params.functions; }; + // void set_functions(const vector& fns){ params.functions = fns; }; + // unordered_map get_functions(){ return params.functions; }; void set_mutation_probs(std::map mutation_probs){ params.mutation_probs = mutation_probs;}; std::map get_mutation_probs(){ return params.mutation_probs; }; diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index ebed9669..e1fd0355 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -23,10 +23,10 @@ void Evaluation::validation(Population& pop, Individual& ind = *pop.individuals.at(idxs.at(i)).get(); // we are modifying it, so operator[] wont work // if there is no validation data, - // set fitness_v to fitness and return ( this assumes that fitness on train was calculated previously.) + // set loss_v to loss and return ( this assumes that loss on train was calculated previously.) if (!data.use_validation) { - ind.fitness_v = ind.fitness; + ind.loss_v = ind.loss; continue; } @@ -35,21 +35,21 @@ void Evaluation::validation(Population& pop, if (!pass) { // TODO: stop doing this hardcoded? - ind.fitness_v = MAX_FLT; + ind.loss_v = MAX_FLT; } else { - // TODO: implement the class weights and use it here (and on fitness) + // TODO: implement the class weights and use it here (and on loss) VectorXf y_pred = ind.program.predict(data.get_validation_data()); assign_fit(ind, y_pred, data, params, true); } - ind.set_obj(params.objectives); + // ind.set_obj(params.objectives); } } // fitness of population template -void Evaluation::fitness(Population& pop, +void Evaluation::update_fitness(Population& pop, int island, const Dataset& data, const Parameters& params, @@ -57,6 +57,7 @@ void Evaluation::fitness(Population& pop, bool offspring ) { + //TODO: it could use the validation_loss auto idxs = pop.get_island_indexes(island); int start = 0; @@ -71,7 +72,8 @@ void Evaluation::fitness(Population& pop, if (!pass) { - ind.fitness = MAX_FLT; + // TODO: check if score was nan and assign the max float + ind.fitness.loss = MAX_FLT; ind.error = MAX_FLT*VectorXf::Ones(data.y.size()); } else @@ -86,7 +88,7 @@ void Evaluation::fitness(Population& pop, } } -// assign fitness to program +// assign loss to program template void Evaluation::assign_fit(Individual& ind, VectorXf& y_pred, const Dataset& data, @@ -97,14 +99,41 @@ void Evaluation::assign_fit(Individual& ind, float f = S.score(data.y, y_pred, loss, params.class_weights); if (val) - { - ind.fitness_v = f; + { // TODO: use this function to decide wether to take loss from validation or training + ind.fitness.loss_v = f; } else { - ind.fitness = f; + // TODO: setter for loss and loss_v + ind.fitness.loss = f; ind.error = loss; } + ind.fitness.size = ind.program.size(); + ind.fitness.complexity = ind.program.complexity(); + ind.fitness.depth = ind.program.depth(); + + ind.set_objectives(params.objectives); + + vector values; + values.resize(0); + + for (const auto& n : ind.get_objectives()) + { + if (n.compare("error")==0) + values.push_back(f); // fitness on training data, not validation. + // if you use batch, this value will change every generation + else if (n.compare("complexity")==0) + values.push_back(ind.program.complexity()); + else if (n.compare("size")==0) + values.push_back(ind.program.size()); + else if (n.compare("depth")==0) + values.push_back(ind.program.depth()); + else + HANDLE_ERROR_THROW(n+" is not a known objective"); + } + + // will use inner attributes to set the fitness object + ind.fitness.set_values(values); } } // Pop diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index 743deca7..c421db52 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -35,10 +35,12 @@ class Evaluation { bool offspring = false ); + // TODO: set objectives + // TODO: evaluation bind // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING? (caps) // TODO: MAKE it work for classification (do I need to have a way to set accuracy as a minimization problem?) /// fitness of population. - void fitness(Population& pop, + void update_fitness(Population& pop, int island, const Dataset& data, const Parameters& params, diff --git a/src/eval/fitness.cpp b/src/eval/fitness.cpp new file mode 100644 index 00000000..e69de29b diff --git a/src/eval/fitness.h b/src/eval/fitness.h new file mode 100644 index 00000000..c7c35660 --- /dev/null +++ b/src/eval/fitness.h @@ -0,0 +1,3 @@ +// Minimizing/maximizing problem: negative/positive weight, respectively. + +// TODO: move fitness here \ No newline at end of file diff --git a/src/individual.cpp b/src/individual.cpp index 39a94bb4..b03e9186 100644 --- a/src/individual.cpp +++ b/src/individual.cpp @@ -1,18 +1,53 @@ #include "individual.h" -namespace Brush{ -namespace Pop{ - -template -int Individual::check_dominance(const Individual& b) const +namespace Brush{ + +void to_json(json &j, const Fitness &f) +{ + j = json{ + {"values", f.values}, + {"weights", f.weights}, + {"wvalues", f.wvalues}, + {"loss", f.loss}, + {"loss_v", f.loss_v}, + {"complexity", f.complexity}, + {"size", f.size}, + {"depth", f.depth}, + {"dcounter", f.dcounter}, + {"dominated", f.dominated}, + {"rank", f.rank}, + {"crowding_dist", f.crowding_dist} + }; +} + +void from_json(const json &j, Fitness& f) +{ + j.at("values").get_to( f.values ); + j.at("weights").get_to( f.weights ); + j.at("wvalues").get_to( f.wvalues ); + j.at("loss").get_to( f.loss ); + j.at("loss_v").get_to( f.loss_v ); + j.at("complexity").get_to( f.complexity ); + j.at("size").get_to( f.size ); + j.at("depth").get_to( f.depth ); + j.at("dcounter").get_to( f.dcounter ); + j.at("dominated").get_to( f.dominated ); + j.at("rank").get_to( f.rank ); + j.at("crowding_dist").get_to( f.crowding_dist ); +} + + +int Fitness::dominates(const Fitness& b) const { int flag1 = 0, // to check if this has a smaller objective flag2 = 0; // to check if b has a smaller objective - for (int i=0; i b.obj.at(i)) + else if (get_wvalues().at(i) > b.get_wvalues().at(i)) flag2 = 1; } @@ -29,25 +64,9 @@ int Individual::check_dominance(const Individual& b) const return 0; } -template -void Individual::set_obj(const vector& objectives) -{ - obj.clear(); - - for (const auto& n : objectives) - { - if (n.compare("fitness")==0) - obj.push_back(fitness); // fitness on training data, not validation. - // if you use batch, this value will change every generation - else if (n.compare("complexity")==0) - obj.push_back(set_complexity()); - else if (n.compare("size")==0) - obj.push_back(program.size()); - else - HANDLE_ERROR_THROW(n+" is not a known objective"); - } -} +namespace Pop{ + } // Pop } // Brush \ No newline at end of file diff --git a/src/individual.h b/src/individual.h index f13d8aef..08a102a9 100644 --- a/src/individual.h +++ b/src/individual.h @@ -1,10 +1,177 @@ #ifndef INDIVIDUAL_H #define INDIVIDUAL_H +// #include "search_space.h" #include "program/program.h" -#include "search_space.h" + +#include + +using namespace nlohmann; + +template <> // this is intended to be used with DEAP. TODO: decide if im going to keep it +struct std::hash> { + std::size_t operator()(const std::vector& v) const { + std::size_t seed = v.size(); + for (const auto& elem : v) { + seed ^= std::hash{}(elem) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; namespace Brush{ + + +// TODO: separate declaration from implementation +// TODO: move fitness to eval folder +// TODO make a better use of this (in selection, when fitting, etc) (actually i need to start using it) +struct Fitness { + // Static map for weights associated with strings + // TODO: weights for different values. loss should be calculated duing runtime, based on the metric + inline static std::map weightsMap = { + {"error", -1.0}, // error should be the common error metrics for class (acc) and regression (mse) by default + {"complexity", -1.0}, + {"size", -1.0} + // Add more key-value pairs as needed + }; + + + float loss; ///< aggregate loss score + float loss_v; ///< aggregate validation loss score + + // TODO: maybe this should be all part of fitness, and individual should have only the fitness, program, and error (and objectives) + size_t complexity; + size_t size; + size_t depth; + unsigned int dcounter; ///< number of individuals this dominates + vector dominated; ///< individual indices this dominates + + unsigned int rank; ///< pareto front rank + float crowding_dist; ///< crowding distance on the Pareto front + + void set_loss(float f){ loss=f; }; + float get_loss() const { return loss; }; + + void set_loss_v(float f_v){ loss_v=f_v; }; + float get_loss_v() const { return loss_v; }; + + void set_rank(unsigned r){ rank=r; }; + size_t get_rank() const { return rank; }; + + void set_crowding_dist(float cd){ crowding_dist=cd; }; + float get_crowding_dist() const { return crowding_dist; }; + + vector values; + vector weights; + + // TODO: fitness could have a function size() + + // weighted values + vector wvalues; + + // Constructor with initializer list for weights + Fitness(const vector& w={}) : values(), wvalues(), weights(w) { } + + // Hash function + size_t hash() const { + std::size_t h = std::hash>{}(wvalues); + return h; + } + + vector get_weights() const { + return weights; + } + vector get_values() const { + return values; + } + vector get_wvalues() const { + return wvalues; + } + + // Method to set values + void set_values(vector& v) { + if (v.size() != weights.size()) { + throw std::length_error("Assigned values have not the same length than current values"); + } + // fmt::print("updated values\n"); + + values.resize(0); + for (const auto& element : v) { + values.push_back(element); + } + + wvalues.resize(weights.size()); + + // Perform element-wise multiplication + std::transform(v.begin(), v.end(), + weights.begin(), wvalues.begin(), + [](double a, double b) { + return a * b; + }); + } + + // Method to clear values + void clearValues() { + wvalues.clear(); + } + + bool valid() const { + return !wvalues.empty(); + } + + // Equality comparison + bool operator==(const Fitness& other) const { + return wvalues == other.wvalues; + } + + // Inequality comparison + bool operator!=(const Fitness& other) const { + return !(*this == other); + } + + // Less than comparison + bool operator<(const Fitness& other) const { + return std::lexicographical_compare(wvalues.begin(), wvalues.end(), + other.wvalues.begin(), other.wvalues.end()); + } + + // Greater than comparison + bool operator>(const Fitness& other) const { + return other < *this; + } + + // Less than or equal to comparison + bool operator<=(const Fitness& other) const { + return !(other < *this); + } + + // Greater than or equal to comparison + bool operator>=(const Fitness& other) const { + return !(*this < other); + } + + // String representation + std::string toString() const { + if (valid()) { + return "TODO: implement string representation"; //std::to_string(wvalues); + } else { + return "Tuple()"; + } + } + + // Representation for debugging + std::string repr() const { + return "Fitness(TODO: implement string representation)"; + } + + + /// set obj vector given a string of objective names + int dominates(const Fitness& b) const; +}; + +void to_json(json &j, const Fitness &f); +void from_json(const json &j, Fitness& f); + namespace Pop{ template @@ -16,31 +183,35 @@ class Individual{ VectorXf error; ///< training error (used in lexicase selectors) - float fitness; ///< aggregate fitness score - float fitness_v; ///< aggregate validation fitness score + Fitness fitness; ///< aggregate fitness score - size_t complexity; - unsigned int dcounter; ///< number of individuals this dominates - vector dominated; ///< individual indices this dominates - - unsigned int rank; ///< pareto front rank - float crowd_dist; ///< crowding distance on the Pareto front - vector obj; ///< objectives for use with Pareto selection + vector objectives; ///< objectives for use with Pareto selection Individual() { - fitness = -1; - fitness_v = -1; + // TODO: default value for fitness + // the fitness is used in evolutionary functions + // fitness = -1; - complexity=-1; + // loss is the aggregation of error vector, and can be user sppecified + // loss = -1; + // loss_v = -1; + + // complexity=-1; + // size=-1; + // depth=-1; - dcounter=-1; - rank=-1; - crowd_dist = -1; + // dcounter=-1; + // rank=-1; + // crowding_dist = -1; + + objectives = {"error", "complexity"}; }; Individual(Program& prg) : Individual() { program = prg; }; + // TODO: clone? maybe a constructor that takes another individual as arg and copies everything + void init(SearchSpace& ss, const Parameters& params) { program = ss.make_program>(params, 0, 0); @@ -50,71 +221,90 @@ class Individual{ // program = SS.make_program(params, params.max_depth, params.max_size); }; - // fitness, objetives, complexity, etc + // fitness, objetives, complexity, etc. TODO: create intermediate functions to interact with fitness and program? void fit(Dataset& data) { program.fit(data); }; auto predict(Dataset& data) { return program.predict(data); }; // TODO: predict proba and classification related methods. // TODO: This class should also have its own cpp wrapper. Update it into the deap api (the idea is that the user is still able to prototype with brush, I dont think we should disable that feature) - // just getters - string get_model() { return program.get_model(); }; - size_t get_size() { return program.size(); }; - size_t get_depth() { return program.depth(); }; + // just getters (TODO: use the attributes ) + string get_model() const { return program.get_model(); }; + size_t get_size() const { return program.size(); }; + size_t get_depth() const { return program.depth(); }; + size_t get_complexity() const { return program.complexity(); }; Program& get_program() { return program; }; - // setters and getters - size_t set_complexity() { - complexity = program.complexity(); - return complexity; - }; // sets and returns it - size_t get_complexity() const { return complexity; }; + void set_fitness(Fitness &f) { fitness=f; }; + Fitness& get_fitness() { return fitness; }; // TODO: USE setters and getters intead of accessing it directly - void set_fitness(float f){ fitness=f; }; - float get_fitness() const { return fitness; }; + // template + // void Individual::set_objectives(const vector& objectives) - void set_fitness_v(float f_v){ fitness_v=f_v; }; - float get_fitness_v() const { return fitness_v; }; + // TODO: fix to use these with fitness instead of with individual + unsigned int dcounter; ///< number of individuals this dominates + vector dominated; ///< individual indices this dominates + + unsigned int rank; ///< pareto front rank + float crowding_dist; ///< crowding distance on the Pareto front - void set_rank(unsigned r){ rank=r; }; - size_t get_rank() const { return rank; }; - void set_crowd_dist(unsigned cd){ crowd_dist=cd; }; - size_t get_crowd_dist() const { return crowd_dist; }; + vector get_objectives() const { return objectives; }; + void set_objectives(vector objs){ + objectives=objs; + + vector weights; + weights.resize(0); + for (const auto& obj : objectives) { + auto it = Fitness::weightsMap.find(obj); + if (it != Fitness::weightsMap.end()) { + weights.push_back(it->second); + } else { + // TODO: throw error here, unknown objective + std::cout << obj << " not found in the weight map." << std::endl; + } + } - /// set obj vector given a string of objective names - void set_obj(const vector&); - int check_dominance(const Individual& b) const; + fitness = Fitness(weights); + }; }; +// TODO: rename (something better (more meaningful) than p) // serialization for Individual template void to_json(json &j, const Individual &p) { j = json{ + // TODO: jsonify fitness struct, and new possible obj functions {"program", p.program}, {"fitness", p.fitness}, - {"fitness_v", p.fitness_v}, - {"complexity", p.complexity}, - {"rank", p.rank}, - {"crowd_dist", p.crowd_dist} + // {"loss", p.loss}, + // {"loss_v", p.loss_v}, + // {"complexity", p.complexity}, + // {"size", p.size}, + // {"depth", p.depth}, + // {"rank", p.rank}, + // {"crowding_dist", p.crowding_dist}, + {"objectives", p.objectives} }; } template void from_json(const json &j, Individual& p) -{ +{// TODO: figure out if this works with private attributes and try to actually make them private (and use getters and setters) j.at("program").get_to( p.program ); j.at("fitness").get_to( p.fitness ); - j.at("fitness_v").get_to( p.fitness_v ); - j.at("complexity").get_to( p.complexity ); - j.at("rank").get_to( p.rank ); - j.at("crowd_dist").get_to( p.crowd_dist ); + // j.at("loss").get_to( p.loss ); + // j.at("loss_v").get_to( p.loss_v ); + // j.at("complexity").get_to( p.complexity ); + // j.at("size").get_to( p.size ); + // j.at("depth").get_to( p.depth ); + // j.at("rank").get_to( p.rank ); + // j.at("crowding_dist").get_to( p.crowding_dist ); + j.at("objectives").get_to( p.objectives ); } - - } // Pop } // Brush diff --git a/src/params.h b/src/params.h index 688c221b..4ba637ed 100644 --- a/src/params.h +++ b/src/params.h @@ -31,7 +31,7 @@ struct Parameters int gens = 1000; unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size unsigned int max_size = 50; - vector objectives{"fitness","complexity"}; // error should be generic and deducted based on mode + vector objectives{"error","complexity"}; // error should be generic and deducted based on mode string sel = "nsga2"; //selection method string surv = "nsga2"; //survival method std::unordered_map functions; @@ -73,9 +73,39 @@ struct Parameters void set_pop_size(int new_pop_size){ pop_size = new_pop_size; }; int get_pop_size(){ return pop_size; }; + void set_gens(int new_gens){ gens = new_gens; }; + int get_gens(){ return gens; }; + void set_num_islands(int new_num_islands){ num_islands = new_num_islands; }; + int get_num_islands(){ return num_islands; }; + + void set_max_depth(unsigned new_max_depth){ max_depth = new_max_depth; }; + unsigned get_max_depth(){ return max_depth; }; + + void set_max_size(unsigned new_max_size){ max_size = new_max_size; }; + unsigned get_max_size(){ return max_size; }; + + void set_objectives(vector new_objectives){ objectives = new_objectives; }; + vector get_objectives(){ return objectives; }; + + void set_sel(string new_sel){ sel = new_sel; }; + string get_sel(){ return sel; }; + + void set_surv(string new_surv){ surv = new_surv; }; + string get_surv(){ return surv; }; + + void set_cx_prob(float new_cx_prob){ cx_prob = new_cx_prob; }; + float get_cx_prob(){ return cx_prob; }; + + void set_mig_prob(float new_mig_prob){ mig_prob = new_mig_prob; }; + float get_mig_prob(){ return mig_prob; }; + + //TODO: unify unordered or ordered void set_mutation_probs(std::map new_mutation_probs){ mutation_probs = new_mutation_probs; }; std::map get_mutation_probs(){ return mutation_probs; }; + + void set_functions(std::unordered_map new_functions){ functions = new_functions; }; + std::unordered_map get_functions(){ return functions; }; }; // Global (deprecated) params diff --git a/src/population.cpp b/src/population.cpp index a9e25903..72a1c9ec 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -9,7 +9,7 @@ Population::Population() individuals.resize(0); mig_prob = 0.0; pop_size = 0; - n_islands = 0; + num_islands = 0; } template @@ -17,18 +17,18 @@ void Population::init(SearchSpace& ss, const Parameters& params) { this->mig_prob = params.mig_prob; this->pop_size = params.pop_size; - this->n_islands=params.num_islands; + this->num_islands=params.num_islands; // Tuples with start and end indexes for each island. Number of individuals - // in each island can slightly differ if N_ISLANDS is not a divisor of p (popsize) - island_indexes.resize(n_islands); + // in each island can slightly differ if num_islands is not a divisor of p (popsize) + island_indexes.resize(num_islands); size_t p = pop_size; // population size - for (int i=0; i::init(SearchSpace& ss, const Parameters& params) { individuals.at(i) = std::make_shared>(); individuals.at(i)->init(ss, params); + individuals.at(i)->set_objectives(params.objectives); } } @@ -60,8 +61,8 @@ void Population::add_offspring_indexes(int island) size_t p = pop_size; // population size. prep_offspring slots will douple the population, adding the new expressions into the islands // this is going to be tricky (pay attention to delta and p use) - size_t idx_start = std::floor(island*p/n_islands); - size_t idx_end = std::floor((island+1)*p/n_islands); + size_t idx_start = std::floor(island*p/num_islands); + size_t idx_end = std::floor((island+1)*p/num_islands); auto delta = idx_end - idx_start; // island size @@ -83,20 +84,20 @@ void Population::update(vector> survivors) vector>> new_pop; new_pop.resize(2*pop_size); size_t i=0; - for (int j=0; jset_complexity(); + // new_pop.at(i)->set_complexity(); ++i; // this will fill just half of the pop } // need to make island point to original range - size_t idx_start = std::floor(j*pop_size/n_islands); - size_t idx_end = std::floor((j+1)*pop_size/n_islands); + size_t idx_start = std::floor(j*pop_size/num_islands); + size_t idx_end = std::floor((j+1)*pop_size/num_islands); auto delta = idx_end - idx_start; @@ -113,7 +114,7 @@ string Population::print_models(bool just_offspring, string sep) // not printing the island each individual belongs to string output = ""; - for (int j=0; j> Population::sorted_front(unsigned rank, bool ignore_of /* Returns individuals on the Pareto front, sorted by increasign complexity. */ vector> pf_islands; - pf_islands.resize(n_islands); + pf_islands.resize(num_islands); - for (int j=0;j pf; @@ -188,13 +189,13 @@ vector Population::hall_of_fame(unsigned rank, bool ignore_offspring) return pf; } - +// TODO: check why im getting core dump in migrate or NSGA2 template void Population::migrate() { // changes where island points to - if (n_islands==1) + if (num_islands==1) return; // we cant use more than half of population here @@ -202,7 +203,7 @@ void Population::migrate() auto global_hall_of_fame = hall_of_fame(1, true); // This is not thread safe (as it is now) - for (int island=0; island::migrate() } else { // from any other local hall of fame // finding other island indexes - vector other_islands(n_islands-1); + vector other_islands(num_islands-1); iota(other_islands.begin(), other_islands.end(), 0); // skipping current island diff --git a/src/population.h b/src/population.h index 4604bcab..77bd8dcc 100644 --- a/src/population.h +++ b/src/population.h @@ -1,10 +1,8 @@ #ifndef POPULATION_H #define POPULATION_H -#include "search_space.h" -#include "individual.h" -#include "program/program.h" #include "util/error.h" +#include "individual.h" using std::vector; using std::string; @@ -17,7 +15,7 @@ template class Population{ public: size_t pop_size; - unsigned int n_islands; + unsigned int num_islands; float mig_prob; vector>> individuals; @@ -27,7 +25,7 @@ class Population{ // - prepare offspring and update are not thread safe because we insert/delete elements from the array. vector> island_indexes; - // TODO: taskflow needs to use n_islands as n_jobs + // TODO: taskflow needs to use num_islands as n_jobs Population(); ~Population(){}; @@ -42,7 +40,7 @@ class Population{ vector get_island_indexes(int island){ return island_indexes.at(island); }; - /// update individual vector size, distributing the expressions in n_islands + /// update individual vector size, distributing the expressions in num_islands void add_offspring_indexes(int island); /// reduce programs to the indices in survivors. Not thread safe,as it removes elements diff --git a/src/program/program.h b/src/program/program.h index 44629aa4..edba219d 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -37,10 +37,6 @@ namespace Brush { typedef tree::pre_order_iterator Iter; typedef tree::post_order_iterator PostIter; -struct Fitness { - vector values; - bool valid; -}; using PT = ProgramType; // for unsupervised learning, classification and regression. @@ -68,8 +64,9 @@ template struct Program /// whether fit has been called bool is_fitted_; + /// fitness - Fitness fitness; + // Fitness fitness; /// the underlying tree tree Tree; diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 19ba0b99..f4b619f2 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -22,15 +22,15 @@ size_t NSGA2::tournament(Population& pop, size_t i, size_t j) const const Individual& ind1 = pop[i]; const Individual& ind2 = pop[j]; - int flag = ind1.check_dominance(ind2); + int flag = ind1.fitness.dominates(ind2.fitness); if (flag == 1) // ind1 dominates ind2 return i; else if (flag == -1) // ind2 dominates ind1 return j; - else if (ind1.crowd_dist > ind2.crowd_dist) + else if (ind1.crowding_dist > ind2.crowding_dist) return i; - else if (ind2.crowd_dist > ind1.crowd_dist) + else if (ind2.crowding_dist > ind1.crowding_dist) return j; else return i; @@ -67,13 +67,16 @@ vector NSGA2::survive(Population& pop, int island, const Parameters& params, const Dataset& d) { - size_t idx_start = std::floor(island*pop.size()/pop.n_islands); - size_t idx_end = std::floor((island+1)*pop.size()/pop.n_islands); + // fmt::print("starting\n"); + size_t idx_start = std::floor(island*pop.size()/pop.num_islands); + size_t idx_end = std::floor((island+1)*pop.size()/pop.num_islands); - auto original_size = idx_end - idx_start; // island size + int original_size = (idx_end - idx_start)/2; // original island size (survive must be called with an island with offfspring) auto island_pool = pop.get_island_indexes(island); + // fmt::print("indexes {} {}\n", idx_start, idx_end); + // set objectives (this is when the obj vector is updated.) // for loop below (originally performed in selection in FEAT) was moved to evaluation --- multiple islands may have the same individual @@ -81,30 +84,53 @@ vector NSGA2::survive(Population& pop, int island, // pop.individuals.at(island_pool[i])->set_obj(params.objectives); // fast non-dominated sort + // fmt::print("fast nds\n"); auto front = fast_nds(pop, island_pool); + // fmt::print("selecting...\n"); // Push back selected individuals until full - vector selected(0); + vector selected; + // fmt::print("created array...\n"); + selected.resize(0); + // fmt::print("resized...\n"); + int i = 0; - while ( selected.size() + front.at(i).size() < original_size ) // (size/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) + + // fmt::print("starting loop...\n"); + // fmt::print("{}...\n",selected.size()); + // fmt::print("{}...\n", front.at(i).size()); + // fmt::print("{}...\n", original_size); + + while ( + i < front.size() + && ( selected.size() + front.at(i).size() < original_size ) // (size/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) + ) { + // fmt::print("1...\n"); std::vector& Fi = front.at(i); // indices in front i + + // fmt::print("2...\n"); crowding_distance(pop, front, i); // calculate crowding in Fi - + + // fmt::print("3...\n"); for (int j = 0; j < Fi.size(); ++j) // Pt+1 = Pt+1 U Fi selected.push_back(Fi.at(j)); + // fmt::print("4...\n"); + ++i; } + // fmt::print("crowding distance\n"); crowding_distance(pop, front, i); // calculate crowding in final front to include std::sort(front.at(i).begin(),front.at(i).end(),sort_n(pop)); - fmt::print("adding last front)\n"); + // fmt::print("adding last front)\n"); const int extra = original_size - selected.size(); for (int j = 0; j < extra; ++j) // Pt+1 = Pt+1 U Fi[1:N-|Pt+1|] selected.push_back(front.at(i).at(j)); + // fmt::print("returning\n"); return selected; } @@ -129,7 +155,7 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan const Individual& q = pop[island_pool[j]]; - int compare = p->check_dominance(q); + int compare = p->fitness.dominates(q.fitness); if (compare == 1) { // p dominates q //p.dominated.push_back(j); dom.push_back(island_pool[j]); @@ -141,12 +167,12 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan #pragma omp critical { - p->dcounter = dcount; - p->dominated.clear(); - p->dominated = dom; // dom will have values already referring to island indexes + p->fitness.dcounter = dcount; + p->fitness.dominated.clear(); + p->fitness.dominated = dom; // dom will have values already referring to island indexes if (p->dcounter == 0) { - p->set_rank(1); + p->fitness.set_rank(1); // front will have values already referring to island indexes front.at(0).push_back(island_pool[i]); } @@ -168,14 +194,14 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan const Individual& p = pop[fronti.at(i)]; // iterating over dominated individuals - for (int j = 0; j < p.dominated.size() ; ++j) { + for (int j = 0; j < p.fitness.dominated.size() ; ++j) { - auto q = pop.individuals.at(p.dominated.at(j)); - q->dcounter -= 1; + auto q = pop.individuals.at(p.fitness.dominated.at(j)); + q->fitness.dcounter -= 1; - if (q->dcounter == 0) { - q->set_rank(fi+1); - Q.push_back(p.dominated.at(j)); + if (q->fitness.dcounter == 0) { + q->fitness.set_rank(fi+1); + Q.push_back(p.fitness.dominated.at(j)); } } } @@ -196,26 +222,26 @@ void NSGA2::crowding_distance(Population& pop, vector>& front, const int fsize = F.size(); for (int i = 0; i < fsize; ++i) - pop.individuals.at(F.at(i))->crowd_dist = 0; + pop.individuals.at(F.at(i))->fitness.crowding_dist = 0; - const int limit = pop.individuals.at(0)->obj.size(); + const int limit = pop.individuals.at(0)->fitness.get_wvalues().size(); for (int m = 0; m < limit; ++m) { std::sort(F.begin(), F.end(), comparator_obj(pop,m)); // in the paper dist=INF for the first and last, in the code // this is only done to the first one or to the two first when size=2 - pop.individuals.at(F.at(0))->crowd_dist = std::numeric_limits::max(); + pop.individuals.at(F.at(0))->fitness.crowding_dist = std::numeric_limits::max(); if (fsize > 1) - pop.individuals.at(F.at(fsize-1))->crowd_dist = std::numeric_limits::max(); + pop.individuals.at(F.at(fsize-1))->fitness.crowding_dist = std::numeric_limits::max(); for (int i = 1; i < fsize-1; ++i) { - if (pop.individuals.at(F.at(i))->crowd_dist != std::numeric_limits::max()) + if (pop.individuals.at(F.at(i))->fitness.crowding_dist != std::numeric_limits::max()) { // crowd over obj - pop.individuals.at(F.at(i))->crowd_dist += - (pop.individuals.at(F.at(i+1))->obj.at(m) - pop.individuals.at(F.at(i-1))->obj.at(m)) - / (pop.individuals.at(F.at(fsize-1))->obj.at(m) - pop.individuals.at(F.at(0))->obj.at(m)); + pop.individuals.at(F.at(i))->fitness.crowding_dist += + (pop.individuals.at(F.at(i+1))->fitness.get_wvalues().at(m) - pop.individuals.at(F.at(i-1))->fitness.get_wvalues().at(m)) + / (pop.individuals.at(F.at(fsize-1))->fitness.get_wvalues().at(m) - pop.individuals.at(F.at(0))->fitness.get_wvalues().at(m)); } } } diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index 966402db..05b14eaf 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -1,12 +1,7 @@ #ifndef NSGA2_H #define NSGA2_H -#include "selection.h" -#include "../init.h" -#include "../program/program.h" -#include "../population.h" -#include "../individual.h" -#include "../data/data.h" +#include "selection_operator.h" namespace Brush { namespace Sel { @@ -25,7 +20,7 @@ class NSGA2 : public SelectionOperator // if any of the islands have overlapping indexes, parallel access and modification should be ok (because i dont increase or decrease pop size, not change island ranges inside selection) - NSGA2(bool surv); + NSGA2(bool surv=false); ~NSGA2(){}; /// selection according to the survival scheme of NSGA-II @@ -60,7 +55,7 @@ class NSGA2 : public SelectionOperator if (ind1->rank < ind2->rank) return true; else if (ind1->rank == ind2->rank && - ind1->crowd_dist > ind2->crowd_dist) + ind1->crowding_dist > ind2->crowding_dist) return true; return false; }; @@ -75,7 +70,8 @@ class NSGA2 : public SelectionOperator comparator_obj(const Population& population, int index) : pop(population), m(index) {}; - bool operator() (int i, int j) { return pop[i].obj[m] < pop[j].obj[m]; }; + bool operator() (int i, int j) { + return pop[i].fitness.get_wvalues()[m] < pop[j].fitness.get_wvalues()[m]; }; }; size_t tournament(Population& pop, size_t i, size_t j) const; diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index 8417c968..4bda6da3 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -1,5 +1,4 @@ #include "selection.h" -#include "nsga2.h" // TODO: organize all namespaces namespace Brush { @@ -8,6 +7,16 @@ namespace Sel { using namespace Brush; using namespace Pop; + +template +Selection::Selection() +{ + this->type = "nsga2"; + this->survival = false; + this->set_operator(); +} + + template Selection::Selection(string type, bool survival) { diff --git a/src/selection/selection.h b/src/selection/selection.h index 6b94cec3..17d3ac22 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -6,50 +6,14 @@ license: GNU/GPL v3 #ifndef SELECTION_H #define SELECTION_H -#include "../init.h" -#include "../params.h" -#include "../types.h" -#include "../population.h" -#include "../variation.h" +#include "selection_operator.h" +#include "nsga2.h" namespace Brush { namespace Sel { using namespace Brush; using namespace Pop; -using namespace Var; - -/*! - * @class SelectionOperator - * @brief base class for selection operators. - */ -template -class SelectionOperator -{ -public: - bool survival; - string name; - // TODO: implement lexicase - - // shoudn't have a constructor - // SelectionOperator(){}; - - virtual ~SelectionOperator(){}; - - virtual vector select(Population& pop, int island, - const Parameters& p, const Dataset& data) - { - HANDLE_ERROR_THROW("Undefined select() operation"); - return vector(); - }; - - virtual vector survive(Population& pop, int island, - const Parameters& p, const Dataset& data) - { - HANDLE_ERROR_THROW("Undefined select() operation"); - return vector(); - }; -}; // struct Parameters; // forward declaration of Parameters @@ -65,13 +29,7 @@ struct Selection string type; bool survival; - Selection() - { - this->type = "nsga2"; - this->survival = false; - this->set_operator(); - }; - + Selection(); ~Selection(){}; Selection(string type, bool survival); diff --git a/src/selection/selection_operator.cpp b/src/selection/selection_operator.cpp new file mode 100644 index 00000000..67d2f7bc --- /dev/null +++ b/src/selection/selection_operator.cpp @@ -0,0 +1,29 @@ +#include "selection_operator.h" + +namespace Brush { +namespace Sel { + +using namespace Brush; +using namespace Pop; + +template +SelectionOperator::~SelectionOperator(){}; + +template +vector SelectionOperator::select(Population& pop, int island, + const Parameters& p, const Dataset& data) +{ + HANDLE_ERROR_THROW("Undefined select() operation"); + return vector(); +}; + +template +vector SelectionOperator::survive(Population& pop, int island, + const Parameters& p, const Dataset& data) +{ + HANDLE_ERROR_THROW("Undefined select() operation"); + return vector(); +}; + +} // selection +} // Brush \ No newline at end of file diff --git a/src/selection/selection_operator.h b/src/selection/selection_operator.h new file mode 100644 index 00000000..0ca38288 --- /dev/null +++ b/src/selection/selection_operator.h @@ -0,0 +1,45 @@ +#ifndef SELECTION_OPERATOR_H +#define SELECTION_OPERATOR_H + +// virtual class. selection must be made with static methods + +// #include "../init.h" +// #include "../data/data.h" +// #include "../types.h" +// #include "../params.h" +#include "../population.h" + +namespace Brush { +namespace Sel { + +using namespace Brush; +using namespace Pop; + +/*! + * @class SelectionOperator + * @brief base class for selection operators. + */ +template +class SelectionOperator +{ +public: + bool survival; + string name; + + // TODO: implement lexicase + + // shoudn't have a constructor + // SelectionOperator(){}; + + virtual ~SelectionOperator(); + + virtual vector select(Population& pop, int island, + const Parameters& p, const Dataset& data); + + virtual vector survive(Population& pop, int island, + const Parameters& p, const Dataset& data); +}; + +} // selection +} // Brush +#endif diff --git a/src/types.h b/src/types.h index 65e08cdd..4247ddda 100644 --- a/src/types.h +++ b/src/types.h @@ -93,11 +93,11 @@ typedef Pop::Individual RepresenterIndividual; //////////////////////////////////////////////////////////////////////////////// // Estimator using PT = ProgramType; -template class CBrush; -typedef CBrush RegressorEstimator; -typedef CBrush ClassifierEstimator; -typedef CBrush MulticlassClassifierEstimator; -typedef CBrush RepresenterEstimator; +template class Estimator; +typedef Estimator RegressorEstimator; +typedef Estimator ClassifierEstimator; +typedef Estimator MulticlassClassifierEstimator; +typedef Estimator RepresenterEstimator; //////////////////////////////////////////////////////////////////////////////// // Data diff --git a/src/variation.cpp b/src/variation.cpp index a7db8325..e2c96b42 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -498,14 +498,14 @@ std::optional> Variation::cross( template std::optional> Variation::mutate(const Program& parent) { - std::cout << "selecting options" << parameters.mutation_probs.size() << std::endl; + // std::cout << "selecting options" << parameters.mutation_probs.size() << std::endl; auto options = parameters.mutation_probs; - std::cout << "selecting options2" << options.size() << std::endl; + // std::cout << "selecting options2" << options.size() << std::endl; bool all_zero = true; for (auto &it : parameters.mutation_probs) { - std::cout << it.first << it.second << std::endl; + // std::cout << it.first << it.second << std::endl; if (it.second > 0.0) { all_zero = false; break; @@ -514,15 +514,15 @@ std::optional> Variation::mutate(const Program& parent) if (all_zero) { // No mutation can be successfully applied to this solution - std::cout << "no viable one" << std::endl; + // std::cout << "no viable one" << std::endl; return std::nullopt; } - std::cout << "selecting (not all are zero)" << std::endl; + // std::cout << "selecting (not all are zero)" << std::endl; // choose a valid mutation option string choice = r.random_choice(parameters.mutation_probs); - std::cout << "picked mutation" << choice << std::endl; + // std::cout << "picked mutation" << choice << std::endl; // TODO: this could be improved (specially with the Variation class) std::unique_ptr mutation; if (choice == "point") @@ -548,10 +548,10 @@ std::optional> Variation::mutate(const Program& parent) HANDLE_ERROR_THROW(msg); } - std::cout << "cloning parent" << std::endl; + // std::cout << "cloning parent" << std::endl; Program child(parent); - std::cout << "findind spot" << std::endl; + // std::cout << "findind spot" << std::endl; // choose location by weighted sampling of program auto weights = mutation->find_spots(child.Tree); @@ -559,22 +559,22 @@ std::optional> Variation::mutate(const Program& parent) return w<=0.0; })) { // There is no spot that has a probability to be selected - std::cout << "no spots" << std::endl; + // std::cout << "no spots" << std::endl; return std::nullopt; } - std::cout << "apickingt spot" << std::endl; + // std::cout << "apickingt spot" << std::endl; // apply the mutation and check if it succeeded auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), weights.begin(), weights.end()); - std::cout << "mutating" << std::endl; + // std::cout << "mutating" << std::endl; // Every mutation here works inplace, so they return bool instead of // std::optional to indicare the result of their manipulation over the // program tree. Here we call the mutation function and return the result bool success = (*mutation)(child.Tree, spot); - std::cout << "returning" << std::endl; + // std::cout << "returning" << std::endl; if (success && ( (child.size() <= parameters.max_size) && (child.depth() <= parameters.max_depth) )){ From 63b0b5fde10f555974d0e8d89e5911b4ee6520b6 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 7 Feb 2024 08:38:41 -0300 Subject: [PATCH 109/199] Updated python wrapper to use Brush's individual class --- pybrush/BrushEstimator.py | 45 ---------- pybrush/DeapEstimator.py | 168 +++++++++++++++++++++----------------- pybrush/__init__.py | 9 +- pybrush/_versionstr.py | 2 +- pybrush/deap_api/nsga2.py | 16 +++- 5 files changed, 111 insertions(+), 129 deletions(-) delete mode 100644 pybrush/BrushEstimator.py diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py deleted file mode 100644 index 4440bc55..00000000 --- a/pybrush/BrushEstimator.py +++ /dev/null @@ -1,45 +0,0 @@ - -from _brush import Dataset, SearchSpace # TODO: stop calling cbrush, rename it -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin - -# TODO: LOGGER AND ARCHIVE - -# TODO: GET DOCUMENTATION BACK -class BrushEstimator(BaseEstimator): - def __init__(self): - # self.cbrush_ = CBrush() - pass - - def fit(self, X, y, Z=None): - pass - - def predict(self,X,Z=None): - pass - - def transform(self,X,Z=None): - pass - - def fit_predict(self,X,y,Z=None): - pass - - def fit_transform(self,X,y,Z=None): - pass - - def score(self,X,y,Z=None): - pass - - -class BrushRegressor(BrushEstimator): - def __init__(self,**kwargs): - pass - - -class BrushClassifier(BrushEstimator): - def __init__(self,**kwargs): - pass - - def predict(self,X,Z=None): - pass - - def predict_proba(self,X,Z=None): - pass \ No newline at end of file diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index 67d32bbd..a2f74a1e 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -18,8 +18,10 @@ import _brush from pybrush.deap_api import nsga2, DeapIndividual # from _brush import Dataset, SearchSpace +from pybrush import RegressorIndividual, ClassifierIndividual, MultiClassifierIndividual +# TODO: LOGGER AND ARCHIVE class DeapEstimator(BaseEstimator): """ This is the base class for Deap-based Brush estimators. @@ -33,7 +35,7 @@ class DeapEstimator(BaseEstimator): The mode of the estimator. Used by subclasses pop_size : int, default 100 Population size. - max_gen : int, default 100 + gens : int, default 100 Maximum iterations of the algorithm. verbosity : int, default 0 Controls level of printouts. @@ -41,7 +43,7 @@ class DeapEstimator(BaseEstimator): Maximum depth of GP trees in the GP program. Use 0 for no limit. max_size : int, default 0 Maximum number of nodes in a tree. Use 0 for no limit. - n_islands : int, default 5 + num_islands : int, default 5 Number of independent islands to use in evolutionary framework. Ignored if `algorithm!="nsga2island"`. mig_prob : float, default 0.05 @@ -55,12 +57,12 @@ class DeapEstimator(BaseEstimator): same time), we want to have by default an uniform probability between crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and `1/n` for each mutation, we can achieve an uniform distribution. - mutation_options : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} + mutation_probs : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} A dictionary with keys naming the types of mutation and floating point values specifying the fraction of total mutations to do with that method. The probability of having a mutation is `(1-cx_prob)` and, in case the mutation is applied, then each mutation option is sampled based on the probabilities - defined in `mutation_options`. The set of probabilities should add up to 1.0. + defined in `mutation_probs`. The set of probabilities should add up to 1.0. functions: dict[str,float] or list[str], default {} A dictionary with keys naming the function set and values giving the probability of sampling them, or a list of functions which will be weighted uniformly. @@ -119,14 +121,14 @@ def __init__( self, mode='classification', pop_size=100, - max_gen=100, + gens=100, verbosity=0, max_depth=3, max_size=20, - n_islands=5, + num_islands=5, mig_prob=0.05, cx_prob= 1/7, - mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, + mutation_probs = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6}, functions: list[str]|dict[str,float] = {}, initialization="uniform", @@ -138,16 +140,16 @@ def __init__( batch_size: float = 1.0 ): self.pop_size=pop_size - self.max_gen=max_gen + self.gens=gens self.verbosity=verbosity self.algorithm=algorithm self.mode=mode self.max_depth=max_depth self.max_size=max_size - self.n_islands=n_islands + self.num_islands=num_islands self.mig_prob=mig_prob self.cx_prob=cx_prob - self.mutation_options=mutation_options + self.mutation_probs=mutation_probs self.functions=functions self.objectives=objectives self.initialization=initialization @@ -164,21 +166,19 @@ def _setup_toolbox(self, data_train, data_validation): # creator.create is used to "create new functions", and takes at least # 2 arguments: the name of the newly created class and a base class - # Cleaning possible previous classes that are model-dependent (clf and reg are differente) - if hasattr(creator, "FitnessMulti"): - del creator.FitnessMulti if hasattr(creator, "Individual"): del creator.Individual - # Minimizing/maximizing problem: negative/positive weight, respectively. - # Our classification is using the error as a metric - # Comparing fitnesses: https://deap.readthedocs.io/en/master/api/base.html#deap.base.Fitness - creator.create("FitnessMulti", base.Fitness, weights=self.weights) - # create Individual class, inheriting from self.Individual with a fitness attribute - creator.create("Individual", DeapIndividual, fitness=creator.FitnessMulti) - - toolbox.register("Clone", lambda ind: creator.Individual(ind.prg.copy())) + if self.mode == 'classification': + if self.n_classes_ == 2: + creator.create("Individual", ClassifierIndividual) + else: + creator.create("Individual", MultiClassifierIndividual) + else: + creator.create("Individual", RegressorIndividual) + + toolbox.register("Clone", lambda ind: creator.Individual(ind.program.copy())) toolbox.register("mate", self._crossover) toolbox.register("mutate", self._mutate) @@ -214,10 +214,12 @@ def _crossover(self, ind1, ind2): child = None while (attempts < 3 and child is None): attempts = attempts + 1 - child = self.variator_.cross(i.prg, j.prg) + child = self.variator_.cross(i.program, j.program) if child is not None: child = creator.Individual(child) + child.objectives = self.objectives + offspring.extend([child]) @@ -226,21 +228,23 @@ def _crossover(self, ind1, ind2): def _mutate(self, ind1): - # offspring = (creator.Individual(ind1.prg.mutate(self.search_space_)),) + # offspring = (creator.Individual(ind1.program.mutate(self.search_space_)),) attempts = 0 offspring = None - print("starting mutation") + # print("starting mutation") while (attempts < 3 and offspring is None): - print("attempt", attempts) - offspring = self.variator_.mutate(ind1.prg) - print("got offspring") + # print("attempt", attempts) + offspring = self.variator_.mutate(ind1.program) + # print("got offspring") if offspring is not None: - print('and it wasnt none') - return creator.Individual(offspring) + # print('and it wasnt none') + xmen = creator.Individual(offspring) + xmen.objectives = self.objectives + return xmen attempts = attempts + 1 - print("i failed") + # print("i failed") return None @@ -255,7 +259,6 @@ def fit(self, X, y): y : np.ndarray 1-d array of (boolean) target values. """ - _brush.set_params(self.get_params()) if self.random_state is not None: _brush.set_random_state(self.random_state) @@ -301,9 +304,22 @@ def fit(self, X, y): self.search_space_ = _brush.SearchSpace(self.train_, self.functions_, self.weights_init) + # TODO: getters and setters in parameters. Use them to take the arguments from python and save in the cpp backend # TODO: use variation operator here instead of these functions # TODO: store parameters in Parameter and use it to create the variator, selector, survivor, etc. self.parameters_ = _brush.Parameters() + + self.parameters_.pop_size = self.pop_size + self.parameters_.gens = self.gens + self.parameters_.num_islands = self.num_islands + self.parameters_.max_depth = self.max_depth + self.parameters_.max_size = self.max_size + self.parameters_.objectives = self.objectives + self.parameters_.cx_prob = self.cx_prob + self.parameters_.mig_prob = self.mig_prob + self.parameters_.functions = self.functions + self.parameters_.mutation_probs = self.mutation_probs + if self.mode == "classification": self.variator_ = _brush.ClassifierVariator(self.parameters_, self.search_space_) elif self.mode == "regressor": @@ -315,7 +331,7 @@ def fit(self, X, y): # nsga2 and ga differ in the toolbox self.archive_, self.logbook_ = nsga2( - self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, + self.toolbox_, self.gens, self.pop_size, self.cx_prob, (0.0 0: print(f'best model {self.best_estimator_.get_model()}' + @@ -374,6 +390,33 @@ def _make_data(self, X, y=None, feature_names=[], validation_size=0.0): feature_names=feature_names, validation_size=validation_size) + def _make_individual(self): + # C++'s PTC2-based `make_individual` will create a tree of at least + # the given size. By uniformly sampling the size, we can instantiate a + # population with more diversity + + if self.initialization not in ["uniform", "max_size"]: + raise ValueError(f"Invalid argument value for `initialization`. " + f"expected 'max_size' or 'uniform'. got {self.initialization}") + + # TODO: implement initialization with uniform or max_size + # No arguments (or zero): brush will use PARAMS passed in set_params. + # max_size is sampled between 1 and params['max_size'] if zero is provided + + # return creator.Individual( + # self.search_space_.make_classifier( + # self.max_depth,(0 if self.initialization=='uniform' else self.max_size)) + # if self.n_classes_ == 2 else + # self.search_space_.make_multiclass_classifier( + # self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) + # ) + + ind = creator.Individual() + ind.init(self.search_space_, self.parameters_) + ind.objectives = self.objectives + + return ind + def predict(self, X): """Predict using the best estimator in the archive. """ @@ -430,46 +473,30 @@ class DeapClassifier(DeapEstimator,ClassifierMixin): >>> from pybrush import DeapClassifier >>> est = DeapClassifier() >>> est.fit(X,y) - >>> print('score:', est.score(X,y)) + >>> # print('score:', est.score(X,y)) """ def __init__( self, **kwargs): super().__init__(mode='classification',**kwargs) def _error(self, ind, data: _brush.Dataset): - #return (data.y==ind.prg.predict(data)).sum() / data.y.shape[0] - return average_precision_score(data.y, ind.prg.predict(data)) + #return (data.y==ind.program.predict(data)).sum() / data.y.shape[0] + return average_precision_score(data.y, ind.program.predict(data)) def _fitness_validation(self, ind, data: _brush.Dataset): # Fitness without fitting the expression, used with validation data ind_objectives = { "error" : self._error(ind, data), - "size" : ind.prg.size(), - "complexity": ind.prg.complexity() + "size" : ind.program.size(), + "complexity": ind.program.complexity() } return [ ind_objectives[obj] for obj in self.objectives ] def _fitness_function(self, ind, data: _brush.Dataset): - ind.prg.fit(data) + ind.program.fit(data) return self._fitness_validation(ind, data) - - def _make_individual(self): - # C++'s PTC2-based `make_individual` will create a tree of at least - # the given size. By uniformly sampling the size, we can instantiate a - # population with more diversity - - if self.initialization not in ["uniform", "max_size"]: - raise ValueError(f"Invalid argument value for `initialization`. " - f"expected 'max_size' or 'uniform'. got {self.initialization}") - - return creator.Individual( - self.search_space_.make_classifier( - self.max_depth,(0 if self.initialization=='uniform' else self.max_size)) - if self.n_classes_ == 2 else - self.search_space_.make_multiclass_classifier( - self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) - ) + def predict_proba(self, X): """Predict class probabilities for X. @@ -523,13 +550,13 @@ class DeapRegressor(DeapEstimator, RegressorMixin): >>> from pybrush import DeapRegressor >>> est = DeapRegressor() >>> est.fit(X,y) - >>> print('score:', est.score(X,y)) + >>> # print('score:', est.score(X,y)) """ def __init__(self, **kwargs): super().__init__(mode='regressor',**kwargs) def _error(self, ind, data: _brush.Dataset): - MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) + MSE = np.mean( (data.y-ind.program.predict(data))**2 ) if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf MSE = np.inf @@ -540,27 +567,16 @@ def _fitness_validation(self, ind, data: _brush.Dataset): ind_objectives = { "error" : self._error(ind, data), - "size" : ind.prg.size(), - "complexity": ind.prg.complexity() + "size" : ind.program.size(), + "complexity": ind.program.complexity() } return [ ind_objectives[obj] for obj in self.objectives ] def _fitness_function(self, ind, data: _brush.Dataset): - ind.prg.fit(data) + ind.program.fit(data) return self._fitness_validation(ind, data) - def _make_individual(self): - if self.initialization not in ["uniform", "max_size"]: - raise ValueError(f"Invalid argument value for `initialization`. " - f"expected 'max_size' or 'uniform'. got {self.initialization}") - - # No arguments (or zero): brush will use PARAMS passed in set_params. - # max_size is sampled between 1 and params['max_size'] if zero is provided - return creator.Individual( - self.search_space_.make_regressor( - self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) - ) # Under development # class DeapRepresenter(DeapEstimator, TransformerMixin): @@ -577,17 +593,17 @@ def _make_individual(self): # >>> from pybrush import DeapRegressor # >>> est = DeapRegressor() # >>> est.fit(X,y) -# >>> print('score:', est.score(X,y)) +# >>> # print('score:', est.score(X,y)) # """ # def __init__(self, **kwargs): # super().__init__(mode='regressor',**kwargs) # def _fitness_function(self, ind, data: _brush.Dataset): -# ind.prg.fit(data) +# ind.program.fit(data) # return ( # # todo: need to return a matrix from X for this -# np.sum((data.get_X()- ind.prg.predict(data))**2), -# ind.prg.size() +# np.sum((data.get_X()- ind.program.predict(data))**2), +# ind.program.size() # ) # def _make_individual(self): diff --git a/pybrush/__init__.py b/pybrush/__init__.py index da941360..9d80e5c6 100644 --- a/pybrush/__init__.py +++ b/pybrush/__init__.py @@ -1,8 +1,11 @@ # Interfaces for Brush classes. Use to prototype with Brush -from _brush import Dataset, SearchSpace, Parameters # TODO: make individual wrapper, Individual +from _brush import Dataset +from _brush import SearchSpace +from _brush import Parameters -# Brush's original EA algorithm -from pybrush.BrushEstimator import BrushClassifier, BrushRegressor +# Individuals +from _brush.individual import RegressorIndividual, \ + ClassifierIndividual, MultiClassifierIndividual # Prototyping an EA using brush classes, but other EA framework from pybrush.DeapEstimator import DeapClassifier, DeapRegressor \ No newline at end of file diff --git a/pybrush/_versionstr.py b/pybrush/_versionstr.py index 4d9b0682..ec03e2cc 100644 --- a/pybrush/_versionstr.py +++ b/pybrush/_versionstr.py @@ -1 +1 @@ -__version__="i-never-tested-that-thing" \ No newline at end of file +__version__="v1.0" \ No newline at end of file diff --git a/pybrush/deap_api/nsga2.py b/pybrush/deap_api/nsga2.py index c6862d79..eaf5bf87 100644 --- a/pybrush/deap_api/nsga2.py +++ b/pybrush/deap_api/nsga2.py @@ -5,8 +5,8 @@ class DeapIndividual(): """Class that wraps brush program for creator.Individual class from DEAP.""" - def __init__(self, prg): - self.prg = prg + def __init__(self, program): + self.program = program def nsga2(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rnd_flt): # NGEN = 250 @@ -43,6 +43,8 @@ def calculate_statistics(ind): for ind, fit in zip(pop, fitnesses): ind.fitness.values = fit + # print(0, pop[0].fitness.values, pop[0].fitness.weights) + # This is just to assign the crowding distance to the individuals # no actual selection is done pop = toolbox.survive(pop, len(pop)) @@ -67,6 +69,8 @@ def calculate_statistics(ind): for ind, fit in zip(pop, fitnesses): ind.fitness.values = fit + # print(1, pop[0].fitness.values, pop[0].fitness.weights) + # Vary the population # offspring = tools.selTournamentDCD(pop, len(pop)) parents = toolbox.select(pop, len(pop)) @@ -93,6 +97,8 @@ def calculate_statistics(ind): off2.fitness.values = toolbox.evaluateValidation(off2, data=batch) offspring.extend([off2]) + # print(2, offspring[0].fitness.values, offspring[0].fitness.weights) + # Select the next generation population (no sorting before this step, as # survive==offspring will cut it in half) pop = toolbox.survive(pop + offspring, MU) @@ -104,9 +110,11 @@ def calculate_statistics(ind): if verbosity > 0: print(logbook.stream) + print(pop[0].program.get_model(), + pop[0].fitness.values, pop[0].fitness.weights, pop[0].fitness.wvalues) - if verbosity > 0: - print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) + # if verbosity > 0: + # print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) archive = tools.ParetoFront() archive.update(pop) From 618af55c86fc547bd0681fe47b02dfc2dd1ddb79 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 7 Feb 2024 08:39:20 -0300 Subject: [PATCH 110/199] Updated tests to work with Brush's individuals --- tests/cpp/test_params.cpp | 1 + tests/cpp/test_population.cpp | 19 +++++++++++-------- tests/python/test_deap_api.py | 16 ++++++++-------- tests/python/test_params.py | 11 ++++++----- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/tests/cpp/test_params.cpp b/tests/cpp/test_params.cpp index e69de29b..d33bed0c 100644 --- a/tests/cpp/test_params.cpp +++ b/tests/cpp/test_params.cpp @@ -0,0 +1 @@ +// TODO: test it \ No newline at end of file diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 64e897ad..09514736 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -4,11 +4,13 @@ #include "../../src/population.cpp" // TODO: figure out if thats ok to include cpps instead of headers #include "../../src/eval/evaluation.cpp" #include "../../src/selection/nsga2.cpp" +#include "../../src/selection/selection_operator.cpp" #include "../../src/selection/selection.cpp" using namespace Brush::Pop; using namespace Brush::Sel; using namespace Brush::Eval; +using namespace Brush::Sel; TEST(Population, PopulationTests) { @@ -71,21 +73,21 @@ TEST(Population, PopulationTests) fmt::print("Performing all steps of an evolution (sequential, not parallel)\n"); for (int i=0; i<100; ++i) // update and prep offspring slots works properly { - vector> survivors(pop.n_islands); + vector> survivors(pop.num_islands); fmt::print("Fitting individuals\n"); // this must be done in one thread (or implement mutex), because we can have multiple islands pointing to same individuals - for (int j=0; j 0, "Mutation didn't worked for any individual" for x in xmen: - assert x.prg.get_model().startswith(fixed_node), \ + assert x.program.get_model().startswith(fixed_node), \ (f"An individual for {setup} was mutated without {fixed_node} " + f"node on root. Model was {x.ind.get_model()}") @@ -132,13 +132,13 @@ def test_fixed_nodes(setup, fixed_node, brush_args, request): cxmen = [x for x in cxmen if x is not None] assert len(cxmen) > 0, "Crossover didn't worked for any individual" for cx in cxmen: - assert cx.prg.get_model().startswith(fixed_node), \ + assert cx.program.get_model().startswith(fixed_node), \ (f"An individual for {setup} was crossovered without {fixed_node} " + f"node on root. Model was {cx.ind.get_model()}") # Originals still the same for p, p_original_model in zip(pop, pop_models): - assert p.prg.get_model() == p_original_model, \ + assert p.program.get_model() == p_original_model, \ "Variation operator changed the original model." diff --git a/tests/python/test_params.py b/tests/python/test_params.py index 26c8e3f2..22c6f568 100644 --- a/tests/python/test_params.py +++ b/tests/python/test_params.py @@ -6,6 +6,7 @@ import numpy as np +# TODO; get this to work again # def test_param_random_state(): # # Check if make_regressor, mutation and crossover will create the same expressions # test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) @@ -54,10 +55,10 @@ # params = { # 'verbosity': False, # 'pop_size' : 100, -# 'max_gen' : 100, +# 'gens' : 100, # 'max_depth': 5, # 'max_size' : 50, -# 'mutation_options': {'point' : 0.0, +# 'mutation_probs': {'point' : 0.0, # 'insert' : 0.0, # 'delete' : 0.0, # 'subtree' : 0.0, @@ -69,16 +70,16 @@ # mutations = ['point', 'insert', 'delete', 'subtree', 'toggle_weight_on', 'toggle_weight_off'] # for i, m in enumerate(mutations): -# params['mutation_options'][m] = 0 if i != index else 1.0 +# params['mutation_probs'][m] = 0 if i != index else 1.0 # print(f"(Thread id {index}{seconds}) Setting mutation {mutations[index]} to 1 and wait {seconds} seconds") # _brush.set_params(params) # time.sleep(seconds) -# print(f"(Thread id {index}{seconds}) Retrieving PARAMS: {_brush.get_params()['mutation_options']}") +# print(f"(Thread id {index}{seconds}) Retrieving PARAMS: {_brush.get_params()['mutation_probs']}") -# assert params['mutation_options']==_brush.get_params()['mutation_options'], \ +# assert params['mutation_probs']==_brush.get_params()['mutation_probs'], \ # f"(Thread id {index}{seconds}) BRUSH FAILED TO KEEP SEPARATE INSTANCES OF `PARAMS` BETWEEN MULTIPLE THREADS" # def test_global_PARAMS_sharing(): From 94290218b65472647791d42cb09c4059d0c808a2 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 7 Feb 2024 15:10:20 -0300 Subject: [PATCH 111/199] Updated variation operators to operate completely in C++ --- pybrush/DeapEstimator.py | 47 +---- pybrush/deap_api/nsga2.py | 27 +-- src/bindings/bind_variation.h | 2 +- src/individual.h | 5 +- src/population.cpp | 18 ++ src/population.h | 4 +- src/selection/nsga2.cpp | 5 +- src/selection/nsga2.h | 5 +- src/selection/selection.cpp | 8 +- src/selection/selection.h | 4 +- src/selection/selection_operator.cpp | 4 +- src/selection/selection_operator.h | 4 +- src/variation.cpp | 276 ++++++++++++++------------- src/variation.h | 5 +- tests/cpp/test_data.cpp | 17 +- tests/cpp/test_population.cpp | 4 +- tests/cpp/test_variation.cpp | 117 +++++++----- tests/python/test_deap_api.py | 2 +- 18 files changed, 285 insertions(+), 269 deletions(-) diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index a2f74a1e..55851049 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -180,8 +180,8 @@ def _setup_toolbox(self, data_train, data_validation): toolbox.register("Clone", lambda ind: creator.Individual(ind.program.copy())) - toolbox.register("mate", self._crossover) - toolbox.register("mutate", self._mutate) + toolbox.register("mate", self.variator_.cross) + toolbox.register("mutate", self.variator_.mutate) # When solving multi-objective problems, selection and survival must # support this feature. This means that these selection operators must @@ -205,49 +205,6 @@ def offspring(pop, MU): return pop[-MU:] return toolbox - - def _crossover(self, ind1, ind2): - offspring = [] - - for i,j in [(ind1,ind2),(ind2,ind1)]: - attempts = 0 - child = None - while (attempts < 3 and child is None): - attempts = attempts + 1 - child = self.variator_.cross(i.program, j.program) - - if child is not None: - child = creator.Individual(child) - child.objectives = self.objectives - - - offspring.extend([child]) - - # so we always need to have two elements to unpack inside `offspring` - return offspring[0], offspring[1] - - - def _mutate(self, ind1): - # offspring = (creator.Individual(ind1.program.mutate(self.search_space_)),) - attempts = 0 - offspring = None - # print("starting mutation") - while (attempts < 3 and offspring is None): - # print("attempt", attempts) - offspring = self.variator_.mutate(ind1.program) - # print("got offspring") - - if offspring is not None: - # print('and it wasnt none') - xmen = creator.Individual(offspring) - xmen.objectives = self.objectives - return xmen - attempts = attempts + 1 - - # print("i failed") - return None - - def fit(self, X, y): """ Fit an estimator to X,y. diff --git a/pybrush/deap_api/nsga2.py b/pybrush/deap_api/nsga2.py index eaf5bf87..d8109fc0 100644 --- a/pybrush/deap_api/nsga2.py +++ b/pybrush/deap_api/nsga2.py @@ -76,26 +76,19 @@ def calculate_statistics(ind): parents = toolbox.select(pop, len(pop)) # offspring = [toolbox.clone(ind) for ind in offspring] offspring = [] - for ind1, ind2 in zip(parents[::2], parents[1::2]): - off1, off2 = None, None + for ind1, ind2 in zip(parents, parents[1:]): + off = None if rnd_flt() < CXPB: # either mutation or crossover. - off1, off2 = toolbox.mate(ind1, ind2) + off = toolbox.mate(ind1, ind2) else: - off1 = toolbox.mutate(ind1) - off2 = toolbox.mutate(ind2) + off = toolbox.mutate(ind1) - if off1 is not None: # Mutation worked. first we fit, then add to offspring + if off is not None: # Mutation worked. first we fit, then add to offspring # Evaluate (instead of evaluateValidation) to fit the weights of the offspring - off1.fitness.values = toolbox.evaluate(off1) + off.fitness.values = toolbox.evaluate(off) if use_batch: # Adjust fitness to the same data as parents - off1.fitness.values = toolbox.evaluateValidation(off1, data=batch) - offspring.extend([off1]) - - if off2 is not None: - off2.fitness.values = toolbox.evaluate(off2) - if use_batch: - off2.fitness.values = toolbox.evaluateValidation(off2, data=batch) - offspring.extend([off2]) + off.fitness.values = toolbox.evaluateValidation(off, data=batch) + offspring.extend([off]) # print(2, offspring[0].fitness.values, offspring[0].fitness.weights) @@ -110,8 +103,8 @@ def calculate_statistics(ind): if verbosity > 0: print(logbook.stream) - print(pop[0].program.get_model(), - pop[0].fitness.values, pop[0].fitness.weights, pop[0].fitness.wvalues) + print(pop[0].fitness.values, pop[0].fitness.weights, pop[0].fitness.wvalues, + pop[0].program.get_model(),) # if verbosity > 0: # print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index 74ab2564..dc1bbe2b 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -1,6 +1,7 @@ #include "module.h" #include "../variation.h" #include "../variation.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) +#include "../population.h" namespace py = pybind11; namespace nl = nlohmann; @@ -19,6 +20,5 @@ void bind_variation(py::module& m, string name) return variation; })) .def("mutate", &Class::mutate, py::return_value_policy::automatic) .def("cross", &Class::cross, py::return_value_policy::automatic) - // .def("vary", &Class::vary) // apply variation to the population TODO: implement it: wrap a list of individuals into a population, modify it, return as a vector of individuals (so we dont have to expose population to python) ; } \ No newline at end of file diff --git a/src/individual.h b/src/individual.h index 08a102a9..7d60b0b0 100644 --- a/src/individual.h +++ b/src/individual.h @@ -222,7 +222,10 @@ class Individual{ }; // fitness, objetives, complexity, etc. TODO: create intermediate functions to interact with fitness and program? - void fit(Dataset& data) { program.fit(data); }; + void fit(Dataset& data) { + program.fit(data); + + }; auto predict(Dataset& data) { return program.predict(data); }; // TODO: predict proba and classification related methods. diff --git a/src/population.cpp b/src/population.cpp index 72a1c9ec..b1479172 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -12,6 +12,24 @@ Population::Population() num_islands = 0; } + +template +void Population::init(vector&>& individuals, const Parameters& params) +{ + this->mig_prob = params.mig_prob; + this->pop_size = params.pop_size; + this->num_islands=params.num_islands; + + // If the assert fails, execution stops, but for completeness, you can also throw an exception + if (individuals.size() != this->pop_size) { + throw std::runtime_error("Individual vector has different number of individuals than pop_size."); + } + individuals.resize(0); + for (const auto& ind : individuals) { + individuals.push_back( std::make_shared>(ind) ); + } +} + template void Population::init(SearchSpace& ss, const Parameters& params) { diff --git a/src/population.h b/src/population.h index 77bd8dcc..33ae10d9 100644 --- a/src/population.h +++ b/src/population.h @@ -27,12 +27,14 @@ class Population{ // TODO: taskflow needs to use num_islands as n_jobs Population(); - ~Population(){}; /// initialize population of programs with a starting model and/or from file void init(SearchSpace& ss, const Parameters& params); + // initialize based on list of individuals + void init(vector&>& individuals, const Parameters& params); + // TODO: init from file (like FEAT) /// returns population size (the effective size of the individuals) diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index f4b619f2..e7f65458 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -5,7 +5,6 @@ namespace Sel { using namespace Brush; using namespace Pop; -using namespace Data; using namespace Sel; template @@ -38,7 +37,7 @@ size_t NSGA2::tournament(Population& pop, size_t i, size_t j) const template vector NSGA2::select(Population& pop, int island, - const Parameters& params, const Dataset& d) + const Parameters& params) { auto island_pool = pop.get_island_indexes(island); @@ -64,7 +63,7 @@ vector NSGA2::select(Population& pop, int island, template vector NSGA2::survive(Population& pop, int island, - const Parameters& params, const Dataset& d) + const Parameters& params) { // fmt::print("starting\n"); diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index 05b14eaf..7a890c7a 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -8,7 +8,6 @@ namespace Sel { using namespace Brush; using namespace Pop; -using namespace Data; using namespace Sel; template @@ -25,11 +24,11 @@ class NSGA2 : public SelectionOperator /// selection according to the survival scheme of NSGA-II vector select(Population& pop, int island, - const Parameters& p, const Dataset& d); + const Parameters& p); /// survival according to the survival scheme of NSGA-II vector survive(Population& pop, int island, - const Parameters& p, const Dataset& d); + const Parameters& p); //< Fast non-dominated sorting vector> fast_nds(Population&, vector&); diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index 4bda6da3..9c97d0e5 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -49,17 +49,17 @@ void Selection::set_type(string in){ type = in; set_operator();} /// perform selection template vector Selection::select(Population& pop, int island, - const Parameters& params, const Dataset& data) + const Parameters& params) { - return pselector->select(pop, island, params, data); + return pselector->select(pop, island, params); } /// perform survival template vector Selection::survive(Population& pop, int island, - const Parameters& params, const Dataset& data) + const Parameters& params) { - return pselector->survive(pop, island, params, data); + return pselector->survive(pop, island, params); } } // selection diff --git a/src/selection/selection.h b/src/selection/selection.h index 17d3ac22..d415ca8b 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -41,11 +41,11 @@ struct Selection /// perform selection. selection uses a pop that has no offspring space vector select(Population& pop, int island, - const Parameters& params, const Dataset& data); + const Parameters& params); /// perform survival. uses a pop with offspring space vector survive(Population& pop, int island, - const Parameters& params, const Dataset& data); + const Parameters& params); }; // TODO: MAKE THIS WORK diff --git a/src/selection/selection_operator.cpp b/src/selection/selection_operator.cpp index 67d2f7bc..b0c628ca 100644 --- a/src/selection/selection_operator.cpp +++ b/src/selection/selection_operator.cpp @@ -11,7 +11,7 @@ SelectionOperator::~SelectionOperator(){}; template vector SelectionOperator::select(Population& pop, int island, - const Parameters& p, const Dataset& data) + const Parameters& p) { HANDLE_ERROR_THROW("Undefined select() operation"); return vector(); @@ -19,7 +19,7 @@ vector SelectionOperator::select(Population& pop, int island, template vector SelectionOperator::survive(Population& pop, int island, - const Parameters& p, const Dataset& data) + const Parameters& p) { HANDLE_ERROR_THROW("Undefined select() operation"); return vector(); diff --git a/src/selection/selection_operator.h b/src/selection/selection_operator.h index 0ca38288..215b79e8 100644 --- a/src/selection/selection_operator.h +++ b/src/selection/selection_operator.h @@ -34,10 +34,10 @@ class SelectionOperator virtual ~SelectionOperator(); virtual vector select(Population& pop, int island, - const Parameters& p, const Dataset& data); + const Parameters& p); virtual vector survive(Population& pop, int island, - const Parameters& p, const Dataset& data); + const Parameters& p); }; } // selection diff --git a/src/variation.cpp b/src/variation.cpp index e2c96b42..96c608b5 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -353,7 +353,7 @@ class SubtreeMutation : public MutationBase * * If the cross succeeds, the child program can be accessed through the * `.value()` attribute of the `std::optional`. - * + * TODO: update this documentation (it doesnt take the program but the individual. also update mutation documentation) * This means that, if you use the cross as `auto opt = mutate(parent, SS)`, * either `opt==false` or `opt.value()` contains the child. * @@ -363,13 +363,13 @@ class SubtreeMutation : public MutationBase * @return `std::optional` that may contain the child program of type `T` */ template -std::optional> Variation::cross( - const Program& mom, const Program& dad) +std::optional> Variation::cross( + const Individual& mom, const Individual& dad) { /* subtree crossover between this and other, producing new Program */ // choose location by weighted sampling of program // TODO: why doesn't this copy the search space reference to child? - Program child(mom); + Program child(mom.program); // pick a subtree to replace vector child_weights(child.Tree.size()); @@ -395,70 +395,80 @@ std::optional> Variation::cross( return std::nullopt; } - auto child_spot = r.select_randomly(child.Tree.begin(), - child.Tree.end(), - child_weights.begin(), - child_weights.end() - ); - - auto child_ret_type = child_spot.node->data.ret_type; - - auto allowed_size = parameters.max_size - - ( child.size() - child.size_at(child_spot) ); - auto allowed_depth = parameters.max_depth - - ( child.depth_to_reach(child_spot) ); - // pick a subtree to insert. Selection is based on other_weights - Program other(dad); - - vector other_weights(other.Tree.size()); - - // iterator to get the size of subtrees inside transform - auto other_iter = other.Tree.begin(); - - // lambda function to check feasibility of solution and increment the iterator - const auto check_and_incrm = [other, &other_iter, allowed_size, allowed_depth]() -> bool { - int s = other.size_at( other_iter ); - int d = other.depth_at( other_iter ); - - std::advance(other_iter, 1); - return (s <= allowed_size) && (d <= allowed_depth); - }; - - // TODO: something like `is_valid_program` in FEAT - std::transform(other.Tree.begin(), other.Tree.end(), - other_weights.begin(), - [child_ret_type, check_and_incrm](const auto& n){ - // need to pick a node that has a matching output type to the child_spot. - // also need to check if swaping this node wouldn't exceed max_size - if (check_and_incrm() && (n.ret_type == child_ret_type)) - return n.get_prob_change(); - else - // setting the weight to zero to indicate a non-feasible crossover point - return 0.0f; - } - ); - - bool matching_spots_found = false; - for (const auto& w: other_weights) + Program other(dad.program); + + int attempts = 0; + while (++attempts <= 3) { - matching_spots_found = w > 0.0; - - if (matching_spots_found) { - auto other_spot = r.select_randomly( - other.Tree.begin(), - other.Tree.end(), - other_weights.begin(), - other_weights.end() - ); - - // fmt::print("other_spot : {}\n",other_spot.node->data); - // swap subtrees at child_spot and other_spot - // TODO: do I need to delete the removed node? - child.Tree.move_ontop(child_spot, other_spot); - return child; + auto child_spot = r.select_randomly(child.Tree.begin(), + child.Tree.end(), + child_weights.begin(), + child_weights.end() + ); + + auto child_ret_type = child_spot.node->data.ret_type; + + auto allowed_size = parameters.max_size - + ( child.size() - child.size_at(child_spot) ); + auto allowed_depth = parameters.max_depth - + ( child.depth_to_reach(child_spot) ); + + vector other_weights(other.Tree.size()); + + // iterator to get the size of subtrees inside transform + auto other_iter = other.Tree.begin(); + + // lambda function to check feasibility of solution and increment the iterator + const auto check_and_incrm = [other, &other_iter, allowed_size, allowed_depth]() -> bool { + int s = other.size_at( other_iter ); + int d = other.depth_at( other_iter ); + + std::advance(other_iter, 1); + return (s <= allowed_size) && (d <= allowed_depth); + }; + + // TODO: something like `is_valid_program` in FEAT + std::transform(other.Tree.begin(), other.Tree.end(), + other_weights.begin(), + [child_ret_type, check_and_incrm](const auto& n){ + // need to pick a node that has a matching output type to the child_spot. + // also need to check if swaping this node wouldn't exceed max_size + if (check_and_incrm() && (n.ret_type == child_ret_type)) + return n.get_prob_change(); + else + // setting the weight to zero to indicate a non-feasible crossover point + return 0.0f; + } + ); + + bool matching_spots_found = false; + for (const auto& w: other_weights) + { + // we found at least one weight that is non-zero + matching_spots_found = w > 0.0; + + if (matching_spots_found) { + auto other_spot = r.select_randomly( + other.Tree.begin(), + other.Tree.end(), + other_weights.begin(), + other_weights.end() + ); + + // fmt::print("other_spot : {}\n",other_spot.node->data); + // swap subtrees at child_spot and other_spot + // TODO: do I need to delete the removed node? + child.Tree.move_ontop(child_spot, other_spot); + + Individual ind(child); + ind.set_objectives(mom.get_objectives()); // it will have an invalid fitness + + return ind; + } } } + return std::nullopt; } @@ -496,7 +506,7 @@ std::optional> Variation::cross( * @return `std::optional` that may contain the child program of type `T` */ template -std::optional> Variation::mutate(const Program& parent) +std::optional> Variation::mutate(const Individual& parent) { // std::cout << "selecting options" << parameters.mutation_probs.size() << std::endl; auto options = parameters.mutation_probs; @@ -517,73 +527,81 @@ std::optional> Variation::mutate(const Program& parent) // std::cout << "no viable one" << std::endl; return std::nullopt; } + + int attempts = 0; + while(++attempts <= 3) + { + // std::cout << "selecting (not all are zero)" << std::endl; + // choose a valid mutation option + string choice = r.random_choice(parameters.mutation_probs); + + // std::cout << "picked mutation" << choice << std::endl; + // TODO: this could be improved (specially with the Variation class) + std::unique_ptr mutation; + if (choice == "point") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "insert") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "delete") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "toggle_weight_on") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "toggle_weight_off") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else if (choice == "subtree") + mutation = std::make_unique( + search_space,parameters.max_size, parameters.max_depth); + else { + std::string msg = fmt::format("{} not a valid mutation choice", choice); + HANDLE_ERROR_THROW(msg); + } - // std::cout << "selecting (not all are zero)" << std::endl; - // choose a valid mutation option - string choice = r.random_choice(parameters.mutation_probs); - - // std::cout << "picked mutation" << choice << std::endl; - // TODO: this could be improved (specially with the Variation class) - std::unique_ptr mutation; - if (choice == "point") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "insert") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "delete") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "toggle_weight_on") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "toggle_weight_off") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "subtree") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else { - std::string msg = fmt::format("{} not a valid mutation choice", choice); - HANDLE_ERROR_THROW(msg); - } - - // std::cout << "cloning parent" << std::endl; - Program child(parent); + // std::cout << "cloning parent" << std::endl; + Program child(parent.program); - // std::cout << "findind spot" << std::endl; - // choose location by weighted sampling of program - auto weights = mutation->find_spots(child.Tree); + // std::cout << "findind spot" << std::endl; + // choose location by weighted sampling of program + auto weights = mutation->find_spots(child.Tree); - if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { - return w<=0.0; - })) - { // There is no spot that has a probability to be selected - // std::cout << "no spots" << std::endl; - return std::nullopt; - } + if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { + return w<=0.0; + })) + { // There is no spot that has a probability to be selected + // std::cout << "no spots" << std::endl; + continue; + } - // std::cout << "apickingt spot" << std::endl; - // apply the mutation and check if it succeeded - auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), - weights.begin(), weights.end()); + // std::cout << "apickingt spot" << std::endl; + // apply the mutation and check if it succeeded + auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), + weights.begin(), weights.end()); - // std::cout << "mutating" << std::endl; - // Every mutation here works inplace, so they return bool instead of - // std::optional to indicare the result of their manipulation over the - // program tree. Here we call the mutation function and return the result - bool success = (*mutation)(child.Tree, spot); + // std::cout << "mutating" << std::endl; + // Every mutation here works inplace, so they return bool instead of + // std::optional to indicare the result of their manipulation over the + // program tree. Here we call the mutation function and return the result + bool success = (*mutation)(child.Tree, spot); - // std::cout << "returning" << std::endl; - if (success - && ( (child.size() <= parameters.max_size) - && (child.depth() <= parameters.max_depth) )){ + // std::cout << "returning" << std::endl; + if (success + && ( (child.size() <= parameters.max_size) + && (child.depth() <= parameters.max_depth) )){ - return child; - } else { + Individual ind(child); + ind.set_objectives(parent.get_objectives()); // it will have an invalid fitness - return std::nullopt; + return ind; + } else { + continue; + } } + + return std::nullopt; } template @@ -600,7 +618,7 @@ void Variation::vary(Population& pop, int island, for (unsigned i = start; i> opt=std::nullopt; // new individual + std::optional> opt=std::nullopt; // new individual // TODO: do it a certain number of times. after that, assume that variation cant // change individual and add it to the island failures // TODO: use island failures everytime that I'm iterating on the offspring of an @@ -615,19 +633,19 @@ void Variation::vary(Population& pop, int island, const Individual& dad = pop[ *r.select_randomly(parents.begin(), parents.end())]; - opt = cross(mom.program, dad.program); + opt = cross(mom, dad); } else // mutation { - opt = mutate(mom.program); + opt = mutate(mom); } if (opt) // no optional value was returned { - Program child = opt.value(); + Individual ind = opt.value(); - assert(child.size()>0); - pop.individuals.at(idxs.at(i)) = std::make_shared>(child); + assert(ind.program.size()>0); + pop.individuals.at(idxs.at(i)) = std::make_shared>(ind); } } } diff --git a/src/variation.h b/src/variation.h index 564066fc..71e727c3 100644 --- a/src/variation.h +++ b/src/variation.h @@ -31,6 +31,7 @@ using namespace Brush::Pop; namespace Brush { namespace Var { +// base for MUTATION variators class MutationBase { public: using Iter = tree::pre_order_iterator; @@ -120,8 +121,8 @@ class Variation }; // individual-level variations - std::optional> cross(const Program& mom, const Program& dad); - std::optional> mutate(const Program& parent); + std::optional> cross(const Individual& mom, const Individual& dad); + std::optional> mutate(const Individual& parent); /// method to handle variation of population void vary(Population& pop, int island, const vector& parents); diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index be8c0a9e..705830a6 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -94,7 +94,10 @@ TEST(Data, MixedVariableTypes) // creating and fitting a child Variation variator = Variation(params, SS); - std::optional opt = variator.mutate(PRG); + + Individual IND(PRG); + + std::optional> opt = variator.mutate(IND); if (!opt){ fmt::print("Mutation failed to create a child\n"); @@ -102,14 +105,22 @@ TEST(Data, MixedVariableTypes) else { auto Child = opt.value(); - fmt::print("Child model: {}\n", Child.get_model("compact", true)); + fmt::print("Child program model: {}\n", Child.program.get_model("compact", true)); fmt::print( "Child fit\n"); Child.fit(dt); fmt::print( "Child predict\n"); ArrayXf y_pred_child = Child.predict(dt); - fmt::print( "y_pred: {}\n", y_pred); + fmt::print( "y_pred: {}\n", y_pred_child); + + // should be the same as the fit and predict above + fmt::print( "Child program fit\n"); + Child.program.fit(dt); + + fmt::print( "Child program predict\n"); + ArrayXf y_pred_child_program = Child.program.predict(dt); + fmt::print( "y_pred: {}\n", y_pred_child_program); } } diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 09514736..acdcadce 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -91,7 +91,7 @@ TEST(Population, PopulationTests) { // just so we can call the update method fmt::print("Selection\n"); - vector parents = selector.select(pop, j, params, data); + vector parents = selector.select(pop, j, params); ASSERT_TRUE(parents.size() > 0); fmt::print("Preparing offspring\n"); @@ -105,7 +105,7 @@ TEST(Population, PopulationTests) evaluator.update_fitness(pop, j, data, params, true, true); fmt::print("survivors {}\n", j); - auto island_survivors = survivor.survive(pop, j, params, data); + auto island_survivors = survivor.survive(pop, j, params); survivors.at(j) = island_survivors; } diff --git a/tests/cpp/test_variation.cpp b/tests/cpp/test_variation.cpp index 7cae962b..b8dc37da 100644 --- a/tests/cpp/test_variation.cpp +++ b/tests/cpp/test_variation.cpp @@ -52,16 +52,17 @@ TEST(Variation, FixedRootDoesntChange) ASSERT_TRUE(root.get_prob_change()==0.0); ASSERT_TRUE(root.fixed==true); - auto opt_mutation = variator.mutate(PRG); + Individual IND(PRG); + auto opt_mutation = variator.mutate(IND); if (opt_mutation) { successes += 1; auto Mut_Child = opt_mutation.value(); fmt::print("After mutation : {}\n", - Mut_Child.get_model("compact", true)); + Mut_Child.program.get_model("compact", true)); - Node mut_child_root = *(Mut_Child.Tree.begin()); + Node mut_child_root = *(Mut_Child.program.Tree.begin()); ASSERT_TRUE(mut_child_root.node_type == NodeType::Logistic); ASSERT_TRUE(mut_child_root.ret_type == DataType::ArrayF); @@ -71,16 +72,18 @@ TEST(Variation, FixedRootDoesntChange) } ClassifierProgram PRG2 = SS.make_classifier(0, 0, params); - auto opt_cx = variator.cross(PRG, PRG2); + + Individual IND2(PRG2); + auto opt_cx = variator.cross(IND, IND2); if (opt_cx) { successes += 1; auto CX_Child = opt_cx.value(); fmt::print("After crossover: {}\n", - CX_Child.get_model("compact", true)); + CX_Child.program.get_model("compact", true)); - Node cx_child_root = *(CX_Child.Tree.begin()); + Node cx_child_root = *(CX_Child.program.Tree.begin()); ASSERT_TRUE(cx_child_root.node_type == NodeType::Logistic); ASSERT_TRUE(cx_child_root.ret_type == DataType::ArrayF); @@ -159,7 +162,10 @@ TEST(Variation, InsertMutationWorks) fmt::print("auto Child = PRG.mutate();\n"); // We should assume that it will be always the insert mutation - auto opt = variator.mutate(PRG); + + Individual IND(PRG); + + auto opt = variator.mutate(IND); if (opt){ successes += 1; @@ -170,8 +176,8 @@ TEST(Variation, InsertMutationWorks) "Initial Model: {}\n" "Mutated Model: {}\n", params.max_depth, params.max_size, - PRG.get_model("compact", true), - Child.get_model("compact", true) + IND.program.get_model("compact", true), + Child.program.get_model("compact", true) ); fmt::print("child fit\n"); @@ -179,18 +185,18 @@ TEST(Variation, InsertMutationWorks) y_pred = Child.predict(data); // since we successfully inserted a node, this should be always true - ASSERT_TRUE(Child.size() > PRG.size()); + ASSERT_TRUE(Child.program.size() > IND.program.size()); // maybe the insertion spot was a shorter branch than the maximum // depth. At least, xmen depth should be equal to its parent - ASSERT_TRUE(Child.depth() >= PRG.depth()); + ASSERT_TRUE(Child.program.depth() >= IND.program.depth()); } // lets also see if it always fails when the child exceeds the maximum limits - params.max_size = PRG.size(); - params.max_depth = PRG.depth(); + params.max_size = IND.program.size(); + params.max_depth = IND.program.depth(); - auto opt2 = variator.mutate(PRG); + auto opt2 = variator.mutate(IND); if (opt2){ // This shoudl't happen. We'll print then error auto Child2 = opt2.value(); @@ -204,8 +210,8 @@ TEST(Variation, InsertMutationWorks) "Initial Model: {}\n" "Mutated Model: {}\n", params.max_depth, params.max_size, - PRG.get_model("compact", true), - Child2.get_model("compact", true) + IND.program.get_model("compact", true), + Child2.program.get_model("compact", true) ); ASSERT_TRUE(opt2==std::nullopt); } @@ -266,7 +272,9 @@ TEST(Variation, Mutation) // applying mutation and checking if the optional result is non-empty fmt::print("auto Child = PRG.mutate();\n"); - auto opt = variator.mutate(PRG); + + Individual IND(PRG); + auto opt = variator.mutate(IND); if (!opt){ fmt::print( @@ -275,7 +283,7 @@ TEST(Variation, Mutation) "Initial Model: {}\n" "Mutation failed to create a child", d, s, - PRG.get_model("compact", true) + IND.program.get_model("compact", true) ); } else { @@ -287,8 +295,8 @@ TEST(Variation, Mutation) "Initial Model: {}\n" "Mutated Model: {}\n", d, s, - PRG.get_model("compact", true), - Child.get_model("compact", true) + IND.program.get_model("compact", true), + Child.program.get_model("compact", true) ); fmt::print("child fit\n"); @@ -296,7 +304,7 @@ TEST(Variation, Mutation) y_pred = Child.predict(data); // no collateral effect (parent still the same) - ASSERT_TRUE(PRG_model == PRG.get_model("compact", true)); + ASSERT_TRUE(PRG_model == IND.program.get_model("compact", true)); } } } @@ -348,7 +356,9 @@ TEST(Variation, MutationSizeAndDepthLimit) RegressorProgram PRG = SS.make_regressor(0, 0, params); auto PRG_model = PRG.get_model("compact", true); - auto opt = variator.mutate(PRG); + + Individual IND(PRG); + auto opt = variator.mutate(IND); if (!opt){ fmt::print( @@ -357,7 +367,7 @@ TEST(Variation, MutationSizeAndDepthLimit) "Initial Model: {}\n" "Mutation failed to create a child", d, s, - PRG.get_model("compact", true) + IND.program.get_model("compact", true) ); } else { @@ -377,23 +387,23 @@ TEST(Variation, MutationSizeAndDepthLimit) "Mutated depth: {}\n" "Mutated size : {}\n", d, s, - PRG.get_model("compact", true), - Child.get_model("compact", true), - Child.depth(), - Child.size() + IND.program.get_model("compact", true), + Child.program.get_model("compact", true), + Child.program.depth(), + Child.program.size() ); // Original didn't change - ASSERT_TRUE(PRG_model == PRG.get_model("compact", true)); + ASSERT_TRUE(PRG_model == IND.program.get_model("compact", true)); - ASSERT_TRUE(Child.size() > 0); - ASSERT_TRUE(Child.size() <= s); + ASSERT_TRUE(Child.program.size() > 0); + ASSERT_TRUE(Child.program.size() <= s); - ASSERT_TRUE(Child.size() > 0); - ASSERT_TRUE(Child.size() <= s); + ASSERT_TRUE(Child.program.size() > 0); + ASSERT_TRUE(Child.program.size() <= s); - ASSERT_TRUE(Child.depth() >= 0); - ASSERT_TRUE(Child.depth() <= d); + ASSERT_TRUE(Child.program.depth() >= 0); + ASSERT_TRUE(Child.program.depth() <= d); } } } @@ -451,7 +461,10 @@ TEST(Variation, Crossover) ArrayXf y_pred = PRG1.predict(data); fmt::print("cross one\n"); - auto opt = variator.cross(PRG1, PRG2); + Individual IND1(PRG1); + Individual IND2(PRG2); + auto opt = variator.cross(IND1, IND2); + if (!opt){ fmt::print( "=================================================\n" @@ -460,8 +473,8 @@ TEST(Variation, Crossover) "Original model 2: {}\n", "Crossover failed to create a child", d, s, - PRG1.get_model("compact", true), - PRG2.get_model("compact", true) + IND1.program.get_model("compact", true), + IND2.program.get_model("compact", true) ); } else { @@ -470,20 +483,20 @@ TEST(Variation, Crossover) fmt::print( "Original model 1 after cross: {}\n" "Original model 2 after cross: {}\n", - PRG1.get_model("compact", true), - PRG2.get_model("compact", true) + IND1.program.get_model("compact", true), + IND2.program.get_model("compact", true) ); fmt::print( "Crossed Model: {}\n" "=================================================\n", - Child.get_model("compact", true) + Child.program.get_model("compact", true) ); Child.fit(data); auto child_pred1 = Child.predict(data); // no collateral effect (parent still the same) - ASSERT_TRUE(PRG1_model == PRG1.get_model("compact", true)); - ASSERT_TRUE(PRG2_model == PRG2.get_model("compact", true)); + ASSERT_TRUE(PRG1_model == IND1.program.get_model("compact", true)); + ASSERT_TRUE(PRG2_model == IND2.program.get_model("compact", true)); } } } @@ -546,7 +559,9 @@ TEST(Variation, CrossoverSizeAndDepthLimit) ); fmt::print("cross\n"); - auto opt = variator.cross(PRG1, PRG2); + Individual IND1(PRG1); + Individual IND2(PRG2); + auto opt = variator.cross(IND1, IND2); if (!opt){ fmt::print("Crossover failed to create a child" @@ -560,20 +575,20 @@ TEST(Variation, CrossoverSizeAndDepthLimit) "Child Model depth: {}\n" "Child Model size : {}\n" "=================================================\n", - Child.get_model("compact", true), - Child.depth(), Child.size() + Child.program.get_model("compact", true), + Child.program.depth(), Child.program.size() ); // Original didn't change - ASSERT_TRUE(PRG1_model == PRG1.get_model("compact", true)); - ASSERT_TRUE(PRG2_model == PRG2.get_model("compact", true)); + ASSERT_TRUE(PRG1_model == IND1.program.get_model("compact", true)); + ASSERT_TRUE(PRG2_model == IND2.program.get_model("compact", true)); // Child is within restrictions - ASSERT_TRUE(Child.size() > 0); - ASSERT_TRUE(Child.size() <= s + 3*max_arity); + ASSERT_TRUE(Child.program.size() > 0); + ASSERT_TRUE(Child.program.size() <= s + 3*max_arity); - ASSERT_TRUE(Child.depth() >= 0); - ASSERT_TRUE(Child.depth() <= d); + ASSERT_TRUE(Child.program.depth() >= 0); + ASSERT_TRUE(Child.program.depth() <= d); } } } diff --git a/tests/python/test_deap_api.py b/tests/python/test_deap_api.py index 06d32438..0aaa8f2f 100644 --- a/tests/python/test_deap_api.py +++ b/tests/python/test_deap_api.py @@ -127,7 +127,7 @@ def test_fixed_nodes(setup, fixed_node, brush_args, request): # Crossover cxmen = [] - [cxmen.extend(est.toolbox_.mate(c1, c2)) + [cxmen.append(est.toolbox_.mate(c1, c2)) for (c1, c2) in zip(clones[::2], clones[1::2])] cxmen = [x for x in cxmen if x is not None] assert len(cxmen) > 0, "Crossover didn't worked for any individual" From ce838079fd2cde68e4d62cb42dbb028d5b8544b6 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Thu, 8 Feb 2024 21:43:54 -0300 Subject: [PATCH 112/199] Implemented interface to evaluate in c++ --- pybrush/DeapEstimator.py | 34 ++++------ pybrush/__init__.py | 1 + src/bindings/bind_evaluator.cpp | 16 +++++ src/bindings/bind_evaluator.h | 20 ++++++ src/bindings/module.cpp | 4 +- src/data/data.cpp | 3 + src/eval/evaluation.cpp | 93 +++++++++----------------- src/eval/evaluation.h | 38 +++++------ src/eval/metrics.cpp | 111 +++++++++++++++++++++++++++++++- src/eval/metrics.h | 22 ++++++- src/eval/scorer.h | 64 +++++++++++++++--- src/individual.h | 28 ++++---- src/program/program.h | 3 + tests/cpp/test_population.cpp | 3 +- 14 files changed, 311 insertions(+), 129 deletions(-) create mode 100644 src/bindings/bind_evaluator.cpp create mode 100644 src/bindings/bind_evaluator.h diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index 55851049..b62162e5 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -139,6 +139,7 @@ def __init__( validation_size: float = 0.0, batch_size: float = 1.0 ): + self.pop_size=pop_size self.gens=gens self.verbosity=verbosity @@ -171,10 +172,9 @@ def _setup_toolbox(self, data_train, data_validation): # create Individual class, inheriting from self.Individual with a fitness attribute if self.mode == 'classification': - if self.n_classes_ == 2: - creator.create("Individual", ClassifierIndividual) - else: - creator.create("Individual", MultiClassifierIndividual) + creator.create("Individual", ClassifierIndividual + if self.n_classes_ == 2 else + MultiClassifierIndividual) else: creator.create("Individual", RegressorIndividual) @@ -261,11 +261,7 @@ def fit(self, X, y): self.search_space_ = _brush.SearchSpace(self.train_, self.functions_, self.weights_init) - # TODO: getters and setters in parameters. Use them to take the arguments from python and save in the cpp backend - # TODO: use variation operator here instead of these functions - # TODO: store parameters in Parameter and use it to create the variator, selector, survivor, etc. self.parameters_ = _brush.Parameters() - self.parameters_.pop_size = self.pop_size self.parameters_.gens = self.gens self.parameters_.num_islands = self.num_islands @@ -316,12 +312,12 @@ def fit(self, X, y): range(len(points)), key=lambda index: (points[index][0], points[index][1]) ) - self.best_estimator_ = self.archive_[final_ind_idx].program + self.best_estimator_ = self.archive_[final_ind_idx] if self.verbosity > 0: - print(f'best model {self.best_estimator_.get_model()}' + - f' with size {self.best_estimator_.size()}, ' + - f' depth {self.best_estimator_.depth()}, ' + + print(f'best model {self.best_estimator_.program.get_model()}' + + f' with size {self.best_estimator_.program.size()}, ' + + f' depth {self.best_estimator_.program.depth()}, ' + f' and fitness {self.archive_[final_ind_idx].fitness}') return self @@ -360,14 +356,6 @@ def _make_individual(self): # No arguments (or zero): brush will use PARAMS passed in set_params. # max_size is sampled between 1 and params['max_size'] if zero is provided - # return creator.Individual( - # self.search_space_.make_classifier( - # self.max_depth,(0 if self.initialization=='uniform' else self.max_size)) - # if self.n_classes_ == 2 else - # self.search_space_.make_multiclass_classifier( - # self.max_depth, (0 if self.initialization=='uniform' else self.max_size)) - # ) - ind = creator.Individual() ind.init(self.search_space_, self.parameters_) ind.objectives = self.objectives @@ -389,7 +377,7 @@ def predict(self, X): # data = self._make_data(X, feature_names=self.feature_names_) - return self.best_estimator_.predict(data) + return self.best_estimator_.program.predict(data) # def _setup_population(self): # """initialize programs""" @@ -435,6 +423,8 @@ class DeapClassifier(DeapEstimator,ClassifierMixin): def __init__( self, **kwargs): super().__init__(mode='classification',**kwargs) + # TODO: test with number of islands =1 + def _error(self, ind, data: _brush.Dataset): #return (data.y==ind.program.predict(data)).sum() / data.y.shape[0] return average_precision_score(data.y, ind.program.predict(data)) @@ -484,7 +474,7 @@ def predict_proba(self, X): # data = self._make_data(X, feature_names=self.feature_names_) - prob = self.best_estimator_.predict_proba(data) + prob = self.best_estimator_.program.predict_proba(data) if self.n_classes_ <= 2: prob = np.hstack( (np.ones(X.shape[0]).reshape(-1,1), prob.reshape(-1,1)) ) diff --git a/pybrush/__init__.py b/pybrush/__init__.py index 9d80e5c6..219c38e1 100644 --- a/pybrush/__init__.py +++ b/pybrush/__init__.py @@ -2,6 +2,7 @@ from _brush import Dataset from _brush import SearchSpace from _brush import Parameters +from _brush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator # Individuals from _brush.individual import RegressorIndividual, \ diff --git a/src/bindings/bind_evaluator.cpp b/src/bindings/bind_evaluator.cpp new file mode 100644 index 00000000..ae8a6450 --- /dev/null +++ b/src/bindings/bind_evaluator.cpp @@ -0,0 +1,16 @@ +#include "module.h" +#include "bind_evaluator.h" + +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +using stream_redirect = py::call_guard; + +void bind_evaluators(py::module &m) +{ + bind_evaluator(m, "RegressorEvaluator"); + bind_evaluator(m, "ClassifierEvaluator"); + bind_evaluator(m, "MultiClassifierEvaluator"); + bind_evaluator(m, "RepresenterEvaluator"); +} \ No newline at end of file diff --git a/src/bindings/bind_evaluator.h b/src/bindings/bind_evaluator.h new file mode 100644 index 00000000..4e02b5b5 --- /dev/null +++ b/src/bindings/bind_evaluator.h @@ -0,0 +1,20 @@ +#include "module.h" +#include "../eval/evaluation.h" +#include "../eval/evaluation.cpp" + +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +using stream_redirect = py::call_guard; + +template +void bind_evaluator(py::module& m, string name) +{ + using Class = br::Eval::Evaluation; + + py::class_ eval(m, name.data() ); + eval.def(py::init<>()) + .def("assign_fit", &Class::assign_fit) + ; +} \ No newline at end of file diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index 30c8e926..5b921432 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -24,6 +24,7 @@ void bind_selections(py::module &); void bind_individuals(py::module &); void bind_populations(py::module &); void bind_estimators(py::module &); +void bind_evaluators(py::module &); PYBIND11_MODULE(_brush, m) { @@ -41,7 +42,7 @@ PYBIND11_MODULE(_brush, m) { bind_search_space(m); bind_variations(m); // bind_selections(m); - bind_populations(m); + // bind_populations(m); // solutions py::module_ m2 = m.def_submodule("program", "Contains Program classes."); @@ -51,4 +52,5 @@ PYBIND11_MODULE(_brush, m) { bind_individuals(m3); // bind_estimators(m); + bind_evaluators(m); } diff --git a/src/data/data.cpp b/src/data/data.cpp index 858ba126..7819be7e 100644 --- a/src/data/data.cpp +++ b/src/data/data.cpp @@ -140,6 +140,9 @@ Dataset Dataset::operator()(const vector& idx) const return Dataset(new_features, new_y, this->classification); } + +// TODO: i need to improve how get batch works. Maybe a function to update batch indexes, and always using the same dataset? +// TODO: also, i need to make sure the get batch will sample only from training data and not test Dataset Dataset::get_batch() const { // will always return a new dataset, even when use_batch is false (this case, returns itself) diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index e1fd0355..68bc0f72 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -4,49 +4,7 @@ namespace Brush{ namespace Eval{ -template -void Evaluation::validation(Population& pop, - int island, - const Dataset& data, - const Parameters& params, - bool offspring - ) -{ - auto idxs = pop.get_island_indexes(island); - - int start = 0; - if (offspring) - start = idxs.size()/2; - - for (unsigned i = start; i& ind = *pop.individuals.at(idxs.at(i)).get(); // we are modifying it, so operator[] wont work - - // if there is no validation data, - // set loss_v to loss and return ( this assumes that loss on train was calculated previously.) - if (!data.use_validation) - { - ind.loss_v = ind.loss; - continue; - } - - bool pass = true; - - if (!pass) - { - // TODO: stop doing this hardcoded? - ind.loss_v = MAX_FLT; - } - else - { - // TODO: implement the class weights and use it here (and on loss) - VectorXf y_pred = ind.program.predict(data.get_validation_data()); - assign_fit(ind, y_pred, data, params, true); - } - // ind.set_obj(params.objectives); - } -} - +// TODO: merge validation and update fitness into one function // fitness of population template void Evaluation::update_fitness(Population& pop, @@ -54,7 +12,8 @@ void Evaluation::update_fitness(Population& pop, const Dataset& data, const Parameters& params, bool fit, - bool offspring + bool offspring, + bool validation ) { //TODO: it could use the validation_loss @@ -74,6 +33,7 @@ void Evaluation::update_fitness(Population& pop, { // TODO: check if score was nan and assign the max float ind.fitness.loss = MAX_FLT; + ind.fitness.loss_v = MAX_FLT; ind.error = MAX_FLT*VectorXf::Ones(data.y.size()); } else @@ -82,32 +42,40 @@ void Evaluation::update_fitness(Population& pop, if (fit) ind.program.fit(data); - VectorXf y_pred = ind.program.predict(data.get_training_data()); - assign_fit(ind, y_pred, data, params, false); + assign_fit(ind, data, params, validation); } } } // assign loss to program template -void Evaluation::assign_fit(Individual& ind, - VectorXf& y_pred, const Dataset& data, - const Parameters& params, bool val) +void Evaluation::assign_fit(Individual& ind, const Dataset& data, + const Parameters& params, bool val) { VectorXf loss; - - float f = S.score(data.y, y_pred, loss, params.class_weights); + using PT = ProgramType; - if (val) - { // TODO: use this function to decide wether to take loss from validation or training - ind.fitness.loss_v = f; - } - else - { - // TODO: setter for loss and loss_v - ind.fitness.loss = f; - ind.error = loss; - } + // we want the predict proba + using RetType = + typename std::conditional_t>>>; + + auto validation = data.get_validation_data(); + RetType y_pred_validation = ind.predict(validation).template cast(); + float f_v = S.score(validation.y, y_pred_validation, loss, params.class_weights); + + // TODO: implement the class weights and use it here (and on loss) + auto train = data.get_training_data(); + RetType y_pred = ind.predict(train).template cast(); + float f = S.score(train.y, y_pred, loss, params.class_weights); + + // TODO: setter for loss and loss_v + ind.error = loss; + ind.fitness.loss = f; + ind.fitness.loss_v = f_v; ind.fitness.size = ind.program.size(); ind.fitness.complexity = ind.program.complexity(); ind.fitness.depth = ind.program.depth(); @@ -120,8 +88,7 @@ void Evaluation::assign_fit(Individual& ind, for (const auto& n : ind.get_objectives()) { if (n.compare("error")==0) - values.push_back(f); // fitness on training data, not validation. - // if you use batch, this value will change every generation + values.push_back(val ? f_v : f); else if (n.compare("complexity")==0) values.push_back(ind.program.complexity()); else if (n.compare("size")==0) diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index c421db52..e4b91cd4 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -6,7 +6,6 @@ #include "../search_space.h" #include "../individual.h" -#include "../program/program.h" #include "../data/data.h" #include "scorer.h" #include "../population.h" @@ -22,19 +21,22 @@ namespace Eval { template class Evaluation { public: - Scorer S; - - Evaluation(string scorer="mse"): S(scorer) { this->S.set_scorer(scorer); }; + Scorer S; + + Evaluation(){ + string scorer; + if ( (T == Brush::ProgramType::MulticlassClassifier) + || (T == Brush::ProgramType::Representer) ) + scorer = "multi_log"; + else if (T == Brush::ProgramType::BinaryClassifier) + scorer = "log"; + else + scorer = "mse"; + + this->S.set_scorer(scorer); + }; ~Evaluation(){}; - /// validation of population. - void validation(Population& pop, - int island, - const Dataset& data, - const Parameters& params, - bool offspring = false - ); - // TODO: set objectives // TODO: evaluation bind // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING? (caps) @@ -45,15 +47,15 @@ class Evaluation { const Dataset& data, const Parameters& params, bool fit=true, - bool offspring = false + bool offspring = false, + bool validation=false ); - // TODO: implement other eval methods - - /// assign fitness to an individual. - void assign_fit(Individual& ind, VectorXf& y_pred, - const Dataset& data, const Parameters& params, bool val=false); + /// assign fitness to an individual. + void assign_fit(Individual& ind, const Dataset& data, + const Parameters& params, bool val=false); + // representation program (TODO: implement) }; } //selection diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp index 8375dc26..96e3014c 100644 --- a/src/eval/metrics.cpp +++ b/src/eval/metrics.cpp @@ -7,12 +7,121 @@ namespace Eval { /// mean squared error float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, - const vector& weights) + const vector& class_weights) { loss = (yhat - y).array().pow(2); return loss.mean(); } +VectorXf log_loss(const VectorXf& y, const VectorXf& predict_proba, + const vector& class_weights) +{ + float eps = pow(10,-10); + + VectorXf loss; + + float sum_weights = 0; + loss.resize(y.rows()); + for (unsigned i = 0; i < y.rows(); ++i) + { + if (predict_proba(i) < eps || 1 - predict_proba(i) < eps) + // clip probabilities since log loss is undefined for predict_proba=0 or predict_proba=1 + loss(i) = -(y(i)*log(eps) + (1-y(i))*log(1-eps)); + else + loss(i) = -(y(i)*log(predict_proba(i)) + (1-y(i))*log(1-predict_proba(i))); + if (loss(i)<0) + std::runtime_error("loss(i)= " + to_string(loss(i)) + + ". y = " + to_string(y(i)) + ", predict_proba(i) = " + + to_string(predict_proba(i))); + + if (!class_weights.empty()) + { + loss(i) = loss(i) * class_weights.at(y(i)); + sum_weights += class_weights.at(y(i)); + } + } + + if (sum_weights > 0) + loss = loss.array() / sum_weights * y.size(); // normalize weight contributions + + return loss; +} + +/// log loss +float mean_log_loss(const VectorXf& y, + const VectorXf& predict_proba, VectorXf& loss, + const vector& class_weights) +{ + + /* std::cout << "loss: " << loss.transpose() << "\n"; */ + loss = log_loss(y,predict_proba,class_weights); + return loss.mean(); +} + + +// multinomial log loss +VectorXf multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, + const vector& class_weights) +{ + VectorXf loss = VectorXf::Zero(y.rows()); + + // TODO: needs to be the index of unique elements + // get class labels + // vector uc = unique( ArrayXi(y.cast()) ); + + // float eps = pow(10,-10); + // float sum_weights = 0; + // for (unsigned i = 0; i < y.rows(); ++i) + // { + // for (const auto& c : uc) + // { + // // for specific class + // ArrayXf yhat = predict_proba.col(int(c)); + // /* std::cout << "class " << c << "\n"; */ + + // /* float yi = y(i) == c ? 1.0 : 0.0 ; */ + // /* std::cout << "yi: " << yi << ", yhat(" << i << "): " << yhat(i) ; */ + // if (y(i) == c) + // { + // if (yhat(i) < eps || 1 - yhat(i) < eps) + // { + // // clip probabilities since log loss is undefined for yhat=0 or yhat=1 + // loss(i) += -log(eps); + // } + // else + // { + // loss(i) += -log(yhat(i)); + // } + // /* std::cout << ", loss(" << i << ") = " << loss(i); */ + // } + // /* std::cout << "\n"; */ + // } + // if (!class_weights.empty()){ + // /* std::cout << "weights.at(y(" << i << ")): " << class_weights.at(y(i)) << "\n"; */ + // loss(i) = loss(i)*class_weights.at(y(i)); + // sum_weights += class_weights.at(y(i)); + // } + // } + // if (sum_weights > 0) + // loss = loss.array() / sum_weights * y.size(); + + /* cout << "loss.mean(): " << loss.mean() << "\n"; */ + /* cout << "loss.sum(): " << loss.sum() << "\n"; */ + return loss; +} + + +float mean_multi_log_loss(const VectorXf& y, + const ArrayXXf& predict_proba, VectorXf& loss, + const vector& class_weights) +{ + loss = multi_log_loss(y, predict_proba, class_weights); + + /* std::cout << "loss: " << loss.transpose() << "\n"; */ + /* std::cout << "mean loss: " << loss.mean() << "\n"; */ + return loss.mean(); +} + // TODO: implement other metrics. Right know I have just the MSE diff --git a/src/eval/metrics.h b/src/eval/metrics.h index e640d19c..5565644a 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -10,8 +10,26 @@ namespace Eval { /// mean squared error float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, - const vector& weights=vector() ); - + const vector& class_weights=vector() ); + +// TODO: test cases for the metrics +// TODO: implement the metrics for classification + +/// log loss (2 methods below) +VectorXf log_loss(const VectorXf& y, const VectorXf& predict_proba, + const vector& class_weights=vector()); + +float mean_log_loss(const VectorXf& y, const VectorXf& predict_proba, VectorXf& loss, + const vector& class_weights = vector()); + +/// multinomial log loss (2 methods below) +VectorXf multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, + const vector& class_weights=vector()); + +float mean_multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, + VectorXf& loss, + const vector& class_weights=vector()); + // TODO: implement other metrics. Right know I have just the MSE } // metrics diff --git a/src/eval/scorer.h b/src/eval/scorer.h index eb3f2298..2972af89 100644 --- a/src/eval/scorer.h +++ b/src/eval/scorer.h @@ -3,18 +3,21 @@ #include "metrics.h" #include "../util/error.h" +#include "../types.h" // code to evaluate GP programs. namespace Brush{ namespace Eval{ + +template // requires(P == PT::Regressor || P == PT::BinaryClassifier) +class Scorer +{ + typedef float (*funcPointer)(const VectorXf&, const VectorXf&, VectorXf&, const vector&); - -class Scorer -{ public: // map the string into a function to be called when calculating the score std::map score_hash; @@ -22,7 +25,10 @@ class Scorer // TODO: add more scores, include them here, add to score_hash Scorer(string scorer="mse") { + // TODO: use this idea of map functpointer to do the mutations score_hash["mse"] = &mse; + score_hash["log"] = &mean_log_loss; + // score_hash["multi_log"] = &mean_multi_log_loss; this->set_scorer(scorer); }; @@ -30,7 +36,7 @@ class Scorer void set_scorer(string scorer){ this->scorer = scorer; }; /* void set_scorer(string scorer); */ - float score(const VectorXf& y_true, VectorXf& y_pred, + float score(const VectorXf& y_true, const VectorXf& y_pred, VectorXf& loss, const vector& w) { // loss is an array passed by reference to store each prediction (used in lexicase) @@ -50,16 +56,54 @@ class Scorer return score_hash.at(this->scorer)(y_true, y_pred, loss, w); } }; +}; + + + +template + requires( P == PT::MulticlassClassifier || P == PT::Representer) +class Scorer

+{ - // overloaded score with no loss - float score(const VectorXf& y_true, VectorXf& y_pred, - vector w=vector()) +typedef float (*funcPointer)(const VectorXf&, + const ArrayXXf&, + VectorXf&, + const vector&); +public: + // map the string into a function to be called when calculating the score + std::map score_hash; + string scorer; + + Scorer(string scorer="multi_log") { + score_hash["multi_log"] = &mean_multi_log_loss; + + this->set_scorer(scorer); + }; + + void set_scorer(string scorer){ this->scorer = scorer; }; + + /* void set_scorer(string scorer); */ + float score(const VectorXf& y_true, const ArrayXXf& y_pred, + VectorXf& loss, const vector& w) { - VectorXf dummy; - return this->score(y_true, y_pred, dummy, w); + // loss is an array passed by reference to store each prediction (used in lexicase) + // weights are used to give more or less importance for a given sample. + // Every scorer must have the same function signature, but arent required to use all info + + if ( score_hash.find(this->scorer) == score_hash.end() ) + { + // not found + HANDLE_ERROR_THROW("Scoring function '" + this->scorer + + "' not defined"); + return 0.0; + } + else + { + // found + return score_hash.at(this->scorer)(y_true, y_pred, loss, w); + } }; }; - } } #endif diff --git a/src/individual.h b/src/individual.h index 7d60b0b0..a6a20b25 100644 --- a/src/individual.h +++ b/src/individual.h @@ -26,15 +26,6 @@ namespace Brush{ // TODO: move fitness to eval folder // TODO make a better use of this (in selection, when fitting, etc) (actually i need to start using it) struct Fitness { - // Static map for weights associated with strings - // TODO: weights for different values. loss should be calculated duing runtime, based on the metric - inline static std::map weightsMap = { - {"error", -1.0}, // error should be the common error metrics for class (acc) and regression (mse) by default - {"complexity", -1.0}, - {"size", -1.0} - // Add more key-value pairs as needed - }; - float loss; ///< aggregate loss score float loss_v; ///< aggregate validation loss score @@ -88,6 +79,7 @@ struct Fitness { return wvalues; } + // TODO: debug size, it is giving weird values // Method to set values void set_values(vector& v) { if (v.size() != weights.size()) { @@ -253,6 +245,20 @@ class Individual{ float crowding_dist; ///< crowding distance on the Pareto front + // Static map for weights associated with strings + // TODO: weights for different values. loss should be calculated duing runtime, based on the metric + inline static std::map weightsMap = []() { + std::map map = { + {"complexity", -1.0}, + {"size", -1.0} + // Add more key-value pairs as needed + }; + // example on how to have weight based on templated class + map["error"] = (T == Brush::ProgramType::Regressor) ? -1.0 : -1.0; + + return map; + }(); + vector get_objectives() const { return objectives; }; void set_objectives(vector objs){ objectives=objs; @@ -260,8 +266,8 @@ class Individual{ vector weights; weights.resize(0); for (const auto& obj : objectives) { - auto it = Fitness::weightsMap.find(obj); - if (it != Fitness::weightsMap.end()) { + auto it = weightsMap.find(obj); + if (it != weightsMap.end()) { weights.push_back(it->second); } else { // TODO: throw error here, unknown objective diff --git a/src/program/program.h b/src/program/program.h index edba219d..82b8bc06 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -57,6 +57,7 @@ template struct Program std::conditional_t>>>; + /// the type of output from the tree object using TreeType = std::conditional_t struct Program return out; } + // TODO: delete this declarations //////////////////////////////////////////////////////////////////////////// // Mutation & Crossover @@ -542,6 +544,7 @@ void Program::update_weights(const Dataset& d) }; +// TODO: delete this declarations //////////////////////////////////////////////////////////////////////////////// // mutation and crossover // template diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index acdcadce..3e6e5466 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -36,7 +36,7 @@ TEST(Population, PopulationTests) Population pop = Population(); // aux classes (they are not tested in-depth in this file) - Evaluation evaluator = Evaluation(params.scorer_); + Evaluation evaluator = Evaluation(); Selection selector = Selection(params.sel, false); Selection survivor = Selection(params.surv, true); Variation variator = Variation(params, SS); @@ -81,6 +81,7 @@ TEST(Population, PopulationTests) fmt::print("Island {}, individuals {}\n", j, pop.get_island_indexes(j)); // we can calculate the fitness for each island + // TODO: have a flag that is set to false everytime we change the individual, and true when we already evaluated it on training. Use this flag to avoid evaluating the same individual multiple times (specially because we have a parameter tuning step, which can give a leverage to individuals evaluated several times, as it will have more iterations in the gradient descent optimization) fmt::print("Fitness\n"); evaluator.update_fitness(pop, j, data, params, true, false); } From 5f392e21b47dae77d07d1a03bb04d91f57535066 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Sat, 10 Feb 2024 09:13:26 -0300 Subject: [PATCH 113/199] Fixed evaluator not working for classification --- src/bindings/bind_evaluator.h | 1 + src/eval/evaluation.cpp | 20 +++----- src/eval/evaluation.h | 2 + src/eval/scorer.h | 86 +++++++++++++++++++++++++++++++++-- 4 files changed, 92 insertions(+), 17 deletions(-) diff --git a/src/bindings/bind_evaluator.h b/src/bindings/bind_evaluator.h index 4e02b5b5..139d9f7f 100644 --- a/src/bindings/bind_evaluator.h +++ b/src/bindings/bind_evaluator.h @@ -16,5 +16,6 @@ void bind_evaluator(py::module& m, string name) py::class_ eval(m, name.data() ); eval.def(py::init<>()) .def("assign_fit", &Class::assign_fit) + .def_property("scorer", &Class::get_scorer, &Class::set_scorer) ; } \ No newline at end of file diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index 68bc0f72..af394b6d 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -55,22 +55,13 @@ void Evaluation::assign_fit(Individual& ind, const Dataset& data, VectorXf loss; using PT = ProgramType; - // we want the predict proba - using RetType = - typename std::conditional_t>>>; - - auto validation = data.get_validation_data(); - RetType y_pred_validation = ind.predict(validation).template cast(); - float f_v = S.score(validation.y, y_pred_validation, loss, params.class_weights); + Dataset validation = data.get_validation_data(); + float f_v = S.score(ind, validation, loss, params); // TODO: implement the class weights and use it here (and on loss) - auto train = data.get_training_data(); - RetType y_pred = ind.predict(train).template cast(); - float f = S.score(train.y, y_pred, loss, params.class_weights); + + Dataset train = data.get_training_data(); + float f = S.score(ind, train, loss, params); // TODO: setter for loss and loss_v ind.error = loss; @@ -103,5 +94,6 @@ void Evaluation::assign_fit(Individual& ind, const Dataset& data, ind.fitness.set_values(values); } + } // Pop } // Brush \ No newline at end of file diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index e4b91cd4..911ed744 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -37,6 +37,8 @@ class Evaluation { }; ~Evaluation(){}; + void set_scorer(string scorer){this->S.set_scorer(scorer);}; + string get_scorer(){return this->S.get_scorer();}; // TODO: set objectives // TODO: evaluation bind // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING? (caps) diff --git a/src/eval/scorer.h b/src/eval/scorer.h index 2972af89..e00fa961 100644 --- a/src/eval/scorer.h +++ b/src/eval/scorer.h @@ -4,9 +4,13 @@ #include "metrics.h" #include "../util/error.h" #include "../types.h" +// #include "../individual.h" // code to evaluate GP programs. namespace Brush{ + +using namespace Pop; + namespace Eval{ @@ -14,6 +18,11 @@ template // requires(P == PT::Regressor || P == PT::BinaryClassi class Scorer { +using RetType = + typename std::conditional_t

>; + typedef float (*funcPointer)(const VectorXf&, const VectorXf&, VectorXf&, @@ -26,14 +35,14 @@ typedef float (*funcPointer)(const VectorXf&, // TODO: add more scores, include them here, add to score_hash Scorer(string scorer="mse") { // TODO: use this idea of map functpointer to do the mutations - score_hash["mse"] = &mse; - score_hash["log"] = &mean_log_loss; + score_hash["mse"] = &mse; // score_hash["multi_log"] = &mean_multi_log_loss; this->set_scorer(scorer); }; void set_scorer(string scorer){ this->scorer = scorer; }; + string get_scorer(){return this->scorer; }; /* void set_scorer(string scorer); */ float score(const VectorXf& y_true, const VectorXf& y_pred, @@ -56,15 +65,76 @@ typedef float (*funcPointer)(const VectorXf&, return score_hash.at(this->scorer)(y_true, y_pred, loss, w); } }; + + float score(Individual

& ind, Dataset& data, + VectorXf& loss, const Parameters& params) + { + RetType y_pred = ind.predict(data); + return score(data.y, y_pred, loss, params.class_weights); + } }; +// TODO: improve this so we dont have a lot of different declarations +template + requires( P == PT::BinaryClassifier) +class Scorer

+{ + +using RetType = ArrayXf; + +typedef float (*funcPointer)(const VectorXf&, + const VectorXf&, + VectorXf&, + const vector&); +public: + // map the string into a function to be called when calculating the score + std::map score_hash; + string scorer; + + Scorer(string scorer="multi_log") { + score_hash["log"] = &mean_log_loss; + + this->set_scorer(scorer); + }; + + void set_scorer(string scorer){ this->scorer = scorer; }; + string get_scorer(){return this->scorer; }; + + /* void set_scorer(string scorer); */ + float score(const VectorXf& y_true, const VectorXf& y_pred, + VectorXf& loss, const vector& w) + { + if ( score_hash.find(this->scorer) == score_hash.end() ) + { + // not found + HANDLE_ERROR_THROW("Scoring function '" + this->scorer + + "' not defined"); + return 0.0; + } + else + { + // found + return score_hash.at(this->scorer)(y_true, y_pred, loss, w); + } + }; + + float score(Individual

& ind, Dataset& data, + VectorXf& loss, const Parameters& params) + { + // TODO: individual should have a wrapper to predict proba + RetType y_pred = ind.program.predict_proba(data); // .template cast(); + return score(data.y, y_pred, loss, params.class_weights); + } +}; template - requires( P == PT::MulticlassClassifier || P == PT::Representer) + requires(P == PT::MulticlassClassifier) class Scorer

{ +using RetType = ArrayXXf; + typedef float (*funcPointer)(const VectorXf&, const ArrayXXf&, VectorXf&, @@ -81,6 +151,7 @@ typedef float (*funcPointer)(const VectorXf&, }; void set_scorer(string scorer){ this->scorer = scorer; }; + string get_scorer(){return this->scorer; }; /* void set_scorer(string scorer); */ float score(const VectorXf& y_true, const ArrayXXf& y_pred, @@ -103,7 +174,16 @@ typedef float (*funcPointer)(const VectorXf&, return score_hash.at(this->scorer)(y_true, y_pred, loss, w); } }; + + float score(Individual

& ind, Dataset& data, + VectorXf& loss, const Parameters& params) + { + // TODO: individual should have a wrapper to predict proba + RetType y_pred = ind.program.predict_proba(data); // .template cast(); + return score(data.y, y_pred, loss, params.class_weights); + } }; + } } #endif From 774c4482d52b4ac3aecf26266ac5c3f4e967687d Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Sat, 10 Feb 2024 12:43:42 -0300 Subject: [PATCH 114/199] Fitness evaluation is now in c++ --- pybrush/DeapEstimator.py | 88 ++++++++++----------------------- pybrush/deap_api/nsga2.py | 39 +++------------ src/bindings/bind_variation.cpp | 1 - src/eval/metrics.h | 1 + 4 files changed, 32 insertions(+), 97 deletions(-) diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index b62162e5..e69d1be3 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -15,10 +15,12 @@ from types import NoneType from sklearn.metrics import average_precision_score from sklearn.preprocessing import MinMaxScaler -import _brush -from pybrush.deap_api import nsga2, DeapIndividual +import _brush # TODO: stop using _brush and use whats in pybrush +import functools +from pybrush.deap_api import nsga2 # from _brush import Dataset, SearchSpace from pybrush import RegressorIndividual, ClassifierIndividual, MultiClassifierIndividual +from pybrush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator # TODO: LOGGER AND ARCHIVE @@ -160,7 +162,7 @@ def __init__( self.validation_size=validation_size - def _setup_toolbox(self, data_train, data_validation): + def _setup_toolbox(self): """Setup the deap toolbox""" toolbox: base.Toolbox = base.Toolbox() @@ -175,8 +177,19 @@ def _setup_toolbox(self, data_train, data_validation): creator.create("Individual", ClassifierIndividual if self.n_classes_ == 2 else MultiClassifierIndividual) + self.eval_ = ( ClassifierEvaluator() + if self.n_classes_ == 2 else + MultiClassifierEvaluator() ) else: creator.create("Individual", RegressorIndividual) + self.eval_ = RegressorEvaluator() + + def assign_fit(ind, validation=False): + ind.program.fit(self.data_.get_training_data()) + self.eval_.assign_fit(ind, self.data_, self.parameters_, validation) + return ind + + toolbox.register("assign_fit", assign_fit) toolbox.register("Clone", lambda ind: creator.Individual(ind.program.copy())) @@ -199,9 +212,6 @@ def offspring(pop, MU): return pop[-MU:] toolbox.register("population", tools.initRepeat, list, toolbox.createRandom) toolbox.register("get_objectives", lambda: self.objectives) - toolbox.register("getBatch", data_train.get_batch) - toolbox.register("evaluate", self._fitness_function, data=data_train) - toolbox.register("evaluateValidation", self._fitness_validation, data=data_validation) return toolbox @@ -245,18 +255,10 @@ def fit(self, X, y): # elif "Softmax" not in self.functions_: # TODO: implement multiclassific. # self.functions_["Softmax"] = 1.0 - # Weight of each objective (+ for maximization, - for minimization) - obj_weight = { - "error" : +1.0 if self.mode=="classification" else -1.0, - "size" : -1.0, - "complexity" : -1.0 - } - self.weights = [obj_weight[w] for w in self.objectives] - # These have a default behavior to return something meaningfull if # no values are set self.train_ = self.data_.get_training_data() - self.train_.set_batch_size(self.batch_size) + self.train_.set_batch_size(self.batch_size) # TODO: update batch indexes at the beggining of every generation self.validation_ = self.data_.get_validation_data() self.search_space_ = _brush.SearchSpace(self.train_, self.functions_, self.weights_init) @@ -274,13 +276,16 @@ def fit(self, X, y): self.parameters_.mutation_probs = self.mutation_probs if self.mode == "classification": - self.variator_ = _brush.ClassifierVariator(self.parameters_, self.search_space_) + self.variator_ = (_brush.ClassifierVariator + if self.n_classes_ == 2 else + _brush.MultiClassifierVariator + )(self.parameters_, self.search_space_) elif self.mode == "regressor": self.variator_ = _brush.RegressorVariator(self.parameters_, self.search_space_) else: raise("Unsupported mode") - self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) + self.toolbox_ = self._setup_toolbox() # nsga2 and ga differ in the toolbox self.archive_, self.logbook_ = nsga2( @@ -291,8 +296,8 @@ def fit(self, X, y): # Each individual is a point in the Multi-Objective space. We multiply # the fitness by the weights so greater numbers are always better - points = np.array([self.toolbox_.evaluateValidation(ind) for ind in self.archive_]) - points = points*np.array(self.weights) + points = np.array([self.toolbox_.assign_fit(ind, True).fitness.wvalues + for ind in self.archive_]) if self.validation_size==0.0: # Using the multi-criteria decision making on training data # Selecting the best estimator using training data @@ -424,27 +429,7 @@ def __init__( self, **kwargs): super().__init__(mode='classification',**kwargs) # TODO: test with number of islands =1 - - def _error(self, ind, data: _brush.Dataset): - #return (data.y==ind.program.predict(data)).sum() / data.y.shape[0] - return average_precision_score(data.y, ind.program.predict(data)) - - def _fitness_validation(self, ind, data: _brush.Dataset): - # Fitness without fitting the expression, used with validation data - - ind_objectives = { - "error" : self._error(ind, data), - "size" : ind.program.size(), - "complexity": ind.program.complexity() - } - return [ ind_objectives[obj] for obj in self.objectives ] - - def _fitness_function(self, ind, data: _brush.Dataset): - ind.program.fit(data) - - return self._fitness_validation(ind, data) - - + def predict_proba(self, X): """Predict class probabilities for X. @@ -502,29 +487,6 @@ class DeapRegressor(DeapEstimator, RegressorMixin): def __init__(self, **kwargs): super().__init__(mode='regressor',**kwargs) - def _error(self, ind, data: _brush.Dataset): - MSE = np.mean( (data.y-ind.program.predict(data))**2 ) - if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf - MSE = np.inf - - return MSE - - def _fitness_validation(self, ind, data: _brush.Dataset): - # Fitness without fitting the expression, used with validation data - - ind_objectives = { - "error" : self._error(ind, data), - "size" : ind.program.size(), - "complexity": ind.program.complexity() - } - return [ ind_objectives[obj] for obj in self.objectives ] - - def _fitness_function(self, ind, data: _brush.Dataset): - ind.program.fit(data) - - return self._fitness_validation(ind, data) - - # Under development # class DeapRepresenter(DeapEstimator, TransformerMixin): # """Deap-based Brush for representation learning. diff --git a/pybrush/deap_api/nsga2.py b/pybrush/deap_api/nsga2.py index d8109fc0..6211fe08 100644 --- a/pybrush/deap_api/nsga2.py +++ b/pybrush/deap_api/nsga2.py @@ -16,7 +16,8 @@ def nsga2(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rnd_flt): def calculate_statistics(ind): on_train = ind.fitness.values - on_val = toolbox.evaluateValidation(ind) + # TODO: make this work again + on_val = ind.fitness.values #toolbox.evaluateValidation(ind) return (*on_train, *on_val) @@ -36,14 +37,7 @@ def calculate_statistics(ind): for objective in toolbox.get_objectives()] pop = toolbox.population(n=MU) - - # OBS: evaluate calls fit in the individual. It is different from using it to predict. The - # function evaluateValidation don't call the fit - fitnesses = toolbox.map(functools.partial(toolbox.evaluate), pop) - for ind, fit in zip(pop, fitnesses): - ind.fitness.values = fit - - # print(0, pop[0].fitness.values, pop[0].fitness.weights) + pop = list(toolbox.map(toolbox.assign_fit, pop)) # This is just to assign the crowding distance to the individuals # no actual selection is done @@ -57,41 +51,20 @@ def calculate_statistics(ind): # Begin the generational process for gen in range(1, NGEN): - batch = toolbox.getBatch() # batch will be a random subset only if it was not defined as the size of the train set. - # everytime this function is called, a new random batch is generated. - if (use_batch): # recalculate the fitness for the parents - # use_batch is false if batch_size is different from train set size. - # If we're using batch, we need to re-evaluate every model (without changing its weights). - # evaluateValidation doesnt fit the weights - fitnesses = toolbox.map( - functools.partial(toolbox.evaluateValidation, data=batch), pop) - - for ind, fit in zip(pop, fitnesses): - ind.fitness.values = fit - - # print(1, pop[0].fitness.values, pop[0].fitness.weights) - # Vary the population - # offspring = tools.selTournamentDCD(pop, len(pop)) parents = toolbox.select(pop, len(pop)) - # offspring = [toolbox.clone(ind) for ind in offspring] offspring = [] for ind1, ind2 in zip(parents, parents[1:]): off = None - if rnd_flt() < CXPB: # either mutation or crossover. + if rnd_flt() < CXPB: # either mutation or crossover off = toolbox.mate(ind1, ind2) else: off = toolbox.mutate(ind1) - if off is not None: # Mutation worked. first we fit, then add to offspring - # Evaluate (instead of evaluateValidation) to fit the weights of the offspring - off.fitness.values = toolbox.evaluate(off) - if use_batch: # Adjust fitness to the same data as parents - off.fitness.values = toolbox.evaluateValidation(off, data=batch) + if off is not None: # first we fit, then add to offspring offspring.extend([off]) - # print(2, offspring[0].fitness.values, offspring[0].fitness.weights) - + offspring = list(toolbox.map(toolbox.assign_fit, offspring)) # Select the next generation population (no sorting before this step, as # survive==offspring will cut it in half) pop = toolbox.survive(pop + offspring, MU) diff --git a/src/bindings/bind_variation.cpp b/src/bindings/bind_variation.cpp index 0b66a1b7..0a772c7c 100644 --- a/src/bindings/bind_variation.cpp +++ b/src/bindings/bind_variation.cpp @@ -14,7 +14,6 @@ void bind_variations(py::module& m) { bind_variation(m, "RegressorVariator"); bind_variation(m, "ClassifierVariator"); - bind_variation(m, "MultiClassifierVariator"); bind_variation(m, "RepresenterVariator"); } \ No newline at end of file diff --git a/src/eval/metrics.h b/src/eval/metrics.h index 5565644a..79d41378 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -30,6 +30,7 @@ float mean_multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, VectorXf& loss, const vector& class_weights=vector()); +// TODO: average_precision_score for classification // TODO: implement other metrics. Right know I have just the MSE } // metrics From 3388f5a374ff8b0af880e49aed585419c9e60d5f Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 23 Feb 2024 19:04:16 -0300 Subject: [PATCH 115/199] Fixed ceres version (newer is crashing due google docs dep) --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 8e6a62d6..34f9e7bb 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: - gcc >= 12.0 - gxx >= 12.0 - ninja - - ceres-solver + - ceres-solver=2.1.0 - taskflow - pybind11 #=2.6.2 - pytest #=6.2.4 From b6dcd9b47f8ebe447beccd0b624ff935d1040a0d Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 23 Feb 2024 19:05:51 -0300 Subject: [PATCH 116/199] Implemented working selection and survival in cpp --- src/estimator.cpp | 6 ++-- src/individual.cpp | 8 +++-- src/individual.h | 29 +++++++++++++---- src/params.h | 5 ++- src/population.cpp | 41 ++++++++++++++++++----- src/population.h | 5 ++- src/selection/nsga2.cpp | 72 ++++++++++++++++++++++++++++++++--------- src/selection/nsga2.h | 6 ++-- 8 files changed, 130 insertions(+), 42 deletions(-) diff --git a/src/estimator.cpp b/src/estimator.cpp index c542665e..4919bcf7 100644 --- a/src/estimator.cpp +++ b/src/estimator.cpp @@ -50,9 +50,9 @@ bool Estimator::update_best(const Dataset& data, bool val) if (ind.rank == 1) { if (val) - f = ind.fitness_v; + f = ind.fitness.loss_v; else - f = ind.fitness; + f = ind.fitness.loss; if (f < bs || (f == bs && ind.get_complexity() < this->best_complexity) @@ -103,7 +103,7 @@ void Estimator::run_generation(unsigned int g, Dataset &data) island_parents.at(island) = parents; }); - vector survivors(pop.size()); + vector survivors(params.pop_size); // TODO: check that I dont use pop.size() (or I use correctly, because it will return the size with the slots for the offspring) pop.add_offspring_indexes(); taskflow.for_each_index(0, pop.num_islands, 1, [&](int island) { diff --git a/src/individual.cpp b/src/individual.cpp index b03e9186..89d06ab4 100644 --- a/src/individual.cpp +++ b/src/individual.cpp @@ -45,12 +45,16 @@ int Fitness::dominates(const Fitness& b) const // TODO: replace comparison of individual values by using the overloaded operators (here and in nsga2) // TODO: save fitness in an temporary variable and stop accessing it everytime for (int i=0; i b.get_wvalues().at(i) + || std::isnan(b.get_wvalues().at(i)) ) flag1 = 1; - else if (get_wvalues().at(i) > b.get_wvalues().at(i)) + if (get_wvalues().at(i) < b.get_wvalues().at(i) + || std::isnan(get_wvalues().at(i)) ) flag2 = 1; } + // the proper way of comparing weighted values is considering everything as a maximization problem + // (this is like deap does, and our fitness is inspired by them) if (flag1==1 && flag2==0) // there is at least one smaller objective for this and none // for b diff --git a/src/individual.h b/src/individual.h index a6a20b25..b10c2501 100644 --- a/src/individual.h +++ b/src/individual.h @@ -34,18 +34,25 @@ struct Fitness { size_t complexity; size_t size; size_t depth; + unsigned int dcounter; ///< number of individuals this dominates + vector dominated; ///< individual indices this dominates - unsigned int rank; ///< pareto front rank float crowding_dist; ///< crowding distance on the Pareto front + void set_dominated(vector& dom){ dominated=dom; }; + vector get_dominated() const { return dominated; }; + void set_loss(float f){ loss=f; }; float get_loss() const { return loss; }; void set_loss_v(float f_v){ loss_v=f_v; }; float get_loss_v() const { return loss_v; }; + void set_dcounter(unsigned int d){ dcounter=d; }; + unsigned int get_dcounter() const { return dcounter; }; + void set_rank(unsigned r){ rank=r; }; size_t get_rank() const { return rank; }; @@ -61,7 +68,12 @@ struct Fitness { vector wvalues; // Constructor with initializer list for weights - Fitness(const vector& w={}) : values(), wvalues(), weights(w) { } + Fitness(const vector& w={}) : values(), wvalues(), weights(w) { + dcounter = 0; + set_rank(0); + set_crowding_dist(0); + dominated.resize(0); + } // Hash function size_t hash() const { @@ -69,6 +81,9 @@ struct Fitness { return h; } + void set_weights(vector& w) { + weights = w; + } vector get_weights() const { return weights; } @@ -238,11 +253,11 @@ class Individual{ // void Individual::set_objectives(const vector& objectives) // TODO: fix to use these with fitness instead of with individual - unsigned int dcounter; ///< number of individuals this dominates - vector dominated; ///< individual indices this dominates + // unsigned int dcounter; ///< number of individuals this dominates + // vector dominated; ///< individual indices this dominates - unsigned int rank; ///< pareto front rank - float crowding_dist; ///< crowding distance on the Pareto front + // unsigned int rank; ///< pareto front rank + // float crowding_dist; ///< crowding distance on the Pareto front // Static map for weights associated with strings @@ -275,7 +290,7 @@ class Individual{ } } - fitness = Fitness(weights); + fitness.set_weights(weights); }; }; diff --git a/src/params.h b/src/params.h index 4ba637ed..915c06dc 100644 --- a/src/params.h +++ b/src/params.h @@ -35,7 +35,7 @@ struct Parameters string sel = "nsga2"; //selection method string surv = "nsga2"; //survival method std::unordered_map functions; - int num_islands=5; + int num_islands=1; // variation std::map mutation_probs = { @@ -76,6 +76,9 @@ struct Parameters void set_gens(int new_gens){ gens = new_gens; }; int get_gens(){ return gens; }; + void set_current_gen(unsigned int gen){ current_gen = gen; }; + unsigned int get_current_gen(){ return current_gen; }; + void set_num_islands(int new_num_islands){ num_islands = new_num_islands; }; int get_num_islands(){ return num_islands; }; diff --git a/src/population.cpp b/src/population.cpp index b1479172..b5207a30 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -14,19 +14,44 @@ Population::Population() template -void Population::init(vector&>& individuals, const Parameters& params) +void Population::init(vector>& new_individuals, const Parameters& params) { + if (new_individuals.size() != params.pop_size + && new_individuals.size() != 2*params.pop_size ) { + throw std::runtime_error("Individual vector has different number of individuals than pop_size. popsize is "+to_string(params.pop_size)+", number of individuals is " + to_string(new_individuals.size())); + } + this->mig_prob = params.mig_prob; this->pop_size = params.pop_size; this->num_islands=params.num_islands; + island_indexes.resize(num_islands); + // If the assert fails, execution stops, but for completeness, you can also throw an exception - if (individuals.size() != this->pop_size) { - throw std::runtime_error("Individual vector has different number of individuals than pop_size."); - } - individuals.resize(0); - for (const auto& ind : individuals) { - individuals.push_back( std::make_shared>(ind) ); + size_t p = pop_size; + + individuals.resize(2*p); + + for (int i=0; i>(new_individuals.at(j)); } } @@ -46,7 +71,7 @@ void Population::init(SearchSpace& ss, const Parameters& params) for (int i=0; i&>& individuals, const Parameters& params); + void init(vector>& individuals, const Parameters& params); // TODO: init from file (like FEAT) @@ -83,8 +83,7 @@ class Population{ SameFitComplexity(Population& p): pop(p){} bool operator()(size_t i, size_t j) { - return (pop[i].fitness == pop[j].fitness - && pop[i].get_complexity() == pop[j].get_complexity()); + return pop[i].fitness == pop[j].fitness; } }; }; diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index e7f65458..640db873 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -27,9 +27,9 @@ size_t NSGA2::tournament(Population& pop, size_t i, size_t j) const return i; else if (flag == -1) // ind2 dominates ind1 return j; - else if (ind1.crowding_dist > ind2.crowding_dist) + else if (ind1.fitness.crowding_dist > ind2.fitness.crowding_dist) return i; - else if (ind2.crowding_dist > ind1.crowding_dist) + else if (ind2.fitness.crowding_dist > ind1.fitness.crowding_dist) return j; else return i; @@ -39,16 +39,24 @@ template vector NSGA2::select(Population& pop, int island, const Parameters& params) { + // tournament selection. TODO: move this to tournament selection file, and throw not implemented error in nsga. auto island_pool = pop.get_island_indexes(island); // if this is first generation, just return indices to pop if (params.current_gen==0) return island_pool; - // setting the objectives + // setting the objectives (evaluator should do it. TODO: make sure it does) // for (unsigned int i=0; iset_obj(params.objectives); + // i am not sure if I need this update of rank and crowding distance (bc first generation is ignored by if above, and the other generations will always have individuals that went through survival, which already calculates this information. TODO: in the final algorithm, I need to make sure this is correct) + auto front = fast_nds(pop, island_pool); + for (size_t i = 0; i< front.size(); i++) + { + crowding_distance(pop, front, i); + } + vector selected(0); for (int i = 0; i < island_pool.size(); ++i) // selecting based on island_pool size { @@ -67,14 +75,13 @@ vector NSGA2::survive(Population& pop, int island, { // fmt::print("starting\n"); - size_t idx_start = std::floor(island*pop.size()/pop.num_islands); - size_t idx_end = std::floor((island+1)*pop.size()/pop.num_islands); - - int original_size = (idx_end - idx_start)/2; // original island size (survive must be called with an island with offfspring) auto island_pool = pop.get_island_indexes(island); - // fmt::print("indexes {} {}\n", idx_start, idx_end); + int original_size = params.pop_size/params.num_islands; // original island size (survive must be called with an island with offfspring) + + // fmt::print("original size {}\n", original_size); + // fmt::print("island size {}\n", island_pool.size()); // set objectives (this is when the obj vector is updated.) @@ -83,7 +90,7 @@ vector NSGA2::survive(Population& pop, int island, // pop.individuals.at(island_pool[i])->set_obj(params.objectives); // fast non-dominated sort - // fmt::print("fast nds\n"); + // fmt::print("fast nds for island {}\n", island); auto front = fast_nds(pop, island_pool); // fmt::print("selecting...\n"); @@ -96,9 +103,9 @@ vector NSGA2::survive(Population& pop, int island, int i = 0; // fmt::print("starting loop...\n"); - // fmt::print("{}...\n",selected.size()); - // fmt::print("{}...\n", front.at(i).size()); - // fmt::print("{}...\n", original_size); + // fmt::print("selected size {}...\n",selected.size()); + // fmt::print("first front size {}...\n", front.at(i).size()); + // fmt::print("goal is to select n individuals: {}...\n", original_size); while ( i < front.size() @@ -136,6 +143,11 @@ vector NSGA2::survive(Population& pop, int island, template vector> NSGA2::fast_nds(Population& pop, vector& island_pool) { + // this will update pareto dominance attributes in fitness class + // based on the population + + // fmt::print("inside fast nds with island pool of size {} from pop of size {}\n", island_pool.size(), pop.size()); + //< the Pareto fronts vector> front; @@ -170,13 +182,18 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan p->fitness.dominated.clear(); p->fitness.dominated = dom; // dom will have values already referring to island indexes - if (p->dcounter == 0) { + if (p->fitness.dcounter == 0) { + // fmt::print("pushing {}...\n", island_pool[i]); p->fitness.set_rank(1); // front will have values already referring to island indexes front.at(0).push_back(island_pool[i]); } + + // fmt::print("... index {} dominates {} ({}) and was dominated by {} ({})\n", island_pool[i], dom.size(), p->fitness.get_dominated().size(), dcount, p->fitness.get_dcounter()); } } + + // fmt::print("First front size {}...\n", front.at(0).size()); // using OpenMP can have different orders in the front.at(0) // so let's sort it so that the algorithm is deterministic @@ -185,6 +202,7 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan int fi = 1; while (front.at(fi-1).size() > 0) { + // fmt::print("starting front {} with size \n", fi, front.at(fi-1).size()); std::vector& fronti = front.at(fi-1); std::vector Q; @@ -192,39 +210,62 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan const Individual& p = pop[fronti.at(i)]; + // fmt::print("ind {} dominated {} \n", fronti.at(i), p.fitness.dominated.size()); + // iterating over dominated individuals for (int j = 0; j < p.fitness.dominated.size() ; ++j) { + // fmt::print("decreased counter of ind {} for {} to {} \n", j, p.fitness.dominated.at(j), pop.individuals.at(p.fitness.dominated.at(j))->fitness.dcounter); auto q = pop.individuals.at(p.fitness.dominated.at(j)); + + // fmt::print("decreased counter \n"); q->fitness.dcounter -= 1; if (q->fitness.dcounter == 0) { + // fmt::print("updated counter for ind {} \n", j); + q->fitness.set_rank(fi+1); Q.push_back(p.fitness.dominated.at(j)); } } } - fi += 1; front.push_back(Q); + // fmt::print("front {} ended with size {}...\n", fi, Q.size()); + + fi += 1; } + // fmt::print("finished\n"); + return front; } template void NSGA2::crowding_distance(Population& pop, vector>& front, int fronti) { + + // fmt::print("inside crowding distance for front {}...\n", fronti); + std::vector F = front.at(fronti); - if (F.size() == 0 ) return; + if (F.size() == 0 ){ + // fmt::print("empty front\n"); + return; + } const int fsize = F.size(); + // fmt::print("front size is {}...\n", fsize); for (int i = 0; i < fsize; ++i) pop.individuals.at(F.at(i))->fitness.crowding_dist = 0; + // fmt::print("reseted crowding distance for individuals in this front\n"); + const int limit = pop.individuals.at(0)->fitness.get_wvalues().size(); + // fmt::print("limit is {}\n", limit); + for (int m = 0; m < limit; ++m) { + // fmt::print("m {}\n", m); std::sort(F.begin(), F.end(), comparator_obj(pop,m)); @@ -238,6 +279,7 @@ void NSGA2::crowding_distance(Population& pop, vector>& front, { if (pop.individuals.at(F.at(i))->fitness.crowding_dist != std::numeric_limits::max()) { // crowd over obj + // TODO: this could be improved pop.individuals.at(F.at(i))->fitness.crowding_dist += (pop.individuals.at(F.at(i+1))->fitness.get_wvalues().at(m) - pop.individuals.at(F.at(i-1))->fitness.get_wvalues().at(m)) / (pop.individuals.at(F.at(fsize-1))->fitness.get_wvalues().at(m) - pop.individuals.at(F.at(0))->fitness.get_wvalues().at(m)); diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index 7a890c7a..d7d9e834 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -51,10 +51,10 @@ class NSGA2 : public SelectionOperator auto ind1 = pop.individuals[i]; auto ind2 = pop.individuals[j]; - if (ind1->rank < ind2->rank) + if (ind1->fitness.rank < ind2->fitness.rank) return true; - else if (ind1->rank == ind2->rank && - ind1->crowding_dist > ind2->crowding_dist) + else if (ind1->fitness.rank == ind2->fitness.rank && + ind1->fitness.crowding_dist > ind2->fitness.crowding_dist) return true; return false; }; From f1bcdcec988b046b08e96ff45cf077c53ead9b6f Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 23 Feb 2024 19:06:16 -0300 Subject: [PATCH 117/199] Updated wrapper to use cpp instead of deap sel and surv --- pybrush/DeapEstimator.py | 35 +++++++++++----- pybrush/__init__.py | 1 + pybrush/deap_api/nsga2.py | 51 ++++++++++++++++++++---- src/bindings/bind_individuals.cpp | 2 +- src/bindings/bind_params.cpp | 1 + src/bindings/bind_population.cpp | 2 +- src/bindings/bind_selection.cpp | 2 +- src/bindings/bind_selection.h | 66 ++++++++++++++++++++++++++++++- src/bindings/module.cpp | 2 +- 9 files changed, 140 insertions(+), 22 deletions(-) diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index e69d1be3..bc4fb992 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -12,7 +12,6 @@ # import deap as dp from deap import algorithms, base, creator, tools # from tqdm import tqdm -from types import NoneType from sklearn.metrics import average_precision_score from sklearn.preprocessing import MinMaxScaler import _brush # TODO: stop using _brush and use whats in pybrush @@ -21,6 +20,7 @@ # from _brush import Dataset, SearchSpace from pybrush import RegressorIndividual, ClassifierIndividual, MultiClassifierIndividual from pybrush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator +from pybrush import RegressorSelector, ClassifierSelector, MultiClassifierSelector # TODO: LOGGER AND ARCHIVE @@ -127,7 +127,7 @@ def __init__( verbosity=0, max_depth=3, max_size=20, - num_islands=5, + num_islands=1, mig_prob=0.05, cx_prob= 1/7, mutation_probs = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, @@ -180,10 +180,24 @@ def _setup_toolbox(self): self.eval_ = ( ClassifierEvaluator() if self.n_classes_ == 2 else MultiClassifierEvaluator() ) + self.sel_ = ( ClassifierSelector("nsga2", False) + if self.n_classes_ == 2 else + MultiClassifierSelector("nsga2", False) ) + self.surv_ = ( ClassifierSelector("nsga2", True) + if self.n_classes_ == 2 else + MultiClassifierSelector("nsga2", True) ) else: creator.create("Individual", RegressorIndividual) + self.sel_ = RegressorSelector("nsga2", False) + self.surv_ = RegressorSelector("nsga2", True) self.eval_ = RegressorEvaluator() + toolbox.register("select", lambda pop: self.sel_.select(pop, self.parameters_)) + toolbox.register("survive", lambda pop: self.surv_.survive(pop, self.parameters_)) + + def update_current_gen(gen): self.parameters_.current_gen = gen + toolbox.register("update_current_gen", update_current_gen) + def assign_fit(ind, validation=False): ind.program.fit(self.data_.get_training_data()) self.eval_.assign_fit(ind, self.data_, self.parameters_, validation) @@ -199,13 +213,14 @@ def assign_fit(ind, validation=False): # When solving multi-objective problems, selection and survival must # support this feature. This means that these selection operators must # accept a tuple of fitnesses as argument) - if self.algorithm=="nsga2" or self.algorithm=="nsga2island": - toolbox.register("select", tools.selTournamentDCD) - toolbox.register("survive", tools.selNSGA2) - elif self.algorithm=="ga" or self.algorithm=="gaisland": - toolbox.register("select", tools.selTournament, tournsize=3) - def offspring(pop, MU): return pop[-MU:] - toolbox.register("survive", offspring) + # if self.algorithm=="nsga2" or self.algorithm=="nsga2island": + # toolbox.register("select", tools.selTournamentDCD) + # toolbox.register("survive", tools.selNSGA2) + # elif self.algorithm=="ga" or self.algorithm=="gaisland": + # toolbox.register("select", tools.selTournament, tournsize=3) + # def offspring(pop, MU): return pop[-MU:] + # toolbox.register("survive", offspring) + # toolbox.population will return a list of elements by calling toolbox.individual toolbox.register("createRandom", self._make_individual) @@ -340,7 +355,7 @@ def _make_data(self, X, y=None, feature_names=[], validation_size=0.0): assert isinstance(X, np.ndarray) - if isinstance(y, NoneType): + if y is None: return _brush.Dataset(X=X, feature_names=feature_names, validation_size=validation_size) diff --git a/pybrush/__init__.py b/pybrush/__init__.py index 219c38e1..0d348a92 100644 --- a/pybrush/__init__.py +++ b/pybrush/__init__.py @@ -3,6 +3,7 @@ from _brush import SearchSpace from _brush import Parameters from _brush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator +from _brush import RegressorSelector, ClassifierSelector, MultiClassifierSelector # Individuals from _brush.individual import RegressorIndividual, \ diff --git a/pybrush/deap_api/nsga2.py b/pybrush/deap_api/nsga2.py index 6211fe08..520cec13 100644 --- a/pybrush/deap_api/nsga2.py +++ b/pybrush/deap_api/nsga2.py @@ -39,10 +39,6 @@ def calculate_statistics(ind): pop = toolbox.population(n=MU) pop = list(toolbox.map(toolbox.assign_fit, pop)) - # This is just to assign the crowding distance to the individuals - # no actual selection is done - pop = toolbox.survive(pop, len(pop)) - record = stats.compile(pop) logbook.record(gen=0, evals=len(pop), **record) @@ -51,10 +47,48 @@ def calculate_statistics(ind): # Begin the generational process for gen in range(1, NGEN): + + # this is used in cpp to decide if we are going to do some calculations or not + toolbox.update_current_gen(gen) + # Vary the population - parents = toolbox.select(pop, len(pop)) + # print("--"*20) + # print("pop before select") + # for p in pop: + # print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) + + parents = toolbox.select(pop) # , len(pop) # select method from brush's cpp side will use the values in self.parameters_ to decide how many individuals it should select + + # print("--"*20) + # print("pop after select") + # for p in pop: + # print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) + + # print("--"*20) + # print("selected parents") + # for p in parents: + # print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) + offspring = [] - for ind1, ind2 in zip(parents, parents[1:]): + for ind1, ind2 in zip(parents, parents[1:]+parents[0:1]): off = None if rnd_flt() < CXPB: # either mutation or crossover off = toolbox.mate(ind1, ind2) @@ -64,10 +98,13 @@ def calculate_statistics(ind): if off is not None: # first we fit, then add to offspring offspring.extend([off]) + # filling offspring empty slots + offspring = offspring + toolbox.population(n=MU - len(offspring)) + offspring = list(toolbox.map(toolbox.assign_fit, offspring)) # Select the next generation population (no sorting before this step, as # survive==offspring will cut it in half) - pop = toolbox.survive(pop + offspring, MU) + pop = toolbox.survive(pop + offspring) pop.sort(key=lambda x: x.fitness, reverse=True) diff --git a/src/bindings/bind_individuals.cpp b/src/bindings/bind_individuals.cpp index 568f8a54..ce8d279b 100644 --- a/src/bindings/bind_individuals.cpp +++ b/src/bindings/bind_individuals.cpp @@ -50,5 +50,5 @@ void bind_individuals(py::module& m) bind_individual(m, "RegressorIndividual"); bind_individual(m, "ClassifierIndividual"); bind_individual(m, "MultiClassifierIndividual"); - bind_individual(m, "RepresenterIndividual"); + // bind_individual(m, "RepresenterIndividual"); } \ No newline at end of file diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 2d95f4d7..b1b3cb1a 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -25,6 +25,7 @@ void bind_params(py::module& m) { Brush::Parameters p; return p; })) .def_property("pop_size", &Brush::Parameters::get_pop_size, &Brush::Parameters::set_pop_size) .def_property("gens", &Brush::Parameters::get_gens, &Brush::Parameters::set_gens) + .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands) .def_property("max_depth", &Brush::Parameters::get_max_depth, &Brush::Parameters::set_max_depth) .def_property("max_size", &Brush::Parameters::get_max_size, &Brush::Parameters::set_max_size) diff --git a/src/bindings/bind_population.cpp b/src/bindings/bind_population.cpp index 11b012e8..ccdd7203 100644 --- a/src/bindings/bind_population.cpp +++ b/src/bindings/bind_population.cpp @@ -17,5 +17,5 @@ void bind_populations(py::module& m) bind_population(m, "ClassifierPopulation"); bind_population(m, "MultiClassifierPopulation"); - bind_population(m, "RepresenterPopulation"); + // bind_population(m, "RepresenterPopulation"); } \ No newline at end of file diff --git a/src/bindings/bind_selection.cpp b/src/bindings/bind_selection.cpp index e2a74442..f8a8641c 100644 --- a/src/bindings/bind_selection.cpp +++ b/src/bindings/bind_selection.cpp @@ -17,5 +17,5 @@ void bind_selections(py::module& m) bind_selection(m, "ClassifierSelector"); bind_selection(m, "MultiClassifierSelector"); - bind_selection(m, "RepresenterSelector"); + // bind_selection(m, "RepresenterSelector"); } \ No newline at end of file diff --git a/src/bindings/bind_selection.h b/src/bindings/bind_selection.h index 1c101fee..2897d4b4 100644 --- a/src/bindings/bind_selection.h +++ b/src/bindings/bind_selection.h @@ -1,6 +1,17 @@ #include "module.h" +// TODO: figure out why im having symbol errors (if i dont include the cpp here as well) #include "../selection/selection.h" -#include "../selection/selection.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) +#include "../selection/selection.cpp" +#include "../selection/selection_operator.h" +#include "../selection/selection_operator.cpp" +#include "../selection/nsga2.h" +#include "../selection/nsga2.cpp" + +#include "../population.cpp" +#include "../population.h" + +// #include "../individual.h" +//#include "../selection/selection.cpp" namespace py = pybind11; namespace nl = nlohmann; @@ -18,5 +29,58 @@ void bind_selection(py::module& m, string name) .def(py::init( [](string type, bool survival){ Class s(type, survival); return s; }) ) + .def("select", [](Class &self, std::vector>& individuals, + const Parameters& params) { + + // auto sel = Class("nsga2", false); + auto pop = br::Pop::Population(); + + pop.init(individuals, params); + + vector selected = self.select(pop, 0, params); + + vector> pool; + pool.resize(0); + + for (size_t idx : selected) { + pool.push_back(pop[idx]); + } + + // returns references + return pool; + }) + .def("survive", [](Class &self, std::vector>& individuals, + const Parameters& params) { + + // auto sel = Class("nsga2", false); + auto pop = br::Pop::Population(); + + // std::cout << "created new population" << std::endl; + + pop.init(individuals, params); + + // std::cout << "called init with individuals" << std::endl; + + vector selected = self.survive(pop, 0, params); + + // std::cout << "survival" << std::endl; + + vector> pool; + + // std::cout << "starting to fill the pool" << std::endl; + + pool.resize(0); + + // std::cout << "pool is empty" << std::endl; + + for (size_t idx : selected) { + pool.push_back(pop[idx]); + } + + // std::cout << "pool has size" << pool.size() << std::endl; + + // returns references + return pool; + }) ; } \ No newline at end of file diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index 5b921432..b6f50f46 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -41,7 +41,7 @@ PYBIND11_MODULE(_brush, m) { bind_dataset(m); bind_search_space(m); bind_variations(m); - // bind_selections(m); + bind_selections(m); // bind_populations(m); // solutions From 95fec2904856f9152f64fec67f5d45be3ab20d8c Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 23 Feb 2024 19:06:33 -0300 Subject: [PATCH 118/199] Updated test --- tests/cpp/test_population.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 3e6e5466..bc092f59 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -73,6 +73,8 @@ TEST(Population, PopulationTests) fmt::print("Performing all steps of an evolution (sequential, not parallel)\n"); for (int i=0; i<100; ++i) // update and prep offspring slots works properly { + params.set_current_gen(i); + vector> survivors(pop.num_islands); fmt::print("Fitting individuals\n"); // this must be done in one thread (or implement mutex), because we can have multiple islands pointing to same individuals From 6d9369b8e55f9edc58dead39e6f78f91812c8b3f Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 26 Feb 2024 17:40:02 -0300 Subject: [PATCH 119/199] Lexicase selection. Fix migration. Sel, surv, and migration bindings. --- pybrush/DeapEstimator.py | 10 +- pybrush/deap_api/nsga2.py | 38 +++++++- src/bindings/bind_params.cpp | 2 + src/bindings/bind_selection.h | 53 ++++++---- src/eval/evaluation.cpp | 1 + src/eval/scorer.h | 2 +- src/params.h | 11 ++- src/population.cpp | 10 +- src/selection/lexicase.cpp | 175 ++++++++++++++++++++++++++++++++++ src/selection/lexicase.h | 43 +++++++++ src/selection/nsga2.h | 2 +- src/selection/selection.cpp | 2 + src/selection/selection.h | 1 + src/util/rnd.h | 4 +- tests/cpp/test_population.cpp | 1 + 15 files changed, 328 insertions(+), 27 deletions(-) create mode 100644 src/selection/lexicase.cpp create mode 100644 src/selection/lexicase.h diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index bc4fb992..c97e676a 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -188,12 +188,15 @@ def _setup_toolbox(self): MultiClassifierSelector("nsga2", True) ) else: creator.create("Individual", RegressorIndividual) - self.sel_ = RegressorSelector("nsga2", False) + self.sel_ = RegressorSelector("lexicase", False) self.surv_ = RegressorSelector("nsga2", True) self.eval_ = RegressorEvaluator() toolbox.register("select", lambda pop: self.sel_.select(pop, self.parameters_)) - toolbox.register("survive", lambda pop: self.surv_.survive(pop, self.parameters_)) + toolbox.register("survive", lambda pop: self.surv_.survive(pop, self.parameters_)) + + # it could be both sel or surv. + toolbox.register("migrate", lambda pop: self.surv_.migrate(pop, self.parameters_)) def update_current_gen(gen): self.parameters_.current_gen = gen toolbox.register("update_current_gen", update_current_gen) @@ -259,6 +262,7 @@ def fit(self, X, y): self.functions_ = self.functions # set n classes if relevant + self.n_classes_ = 0 if self.mode=="classification": self.n_classes_ = len(np.unique(y)) @@ -279,6 +283,8 @@ def fit(self, X, y): self.search_space_ = _brush.SearchSpace(self.train_, self.functions_, self.weights_init) self.parameters_ = _brush.Parameters() + self.parameters_.classification = self.mode == "classification" + self.parameters_.n_classes = self.n_classes_ self.parameters_.pop_size = self.pop_size self.parameters_.gens = self.gens self.parameters_.num_islands = self.num_islands diff --git a/pybrush/deap_api/nsga2.py b/pybrush/deap_api/nsga2.py index 520cec13..c637ced0 100644 --- a/pybrush/deap_api/nsga2.py +++ b/pybrush/deap_api/nsga2.py @@ -46,7 +46,7 @@ def calculate_statistics(ind): print(logbook.stream) # Begin the generational process - for gen in range(1, NGEN): + for gen in range(1, NGEN+1): # this is used in cpp to decide if we are going to do some calculations or not toolbox.update_current_gen(gen) @@ -102,10 +102,46 @@ def calculate_statistics(ind): offspring = offspring + toolbox.population(n=MU - len(offspring)) offspring = list(toolbox.map(toolbox.assign_fit, offspring)) + + # print("--"*20) + # print("offspring") + # for p in offspring: + # print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) + # Select the next generation population (no sorting before this step, as # survive==offspring will cut it in half) pop = toolbox.survive(pop + offspring) + # print("--"*20) + # print("pop after survival") + # for p in pop: + # print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) + + pop = toolbox.migrate(pop) + + # print("--"*20) + # print("pop after migration") + # for p in pop: + # print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) + pop.sort(key=lambda x: x.fitness, reverse=True) record = stats.compile(pop) diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index b1b3cb1a..81769365 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -27,6 +27,8 @@ void bind_params(py::module& m) .def_property("gens", &Brush::Parameters::get_gens, &Brush::Parameters::set_gens) .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands) + .def_property("n_classes", &Brush::Parameters::get_n_classes, &Brush::Parameters::set_n_classes) + .def_property("classification", &Brush::Parameters::get_classification, &Brush::Parameters::set_classification) .def_property("max_depth", &Brush::Parameters::get_max_depth, &Brush::Parameters::set_max_depth) .def_property("max_size", &Brush::Parameters::get_max_size, &Brush::Parameters::set_max_size) .def_property("objectives", &Brush::Parameters::get_objectives, &Brush::Parameters::set_objectives) diff --git a/src/bindings/bind_selection.h b/src/bindings/bind_selection.h index 2897d4b4..781f65d8 100644 --- a/src/bindings/bind_selection.h +++ b/src/bindings/bind_selection.h @@ -6,6 +6,8 @@ #include "../selection/selection_operator.cpp" #include "../selection/nsga2.h" #include "../selection/nsga2.cpp" +#include "../selection/lexicase.h" +#include "../selection/lexicase.cpp" #include "../population.cpp" #include "../population.h" @@ -37,13 +39,18 @@ void bind_selection(py::module& m, string name) pop.init(individuals, params); - vector selected = self.select(pop, 0, params); - vector> pool; pool.resize(0); - for (size_t idx : selected) { - pool.push_back(pop[idx]); + for (int island = 0; island < params.num_islands; ++island) + { + vector selected = self.select(pop, island, params); + + // std::cout << "selecting in island " << island << std::endl; + + for (size_t idx : selected) { + pool.push_back(pop[idx]); + } } // returns references @@ -55,30 +62,42 @@ void bind_selection(py::module& m, string name) // auto sel = Class("nsga2", false); auto pop = br::Pop::Population(); - // std::cout << "created new population" << std::endl; - pop.init(individuals, params); - // std::cout << "called init with individuals" << std::endl; + vector> pool; + pool.resize(0); - vector selected = self.survive(pop, 0, params); + for (int island = 0; island < params.num_islands; ++island) + { + vector selected = self.survive(pop, island, params); - // std::cout << "survival" << std::endl; + for (size_t idx : selected) { + pool.push_back(pop[idx]); + } + } - vector> pool; + // returns references + return pool; + }) + .def("migrate", [](Class &self, std::vector>& individuals, + const Parameters& params) { - // std::cout << "starting to fill the pool" << std::endl; + auto pop = br::Pop::Population(); + pop.init(individuals, params); + pop.migrate(); // this will modify island indexes inplace + + vector> pool; pool.resize(0); - // std::cout << "pool is empty" << std::endl; + for (int island = 0; island < params.num_islands; ++island) + { + vector selected = pop.get_island_indexes(island); - for (size_t idx : selected) { - pool.push_back(pop[idx]); + for (size_t idx : selected) { + pool.push_back(pop[idx]); + } } - - // std::cout << "pool has size" << pool.size() << std::endl; - // returns references return pool; }) diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index af394b6d..c0675271 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -32,6 +32,7 @@ void Evaluation::update_fitness(Population& pop, if (!pass) { // TODO: check if score was nan and assign the max float + // TODO: better handling of nan or inf scores when doing selection and survival (and hall of fame and rank for migration) ind.fitness.loss = MAX_FLT; ind.fitness.loss_v = MAX_FLT; ind.error = MAX_FLT*VectorXf::Ones(data.y.size()); diff --git a/src/eval/scorer.h b/src/eval/scorer.h index e00fa961..5681da02 100644 --- a/src/eval/scorer.h +++ b/src/eval/scorer.h @@ -92,7 +92,7 @@ typedef float (*funcPointer)(const VectorXf&, std::map score_hash; string scorer; - Scorer(string scorer="multi_log") { + Scorer(string scorer="log") { score_hash["log"] = &mean_log_loss; this->set_scorer(scorer); diff --git a/src/params.h b/src/params.h index 915c06dc..65ad1a45 100644 --- a/src/params.h +++ b/src/params.h @@ -32,7 +32,7 @@ struct Parameters unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size unsigned int max_size = 50; vector objectives{"error","complexity"}; // error should be generic and deducted based on mode - string sel = "nsga2"; //selection method + string sel = "lexicase"; //selection method string surv = "nsga2"; //survival method std::unordered_map functions; int num_islands=1; @@ -53,7 +53,10 @@ struct Parameters string scorer_="mse"; ///< actual loss function used, determined by error // for classification (TODO: should I have these, or they could be just dataset arguments (except the ones needed to use in dataset constructor)) + bool classification; unsigned int n_classes; ///< number of classes for classification + + // TODO: set these values when creating the parameters in python side vector classes; ///< class labels vector class_weights; ///< weights for each class vector sample_weights; ///< weights for each sample @@ -103,6 +106,12 @@ struct Parameters void set_mig_prob(float new_mig_prob){ mig_prob = new_mig_prob; }; float get_mig_prob(){ return mig_prob; }; + void set_classification(bool c){ classification = c; }; + bool get_classification(){ return classification; }; + + void set_n_classes(unsigned int new_n_classes){ n_classes = new_n_classes; }; + unsigned int get_n_classes(){ return n_classes; }; + //TODO: unify unordered or ordered void set_mutation_probs(std::map new_mutation_probs){ mutation_probs = new_mutation_probs; }; std::map get_mutation_probs(){ return mutation_probs; }; diff --git a/src/population.cpp b/src/population.cpp index b5207a30..48e8345c 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -195,7 +195,7 @@ vector> Population::sorted_front(unsigned rank, bool ignore_of for (int i=0; irank == rank) + if (individuals.at(idxs.at(i))->fitness.rank == rank) pf.push_back(i); } @@ -222,7 +222,7 @@ vector Population::hall_of_fame(unsigned rank, bool ignore_offspring) for (unsigned int i =0; irank == rank) + if (individuals.at(i)->fitness.rank == rank) pf.push_back(i); } std::sort(pf.begin(),pf.end(),SortComplexity(*this)); @@ -253,14 +253,18 @@ void Population::migrate() { if (r() < mig_prob) { + // std::cout << "migrating in island" << island << std::endl; + size_t migrating_idx; // determine if incoming individual comes from global or local hall of fame if (r() < 0.5) { // from global hall of fame + // std::cout << "from hall of fame" << std::endl; migrating_idx = *r.select_randomly( global_hall_of_fame.begin(), global_hall_of_fame.end()); } else { // from any other local hall of fame + // std::cout << "from other island" << std::endl; // finding other island indexes vector other_islands(num_islands-1); iota(other_islands.begin(), other_islands.end(), 0); @@ -281,6 +285,8 @@ void Population::migrate() island_fronts.at(other_island).begin(), island_fronts.at(other_island).end()); } + + // std::cout << "index " << i << " of island " << island << " is now" << migrating_idx << std::endl; island_indexes.at(island).at(i) = migrating_idx; } diff --git a/src/selection/lexicase.cpp b/src/selection/lexicase.cpp new file mode 100644 index 00000000..911499b3 --- /dev/null +++ b/src/selection/lexicase.cpp @@ -0,0 +1,175 @@ +#include "lexicase.h" + +namespace Brush { +namespace Sel { + +using namespace Brush; +using namespace Pop; +using namespace Sel; + +template +Lexicase::Lexicase(bool surv) +{ + this->name = "lexicase"; + this->survival = surv; +} + +template +vector Lexicase::select(Population& pop, int island, + const Parameters& params) +{ + // this one can be executed in parallel because it is just reading the errors. This + // method assumes that the expressions have been fitted previously, and their respective + // error vectors are filled + + auto island_pool = pop.get_island_indexes(island); + + // if this is first generation, just return indices to pop + if (params.current_gen==0) + return island_pool; + + //< number of samples + unsigned int N = pop.individuals.at(0)->error.size(); + + //< number of individuals + unsigned int P = island_pool.size(); + + // define epsilon + ArrayXf epsilon = ArrayXf::Zero(N); + + // if output is continuous, use epsilon lexicase + if (!params.classification || params.scorer_.compare("log")==0 + || params.scorer_.compare("multi_log")==0) + { + // for each sample, calculate epsilon + for (int i = 0; ierror(i); + } + epsilon(i) = mad(case_errors); + } + } + + // selection pool + vector starting_pool; + for (int i = 0; i < island_pool.size(); ++i) + { + starting_pool.push_back(i); + } + assert(starting_pool.size() == P); + + vector selected(P,0); // selected individuals + + #pragma omp parallel for + for (unsigned int i = 0; i cases; // cases (samples) + if (params.classification && !params.class_weights.empty()) + { + // for classification problems, weight case selection + // by class weights + vector choices(N); + std::iota(choices.begin(), choices.end(),0); + + vector sample_weights = params.sample_weights; + + for (unsigned i = 0; i choice_idxs(N-i); + std::iota(choice_idxs.begin(),choice_idxs.end(),0); + + size_t idx = *r.select_randomly( + choice_idxs.begin(), choice_idxs.end(), + sample_weights.begin(), sample_weights.end()); + + cases.push_back(choices.at(idx)); + choices.erase(choices.begin() + idx); + + sample_weights.erase(sample_weights.begin() + idx); + } + } + else + { // otherwise, choose cases randomly + cases.resize(N); + std::iota(cases.begin(),cases.end(),0); + r.shuffle(cases.begin(),cases.end()); // shuffle cases + } + vector pool = starting_pool; // initial pool + vector winner; // winners + + bool pass = true; // checks pool size and number of cases + unsigned int h = 0; // case count + + float epsilon_threshold; + + while(pass){ // main loop + epsilon_threshold = 0; + + winner.resize(0); // winners + // minimum error on case + float minfit = std::numeric_limits::max(); + + // get minimum + for (size_t j = 0; jerror(cases[h]) < minfit) + minfit = pop.individuals.at(pool[j])->error(cases[h]); + + // criteria to stay in pool + epsilon_threshold = minfit+epsilon[cases[h]]; + + // select best + for (size_t j = 0; jerror(cases[h]) + <= epsilon_threshold) + winner.push_back(pool[j]); + + ++h; // next case + // only keep going if needed + pass = (winner.size()>1 && h= cases.size()) + winner.push_back(*r.select_randomly( + pool.begin(), pool.end()) ); + else + pass = true; + } + else + pool = winner; // reduce pool to remaining individuals + } + + assert(winner.size()>0); + + //if more than one winner, pick randomly + selected.at(i) = *r.select_randomly( + winner.begin(), winner.end() ); + } + + if (selected.size() != island_pool.size()) + { + std::cout << "selected: " ; + for (auto s: selected) std::cout << s << " "; std::cout << "\n"; + HANDLE_ERROR_THROW("Lexicase did not select correct number of \ + parents"); + } + + return selected; +} + + +template +vector Lexicase::survive(Population& pop, int island, + const Parameters& params) +{ + /* Lexicase survival */ + HANDLE_ERROR_THROW("Lexicase survival not implemented"); + return vector(); +} + + +} +} diff --git a/src/selection/lexicase.h b/src/selection/lexicase.h new file mode 100644 index 00000000..2f4365d5 --- /dev/null +++ b/src/selection/lexicase.h @@ -0,0 +1,43 @@ +#ifndef LEXICASE_H +#define LEXICASE_H + +#include "selection_operator.h" +#include "../util/utils.h" + + +namespace Brush { +namespace Sel { + + +using namespace Brush; +using namespace Pop; +using namespace Sel; + + +////////////////////////////////////////////////////////////// Declarations +/*! +* @class Lexicase +* @brief Lexicase selection operator. +*/ + +template +class Lexicase : public SelectionOperator +{ +public: + Lexicase(bool surv=false); + ~Lexicase(){}; + + /// function returns a set of selected indices from pop + vector select(Population& pop, int island, + const Parameters& p); + + /// lexicase survival + vector survive(Population& pop, int island, + const Parameters& p); +}; + + +} +} + +#endif \ No newline at end of file diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index d7d9e834..8ba735b4 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -23,7 +23,7 @@ class NSGA2 : public SelectionOperator ~NSGA2(){}; /// selection according to the survival scheme of NSGA-II - vector select(Population& pop, int island, + vector select(Population& pop, int island, const Parameters& p); /// survival according to the survival scheme of NSGA-II diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index 9c97d0e5..3a7c01ba 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -33,6 +33,8 @@ void Selection::set_operator() { if (this->type == "nsga2") pselector = new NSGA2(survival); + else if (this->type == "lexicase") + pselector = new Lexicase(survival); else HANDLE_ERROR_THROW("Undefined Selection Operator " + this->type + "\n"); diff --git a/src/selection/selection.h b/src/selection/selection.h index d415ca8b..32574416 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -8,6 +8,7 @@ license: GNU/GPL v3 #include "selection_operator.h" #include "nsga2.h" +#include "lexicase.h" namespace Brush { namespace Sel { diff --git a/src/util/rnd.h b/src/util/rnd.h index 0a682e54..21e90ed1 100644 --- a/src/util/rnd.h +++ b/src/util/rnd.h @@ -136,13 +136,13 @@ namespace Brush { namespace Util{ if(w.size() == 0) { fmt::format("w size = {} and v size = {}, returning uniform random choice\n", - w.size(), v.size()); + w.size(), v.size()); return random_choice(v); } if(w.size() != v.size()) { fmt::format("w ({}) != v size ({}), returning uniform random choice\n", - w.size(), v.size()); + w.size(), v.size()); return random_choice(v); } else diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index bc092f59..5f051079 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -4,6 +4,7 @@ #include "../../src/population.cpp" // TODO: figure out if thats ok to include cpps instead of headers #include "../../src/eval/evaluation.cpp" #include "../../src/selection/nsga2.cpp" +#include "../../src/selection/lexicase.cpp" #include "../../src/selection/selection_operator.cpp" #include "../../src/selection/selection.cpp" From 02c5058e98d1028293c2c3d49afa2b99d0242f78 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 27 Feb 2024 10:49:17 -0300 Subject: [PATCH 120/199] Bug fix in lexicase selecting from the wrong island --- src/selection/lexicase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/selection/lexicase.cpp b/src/selection/lexicase.cpp index 911499b3..8fda38d9 100644 --- a/src/selection/lexicase.cpp +++ b/src/selection/lexicase.cpp @@ -57,7 +57,7 @@ vector Lexicase::select(Population& pop, int island, vector starting_pool; for (int i = 0; i < island_pool.size(); ++i) { - starting_pool.push_back(i); + starting_pool.push_back(island_pool[i]); } assert(starting_pool.size() == P); From baaf37c0e41e2030c1cc70a52438c04076c63e04 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 27 Feb 2024 10:51:29 -0300 Subject: [PATCH 121/199] pop variation entirely in cpp --- pybrush/DeapEstimator.py | 1 + pybrush/deap_api/nsga2.py | 147 +++++++++++++++++----------------- src/bindings/bind_variation.h | 38 +++++++++ src/population.h | 2 + src/variation.cpp | 52 ++++++------ 5 files changed, 142 insertions(+), 98 deletions(-) diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index c97e676a..a292bf1e 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -212,6 +212,7 @@ def assign_fit(ind, validation=False): toolbox.register("mate", self.variator_.cross) toolbox.register("mutate", self.variator_.mutate) + toolbox.register("vary_pop", lambda pop: self.variator_.vary_pop(pop, self.parameters_)) # When solving multi-objective problems, selection and survival must # support this feature. This means that these selection operators must diff --git a/pybrush/deap_api/nsga2.py b/pybrush/deap_api/nsga2.py index c637ced0..3175e129 100644 --- a/pybrush/deap_api/nsga2.py +++ b/pybrush/deap_api/nsga2.py @@ -52,95 +52,96 @@ def calculate_statistics(ind): toolbox.update_current_gen(gen) # Vary the population - # print("--"*20) - # print("pop before select") - # for p in pop: - # print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) + print("--"*20) + print("pop before select") + for p in pop: + print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) parents = toolbox.select(pop) # , len(pop) # select method from brush's cpp side will use the values in self.parameters_ to decide how many individuals it should select - # print("--"*20) - # print("pop after select") - # for p in pop: - # print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) - - # print("--"*20) - # print("selected parents") - # for p in parents: - # print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) - - offspring = [] - for ind1, ind2 in zip(parents, parents[1:]+parents[0:1]): - off = None - if rnd_flt() < CXPB: # either mutation or crossover - off = toolbox.mate(ind1, ind2) - else: - off = toolbox.mutate(ind1) + print("--"*20) + print("pop after select") + for p in pop: + print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) + + print("--"*20) + print("selected parents") + for p in parents: + print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) + + # offspring = [] + # for ind1, ind2 in zip(parents, parents[1:]+parents[0:1]): + # off = None + # if rnd_flt() < CXPB: # either mutation or crossover + # off = toolbox.mate(ind1, ind2) + # else: + # off = toolbox.mutate(ind1) - if off is not None: # first we fit, then add to offspring - offspring.extend([off]) + # if off is not None: # first we fit, then add to offspring + # offspring.extend([off]) - # filling offspring empty slots - offspring = offspring + toolbox.population(n=MU - len(offspring)) + # # filling offspring empty slots + # offspring = offspring + toolbox.population(n=MU - len(offspring)) + offspring = toolbox.vary_pop(parents) offspring = list(toolbox.map(toolbox.assign_fit, offspring)) - # print("--"*20) - # print("offspring") - # for p in offspring: - # print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) + print("--"*20) + print("offspring") + for p in offspring: + print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) # Select the next generation population (no sorting before this step, as # survive==offspring will cut it in half) pop = toolbox.survive(pop + offspring) - # print("--"*20) - # print("pop after survival") - # for p in pop: - # print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) + print("--"*20) + print("pop after survival") + for p in pop: + print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) pop = toolbox.migrate(pop) - # print("--"*20) - # print("pop after migration") - # for p in pop: - # print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) + print("--"*20) + print("pop after migration") + for p in pop: + print(p.program.get_model()) + # print(p.fitness.values) + # print(p.fitness.weights) + # print(p.fitness.wvalues) + # print(p.fitness.rank) + # print(p.fitness.loss_v) + # print(p.fitness.crowding_dist) pop.sort(key=lambda x: x.fitness, reverse=True) diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index dc1bbe2b..02ab18de 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -1,6 +1,8 @@ #include "module.h" #include "../variation.h" #include "../variation.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) + +#include "../population.cpp" #include "../population.h" namespace py = pybind11; @@ -20,5 +22,41 @@ void bind_variation(py::module& m, string name) return variation; })) .def("mutate", &Class::mutate, py::return_value_policy::automatic) .def("cross", &Class::cross, py::return_value_policy::automatic) + .def("vary_pop", [](Class &self, std::vector>& individuals, const Parameters& params) { + + if (individuals.size() != params.pop_size) { + throw std::runtime_error("Individual vector has different number of individuals than pop_size. When calling variation, they should be the same. popsize is "+to_string(params.pop_size)+", number of individuals is " + to_string(individuals.size())); + } + + auto pop = br::Pop::Population(); + + pop.init(individuals, params); + + vector> pool; + pool.resize(0); + + for (int island = 0; island < params.num_islands; ++island) + { + // I am assuming the individual vector passed as argument will contain the selected parents already + vector parents = pop.get_island_indexes(island); + + // including offspring indexes (the vary method will store the offspring in the second half of the index vector) + pop.add_offspring_indexes(island); + + self.vary(pop, island, parents); + + // making copies of the second half of the island individuals + vector idxs = pop.get_island_indexes(island); + int start = idxs.size()/2; + for (unsigned i = start; i>> individuals; + // TODO: right now, the number of islands must be a divisor of the popsize, and cannot be greater than half of the popsize (it cant be the same as popsize). Should this behavior change? Also, write this in docs + // TODO: MAKE SURE THIS TWO ITEMS BELOW ARE TAKEN CARE IN THE MAIN LOOP AND IN TEST_POPULATION (I may need to create new methods for taking care of this) // - fitting, fitness calculation, and setting the objectives are not thread safe because we write in individual attributes. // - prepare offspring and update are not thread safe because we insert/delete elements from the array. diff --git a/src/variation.cpp b/src/variation.cpp index 96c608b5..69feeb94 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -619,35 +619,37 @@ void Variation::vary(Population& pop, int island, { // pass check for children undergoing variation std::optional> opt=std::nullopt; // new individual - // TODO: do it a certain number of times. after that, assume that variation cant - // change individual and add it to the island failures - // TODO: use island failures everytime that I'm iterating on the offspring of an - // island (with island range) - while (!opt) + + const Individual& mom = pop[ + *r.select_randomly(parents.begin(), parents.end())]; + + if ( r() < parameters.cx_prob) // crossover { - const Individual& mom = pop[ + const Individual& dad = pop[ *r.select_randomly(parents.begin(), parents.end())]; - - if ( r() < parameters.cx_prob) // crossover - { - const Individual& dad = pop[ - *r.select_randomly(parents.begin(), parents.end())]; - - opt = cross(mom, dad); - } - else // mutation - { - opt = mutate(mom); - } + + opt = cross(mom, dad); + } + else // mutation + { + opt = mutate(mom); + } - if (opt) // no optional value was returned - { - Individual ind = opt.value(); + // mutation and crossover will already perform 3 attempts. If it fails, we just fill with a random individual + if (opt) // no optional value was returned + { + Individual ind = opt.value(); - assert(ind.program.size()>0); - pop.individuals.at(idxs.at(i)) = std::make_shared>(ind); - } - } + assert(ind.program.size()>0); + pop.individuals.at(idxs.at(i)) = std::make_shared>(ind); + } + else { + Individual new_ind; + new_ind.init(search_space, parameters); + new_ind.set_objectives(mom.get_objectives()); // it will have an invalid fitness + + pop.individuals.at(idxs.at(i)) = std::make_shared>(new_ind); + } } } From e5aed9c56fac2920574d8caf1d670460d8859789 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 1 Mar 2024 21:15:55 -0300 Subject: [PATCH 122/199] added annotation with todo --- src/selection/nsga2.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 640db873..6edf5041 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -78,6 +78,7 @@ vector NSGA2::survive(Population& pop, int island, auto island_pool = pop.get_island_indexes(island); + // TODO: do similar calculations in other selection survival and pop methods (so I dont mess up with anything) int original_size = params.pop_size/params.num_islands; // original island size (survive must be called with an island with offfspring) // fmt::print("original size {}\n", original_size); From de84b3c3e8983e37ba267c236f46d14daba77d64 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 1 Mar 2024 21:16:40 -0300 Subject: [PATCH 123/199] Replacing population in update is now better --- src/population.cpp | 14 ++++++++++---- src/population.h | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/population.cpp b/src/population.cpp index 48e8345c..145001d4 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -124,13 +124,14 @@ void Population::add_offspring_indexes(int island) template void Population::update(vector> survivors) { - vector>> new_pop; + vector> new_pop; new_pop.resize(2*pop_size); size_t i=0; for (int j=0; jset_complexity(); @@ -142,13 +143,18 @@ void Population::update(vector> survivors) size_t idx_start = std::floor(j*pop_size/num_islands); size_t idx_end = std::floor((j+1)*pop_size/num_islands); - auto delta = idx_end - idx_start; + auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start // inserting indexes of the offspring island_indexes.at(j).resize(delta); iota(island_indexes.at(j).begin(), island_indexes.at(j).end(), idx_start); } - individuals = new_pop; + this->individuals.resize(0); + for (auto ind : new_pop) + { + individuals.push_back( + std::make_shared>(ind) ); + } } template diff --git a/src/population.h b/src/population.h index 1296cbdf..fe9b653c 100644 --- a/src/population.h +++ b/src/population.h @@ -15,7 +15,7 @@ template class Population{ public: size_t pop_size; - unsigned int num_islands; + int num_islands; float mig_prob; vector>> individuals; From 9a0f36a0f30f3037a900a90855ba3920c12cab53 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 1 Mar 2024 21:17:08 -0300 Subject: [PATCH 124/199] Island algorithm with taskflow implemented it is time to debug! --- src/estimator.cpp | 321 ++++++++++++++++++++++++++++++++-------------- src/estimator.h | 121 ++++------------- src/params.h | 4 +- 3 files changed, 253 insertions(+), 193 deletions(-) diff --git a/src/estimator.cpp b/src/estimator.cpp index 4919bcf7..15025968 100644 --- a/src/estimator.cpp +++ b/src/estimator.cpp @@ -1,22 +1,78 @@ #include "estimator.h" + #include namespace Brush{ + +using namespace Pop; +using namespace Sel; +using namespace Eval; +using namespace Var; + /// @brief initialize Feat object for fitting. template void Estimator::init() { + std::cout << "inside init" << std::endl; + + // TODO: initialize (set operator) for survivor and selector + // initialize population with initial model and/or starting pop + if (params.n_jobs!=0) // TODO: change this to set taskflow jobs omp_set_num_threads(params.n_jobs); + + std::cout << "set number of threads" << std::endl; + r.set_seed(params.random_state); + std::cout << "set random state" << std::endl; + // set up the pop, variator, etc set_is_fitted(false); + std::cout << "is fitted is false" << std::endl; + + + this->pop = Population(); + std::cout << "created population" << std::endl; + + this->evaluator = Evaluation(); + std::cout << "created evaluator" << std::endl; + + this->selector = Selection(params.sel, false); + std::cout << "created selector" << std::endl; + + this->survivor = Selection(params.surv, true); + std::cout << "created survivor" << std::endl; - // TODO: INIT SEARCH SPACE AND VARIATION HERE + //TODO + ///return fraction of data to use for training + // float get_split(); + // /// set train fraction of dataset + // void set_split(float sp); + + // TODO + // int get_batch_size(){return params.bp.batch_size;}; + // void set_batch_size(int bs); + + // TODO + ///set number of threads (and use them in taskflow) + // void set_n_jobs(unsigned t); + // int get_n_jobs(){return omp_get_num_threads();}; + + ///set flag to use batch for training + // void set_use_batch(); + + // TODO getters and setters for the best solution found after evolution + // predict, transform, predict_proba, etc. + // get statistics + // load and save best individuals + // logger, save to file + // execution archive + // score functions + // fit methods (this will run the evolution) // TODO: implement stuff below // // start the clock @@ -34,36 +90,51 @@ void Estimator::init() } -template +template // TODO: use the dataset, or ignore it bool Estimator::update_best(const Dataset& data, bool val) { + std::cout << "updating best" << std::endl; + float bs; bs = this->best_loss; float f; - vector>& pop_ref =this->pop.individuals; // TODO: archive here? + // TODO: archive here? bool updated = false; - for (const auto& ind: pop_ref) + std::cout << "inside loop" << std::endl; + + vector hof = this->pop.hall_of_fame(1, true); + + std::cout << "got hof" << std::endl; + + // will look only in the first half of the population (this is intended to be done after survival step) + for (int i=0; i < hof.size(); ++i) { - if (ind.rank == 1) + // TODO: i guess the right way of doing this is using island indexes (or just take the hall of fame) + std::cout << "index" << hof[i] << std::endl; + const auto& ind = *pop.individuals.at(hof[i]); + + std::cout << ind.program.get_model() << std::endl; + + std::cout << "got individual of rank" << ind.fitness.rank << std::endl; + if (val) + f = ind.fitness.loss_v; + else + f = ind.fitness.loss; + + if (f < bs + || (f == bs && ind.fitness.complexity < this->best_complexity) + ) { - if (val) - f = ind.fitness.loss_v; - else - f = ind.fitness.loss; - - if (f < bs - || (f == bs && ind.get_complexity() < this->best_complexity) - ) - { - bs = f; - this->best_ind = ind; - this->best_complexity = ind.get_complexity(); - - updated = true; - } + std::cout << "updated" << std::endl; + + bs = f; + this->best_ind = ind; + this->best_complexity = ind.fitness.complexity; + + updated = true; } } @@ -74,95 +145,153 @@ bool Estimator::update_best(const Dataset& data, bool val) template -void Estimator::run_generation(unsigned int g, Dataset &data) +void Estimator::run(Dataset &data) { - // https://taskflow.github.io/taskflow/ParallelIterations.html - tf::Executor executor; - tf::Taskflow taskflow; // TODO: how to set number of threads? - - // TODO: implement custom behavior for first generation (specially regarding evaluator) - params.current_gen = g; + // It is up to the python side to create the dataset (we have a cool wrapper for that) + std::cout << "starting to run" << std::endl; - auto batch = data.get_batch(); // will return the original dataset if it is set to dont use batch - - vector> island_parents; - island_parents.resize(pop.num_islands); - taskflow.for_each_index(0, pop.num_islands, 1, [&](int island) { - tuple island_range = pop.get_island_range(island); - - // fit the weights with all training data - evaluator.update_fitness(pop, island_range, data, params, true, false); - evaluator.validation(pop, island_range, data, params, false); - - // TODO: if using batch, fitness should be called before selection to set the batch - if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) - evaluator.update_fitness(pop, island_range, batch, params, false, false); - - // select parents - vector parents = selector.select(pop, island_range, params, data); - island_parents.at(island) = parents; - }); - - vector survivors(params.pop_size); // TODO: check that I dont use pop.size() (or I use correctly, because it will return the size with the slots for the offspring) - pop.add_offspring_indexes(); - - taskflow.for_each_index(0, pop.num_islands, 1, [&](int island) { - tuple island_range = pop.get_island_range(island); + //TODO: i need to make sure i initialize everything (pybind needs to have constructors + // without arguments to work, and i need to handle correcting these values before running) + this->ss = SearchSpace(data, params.functions); + std::cout << "search space was set" << std::endl; - // // variation to produce offspring - variator.vary(pop, island_range, island_parents.at(island)); + this->init(); + std::cout << "estimator initialized" << std::endl; - evaluator.update_fitness(pop, island_range, data, params, true, true); - evaluator.validation(pop, island_range, data, params, true); + pop.init(this->ss, this->params); - if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) - evaluator.update_fitness(pop, island_range, batch, params, false, true); + std::cout << "pop initialized" << std::endl; + std::cout << pop.print_models() << std::endl; - // select survivors from combined pool of parents and offspring - auto island_survivors = survivor.survive(pop, island_range, params, data); - - auto [idx_start, idx_end] = island_range; - size_t delta = idx_end - idx_start; - for (unsigned i = 0; i -void Estimator::fit(MatrixXf& X, VectorXf& y) -{ - this->init(); + int threads; + if (params.n_jobs == -1) + threads = std::thread::hardware_concurrency(); + else if (params.n_jobs == 0) + threads = params.num_islands; + else + threads = params.n_jobs; - // TODO: fit method that takes different arguments? - Dataset data(X, y); + tf::Executor executor(threads); // TODO: executor could be an attribute (so I can move a lot of stuff here to init) + std::cout << "using n threads " << threads << std::endl; - //TODO: i need to make sure i initialize everything (pybind needs to have constructors without arguments to work, and i need to handle correcting these values before running) - this->ss = SearchSpace(data, params.functions); - this->pop = Population(params.pop_size, params.num_islands); - this->evaluator = Evaluation(params.scorer_); - this->selector = Selection(params.sel, false); - this->survivor = Selection(params.surv, true); + assert( (executor.num_workers() > 0) && "Invalid number of workers"); - // TODO: initialize (set operator) for survivor and selector - // initialize population with initial model and/or starting pop - pop.init(this->ss, this->params); + tf::Taskflow taskflow; - unsigned g = 0; - // continue until max gens is reached or max_time is up (if it is set) + // TODO: get references to all classes ( so they can be captured by taskflow) (like some private getters and setters) - while(g> island_parents; + island_parents.resize(pop.num_islands); + vector> survivors; + survivors.resize(pop.num_islands); - set_is_fitted(true); -} + std::cout << "vectors are created " << std::endl; + // TODO: progress bar? (it would be cool) + // heavily inspired in https://github.com/heal-research/operon/blob/main/source/algorithms/nsga2.cpp + auto [init, cond, body, back, done] = taskflow.emplace( + [&]() { /* done nothing to do */ }, // init (entry point for taskflow) + stop, // loop condition + + [&](tf::Subflow& subflow) { // loop body (evolutionary main loop) + std::cout << "inside body" << std::endl; + auto prepare_gen = subflow.emplace([&]() { + std::cout << "inside prepare gen" << std::endl; + std::cout << "generation " << generation << std::endl; + params.set_current_gen(generation); + batch = data.get_batch(); // will return the original dataset if it is set to dont use batch + + island_parents.clear(); + island_parents.resize(pop.num_islands); + + survivors.clear(); + survivors.resize(pop.num_islands); + + ++generation; + }).name("prepare generation");// set generation in params, get batch + + auto select_parents = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { + std::cout << "inside select parents" << std::endl; + evaluator.update_fitness(this->pop, island, data, params, true, false); // fit the weights with all training data + + // TODO: have some way to set which fitness to use (for example in params, or it can infer based on split size idk) + // TODO: if using batch, fitness should be called before selection to set the batch + if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) + evaluator.update_fitness(this->pop, island, batch, params, false, false); + + vector parents = selector.select(this->pop, island, params); + + island_parents.at(island) = parents; + }).name("select parents for each island"); + + auto generate_offspring = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { + std::cout << "inside generate offspring" << std::endl; + this->pop.add_offspring_indexes(island); // we just need to add them, not remove (they are removed in survival step, that will return a selection with the same number of individuals as the original island size) + + // // variation to produce offspring + variator.vary(this->pop, island, island_parents.at(island)); + + evaluator.update_fitness(this->pop, island, data, params, true, true); + // evaluator.validation(*this->pop, island_range, data, params, true); + + if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) + evaluator.update_fitness(this->pop, island, batch, params, false, true); + + // select survivors from combined pool of parents and offspring + vector island_survivors = survivor.survive(this->pop, island, params); + + survivors.at(island) = island_survivors; + }).name("generate offspring for each island"); + + auto survive = subflow.emplace([&]() { this->pop.update(survivors); }).name("survival of the fittest"); + + auto migration = subflow.emplace([&]() { this->pop.migrate(); }).name("migration between islands"); + + // TODO: update best, update log, increment generation counter (but not set in params) + auto finish_gen = subflow.emplace([&]() { bool updated_best = this->update_best(data); }).name("update best, log, archive"); + + // set-up subflow graph + prepare_gen.precede(select_parents); + select_parents.precede(generate_offspring); + generate_offspring.precede(survive); + survive.precede(migration); + migration.precede(finish_gen); + }, + + [&]() { return 0; }, // jump back to the next iteration + + [&]() { this->set_is_fitted(true); } // work done, report last gen and stop + ); // evolutionary loop + + init.name("init"); + cond.name("termination"); + body.name("main loop"); + back.name("back"); + done.name("done"); + taskflow.name("island_gp"); + + init.precede(cond); + cond.precede(body, done); + body.precede(back); + back.precede(cond); + + std::cout << "taskflow configured " << std::endl; + executor.run(taskflow); + executor.wait_for_all(); +} } \ No newline at end of file diff --git a/src/estimator.h b/src/estimator.h index 3f3e9550..7a54ac05 100644 --- a/src/estimator.h +++ b/src/estimator.h @@ -15,6 +15,8 @@ license: GNU/GPL v3 #include "selection/selection.h" #include "taskflow/taskflow.hpp" +#include + namespace Brush { @@ -26,108 +28,33 @@ using namespace Var; template class Estimator{ public: - Estimator() - : params(Parameters()) - , ss(SearchSpace()) - , variator(Variation(params, ss)) + Estimator(const Parameters& p=Parameters()) + : params(p) + , ss(SearchSpace()) // we need to initialize ss and variator. TODO: make them have a default way so we dont have to initialize here + , variator(Variation(params, ss)) {}; - + ~Estimator(){}; - void init(); - //getters and setters for GA configuration --------------------------------- - /// set flag indicating whether fit has been called - inline void set_is_fitted(bool f){is_fitted=f;} + // all hyperparameters are controlled by the parameter class. please refer to that to change something + inline Parameters& get_params(){return params;} + inline void set_params(Parameters& p){params=p;} + inline bool get_is_fitted(){return is_fitted;} - // TODO: WRAPPER SHOULD SET ALL THESE (by changing the inner parameter instance) + /// updates best score by searching in the population for the individual that best fits the given data + bool update_best(const Dataset& data, bool val=false); - void set_pop_size(int pop_size){ params.pop_size = pop_size; }; - int get_pop_size(){ return params.pop_size; }; - - void set_gens(int gens){ params.gens = gens; }; - int get_gens(){ return params.gens; }; - - void set_max_depth(unsigned int max_depth){ params.max_depth = max_depth; }; - int get_max_depth(){ return params.max_depth; }; - - void set_max_size(unsigned int max_size){ params.max_size = max_size; }; - int get_max_size(){ return params.max_size; }; - - void set_mode(string mode) { params.mode = mode; }; - string get_mode(){ return params.mode; }; - - void set_selection(string sel){ params.sel = sel; }; - string get_selection(){ return params.sel; }; - - void set_survival(string surv){ params.surv = surv; }; - string get_survival(){ return params.surv; }; - - void set_num_islands(int num_islands){ params.num_islands = num_islands; }; - int get_num_islands(){ return params.num_islands; }; - - void set_objectives(const vector& obj){params.objectives = obj; }; - auto get_objectives(){return params.objectives; }; - - void set_random_state(int random_state) { - params.random_state = random_state; - r.set_seed(params.random_state); - }; - int get_random_state() { return params.random_state; }; - - void set_mig_prob(float mig_prob){ params.mig_prob = mig_prob;}; - float get_mig_prob(){ return params.mig_prob; }; - - void set_cross_prob(float cross_prob){ params.cx_prob = cross_prob;}; - float get_cross_prob(){ return params.cx_prob; }; - - // TODO: MAKE functions work - // sets available functions based on comma-separated list. - // void set_functions(const vector& fns){ params.functions = fns; }; - // unordered_map get_functions(){ return params.functions; }; - - void set_mutation_probs(std::map mutation_probs){ params.mutation_probs = mutation_probs;}; - std::map get_mutation_probs(){ return params.mutation_probs; }; - - //TODO - ///return fraction of data to use for training - // float get_split(); - // /// set train fraction of dataset - // void set_split(float sp); - - // TODO - // int get_batch_size(){return params.bp.batch_size;}; - // void set_batch_size(int bs); - - // TODO - ///set number of threads (and use them in taskflow) - // void set_n_jobs(unsigned t); - // int get_n_jobs(){return omp_get_num_threads();}; - - ///set flag to use batch for training - // void set_use_batch(); - - // TODO getters and setters for the best solution found after evolution - // predict, transform, predict_proba, etc. - // get statistics - // load and save best individuals - // logger, save to file - // execution archive - // score functions - // fit methods (this will run the evolution) - - /// updates best score - bool update_best(const Dataset& data, bool val=false); + // TODO: im thinking about getting rid of these first two, and keep only the best ind + float best_loss; + int best_complexity; + Individual& get_best_ind(){return best_ind;}; /// train a model. TODO: take arguments needed to build the dataset. once we have it, go through params to set global options and use them - void fit(MatrixXf& X); - void fit(MatrixXf& X, VectorXf& y); + void run(Dataset &d); - bool is_fitted; ///< keeps track of whether fit was called. - - void run_generation(unsigned int g, Dataset &data); + Parameters params; ///< hyperparameters of brush, which the user can interact private: - Parameters params; ///< hyperparameters of brush SearchSpace ss; Population pop; ///< population of programs @@ -137,11 +64,15 @@ class Estimator{ Selection survivor; ///< survival algorithm // TODO: MISSING CLASSES: timer, archive, logger - float best_loss; - int best_complexity; Individual best_ind; + bool is_fitted; ///< keeps track of whether fit was called. + + void init(); + + /// set flag indicating whether fit has been called + inline void set_is_fitted(bool f){is_fitted=f;} - // calculate/print stats + // TODO: calculate/print stats }; } // Brush diff --git a/src/params.h b/src/params.h index 65ad1a45..1bcca676 100644 --- a/src/params.h +++ b/src/params.h @@ -35,7 +35,7 @@ struct Parameters string sel = "lexicase"; //selection method string surv = "nsga2"; //survival method std::unordered_map functions; - int num_islands=1; + int num_islands=5; // variation std::map mutation_probs = { @@ -68,7 +68,7 @@ struct Parameters float batch_size = 0.0; bool use_batch = false; ///< whether to use mini batch for training - int n_jobs = 1; ///< number of parallel jobs (TODO if -1, equals the number of islands?) + int n_jobs = 1; // -1; ///< number of parallel jobs -1 use all threads; 0 use same as number of islands; positive number specify the amouut of threads Parameters(){}; ~Parameters(){}; From 6d3c3cc026171ac4285f7165ebee43078d25cd41 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Fri, 1 Mar 2024 21:17:40 -0300 Subject: [PATCH 125/199] Updated tests, bindings, and wrapper --- pybrush/DeapEstimator.py | 7 +++++ pybrush/__init__.py | 5 +++- pybrush/deap_api/__init__.py | 2 +- pybrush/deap_api/nsga2.py | 50 -------------------------------- src/bindings/bind_estimators.cpp | 8 ++--- src/bindings/bind_estimators.h | 28 ++++++++++++++++-- src/bindings/bind_programs.h | 1 + src/bindings/module.cpp | 9 ++++-- tests/cpp/test_brush.cpp | 43 +++++++++++++++++++++++++++ tests/cpp/test_population.cpp | 2 ++ tests/cpp/testsHeader.h | 10 +++++++ 11 files changed, 103 insertions(+), 62 deletions(-) diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index a292bf1e..e8e022af 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -304,9 +304,16 @@ def fit(self, X, y): )(self.parameters_, self.search_space_) elif self.mode == "regressor": self.variator_ = _brush.RegressorVariator(self.parameters_, self.search_space_) + + # from pybrush import RegressorEngine + # brush_estimator = RegressorEngine(self.parameters_) + # brush_estimator.run(self.data_) + # print(brush_estimator.is_fitted) + # print(brush_estimator.best_ind) else: raise("Unsupported mode") + self.toolbox_ = self._setup_toolbox() # nsga2 and ga differ in the toolbox diff --git a/pybrush/__init__.py b/pybrush/__init__.py index 0d348a92..6a4c518e 100644 --- a/pybrush/__init__.py +++ b/pybrush/__init__.py @@ -10,4 +10,7 @@ ClassifierIndividual, MultiClassifierIndividual # Prototyping an EA using brush classes, but other EA framework -from pybrush.DeapEstimator import DeapClassifier, DeapRegressor \ No newline at end of file +from pybrush.DeapEstimator import DeapClassifier, DeapRegressor + +# c++ learning engines. These are wrapped into a scikit-learn-like estimator in the python side +from _brush.engine import RegressorEngine, ClassifierEngine, MultiClassifierEngine \ No newline at end of file diff --git a/pybrush/deap_api/__init__.py b/pybrush/deap_api/__init__.py index d74636f0..e13697ee 100644 --- a/pybrush/deap_api/__init__.py +++ b/pybrush/deap_api/__init__.py @@ -1 +1 @@ -from pybrush.deap_api.nsga2 import nsga2, DeapIndividual # TODO: use brush individual instead of deap \ No newline at end of file +from pybrush.deap_api.nsga2 import nsga2 \ No newline at end of file diff --git a/pybrush/deap_api/nsga2.py b/pybrush/deap_api/nsga2.py index 3175e129..048cde56 100644 --- a/pybrush/deap_api/nsga2.py +++ b/pybrush/deap_api/nsga2.py @@ -3,11 +3,6 @@ import numpy as np import functools -class DeapIndividual(): - """Class that wraps brush program for creator.Individual class from DEAP.""" - def __init__(self, program): - self.program = program - def nsga2(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rnd_flt): # NGEN = 250 # MU = 100 @@ -69,37 +64,11 @@ def calculate_statistics(ind): print("pop after select") for p in pop: print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) print("--"*20) print("selected parents") for p in parents: print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) - - # offspring = [] - # for ind1, ind2 in zip(parents, parents[1:]+parents[0:1]): - # off = None - # if rnd_flt() < CXPB: # either mutation or crossover - # off = toolbox.mate(ind1, ind2) - # else: - # off = toolbox.mutate(ind1) - - # if off is not None: # first we fit, then add to offspring - # offspring.extend([off]) - - # # filling offspring empty slots - # offspring = offspring + toolbox.population(n=MU - len(offspring)) offspring = toolbox.vary_pop(parents) offspring = list(toolbox.map(toolbox.assign_fit, offspring)) @@ -108,12 +77,6 @@ def calculate_statistics(ind): print("offspring") for p in offspring: print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) # Select the next generation population (no sorting before this step, as # survive==offspring will cut it in half) @@ -123,12 +86,6 @@ def calculate_statistics(ind): print("pop after survival") for p in pop: print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) pop = toolbox.migrate(pop) @@ -136,13 +93,6 @@ def calculate_statistics(ind): print("pop after migration") for p in pop: print(p.program.get_model()) - # print(p.fitness.values) - # print(p.fitness.weights) - # print(p.fitness.wvalues) - # print(p.fitness.rank) - # print(p.fitness.loss_v) - # print(p.fitness.crowding_dist) - pop.sort(key=lambda x: x.fitness, reverse=True) record = stats.compile(pop) diff --git a/src/bindings/bind_estimators.cpp b/src/bindings/bind_estimators.cpp index cb7cb4af..5f908c54 100644 --- a/src/bindings/bind_estimators.cpp +++ b/src/bindings/bind_estimators.cpp @@ -7,10 +7,10 @@ namespace nl = nlohmann; void bind_estimators(py::module& m) { - bind_estimator(m, "BrushRegressorEstimator"); - bind_estimator(m, "BrushClassifierEstimator"); + bind_estimator(m, "RegressorEngine"); + bind_estimator(m, "ClassifierEngine"); // TODO: make these work - bind_estimator(m, "BrushMultiClassifierEstimator"); - bind_estimator(m, "BrushRepresenterEstimator"); + bind_estimator(m, "MultiClassifierEngine"); + bind_estimator(m, "RepresenterEngine"); } \ No newline at end of file diff --git a/src/bindings/bind_estimators.h b/src/bindings/bind_estimators.h index 55dd0661..e16aa719 100644 --- a/src/bindings/bind_estimators.h +++ b/src/bindings/bind_estimators.h @@ -1,5 +1,22 @@ #include "module.h" #include "../estimator.h" +#include "../estimator.cpp" + +// TODO: figure out why do I need to include the whole thing (otherwise it gives me symbol errors) +#include "../selection/selection.h" +#include "../selection/selection.cpp" +#include "../selection/selection_operator.h" +#include "../selection/selection_operator.cpp" +#include "../selection/nsga2.h" +#include "../selection/nsga2.cpp" +#include "../selection/lexicase.h" +#include "../selection/lexicase.cpp" + +#include "../eval/evaluation.h" +#include "../eval/evaluation.cpp" + +#include "../population.cpp" +#include "../population.h" using Reg = Brush::RegressorEstimator; using Cls = Brush::ClassifierEstimator; @@ -13,7 +30,7 @@ using stream_redirect = py::call_guard void bind_estimator(py::module& m, string name) -{ +{ using RetType = std::conditional_t< std::is_same_v, ArrayXf, std::conditional_t, ArrayXb, @@ -21,8 +38,13 @@ void bind_estimator(py::module& m, string name) py::class_ estimator(m, name.data() ); estimator.def(py::init<>()) - .def_property("pop_size", &T::get_pop_size, &T::set_pop_size) - .def_property("gens", &T::get_gens, &T::set_gens) + .def(py::init([](br::Parameters& p){ T e(p); + return e; }) + ) + .def_property("params", &T::get_params, &T::set_params) + .def_property_readonly("is_fitted", &T::get_is_fitted) + .def_property_readonly("best_ind", &T::get_best_ind) + .def("run", &T::run, "run from brush dataset") ; // specialization for subclasses diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index 49ca8ff7..81d5b294 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -71,6 +71,7 @@ void bind_program(py::module& m, string name) ; if constexpr (std::is_same_v) { + // TODO: have these in individual and wrapper prog.def("predict_proba", static_cast(&T::predict_proba), "predict from Dataset object") diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index b6f50f46..5c3f1ca7 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -40,8 +40,11 @@ PYBIND11_MODULE(_brush, m) { bind_params(m); bind_dataset(m); bind_search_space(m); + + // should these 4 below be exposed? bind_variations(m); bind_selections(m); + bind_evaluators(m); // bind_populations(m); // solutions @@ -50,7 +53,7 @@ PYBIND11_MODULE(_brush, m) { py::module_ m3 = m.def_submodule("individual", "Contains Individual classes."); bind_individuals(m3); - - // bind_estimators(m); - bind_evaluators(m); + + py::module_ m4 = m.def_submodule("engine", "Learning engines (used inside the python estimators)."); + bind_estimators(m4); } diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index e69de29b..7eae4660 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -0,0 +1,43 @@ +#include "testsHeader.h" +#include "../../src/search_space.h" +#include "../../src/program/program.h" +#include "../../src/program/dispatch_table.h" +#include "../../src/data/io.h" +#include "../../src/estimator.h" +#include "../../src/estimator.cpp" +#include "../../src/selection/selection.h" +#include "../../src/selection/selection_operator.h" +#include "../../src/selection/nsga2.h" +#include "../../src/selection/lexicase.h" +#include "../../src/eval/evaluation.h" +#include "../../src/population.h" + +// TODO: omg i need to figure out why my code only works if i import basically the whole stuff +#include "../../src/selection/selection.cpp" +#include "../../src/selection/selection_operator.cpp" +#include "../../src/selection/nsga2.cpp" +#include "../../src/selection/lexicase.cpp" +#include "../../src/eval/evaluation.cpp" +#include "../../src/population.cpp" + +TEST(Engine, EngineWorks) +{ + MatrixXf X(10,2); + ArrayXf y(10); + X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, + 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + + 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, + 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + + y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, + 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + + Dataset data(X,y); + + Parameters params; + params.set_pop_size(10); + params.set_gens(10); + Brush::RegressorEstimator est(params); + est.run(data); +} \ No newline at end of file diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 5f051079..410e5923 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -59,6 +59,8 @@ TEST(Population, PopulationTests) // Its size is actually the double, // but the real value goes just up to the middle (no offspring was initialized) + // TODO: put a lot of asserts here between the steps + for (int i=0; i Date: Sun, 10 Mar 2024 16:20:23 -0300 Subject: [PATCH 126/199] Fixed core dump due to parallel calculations --- src/selection/lexicase.cpp | 20 ++++++++- src/selection/nsga2.cpp | 87 +++++++++++++++++++++----------------- 2 files changed, 66 insertions(+), 41 deletions(-) diff --git a/src/selection/lexicase.cpp b/src/selection/lexicase.cpp index 8fda38d9..831ae730 100644 --- a/src/selection/lexicase.cpp +++ b/src/selection/lexicase.cpp @@ -18,22 +18,29 @@ template vector Lexicase::select(Population& pop, int island, const Parameters& params) { + cout << "select lexicase island " << island << endl; + // this one can be executed in parallel because it is just reading the errors. This // method assumes that the expressions have been fitted previously, and their respective // error vectors are filled auto island_pool = pop.get_island_indexes(island); + cout << "got indexes " << endl; + // if this is first generation, just return indices to pop if (params.current_gen==0) return island_pool; //< number of samples - unsigned int N = pop.individuals.at(0)->error.size(); + unsigned int N = pop.individuals.at(island_pool.at(0))->error.size(); //< number of individuals unsigned int P = island_pool.size(); + cout << "pool size is " << P << endl; + cout << "epsilon size is " << N << endl; + // define epsilon ArrayXf epsilon = ArrayXf::Zero(N); @@ -41,6 +48,8 @@ vector Lexicase::select(Population& pop, int island, if (!params.classification || params.scorer_.compare("log")==0 || params.scorer_.compare("multi_log")==0) { + cout << "using lexicase for regression " << endl; + // for each sample, calculate epsilon for (int i = 0; i Lexicase::select(Population& pop, int island, epsilon(i) = mad(case_errors); } } + assert(epsilon.size() == N); // selection pool vector starting_pool; @@ -63,12 +73,16 @@ vector Lexicase::select(Population& pop, int island, vector selected(P,0); // selected individuals - #pragma omp parallel for + // #pragma omp parallel for for (unsigned int i = 0; i cases; // cases (samples) if (params.classification && !params.class_weights.empty()) { + cout << "using WEIGHTED for classification " << endl; + // for classification problems, weight case selection // by class weights vector choices(N); @@ -147,6 +161,8 @@ vector Lexicase::select(Population& pop, int island, //if more than one winner, pick randomly selected.at(i) = *r.select_randomly( winner.begin(), winner.end() ); + + cout << "parallel end index " + to_string(i) << endl; } if (selected.size() != island_pool.size()) diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 6edf5041..1214b523 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -39,6 +39,8 @@ template vector NSGA2::select(Population& pop, int island, const Parameters& params) { + cout << "select nsga island" << island << endl; + // tournament selection. TODO: move this to tournament selection file, and throw not implemented error in nsga. auto island_pool = pop.get_island_indexes(island); @@ -74,15 +76,20 @@ vector NSGA2::survive(Population& pop, int island, const Parameters& params) { - // fmt::print("starting\n"); + fmt::print("starting survive\n"); - auto island_pool = pop.get_island_indexes(island); + cout << "survive nsga island " << island << endl; + + size_t idx_start = std::floor(island*params.pop_size/params.num_islands); + size_t idx_end = std::floor((island+1)*params.pop_size/params.num_islands); - // TODO: do similar calculations in other selection survival and pop methods (so I dont mess up with anything) - int original_size = params.pop_size/params.num_islands; // original island size (survive must be called with an island with offfspring) + auto original_size = idx_end - idx_start; // original island size (survive must be called with an island with offfspring) - // fmt::print("original size {}\n", original_size); - // fmt::print("island size {}\n", island_pool.size()); + fmt::print("original size {}\n", original_size); + + auto island_pool = pop.get_island_indexes(island); + + fmt::print("island size {}\n", island_pool.size()); // set objectives (this is when the obj vector is updated.) @@ -91,53 +98,53 @@ vector NSGA2::survive(Population& pop, int island, // pop.individuals.at(island_pool[i])->set_obj(params.objectives); // fast non-dominated sort - // fmt::print("fast nds for island {}\n", island); + fmt::print("fast nds for island {}\n", island); auto front = fast_nds(pop, island_pool); - // fmt::print("selecting...\n"); + fmt::print("selecting...\n"); // Push back selected individuals until full vector selected; - // fmt::print("created array...\n"); + fmt::print("created array...\n"); selected.resize(0); - // fmt::print("resized...\n"); + fmt::print("resized...\n"); int i = 0; - // fmt::print("starting loop...\n"); - // fmt::print("selected size {}...\n",selected.size()); - // fmt::print("first front size {}...\n", front.at(i).size()); - // fmt::print("goal is to select n individuals: {}...\n", original_size); + fmt::print("starting loop...\n"); + fmt::print("selected size {}...\n",selected.size()); + fmt::print("first front size {}...\n", front.at(i).size()); + fmt::print("goal is to select n individuals: {}...\n", original_size); while ( i < front.size() - && ( selected.size() + front.at(i).size() < original_size ) // (size/2) because we want to get to the original size (prepare_offspring_slots doubled it before survival operation) + && ( selected.size() + front.at(i).size() < original_size ) ) { - // fmt::print("1...\n"); + fmt::print("1...\n"); std::vector& Fi = front.at(i); // indices in front i - // fmt::print("2...\n"); + fmt::print("2...\n"); crowding_distance(pop, front, i); // calculate crowding in Fi - // fmt::print("3...\n"); + fmt::print("3...\n"); for (int j = 0; j < Fi.size(); ++j) // Pt+1 = Pt+1 U Fi selected.push_back(Fi.at(j)); - // fmt::print("4...\n"); + fmt::print("4...\n"); ++i; } - // fmt::print("crowding distance\n"); + fmt::print("crowding distance\n"); crowding_distance(pop, front, i); // calculate crowding in final front to include std::sort(front.at(i).begin(),front.at(i).end(),sort_n(pop)); - // fmt::print("adding last front)\n"); + fmt::print("adding last front)\n"); const int extra = original_size - selected.size(); for (int j = 0; j < extra; ++j) // Pt+1 = Pt+1 U Fi[1:N-|Pt+1|] selected.push_back(front.at(i).at(j)); - // fmt::print("returning\n"); + fmt::print("returning\n"); return selected; } @@ -147,7 +154,7 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan // this will update pareto dominance attributes in fitness class // based on the population - // fmt::print("inside fast nds with island pool of size {} from pop of size {}\n", island_pool.size(), pop.size()); + fmt::print("inside fast nds with island pool of size {} from pop of size {} and\n", island_pool.size(), pop.size()); //< the Pareto fronts vector> front; @@ -155,6 +162,7 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan front.resize(1); front.at(0).clear(); + // this pragma must go alongside with the inner pragma omp critical (to avoid racing conditions) #pragma omp parallel for for (int i = 0; i < island_pool.size(); ++i) { @@ -179,22 +187,23 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan #pragma omp critical { + // TODO: dcounter rank etc should be local variables (because one individual can be in multiple islands) p->fitness.dcounter = dcount; p->fitness.dominated.clear(); p->fitness.dominated = dom; // dom will have values already referring to island indexes if (p->fitness.dcounter == 0) { - // fmt::print("pushing {}...\n", island_pool[i]); + fmt::print("pushing {}...\n", island_pool[i]); p->fitness.set_rank(1); // front will have values already referring to island indexes front.at(0).push_back(island_pool[i]); } - // fmt::print("... index {} dominates {} ({}) and was dominated by {} ({})\n", island_pool[i], dom.size(), p->fitness.get_dominated().size(), dcount, p->fitness.get_dcounter()); + fmt::print("... index {} dominates {} ({}) and was dominated by {} ({})\n", island_pool[i], dom.size(), p->fitness.get_dominated().size(), dcount, p->fitness.get_dcounter()); } } - // fmt::print("First front size {}...\n", front.at(0).size()); + fmt::print("First front size {}...\n", front.at(0).size()); // using OpenMP can have different orders in the front.at(0) // so let's sort it so that the algorithm is deterministic @@ -203,7 +212,7 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan int fi = 1; while (front.at(fi-1).size() > 0) { - // fmt::print("starting front {} with size \n", fi, front.at(fi-1).size()); + fmt::print("starting front {} with size {} \n", fi, front.at(fi-1).size()); std::vector& fronti = front.at(fi-1); std::vector Q; @@ -211,19 +220,19 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan const Individual& p = pop[fronti.at(i)]; - // fmt::print("ind {} dominated {} \n", fronti.at(i), p.fitness.dominated.size()); + fmt::print("ind {} dominated {} \n", fronti.at(i), p.fitness.dominated.size()); // iterating over dominated individuals for (int j = 0; j < p.fitness.dominated.size() ; ++j) { - // fmt::print("decreased counter of ind {} for {} to {} \n", j, p.fitness.dominated.at(j), pop.individuals.at(p.fitness.dominated.at(j))->fitness.dcounter); + fmt::print("decreased counter of ind {} for {} to {} \n", j, p.fitness.dominated.at(j), pop.individuals.at(p.fitness.dominated.at(j))->fitness.dcounter); auto q = pop.individuals.at(p.fitness.dominated.at(j)); - // fmt::print("decreased counter \n"); + fmt::print("decreased counter \n"); q->fitness.dcounter -= 1; if (q->fitness.dcounter == 0) { - // fmt::print("updated counter for ind {} \n", j); + fmt::print("updated counter for ind {} \n", j); q->fitness.set_rank(fi+1); Q.push_back(p.fitness.dominated.at(j)); @@ -232,12 +241,12 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan } front.push_back(Q); - // fmt::print("front {} ended with size {}...\n", fi, Q.size()); + fmt::print("front {} ended with size {}...\n", fi, Q.size()); fi += 1; } - // fmt::print("finished\n"); + fmt::print("finished\n"); return front; } @@ -246,27 +255,27 @@ template void NSGA2::crowding_distance(Population& pop, vector>& front, int fronti) { - // fmt::print("inside crowding distance for front {}...\n", fronti); + fmt::print("inside crowding distance for front {}...\n", fronti); std::vector F = front.at(fronti); if (F.size() == 0 ){ - // fmt::print("empty front\n"); + fmt::print("empty front\n"); return; } const int fsize = F.size(); - // fmt::print("front size is {}...\n", fsize); + fmt::print("front size is {}...\n", fsize); for (int i = 0; i < fsize; ++i) pop.individuals.at(F.at(i))->fitness.crowding_dist = 0; - // fmt::print("reseted crowding distance for individuals in this front\n"); + fmt::print("reseted crowding distance for individuals in this front\n"); const int limit = pop.individuals.at(0)->fitness.get_wvalues().size(); - // fmt::print("limit is {}\n", limit); + fmt::print("limit is {}\n", limit); for (int m = 0; m < limit; ++m) { - // fmt::print("m {}\n", m); + fmt::print("m {}\n", m); std::sort(F.begin(), F.end(), comparator_obj(pop,m)); From 61f9e59dd58f552797e08ba7c81d83fea2791977 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Sun, 10 Mar 2024 16:22:53 -0300 Subject: [PATCH 127/199] Assert in eval (for debug reasons). Will be removed later --- src/eval/evaluation.cpp | 10 ++++------ src/eval/evaluation.h | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index c0675271..62a48414 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -12,18 +12,14 @@ void Evaluation::update_fitness(Population& pop, const Dataset& data, const Parameters& params, bool fit, - bool offspring, bool validation ) { //TODO: it could use the validation_loss auto idxs = pop.get_island_indexes(island); - int start = 0; - if (offspring) - start = idxs.size()/2; - - for (unsigned i = start; i& ind = *pop.individuals.at(idxs.at(i)).get(); // we are modifying it, so operator[] wont work @@ -45,7 +41,9 @@ void Evaluation::update_fitness(Population& pop, assign_fit(ind, data, params, validation); } + ++counter; } + assert(counter > 0); } // assign loss to program diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index 911ed744..242cf916 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -23,6 +23,7 @@ class Evaluation { public: Scorer S; + // TODO: make eval update loss_v accordingly, and set to th same as train loss if there is no batch or no validation Evaluation(){ string scorer; if ( (T == Brush::ProgramType::MulticlassClassifier) @@ -49,7 +50,6 @@ class Evaluation { const Dataset& data, const Parameters& params, bool fit=true, - bool offspring = false, bool validation=false ); From 23fbd84e173499697ad1ba6bb9f829e69f6468b9 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Sun, 10 Mar 2024 16:23:59 -0300 Subject: [PATCH 128/199] added some comments --- src/individual.cpp | 7 ++++--- src/individual.h | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/individual.cpp b/src/individual.cpp index 89d06ab4..3a27b9a7 100644 --- a/src/individual.cpp +++ b/src/individual.cpp @@ -43,13 +43,14 @@ int Fitness::dominates(const Fitness& b) const flag2 = 0; // to check if b has a smaller objective // TODO: replace comparison of individual values by using the overloaded operators (here and in nsga2) - // TODO: save fitness in an temporary variable and stop accessing it everytime for (int i=0; i b.get_wvalues().at(i) - || std::isnan(b.get_wvalues().at(i)) ) + || std::isnan(b.get_wvalues().at(i)) + ) flag1 = 1; if (get_wvalues().at(i) < b.get_wvalues().at(i) - || std::isnan(get_wvalues().at(i)) ) + || std::isnan(get_wvalues().at(i)) + ) flag2 = 1; } diff --git a/src/individual.h b/src/individual.h index b10c2501..e5b1fb76 100644 --- a/src/individual.h +++ b/src/individual.h @@ -35,8 +35,9 @@ struct Fitness { size_t size; size_t depth; - unsigned int dcounter; ///< number of individuals this dominates + // these can be different depending on the island the individual is + unsigned int dcounter; ///< number of individuals this dominates vector dominated; ///< individual indices this dominates unsigned int rank; ///< pareto front rank float crowding_dist; ///< crowding distance on the Pareto front From d768db928619b29ef0098fe56e11a4aaa5876fbe Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Sun, 10 Mar 2024 16:24:57 -0300 Subject: [PATCH 129/199] Fixed core dumps. Forcing unused indexes in pop to be nullptr this makes it easier to catch wrong accesses. taskflow is working now. new parameter n_jobs --- pybrush/DeapEstimator.py | 13 ++- src/bindings/bind_params.cpp | 1 + src/estimator.cpp | 104 +++++++++++++++----- src/params.h | 3 + src/population.cpp | 183 ++++++++++++++++++++++++++--------- src/population.h | 8 +- src/variation.cpp | 11 ++- tests/cpp/test_brush.cpp | 54 ++++++++++- 8 files changed, 290 insertions(+), 87 deletions(-) diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index e8e022af..4ce8cc65 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -128,6 +128,7 @@ def __init__( max_depth=3, max_size=20, num_islands=1, + n_jobs=1, mig_prob=0.05, cx_prob= 1/7, mutation_probs = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, @@ -151,6 +152,7 @@ def __init__( self.max_size=max_size self.num_islands=num_islands self.mig_prob=mig_prob + self.n_jobs=n_jobs self.cx_prob=cx_prob self.mutation_probs=mutation_probs self.functions=functions @@ -286,6 +288,7 @@ def fit(self, X, y): self.parameters_ = _brush.Parameters() self.parameters_.classification = self.mode == "classification" self.parameters_.n_classes = self.n_classes_ + self.parameters_.n_jobs = self.n_jobs self.parameters_.pop_size = self.pop_size self.parameters_.gens = self.gens self.parameters_.num_islands = self.num_islands @@ -305,11 +308,11 @@ def fit(self, X, y): elif self.mode == "regressor": self.variator_ = _brush.RegressorVariator(self.parameters_, self.search_space_) - # from pybrush import RegressorEngine - # brush_estimator = RegressorEngine(self.parameters_) - # brush_estimator.run(self.data_) - # print(brush_estimator.is_fitted) - # print(brush_estimator.best_ind) + from pybrush import RegressorEngine + brush_estimator = RegressorEngine(self.parameters_) + brush_estimator.run(self.data_) + print(brush_estimator.is_fitted) + print(brush_estimator.best_ind) else: raise("Unsupported mode") diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 81769365..39a4ffa6 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -28,6 +28,7 @@ void bind_params(py::module& m) .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands) .def_property("n_classes", &Brush::Parameters::get_n_classes, &Brush::Parameters::set_n_classes) + .def_property("n_jobs", &Brush::Parameters::get_n_jobs, &Brush::Parameters::set_n_classes) .def_property("classification", &Brush::Parameters::get_classification, &Brush::Parameters::set_classification) .def_property("max_depth", &Brush::Parameters::get_max_depth, &Brush::Parameters::set_max_depth) .def_property("max_size", &Brush::Parameters::get_max_size, &Brush::Parameters::set_max_size) diff --git a/src/estimator.cpp b/src/estimator.cpp index 15025968..2c238181 100644 --- a/src/estimator.cpp +++ b/src/estimator.cpp @@ -105,7 +105,7 @@ bool Estimator::update_best(const Dataset& data, bool val) std::cout << "inside loop" << std::endl; - vector hof = this->pop.hall_of_fame(1, true); + vector hof = this->pop.hall_of_fame(1); std::cout << "got hof" << std::endl; @@ -160,7 +160,7 @@ void Estimator::run(Dataset &data) pop.init(this->ss, this->params); - std::cout << "pop initialized" << std::endl; + std::cout << "pop initialized with size " << params.pop_size << " and " << params.num_islands << "islands" << std::endl; std::cout << pop.print_models() << std::endl; evaluator.set_scorer(params.scorer_); @@ -195,10 +195,8 @@ void Estimator::run(Dataset &data) // TODO: check that I dont use pop.size() (or I use correctly, because it will return the size with the slots for the offspring) // vectors to store each island separatedly - vector> island_parents; - island_parents.resize(pop.num_islands); + vector> island_parents; vector> survivors; - survivors.resize(pop.num_islands); std::cout << "vectors are created " << std::endl; // TODO: progress bar? (it would be cool) @@ -212,7 +210,7 @@ void Estimator::run(Dataset &data) std::cout << "inside body" << std::endl; auto prepare_gen = subflow.emplace([&]() { std::cout << "inside prepare gen" << std::endl; - std::cout << "generation " << generation << std::endl; + std::cout << " -------------------- generation " << generation << " -------------------- " << std::endl; params.set_current_gen(generation); batch = data.get_batch(); // will return the original dataset if it is set to dont use batch @@ -222,45 +220,94 @@ void Estimator::run(Dataset &data) survivors.clear(); survivors.resize(pop.num_islands); + for (int i=0; i< params.num_islands; i++){ + size_t idx_start = std::floor(i*params.pop_size/params.num_islands); + size_t idx_end = std::floor((i+1)*params.pop_size/params.num_islands); + + // auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start + auto delta = idx_end - idx_start; + + survivors.at(i).clear(); + island_parents.at(i).clear(); + + survivors.at(i).resize(delta); + island_parents.at(i).resize(delta); + } + ++generation; }).name("prepare generation");// set generation in params, get batch auto select_parents = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { std::cout << "inside select parents" << std::endl; - evaluator.update_fitness(this->pop, island, data, params, true, false); // fit the weights with all training data + evaluator.update_fitness(this->pop, island, data, params, true); // fit the weights with all training data // TODO: have some way to set which fitness to use (for example in params, or it can infer based on split size idk) // TODO: if using batch, fitness should be called before selection to set the batch if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) - evaluator.update_fitness(this->pop, island, batch, params, false, false); + evaluator.update_fitness(this->pop, island, batch, params, false); vector parents = selector.select(this->pop, island, params); - island_parents.at(island) = parents; + for (int i=0; i< parents.size(); i++){ + std::cout << i << std::endl; + island_parents.at(island).at(i) = parents.at(i); + } }).name("select parents for each island"); - auto generate_offspring = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { - std::cout << "inside generate offspring" << std::endl; - this->pop.add_offspring_indexes(island); // we just need to add them, not remove (they are removed in survival step, that will return a selection with the same number of individuals as the original island size) - - // // variation to produce offspring - variator.vary(this->pop, island, island_parents.at(island)); - - evaluator.update_fitness(this->pop, island, data, params, true, true); - // evaluator.validation(*this->pop, island_range, data, params, true); + // this is not thread safe. But it is nice to keep out of parallel execution the bits of the + // code that uses random generators (i think this helps to having random_seed to work properly). Also, + // fit and evaluation are paralellized in survive_population, and these are expensive to run + auto generate_offspring = subflow.emplace([&]() { + + for (int island=0; island < params.num_islands; island++){ + std::cout << "inside generate offspring" << std::endl; + this->pop.add_offspring_indexes(island); // we just need to add them, not remove (they are removed in survival step, that will return a selection with the same number of individuals as the original island size) + + std::cout << "before vary" << std::endl; + + // // variation to produce offspring + variator.vary(this->pop, island, island_parents.at(island)); + std::cout << "before update fitness" << std::endl; + } + }).name("generate offspring for each island"); + + auto survive_population = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { + + evaluator.update_fitness(this->pop, island, data, params, true); + // evaluator.validation(*this->pop, island_range, data, params); + std::cout << "before batch update" << std::endl; if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) - evaluator.update_fitness(this->pop, island, batch, params, false, true); + evaluator.update_fitness(this->pop, island, batch, params, false); + std::cout << "before survive" << std::endl; // select survivors from combined pool of parents and offspring vector island_survivors = survivor.survive(this->pop, island, params); + std::cout << "before assign to survivors array" << std::endl; + + for (int i=0; i< island_survivors.size(); i++){ + std::cout << i << std::endl; + survivors.at(island).at(i) = island_survivors.at(i); + } + }).name("evaluate offspring and select survivors"); + + auto update_pop = subflow.emplace([&]() { + std::cout << "before updating survivors" << std::endl; + std::cout << pop.print_models() << std::endl; + this->pop.update(survivors); - survivors.at(island) = island_survivors; - }).name("generate offspring for each island"); - - auto survive = subflow.emplace([&]() { this->pop.update(survivors); }).name("survival of the fittest"); + std::cout << "after updating survivors" << std::endl; + std::cout << pop.print_models() << std::endl; + }).name("update population and detangle indexes"); - auto migration = subflow.emplace([&]() { this->pop.migrate(); }).name("migration between islands"); + auto migration = subflow.emplace([&]() { + std::cout << "before migrating" << std::endl; + std::cout << pop.print_models() << std::endl; + this->pop.migrate(); + + std::cout << "after migrating" << std::endl; + std::cout << pop.print_models() << std::endl; + }).name("migration between islands"); // TODO: update best, update log, increment generation counter (but not set in params) auto finish_gen = subflow.emplace([&]() { bool updated_best = this->update_best(data); }).name("update best, log, archive"); @@ -268,8 +315,9 @@ void Estimator::run(Dataset &data) // set-up subflow graph prepare_gen.precede(select_parents); select_parents.precede(generate_offspring); - generate_offspring.precede(survive); - survive.precede(migration); + generate_offspring.precede(survive_population); + survive_population.precede(update_pop); + update_pop.precede(migration); migration.precede(finish_gen); }, @@ -292,6 +340,10 @@ void Estimator::run(Dataset &data) std::cout << "taskflow configured " << std::endl; executor.run(taskflow); + + std::cout << "submitted jobs " << std::endl; + executor.wait_for_all(); + std::cout << "finished " << std::endl; } } \ No newline at end of file diff --git a/src/params.h b/src/params.h index 1bcca676..cb1af4a1 100644 --- a/src/params.h +++ b/src/params.h @@ -88,6 +88,9 @@ struct Parameters void set_max_depth(unsigned new_max_depth){ max_depth = new_max_depth; }; unsigned get_max_depth(){ return max_depth; }; + void set_n_jobs(int new_n_jobs){ n_jobs = new_n_jobs; }; + int get_n_jobs(){ return n_jobs; }; + void set_max_size(unsigned new_max_size){ max_size = new_max_size; }; unsigned get_max_size(){ return max_size; }; diff --git a/src/population.cpp b/src/population.cpp index 145001d4..cf0c04d8 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -31,6 +31,7 @@ void Population::init(vector>& new_individuals, const Parameter size_t p = pop_size; individuals.resize(2*p); + std::fill(individuals.begin(), individuals.end(), nullptr); for (int i=0; i::init(vector>& new_individuals, const Parameter island_indexes.at(i).begin() + delta, island_indexes.at(i).end(), p+idx_start); } + else + { + // // second half is space to the offspring (but we dont initialize them) + // individuals.at(i) = std::make_shared; + } }; for (int j=0; j< new_individuals.size(); j++) { @@ -71,7 +77,7 @@ void Population::init(SearchSpace& ss, const Parameters& params) for (int i=0; i::init(SearchSpace& ss, const Parameters& params) for (int i = 0; i< p; ++i) { + // first half will contain the initial population individuals.at(i) = std::make_shared>(); individuals.at(i)->init(ss, params); individuals.at(i)->set_objectives(params.objectives); + + // second half is space to the offspring (but we dont initialize them) + individuals.at(p+i) = nullptr; } + } /// update individual vector size and island indexes template void Population::add_offspring_indexes(int island) { + // TODO 2: i guess I dont need to do this (below) anymore + // TODO: find unused indexes and distribute them to the islands (I think islands can point to anywhere in the population. also make sure that the selection survival and mutation works like that) // reading and writing is thread-safe, as long as there's no overlap on island ranges. // manipulating a vector IS NOT thread-safe (inserting and erasing elements). // So, add_offspring_indexes and update should be the synchronization points, not @@ -110,7 +123,7 @@ void Population::add_offspring_indexes(int island) auto delta = idx_end - idx_start; // island size // inserting indexes of the offspring - island_indexes.at(island).resize(delta*2); + island_indexes.at(island).resize(island_indexes.at(island).size() + delta); iota( island_indexes.at(island).begin() + delta, island_indexes.at(island).end(), p+idx_start); @@ -124,41 +137,58 @@ void Population::add_offspring_indexes(int island) template void Population::update(vector> survivors) { + // this is the step that should end up cutting off half of the population vector> new_pop; - new_pop.resize(2*pop_size); - size_t i=0; + new_pop.resize(0); for (int j=0; jset_complexity(); - - ++i; // this will fill just half of the pop + new_pop.push_back( + *individuals.at(survivors.at(j).at(k)) ); } // need to make island point to original range size_t idx_start = std::floor(j*pop_size/num_islands); size_t idx_end = std::floor((j+1)*pop_size/num_islands); - auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start + // auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start + auto delta = idx_end - idx_start; + + assert(delta == survivors.at(j).size() + && " migration ended up with a different popsize"); // inserting indexes of the offspring + island_indexes.at(j).clear(); island_indexes.at(j).resize(delta); iota(island_indexes.at(j).begin(), island_indexes.at(j).end(), idx_start); } + + assert(new_pop.size() == pop_size + && " update ended up with a different popsize"); + this->individuals.resize(0); for (auto ind : new_pop) { + // making hard copies of the individuals + json ind_copy = ind; + + // this will fill just half of the pop individuals.push_back( - std::make_shared>(ind) ); + std::make_shared>(ind_copy) ); + } + + assert(individuals.size() == pop_size + && " number of new individuals is different from pop size"); + + for (int i=0; i< pop_size; ++i) + { + // second half is space to the offspring (but we dont initialize them) + individuals.push_back(nullptr); } } template -string Population::print_models(bool just_offspring, string sep) +string Population::print_models(string sep) { // not printing the island each individual belongs to string output = ""; @@ -167,12 +197,9 @@ string Population::print_models(bool just_offspring, string sep) { output += "island " + to_string(j) + ":\n"; - int start = 0; - if (just_offspring) - start = island_indexes.at(j).size()/2; - - for (int k=start; k& ind = *individuals.at(island_indexes.at(j).at(k)).get(); output += ind.get_model() + sep; } @@ -181,7 +208,7 @@ string Population::print_models(bool just_offspring, string sep) } template -vector> Population::sorted_front(unsigned rank, bool ignore_offspring) +vector> Population::sorted_front(unsigned rank) { // this is used to migration and update archive at the end of a generation. expect islands without offspring @@ -194,11 +221,7 @@ vector> Population::sorted_front(unsigned rank, bool ignore_of auto idxs = island_indexes.at(j); vector pf; - auto end = idxs.size(); - if (ignore_offspring) - end = end/2; - - for (int i=0; ifitness.rank == rank) @@ -216,61 +239,75 @@ vector> Population::sorted_front(unsigned rank, bool ignore_of } template -vector Population::hall_of_fame(unsigned rank, bool ignore_offspring) +vector Population::hall_of_fame(unsigned rank) { + // TODO: remove this ignore offspring (things should work without it) // this is used to migration and update archive at the end of a generation. expect islands without offspring vector pf(0); - auto end = individuals.size(); - if (ignore_offspring) - end = end/2; - - for (unsigned int i =0; ifitness.rank == rank) - pf.push_back(i); + auto idxs = island_indexes.at(j); + for (int i=0; ifitness.rank == rank) + pf.push_back(idxs.at(i)); + } } std::sort(pf.begin(),pf.end(),SortComplexity(*this)); + auto it = std::unique(pf.begin(),pf.end(),SameFitComplexity(*this)); + pf.resize(std::distance(pf.begin(),it)); return pf; } -// TODO: check why im getting core dump in migrate or NSGA2 + template void Population::migrate() { // changes where island points to if (num_islands==1) - return; + return; // skipping. this only work because update is fixing island indexes // we cant use more than half of population here - auto island_fronts = sorted_front(1, true); - auto global_hall_of_fame = hall_of_fame(1, true); + std::cout << "finding island sorted fronts" << std::endl; + auto island_fronts = sorted_front(1); + + std::cout << "finding global hall of fame" << std::endl; + auto global_hall_of_fame = hall_of_fame(1); - // This is not thread safe (as it is now) + // This method is not thread safe (as it is now) + vector> new_island_indexes; + new_island_indexes.resize(num_islands); + + std::cout << "Looping" << std::endl; for (int island=0; island other_islands(num_islands-1); iota(other_islands.begin(), other_islands.end(), 0); @@ -279,7 +316,7 @@ void Population::migrate() auto it = other_islands.begin(); std::advance(it, island); for (;it != other_islands.end(); ++it) { - ++(*it); + ++(*it); // TODO: is this really skipping the current island? } // picking other island @@ -290,13 +327,69 @@ void Population::migrate() migrating_idx = *r.select_randomly( island_fronts.at(other_island).begin(), island_fronts.at(other_island).end()); + std::cout << "mig idx" << migrating_idx << std::endl; } - // std::cout << "index " << i << " of island " << island << " is now" << migrating_idx << std::endl; + std::cout << "index " << i << " of island " << island; + std::cout << " is now" << migrating_idx << std::endl; - island_indexes.at(island).at(i) = migrating_idx; + new_island_indexes.at(island).push_back(migrating_idx); } + else + { + new_island_indexes.at(island).push_back(idxs.at(i)); + } + } + } + // making hard copies (so the next generation starts with islands that does not share individuals + // this is particularly important to avoid multiple threads assigning different rank/crowdist/dcounter + // or different fitness) + + std::cout << "starting to consolidate pop" << std::endl; + vector> new_pop; + new_pop.resize(0); + for (int j=0; jindividuals.resize(0); + for (auto ind : new_pop) + { + // making hard copies of the individuals + json ind_copy = ind; + + // this will fill just half of the pop + individuals.push_back( + std::make_shared>(ind_copy) ); + } + for (int i=0; i< pop_size; ++i) + { + // second half is space to the offspring (but we dont initialize them) + individuals.push_back(nullptr); } } diff --git a/src/population.h b/src/population.h index fe9b653c..3cfa63c9 100644 --- a/src/population.h +++ b/src/population.h @@ -55,13 +55,13 @@ class Population{ const Individual& operator [](size_t i) {return *individuals.at(i);} /// return population equations. - string print_models(bool just_offspring=false, string sep="\n"); + string print_models(string sep="\n"); /// return complexity-sorted Pareto front indices for each island - vector> sorted_front(unsigned rank=1, bool ignore_offspring=false); + vector> sorted_front(unsigned rank=1); // pareto front ignoring island divisions - vector hall_of_fame(unsigned rank=1, bool ignore_offspring=false); + vector hall_of_fame(unsigned rank=1); // perform a migration in the population. Individuals from sorted front or hall of fame will replace others by the // probability set in parameters. Expects a population without offspring @@ -85,7 +85,7 @@ class Population{ SameFitComplexity(Population& p): pop(p){} bool operator()(size_t i, size_t j) { - return pop[i].fitness == pop[j].fitness; + return pop[i].get_complexity() == pop[j].get_complexity(); } }; }; diff --git a/src/variation.cpp b/src/variation.cpp index 69feeb94..85f73cf6 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -608,15 +608,18 @@ template void Variation::vary(Population& pop, int island, const vector& parents) { + // TODO: fill indexes with nullptr, istead of using second half auto idxs = pop.get_island_indexes(island); - // assumes it should save new individuals in second half of the island - int start = idxs.size()/2; - // TODO: fix pragma omp usage //#pragma omp parallel for - for (unsigned i = start; i> opt=std::nullopt; // new individual diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 7eae4660..7c4b387a 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -1,7 +1,8 @@ #include "testsHeader.h" + #include "../../src/search_space.h" #include "../../src/program/program.h" -#include "../../src/program/dispatch_table.h" +// #include "../../src/program/dispatch_table.h" #include "../../src/data/io.h" #include "../../src/estimator.h" #include "../../src/estimator.cpp" @@ -38,6 +39,53 @@ TEST(Engine, EngineWorks) Parameters params; params.set_pop_size(10); params.set_gens(10); - Brush::RegressorEstimator est(params); - est.run(data); + params.set_mig_prob(0.0); + + std::cout << "n jobs = 1" << std::endl; + params.set_n_jobs(1); + Brush::RegressorEstimator est5(params); + est5.run(data); + + std::cout << "n jobs = 2" << std::endl; + params.set_n_jobs(2); + Brush::RegressorEstimator est2(params); + est2.run(data); + + std::cout << "n jobs = -1" << std::endl; + params.set_n_jobs(-1); + Brush::RegressorEstimator est3(params); + est3.run(data); + + std::cout << "n jobs = 0" << std::endl; + params.set_n_jobs(0); + Brush::RegressorEstimator est4(params); + est4.run(data); + + std::cout << "testing migration" << std::endl; + + params.set_pop_size(10); + params.set_gens(10); + params.set_mig_prob(0.5); + + std::cout << "n jobs = 1" << std::endl; + params.set_n_jobs(1); + Brush::RegressorEstimator est6(params); + est6.run(data); + + std::cout << "n jobs = 2" << std::endl; + params.set_n_jobs(2); + Brush::RegressorEstimator est7(params); + est7.run(data); + + std::cout << "n jobs = -1" << std::endl; + params.set_n_jobs(-1); + Brush::RegressorEstimator est8(params); + est8.run(data); + + std::cout << "n jobs = 0" << std::endl; + params.set_n_jobs(0); + Brush::RegressorEstimator est9(params); + est9.run(data); + + // TODO: test classifier and multiclassifier } \ No newline at end of file From 4ede2fdd2415a4f472fb079d54cbe661e8f89e9a Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 12 Mar 2024 08:47:57 -0300 Subject: [PATCH 130/199] Replaced creator.Individual with brush's individual --- environment.yml | 2 +- pybrush/DeapEstimator.py | 29 +++++++++++------------------ 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/environment.yml b/environment.yml index 34f9e7bb..1ee8a71f 100644 --- a/environment.yml +++ b/environment.yml @@ -2,7 +2,7 @@ name: brush channels: - conda-forge dependencies: - - python #=3.8.2 + - python # =3.8.2 - cmake #=3.18.* - eigen #=3.4.* - fmt diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index 4ce8cc65..54d4b67f 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -168,17 +168,11 @@ def _setup_toolbox(self): """Setup the deap toolbox""" toolbox: base.Toolbox = base.Toolbox() - # creator.create is used to "create new functions", and takes at least - # 2 arguments: the name of the newly created class and a base class - - if hasattr(creator, "Individual"): - del creator.Individual - # create Individual class, inheriting from self.Individual with a fitness attribute if self.mode == 'classification': - creator.create("Individual", ClassifierIndividual - if self.n_classes_ == 2 else - MultiClassifierIndividual) + self.Individual = ( ClassifierIndividual + if self.n_classes_ == 2 else + MultiClassifierIndividual) self.eval_ = ( ClassifierEvaluator() if self.n_classes_ == 2 else MultiClassifierEvaluator() ) @@ -189,7 +183,7 @@ def _setup_toolbox(self): if self.n_classes_ == 2 else MultiClassifierSelector("nsga2", True) ) else: - creator.create("Individual", RegressorIndividual) + self.Individual = RegressorIndividual self.sel_ = RegressorSelector("lexicase", False) self.surv_ = RegressorSelector("nsga2", True) self.eval_ = RegressorEvaluator() @@ -210,7 +204,7 @@ def assign_fit(ind, validation=False): toolbox.register("assign_fit", assign_fit) - toolbox.register("Clone", lambda ind: creator.Individual(ind.program.copy())) + toolbox.register("Clone", lambda ind: self.Individual(ind.program.copy())) toolbox.register("mate", self.variator_.cross) toolbox.register("mutate", self.variator_.mutate) @@ -308,15 +302,14 @@ def fit(self, X, y): elif self.mode == "regressor": self.variator_ = _brush.RegressorVariator(self.parameters_, self.search_space_) - from pybrush import RegressorEngine - brush_estimator = RegressorEngine(self.parameters_) - brush_estimator.run(self.data_) - print(brush_estimator.is_fitted) - print(brush_estimator.best_ind) + # from pybrush import RegressorEngine + # brush_estimator = RegressorEngine(self.parameters_) + # brush_estimator.run(self.data_) + # print(brush_estimator.is_fitted) + # print(brush_estimator.best_ind) else: raise("Unsupported mode") - self.toolbox_ = self._setup_toolbox() # nsga2 and ga differ in the toolbox @@ -393,7 +386,7 @@ def _make_individual(self): # No arguments (or zero): brush will use PARAMS passed in set_params. # max_size is sampled between 1 and params['max_size'] if zero is provided - ind = creator.Individual() + ind = self.Individual() ind.init(self.search_space_, self.parameters_) ind.objectives = self.objectives From 0b2ab960eb8da6ae2210d809d4b9fe8f3739af0a Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 12 Mar 2024 08:48:47 -0300 Subject: [PATCH 131/199] Renamed cpp's estimator to engine --- src/bindings/bind_engines.cpp | 16 +++ .../{bind_estimators.h => bind_engines.h} | 18 +-- src/bindings/bind_estimators.cpp | 16 --- src/bindings/bind_individuals.h | 1 - src/bindings/module.cpp | 6 +- src/{estimator.cpp => engine.cpp} | 104 +++++++++--------- src/{estimator.h => engine.h} | 11 +- src/types.h | 12 +- tests/cpp/test_brush.cpp | 20 ++-- tests/cpp/testsHeader.h | 2 +- tests/python/test_deap_api.py | 20 ++++ 11 files changed, 123 insertions(+), 103 deletions(-) create mode 100644 src/bindings/bind_engines.cpp rename src/bindings/{bind_estimators.h => bind_engines.h} (80%) delete mode 100644 src/bindings/bind_estimators.cpp rename src/{estimator.cpp => engine.cpp} (76%) rename src/{estimator.h => engine.h} (94%) diff --git a/src/bindings/bind_engines.cpp b/src/bindings/bind_engines.cpp new file mode 100644 index 00000000..619b7bbf --- /dev/null +++ b/src/bindings/bind_engines.cpp @@ -0,0 +1,16 @@ +#include "module.h" +#include "bind_engines.h" + +namespace py = pybind11; +namespace br = Brush; +namespace nl = nlohmann; + +void bind_engines(py::module& m) +{ + bind_engine(m, "RegressorEngine"); + bind_engine(m, "ClassifierEngine"); + + // TODO: make these work + bind_engine(m, "MultiClassifierEngine"); + bind_engine(m, "RepresenterEngine"); +} \ No newline at end of file diff --git a/src/bindings/bind_estimators.h b/src/bindings/bind_engines.h similarity index 80% rename from src/bindings/bind_estimators.h rename to src/bindings/bind_engines.h index e16aa719..185767d4 100644 --- a/src/bindings/bind_estimators.h +++ b/src/bindings/bind_engines.h @@ -1,6 +1,6 @@ #include "module.h" -#include "../estimator.h" -#include "../estimator.cpp" +#include "../engine.h" +#include "../engine.cpp" // TODO: figure out why do I need to include the whole thing (otherwise it gives me symbol errors) #include "../selection/selection.h" @@ -18,10 +18,10 @@ #include "../population.cpp" #include "../population.h" -using Reg = Brush::RegressorEstimator; -using Cls = Brush::ClassifierEstimator; -using Rep = Brush::RepresenterEstimator; -using MCls = Brush::MulticlassClassifierEstimator; +using Reg = Brush::RegressorEngine; +using Cls = Brush::ClassifierEngine; +using Rep = Brush::RepresenterEngine; +using MCls = Brush::MulticlassClassifierEngine; namespace nl = nlohmann; namespace br = Brush; @@ -29,15 +29,15 @@ namespace br = Brush; using stream_redirect = py::call_guard; template -void bind_estimator(py::module& m, string name) +void bind_engine(py::module& m, string name) { using RetType = std::conditional_t< std::is_same_v, ArrayXf, std::conditional_t, ArrayXb, std::conditional_t, ArrayXi, ArrayXXf>>>; - py::class_ estimator(m, name.data() ); - estimator.def(py::init<>()) + py::class_ engine(m, name.data() ); + engine.def(py::init<>()) .def(py::init([](br::Parameters& p){ T e(p); return e; }) ) diff --git a/src/bindings/bind_estimators.cpp b/src/bindings/bind_estimators.cpp deleted file mode 100644 index 5f908c54..00000000 --- a/src/bindings/bind_estimators.cpp +++ /dev/null @@ -1,16 +0,0 @@ -#include "module.h" -#include "bind_estimators.h" - -namespace py = pybind11; -namespace br = Brush; -namespace nl = nlohmann; - -void bind_estimators(py::module& m) -{ - bind_estimator(m, "RegressorEngine"); - bind_estimator(m, "ClassifierEngine"); - - // TODO: make these work - bind_estimator(m, "MultiClassifierEngine"); - bind_estimator(m, "RepresenterEngine"); -} \ No newline at end of file diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index 72e411f4..11ec9238 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -26,7 +26,6 @@ void bind_individual(py::module& m, string name) .def_property("objectives", &Class::get_objectives, &Class::set_objectives) .def_property_readonly("program", &Class::get_program) // program cannot be changed by the user. Either create a new instance with the program as argument (so it will be a clone), or call init() (TODO: I should make init reset the attributes in the cpp end to avoid reseting the program but keeping the attributes) .def_property_readonly("fitness", &Class::get_fitness) // program cannot be changed by the user. Either create a new instance with the program as argument (so it will be a clone), or call init() (TODO: I should make init reset the attributes in the cpp end to avoid reseting the program but keeping the attributes) - // .def_property("fitness", &Class::get_fitness, &Class::set_fitness) // .def_property("complexity", &Class::get_complexity, &Class::set_complexity) .def(py::pickle( [](const Class &p) { // __getstate__ diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index 5c3f1ca7..09f2f267 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -23,7 +23,7 @@ void bind_variations(py::module &); void bind_selections(py::module &); void bind_individuals(py::module &); void bind_populations(py::module &); -void bind_estimators(py::module &); +void bind_engines(py::module &); void bind_evaluators(py::module &); PYBIND11_MODULE(_brush, m) { @@ -54,6 +54,6 @@ PYBIND11_MODULE(_brush, m) { py::module_ m3 = m.def_submodule("individual", "Contains Individual classes."); bind_individuals(m3); - py::module_ m4 = m.def_submodule("engine", "Learning engines (used inside the python estimators)."); - bind_estimators(m4); + py::module_ m4 = m.def_submodule("engine", "Learning engines (used inside the python s)."); + bind_engines(m4); } diff --git a/src/estimator.cpp b/src/engine.cpp similarity index 76% rename from src/estimator.cpp rename to src/engine.cpp index 2c238181..41efd1f8 100644 --- a/src/estimator.cpp +++ b/src/engine.cpp @@ -1,4 +1,4 @@ -#include "estimator.h" +#include "engine.h" #include @@ -14,9 +14,9 @@ using namespace Var; /// @brief initialize Feat object for fitting. template -void Estimator::init() +void Engine::init() { - std::cout << "inside init" << std::endl; + // std::cout << "inside init" << std::endl; // TODO: initialize (set operator) for survivor and selector // initialize population with initial model and/or starting pop @@ -24,28 +24,28 @@ void Estimator::init() if (params.n_jobs!=0) // TODO: change this to set taskflow jobs omp_set_num_threads(params.n_jobs); - std::cout << "set number of threads" << std::endl; + // std::cout << "set number of threads" << std::endl; r.set_seed(params.random_state); - std::cout << "set random state" << std::endl; + // std::cout << "set random state" << std::endl; // set up the pop, variator, etc set_is_fitted(false); - std::cout << "is fitted is false" << std::endl; + // std::cout << "is fitted is false" << std::endl; this->pop = Population(); - std::cout << "created population" << std::endl; + //std::cout << "created population" << std::endl; this->evaluator = Evaluation(); - std::cout << "created evaluator" << std::endl; + //std::cout << "created evaluator" << std::endl; this->selector = Selection(params.sel, false); - std::cout << "created selector" << std::endl; + //std::cout << "created selector" << std::endl; this->survivor = Selection(params.surv, true); - std::cout << "created survivor" << std::endl; + //std::cout << "created survivor" << std::endl; //TODO ///return fraction of data to use for training @@ -91,9 +91,9 @@ void Estimator::init() } template // TODO: use the dataset, or ignore it -bool Estimator::update_best(const Dataset& data, bool val) +bool Engine::update_best(const Dataset& data, bool val) { - std::cout << "updating best" << std::endl; + //std::cout << "updating best" << std::endl; float bs; bs = this->best_loss; @@ -103,22 +103,22 @@ bool Estimator::update_best(const Dataset& data, bool val) bool updated = false; - std::cout << "inside loop" << std::endl; + //std::cout << "inside loop" << std::endl; vector hof = this->pop.hall_of_fame(1); - std::cout << "got hof" << std::endl; + //std::cout << "got hof" << std::endl; // will look only in the first half of the population (this is intended to be done after survival step) for (int i=0; i < hof.size(); ++i) { // TODO: i guess the right way of doing this is using island indexes (or just take the hall of fame) - std::cout << "index" << hof[i] << std::endl; + //std::cout << "index" << hof[i] << std::endl; const auto& ind = *pop.individuals.at(hof[i]); - std::cout << ind.program.get_model() << std::endl; + //std::cout << ind.program.get_model() << std::endl; - std::cout << "got individual of rank" << ind.fitness.rank << std::endl; + //std::cout << "got individual of rank" << ind.fitness.rank << std::endl; if (val) f = ind.fitness.loss_v; else @@ -128,7 +128,7 @@ bool Estimator::update_best(const Dataset& data, bool val) || (f == bs && ind.fitness.complexity < this->best_complexity) ) { - std::cout << "updated" << std::endl; + //std::cout << "updated" << std::endl; bs = f; this->best_ind = ind; @@ -145,26 +145,26 @@ bool Estimator::update_best(const Dataset& data, bool val) template -void Estimator::run(Dataset &data) +void Engine::run(Dataset &data) { // It is up to the python side to create the dataset (we have a cool wrapper for that) - std::cout << "starting to run" << std::endl; + //std::cout << "starting to run" << std::endl; //TODO: i need to make sure i initialize everything (pybind needs to have constructors // without arguments to work, and i need to handle correcting these values before running) this->ss = SearchSpace(data, params.functions); - std::cout << "search space was set" << std::endl; + //std::cout << "search space was set" << std::endl; this->init(); - std::cout << "estimator initialized" << std::endl; + //std::cout << "Engine initialized" << std::endl; pop.init(this->ss, this->params); - std::cout << "pop initialized with size " << params.pop_size << " and " << params.num_islands << "islands" << std::endl; - std::cout << pop.print_models() << std::endl; + //std::cout << "pop initialized with size " << params.pop_size << " and " << params.num_islands << "islands" << std::endl; + //std::cout << pop.print_models() << std::endl; evaluator.set_scorer(params.scorer_); - std::cout << "evaluator configured. starting to run " << std::endl; + //std::cout << "evaluator configured. starting to run " << std::endl; Dataset &batch = data; @@ -177,7 +177,7 @@ void Estimator::run(Dataset &data) threads = params.n_jobs; tf::Executor executor(threads); // TODO: executor could be an attribute (so I can move a lot of stuff here to init) - std::cout << "using n threads " << threads << std::endl; + //std::cout << "using n threads " << threads << std::endl; assert( (executor.num_workers() > 0) && "Invalid number of workers"); @@ -185,11 +185,11 @@ void Estimator::run(Dataset &data) // TODO: get references to all classes ( so they can be captured by taskflow) (like some private getters and setters) - std::cout << "stop criteria is ready " << std::endl; + //std::cout << "stop criteria is ready " << std::endl; // stop criteria unsigned generation = 0; auto stop = [&]() { - std::cout << "inside stop " << std::endl; + //std::cout << "inside stop " << std::endl; return generation == params.gens; // TODO: max stall, max time, etc }; @@ -198,7 +198,7 @@ void Estimator::run(Dataset &data) vector> island_parents; vector> survivors; - std::cout << "vectors are created " << std::endl; + //std::cout << "vectors are created " << std::endl; // TODO: progress bar? (it would be cool) // heavily inspired in https://github.com/heal-research/operon/blob/main/source/algorithms/nsga2.cpp auto [init, cond, body, back, done] = taskflow.emplace( @@ -207,10 +207,10 @@ void Estimator::run(Dataset &data) stop, // loop condition [&](tf::Subflow& subflow) { // loop body (evolutionary main loop) - std::cout << "inside body" << std::endl; + //std::cout << "inside body" << std::endl; auto prepare_gen = subflow.emplace([&]() { - std::cout << "inside prepare gen" << std::endl; - std::cout << " -------------------- generation " << generation << " -------------------- " << std::endl; + //std::cout << "inside prepare gen" << std::endl; + //std::cout << " -------------------- generation " << generation << " -------------------- " << std::endl; params.set_current_gen(generation); batch = data.get_batch(); // will return the original dataset if it is set to dont use batch @@ -238,7 +238,7 @@ void Estimator::run(Dataset &data) }).name("prepare generation");// set generation in params, get batch auto select_parents = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { - std::cout << "inside select parents" << std::endl; + //std::cout << "inside select parents" << std::endl; evaluator.update_fitness(this->pop, island, data, params, true); // fit the weights with all training data // TODO: have some way to set which fitness to use (for example in params, or it can infer based on split size idk) @@ -249,7 +249,7 @@ void Estimator::run(Dataset &data) vector parents = selector.select(this->pop, island, params); for (int i=0; i< parents.size(); i++){ - std::cout << i << std::endl; + //std::cout << i << std::endl; island_parents.at(island).at(i) = parents.at(i); } }).name("select parents for each island"); @@ -260,14 +260,14 @@ void Estimator::run(Dataset &data) auto generate_offspring = subflow.emplace([&]() { for (int island=0; island < params.num_islands; island++){ - std::cout << "inside generate offspring" << std::endl; + //std::cout << "inside generate offspring" << std::endl; this->pop.add_offspring_indexes(island); // we just need to add them, not remove (they are removed in survival step, that will return a selection with the same number of individuals as the original island size) - std::cout << "before vary" << std::endl; + //std::cout << "before vary" << std::endl; // // variation to produce offspring variator.vary(this->pop, island, island_parents.at(island)); - std::cout << "before update fitness" << std::endl; + //std::cout << "before update fitness" << std::endl; } }).name("generate offspring for each island"); @@ -275,38 +275,38 @@ void Estimator::run(Dataset &data) evaluator.update_fitness(this->pop, island, data, params, true); // evaluator.validation(*this->pop, island_range, data, params); - std::cout << "before batch update" << std::endl; + //std::cout << "before batch update" << std::endl; if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) evaluator.update_fitness(this->pop, island, batch, params, false); - std::cout << "before survive" << std::endl; + //std::cout << "before survive" << std::endl; // select survivors from combined pool of parents and offspring vector island_survivors = survivor.survive(this->pop, island, params); - std::cout << "before assign to survivors array" << std::endl; + //std::cout << "before assign to survivors array" << std::endl; for (int i=0; i< island_survivors.size(); i++){ - std::cout << i << std::endl; + //std::cout << i << std::endl; survivors.at(island).at(i) = island_survivors.at(i); } }).name("evaluate offspring and select survivors"); auto update_pop = subflow.emplace([&]() { - std::cout << "before updating survivors" << std::endl; - std::cout << pop.print_models() << std::endl; + //std::cout << "before updating survivors" << std::endl; + //std::cout << pop.print_models() << std::endl; this->pop.update(survivors); - std::cout << "after updating survivors" << std::endl; - std::cout << pop.print_models() << std::endl; + //std::cout << "after updating survivors" << std::endl; + //std::cout << pop.print_models() << std::endl; }).name("update population and detangle indexes"); auto migration = subflow.emplace([&]() { - std::cout << "before migrating" << std::endl; - std::cout << pop.print_models() << std::endl; + //std::cout << "before migrating" << std::endl; + //std::cout << pop.print_models() << std::endl; this->pop.migrate(); - std::cout << "after migrating" << std::endl; - std::cout << pop.print_models() << std::endl; + //std::cout << "after migrating" << std::endl; + //std::cout << pop.print_models() << std::endl; }).name("migration between islands"); // TODO: update best, update log, increment generation counter (but not set in params) @@ -338,12 +338,12 @@ void Estimator::run(Dataset &data) body.precede(back); back.precede(cond); - std::cout << "taskflow configured " << std::endl; + //std::cout << "taskflow configured " << std::endl; executor.run(taskflow); - std::cout << "submitted jobs " << std::endl; + //std::cout << "submitted jobs " << std::endl; executor.wait_for_all(); - std::cout << "finished " << std::endl; + //std::cout << "finished " << std::endl; } } \ No newline at end of file diff --git a/src/estimator.h b/src/engine.h similarity index 94% rename from src/estimator.h rename to src/engine.h index 7a54ac05..bbdef45f 100644 --- a/src/estimator.h +++ b/src/engine.h @@ -3,8 +3,8 @@ copyright 2020 William La Cava license: GNU/GPL v3 */ -#ifndef Estimator_H -#define Estimator_H +#ifndef Engine_H +#define Engine_H #include "./util/rnd.h" #include "init.h" @@ -17,6 +17,7 @@ license: GNU/GPL v3 #include +// TODO: rename it to engine namespace Brush { @@ -26,15 +27,15 @@ using namespace Eval; using namespace Var; template -class Estimator{ +class Engine{ public: - Estimator(const Parameters& p=Parameters()) + Engine(const Parameters& p=Parameters()) : params(p) , ss(SearchSpace()) // we need to initialize ss and variator. TODO: make them have a default way so we dont have to initialize here , variator(Variation(params, ss)) {}; - ~Estimator(){}; + ~Engine(){}; // all hyperparameters are controlled by the parameter class. please refer to that to change something inline Parameters& get_params(){return params;} diff --git a/src/types.h b/src/types.h index 4247ddda..a4415389 100644 --- a/src/types.h +++ b/src/types.h @@ -91,13 +91,13 @@ typedef Pop::Individual MulticlassClassifierIndividual typedef Pop::Individual RepresenterIndividual; //////////////////////////////////////////////////////////////////////////////// -// Estimator +// Engine using PT = ProgramType; -template class Estimator; -typedef Estimator RegressorEstimator; -typedef Estimator ClassifierEstimator; -typedef Estimator MulticlassClassifierEstimator; -typedef Estimator RepresenterEstimator; +template class Engine; +typedef Engine RegressorEngine; +typedef Engine ClassifierEngine; +typedef Engine MulticlassClassifierEngine; +typedef Engine RepresenterEngine; //////////////////////////////////////////////////////////////////////////////// // Data diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 7c4b387a..8897c08c 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -4,8 +4,8 @@ #include "../../src/program/program.h" // #include "../../src/program/dispatch_table.h" #include "../../src/data/io.h" -#include "../../src/estimator.h" -#include "../../src/estimator.cpp" +#include "../../src/engine.h" +#include "../../src/engine.cpp" #include "../../src/selection/selection.h" #include "../../src/selection/selection_operator.h" #include "../../src/selection/nsga2.h" @@ -43,22 +43,22 @@ TEST(Engine, EngineWorks) std::cout << "n jobs = 1" << std::endl; params.set_n_jobs(1); - Brush::RegressorEstimator est5(params); + Brush::RegressorEngine est5(params); est5.run(data); std::cout << "n jobs = 2" << std::endl; params.set_n_jobs(2); - Brush::RegressorEstimator est2(params); + Brush::RegressorEngine est2(params); est2.run(data); std::cout << "n jobs = -1" << std::endl; params.set_n_jobs(-1); - Brush::RegressorEstimator est3(params); + Brush::RegressorEngine est3(params); est3.run(data); std::cout << "n jobs = 0" << std::endl; params.set_n_jobs(0); - Brush::RegressorEstimator est4(params); + Brush::RegressorEngine est4(params); est4.run(data); std::cout << "testing migration" << std::endl; @@ -69,22 +69,22 @@ TEST(Engine, EngineWorks) std::cout << "n jobs = 1" << std::endl; params.set_n_jobs(1); - Brush::RegressorEstimator est6(params); + Brush::RegressorEngine est6(params); est6.run(data); std::cout << "n jobs = 2" << std::endl; params.set_n_jobs(2); - Brush::RegressorEstimator est7(params); + Brush::RegressorEngine est7(params); est7.run(data); std::cout << "n jobs = -1" << std::endl; params.set_n_jobs(-1); - Brush::RegressorEstimator est8(params); + Brush::RegressorEngine est8(params); est8.run(data); std::cout << "n jobs = 0" << std::endl; params.set_n_jobs(0); - Brush::RegressorEstimator est9(params); + Brush::RegressorEngine est9(params); est9.run(data); // TODO: test classifier and multiclassifier diff --git a/tests/cpp/testsHeader.h b/tests/cpp/testsHeader.h index 4559c0a8..10016765 100644 --- a/tests/cpp/testsHeader.h +++ b/tests/cpp/testsHeader.h @@ -42,7 +42,7 @@ using std::stof; #include "../../src/eval/evaluation.h" #include "../../src/eval/metrics.h" #include "../../src/eval/scorer.h" -#include "../../src/estimator.h" +#include "../../src/engine.h" #include "../../src/variation.cpp" // TODO: is this ok? (otherwise I would have to create a test separated file, or move the implementation to the header) using namespace Brush; diff --git a/tests/python/test_deap_api.py b/tests/python/test_deap_api.py index 0aaa8f2f..3d44708c 100644 --- a/tests/python/test_deap_api.py +++ b/tests/python/test_deap_api.py @@ -87,6 +87,26 @@ def test_predict_proba(setup, brush_args, request): assert y_prob.shape[1] >= 2, \ "every class should have its own column (even for binary clf)" + +# @pytest.mark.parametrize('setup', +# [('regression_setup')]) +# def test_brush_engine(setup, brush_args, request): + +# Estimator, X, y = request.getfixturevalue(setup) + +# dataset = pybrush.Dataset(X=X, y=y) + +# # TODO: pybrush parameters could have named arguments +# params = pybrush.Parameters() +# params.pop_size = 10 +# params.gens = 10 +# params.num_islands = 1 + +# eng = pybrush.RegressorEngine(params) +# # eng.run(dataset) + + + @pytest.mark.parametrize('setup,fixed_node', [ ('classification_setup', 'Logistic'), # ('multiclass_classification_setup', 'Softmax') From b0b27fd9f12438afa0115a8785e1dcdbf4c3dd49 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 12 Mar 2024 08:49:04 -0300 Subject: [PATCH 132/199] fixed population and selection implementations --- src/population.cpp | 26 +++++++------- src/selection/lexicase.cpp | 20 +++++------ src/selection/nsga2.cpp | 74 +++++++++++++++++++------------------- 3 files changed, 60 insertions(+), 60 deletions(-) diff --git a/src/population.cpp b/src/population.cpp index cf0c04d8..ded1f1be 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -274,17 +274,17 @@ void Population::migrate() return; // skipping. this only work because update is fixing island indexes // we cant use more than half of population here - std::cout << "finding island sorted fronts" << std::endl; + // std::cout << "finding island sorted fronts" << std::endl; auto island_fronts = sorted_front(1); - std::cout << "finding global hall of fame" << std::endl; + // std::cout << "finding global hall of fame" << std::endl; auto global_hall_of_fame = hall_of_fame(1); // This method is not thread safe (as it is now) vector> new_island_indexes; new_island_indexes.resize(num_islands); - std::cout << "Looping" << std::endl; + // std::cout << "Looping" << std::endl; for (int island=0; island::migrate() { if (r() < mig_prob) { - std::cout << "migrating in island " << island << std::endl; + // std::cout << "migrating in island " << island << std::endl; size_t migrating_idx; // determine if incoming individual comes from global or local hall of fame if (r() < 0.5) { // from global hall of fame - std::cout << "from hall of fame" << std::endl; + // std::cout << "from hall of fame" << std::endl; migrating_idx = *r.select_randomly( global_hall_of_fame.begin(), global_hall_of_fame.end()); - std::cout << "mig idx" << migrating_idx << std::endl; + // std::cout << "mig idx" << migrating_idx << std::endl; } else { // from any other local hall of fame - std::cout << "from other island" << std::endl; + // std::cout << "from other island" << std::endl; // finding other island indexes vector other_islands(num_islands-1); iota(other_islands.begin(), other_islands.end(), 0); @@ -327,11 +327,11 @@ void Population::migrate() migrating_idx = *r.select_randomly( island_fronts.at(other_island).begin(), island_fronts.at(other_island).end()); - std::cout << "mig idx" << migrating_idx << std::endl; + // std::cout << "mig idx" << migrating_idx << std::endl; } - std::cout << "index " << i << " of island " << island; - std::cout << " is now" << migrating_idx << std::endl; + // std::cout << "index " << i << " of island " << island; + // std::cout << " is now" << migrating_idx << std::endl; new_island_indexes.at(island).push_back(migrating_idx); } @@ -345,7 +345,7 @@ void Population::migrate() // this is particularly important to avoid multiple threads assigning different rank/crowdist/dcounter // or different fitness) - std::cout << "starting to consolidate pop" << std::endl; + // std::cout << "starting to consolidate pop" << std::endl; vector> new_pop; new_pop.resize(0); for (int j=0; j::migrate() iota(island_indexes.at(j).begin(), island_indexes.at(j).end(), idx_start); } - std::cout << "finished making copies" << std::endl; + // std::cout << "finished making copies" << std::endl; assert(new_pop.size() == pop_size && " migration ended up with a different popsize"); - std::cout << "filling individuals" << std::endl; + // std::cout << "filling individuals" << std::endl; this->individuals.resize(0); for (auto ind : new_pop) { diff --git a/src/selection/lexicase.cpp b/src/selection/lexicase.cpp index 831ae730..46b8b420 100644 --- a/src/selection/lexicase.cpp +++ b/src/selection/lexicase.cpp @@ -18,7 +18,7 @@ template vector Lexicase::select(Population& pop, int island, const Parameters& params) { - cout << "select lexicase island " << island << endl; + // cout << "select lexicase island " << island << endl; // this one can be executed in parallel because it is just reading the errors. This // method assumes that the expressions have been fitted previously, and their respective @@ -26,7 +26,7 @@ vector Lexicase::select(Population& pop, int island, auto island_pool = pop.get_island_indexes(island); - cout << "got indexes " << endl; + // cout << "got indexes " << endl; // if this is first generation, just return indices to pop if (params.current_gen==0) @@ -38,8 +38,8 @@ vector Lexicase::select(Population& pop, int island, //< number of individuals unsigned int P = island_pool.size(); - cout << "pool size is " << P << endl; - cout << "epsilon size is " << N << endl; + // cout << "pool size is " << P << endl; + // cout << "epsilon size is " << N << endl; // define epsilon ArrayXf epsilon = ArrayXf::Zero(N); @@ -48,7 +48,7 @@ vector Lexicase::select(Population& pop, int island, if (!params.classification || params.scorer_.compare("log")==0 || params.scorer_.compare("multi_log")==0) { - cout << "using lexicase for regression " << endl; + // cout << "using lexicase for regression " << endl; // for each sample, calculate epsilon for (int i = 0; i Lexicase::select(Population& pop, int island, // #pragma omp parallel for for (unsigned int i = 0; i cases; // cases (samples) if (params.classification && !params.class_weights.empty()) { - cout << "using WEIGHTED for classification " << endl; + // cout << "using WEIGHTED for classification " << endl; // for classification problems, weight case selection // by class weights @@ -162,13 +162,13 @@ vector Lexicase::select(Population& pop, int island, selected.at(i) = *r.select_randomly( winner.begin(), winner.end() ); - cout << "parallel end index " + to_string(i) << endl; + // cout << "parallel end index " + to_string(i) << endl; } if (selected.size() != island_pool.size()) { - std::cout << "selected: " ; - for (auto s: selected) std::cout << s << " "; std::cout << "\n"; + // std::cout << "selected: " ; + // for (auto s: selected) std::cout << s << " "; std::cout << "\n"; HANDLE_ERROR_THROW("Lexicase did not select correct number of \ parents"); } diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 1214b523..f8c286eb 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -39,7 +39,7 @@ template vector NSGA2::select(Population& pop, int island, const Parameters& params) { - cout << "select nsga island" << island << endl; + // cout << "select nsga island" << island << endl; // tournament selection. TODO: move this to tournament selection file, and throw not implemented error in nsga. auto island_pool = pop.get_island_indexes(island); @@ -76,20 +76,20 @@ vector NSGA2::survive(Population& pop, int island, const Parameters& params) { - fmt::print("starting survive\n"); + // fmt::print("starting survive\n"); - cout << "survive nsga island " << island << endl; + // cout << "survive nsga island " << island << endl; size_t idx_start = std::floor(island*params.pop_size/params.num_islands); size_t idx_end = std::floor((island+1)*params.pop_size/params.num_islands); auto original_size = idx_end - idx_start; // original island size (survive must be called with an island with offfspring) - fmt::print("original size {}\n", original_size); + // fmt::print("original size {}\n", original_size); auto island_pool = pop.get_island_indexes(island); - fmt::print("island size {}\n", island_pool.size()); + // fmt::print("island size {}\n", island_pool.size()); // set objectives (this is when the obj vector is updated.) @@ -98,53 +98,53 @@ vector NSGA2::survive(Population& pop, int island, // pop.individuals.at(island_pool[i])->set_obj(params.objectives); // fast non-dominated sort - fmt::print("fast nds for island {}\n", island); + // fmt::print("fast nds for island {}\n", island); auto front = fast_nds(pop, island_pool); - fmt::print("selecting...\n"); + // fmt::print("selecting...\n"); // Push back selected individuals until full vector selected; - fmt::print("created array...\n"); + // fmt::print("created array...\n"); selected.resize(0); - fmt::print("resized...\n"); + // fmt::print("resized...\n"); int i = 0; - fmt::print("starting loop...\n"); - fmt::print("selected size {}...\n",selected.size()); - fmt::print("first front size {}...\n", front.at(i).size()); - fmt::print("goal is to select n individuals: {}...\n", original_size); + // fmt::print("starting loop...\n"); + // fmt::print("selected size {}...\n",selected.size()); + // fmt::print("first front size {}...\n", front.at(i).size()); + // fmt::print("goal is to select n individuals: {}...\n", original_size); while ( i < front.size() && ( selected.size() + front.at(i).size() < original_size ) ) { - fmt::print("1...\n"); + // fmt::print("1...\n"); std::vector& Fi = front.at(i); // indices in front i - fmt::print("2...\n"); + // fmt::print("2...\n"); crowding_distance(pop, front, i); // calculate crowding in Fi - fmt::print("3...\n"); + // fmt::print("3...\n"); for (int j = 0; j < Fi.size(); ++j) // Pt+1 = Pt+1 U Fi selected.push_back(Fi.at(j)); - fmt::print("4...\n"); + // fmt::print("4...\n"); ++i; } - fmt::print("crowding distance\n"); + // fmt::print("crowding distance\n"); crowding_distance(pop, front, i); // calculate crowding in final front to include std::sort(front.at(i).begin(),front.at(i).end(),sort_n(pop)); - fmt::print("adding last front)\n"); + // fmt::print("adding last front)\n"); const int extra = original_size - selected.size(); for (int j = 0; j < extra; ++j) // Pt+1 = Pt+1 U Fi[1:N-|Pt+1|] selected.push_back(front.at(i).at(j)); - fmt::print("returning\n"); + // fmt::print("returning\n"); return selected; } @@ -154,7 +154,7 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan // this will update pareto dominance attributes in fitness class // based on the population - fmt::print("inside fast nds with island pool of size {} from pop of size {} and\n", island_pool.size(), pop.size()); + // fmt::print("inside fast nds with island pool of size {} from pop of size {} and\n", island_pool.size(), pop.size()); //< the Pareto fronts vector> front; @@ -193,17 +193,17 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan p->fitness.dominated = dom; // dom will have values already referring to island indexes if (p->fitness.dcounter == 0) { - fmt::print("pushing {}...\n", island_pool[i]); + // fmt::print("pushing {}...\n", island_pool[i]); p->fitness.set_rank(1); // front will have values already referring to island indexes front.at(0).push_back(island_pool[i]); } - fmt::print("... index {} dominates {} ({}) and was dominated by {} ({})\n", island_pool[i], dom.size(), p->fitness.get_dominated().size(), dcount, p->fitness.get_dcounter()); + // fmt::print("... index {} dominates {} ({}) and was dominated by {} ({})\n", island_pool[i], dom.size(), p->fitness.get_dominated().size(), dcount, p->fitness.get_dcounter()); } } - fmt::print("First front size {}...\n", front.at(0).size()); + // fmt::print("First front size {}...\n", front.at(0).size()); // using OpenMP can have different orders in the front.at(0) // so let's sort it so that the algorithm is deterministic @@ -212,7 +212,7 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan int fi = 1; while (front.at(fi-1).size() > 0) { - fmt::print("starting front {} with size {} \n", fi, front.at(fi-1).size()); + // fmt::print("starting front {} with size {} \n", fi, front.at(fi-1).size()); std::vector& fronti = front.at(fi-1); std::vector Q; @@ -220,19 +220,19 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan const Individual& p = pop[fronti.at(i)]; - fmt::print("ind {} dominated {} \n", fronti.at(i), p.fitness.dominated.size()); + // fmt::print("ind {} dominated {} \n", fronti.at(i), p.fitness.dominated.size()); // iterating over dominated individuals for (int j = 0; j < p.fitness.dominated.size() ; ++j) { - fmt::print("decreased counter of ind {} for {} to {} \n", j, p.fitness.dominated.at(j), pop.individuals.at(p.fitness.dominated.at(j))->fitness.dcounter); + // fmt::print("decreased counter of ind {} for {} to {} \n", j, p.fitness.dominated.at(j), pop.individuals.at(p.fitness.dominated.at(j))->fitness.dcounter); auto q = pop.individuals.at(p.fitness.dominated.at(j)); - fmt::print("decreased counter \n"); + // fmt::print("decreased counter \n"); q->fitness.dcounter -= 1; if (q->fitness.dcounter == 0) { - fmt::print("updated counter for ind {} \n", j); + // fmt::print("updated counter for ind {} \n", j); q->fitness.set_rank(fi+1); Q.push_back(p.fitness.dominated.at(j)); @@ -241,12 +241,12 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan } front.push_back(Q); - fmt::print("front {} ended with size {}...\n", fi, Q.size()); + // fmt::print("front {} ended with size {}...\n", fi, Q.size()); fi += 1; } - fmt::print("finished\n"); + // fmt::print("finished\n"); return front; } @@ -255,27 +255,27 @@ template void NSGA2::crowding_distance(Population& pop, vector>& front, int fronti) { - fmt::print("inside crowding distance for front {}...\n", fronti); + // fmt::print("inside crowding distance for front {}...\n", fronti); std::vector F = front.at(fronti); if (F.size() == 0 ){ - fmt::print("empty front\n"); + // fmt::print("empty front\n"); return; } const int fsize = F.size(); - fmt::print("front size is {}...\n", fsize); + // fmt::print("front size is {}...\n", fsize); for (int i = 0; i < fsize; ++i) pop.individuals.at(F.at(i))->fitness.crowding_dist = 0; - fmt::print("reseted crowding distance for individuals in this front\n"); + // fmt::print("reseted crowding distance for individuals in this front\n"); const int limit = pop.individuals.at(0)->fitness.get_wvalues().size(); - fmt::print("limit is {}\n", limit); + // fmt::print("limit is {}\n", limit); for (int m = 0; m < limit; ++m) { - fmt::print("m {}\n", m); + // fmt::print("m {}\n", m); std::sort(F.begin(), F.end(), comparator_obj(pop,m)); From e7259c6974d50f4e735a60810a47ddec053b08e8 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 12 Mar 2024 19:18:15 -0300 Subject: [PATCH 133/199] trying to solve GIL problem --- src/bindings/bind_engines.h | 2 +- src/engine.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/bindings/bind_engines.h b/src/bindings/bind_engines.h index 185767d4..c166c058 100644 --- a/src/bindings/bind_engines.h +++ b/src/bindings/bind_engines.h @@ -44,7 +44,7 @@ void bind_engine(py::module& m, string name) .def_property("params", &T::get_params, &T::set_params) .def_property_readonly("is_fitted", &T::get_is_fitted) .def_property_readonly("best_ind", &T::get_best_ind) - .def("run", &T::run, "run from brush dataset") + .def("run", &T::run, py::call_guard(), "run from brush dataset") ; // specialization for subclasses diff --git a/src/engine.cpp b/src/engine.cpp index 41efd1f8..e8d535aa 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -345,5 +345,11 @@ void Engine::run(Dataset &data) executor.wait_for_all(); //std::cout << "finished " << std::endl; + + //When you have tasks that are created at runtime (e.g., subflow, + // cudaFlow), you need to execute the graph first to spawn these tasks and dump the entire graph. + + //std::cout << "dumping taskflow in json " << std::endl; + taskflow.dump(std::cout); } } \ No newline at end of file From f0e3d4313026ddd0814a4bbc60f7cd101fb548c3 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 13 Mar 2024 09:33:44 -0300 Subject: [PATCH 134/199] Fixed variator not being initialize --- src/engine.cpp | 6 ++++++ src/variation.h | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/engine.cpp b/src/engine.cpp index e8d535aa..ca252610 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -41,6 +41,10 @@ void Engine::init() this->evaluator = Evaluation(); //std::cout << "created evaluator" << std::endl; + // TOD: make these classes have a default constructor, and stop recreating instances + this->variator.init(params, ss); + //std::cout << "initialized variator" << std::endl; + this->selector = Selection(params.sel, false); //std::cout << "created selector" << std::endl; @@ -241,6 +245,8 @@ void Engine::run(Dataset &data) //std::cout << "inside select parents" << std::endl; evaluator.update_fitness(this->pop, island, data, params, true); // fit the weights with all training data + // TODO: individuals should have a flag is_fitted so we avoid re-fitting them + // TODO: have some way to set which fitness to use (for example in params, or it can infer based on split size idk) // TODO: if using batch, fitness should be called before selection to set the batch if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) diff --git a/src/variation.h b/src/variation.h index 71e727c3..13866568 100644 --- a/src/variation.h +++ b/src/variation.h @@ -103,8 +103,8 @@ template class Variation { private: - SearchSpace& search_space; - Parameters& parameters; + SearchSpace search_space; + Parameters parameters; public: Variation() = default; @@ -116,8 +116,8 @@ class Variation ~Variation() {}; void init(Parameters& params, SearchSpace& ss){ - parameters = params; - search_space = ss; + this->parameters = params; + this->search_space = ss; }; // individual-level variations From 2474025fc6f168248ec0513215e660e448d6695a Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 13 Mar 2024 13:07:58 -0300 Subject: [PATCH 135/199] Cleaning up some TODOs --- pybrush/DeapEstimator.py | 48 ++++++++++++++---------------- pybrush/__init__.py | 6 ++++ src/bindings/bind_individuals.cpp | 1 - src/bindings/bind_individuals.h | 4 +-- src/bindings/bind_params.cpp | 12 +------- src/bindings/bind_population.cpp | 21 ------------- src/bindings/bind_population.h | 20 ------------- src/bindings/module.cpp | 2 -- src/engine.cpp | 25 ---------------- src/engine.h | 3 +- src/eval/evaluation.cpp | 1 - src/eval/evaluation.h | 5 +--- src/eval/metrics.h | 2 -- src/individual.h | 24 +++------------ src/params.cpp | 5 ---- src/params.h | 10 ++----- src/population.h | 1 - src/program/program.h | 33 -------------------- src/selection/nsga2.cpp | 1 - src/selection/selection_operator.h | 2 -- src/variation.cpp | 2 -- tests/cpp/test_variation.cpp | 14 ++++----- 22 files changed, 47 insertions(+), 195 deletions(-) delete mode 100644 src/bindings/bind_population.cpp delete mode 100644 src/bindings/bind_population.h diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index 54d4b67f..411dffe3 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -14,13 +14,15 @@ # from tqdm import tqdm from sklearn.metrics import average_precision_score from sklearn.preprocessing import MinMaxScaler -import _brush # TODO: stop using _brush and use whats in pybrush import functools from pybrush.deap_api import nsga2 -# from _brush import Dataset, SearchSpace from pybrush import RegressorIndividual, ClassifierIndividual, MultiClassifierIndividual from pybrush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator from pybrush import RegressorSelector, ClassifierSelector, MultiClassifierSelector +from pybrush import RegressorVariator, ClassifierVariator, MultiClassifierVariator +from pybrush import Parameters, Dataset, SearchSpace + +from pybrush brush_rng # TODO: LOGGER AND ARCHIVE @@ -103,15 +105,15 @@ class DeapEstimator(BaseEstimator): Attributes ---------- - best_estimator_ : _brush.Program + best_estimator_ : pybrush.Program The final model picked from training. Used in subsequent calls to :func:`predict`. archive_ : list[deap_api.DeapIndividual] The final population from training. - data_ : _brush.Dataset + data_ : pybrush.Dataset The complete data in Brush format. - train_ : _brush.Dataset + train_ : pybrush.Dataset Partition of `data_` containing `(1-validation_size)`% of the data, in Brush format. - validation_ : _brush.Dataset + validation_ : pybrush.Dataset Partition of `data_` containing `(validation_size)`% of the data, in Brush format. search_space_ : a Brush `SearchSpace` object. Holds the operators and terminals and sampling utilities to update programs. @@ -242,9 +244,6 @@ def fit(self, X, y): 1-d array of (boolean) target values. """ - if self.random_state is not None: - _brush.set_random_state(self.random_state) - self.feature_names_ = [] if isinstance(X, pd.DataFrame): self.feature_names_ = X.columns.to_list() @@ -277,9 +276,9 @@ def fit(self, X, y): self.train_.set_batch_size(self.batch_size) # TODO: update batch indexes at the beggining of every generation self.validation_ = self.data_.get_validation_data() - self.search_space_ = _brush.SearchSpace(self.train_, self.functions_, self.weights_init) + self.search_space_ = SearchSpace(self.train_, self.functions_, self.weights_init) - self.parameters_ = _brush.Parameters() + self.parameters_ = Parameters() self.parameters_.classification = self.mode == "classification" self.parameters_.n_classes = self.n_classes_ self.parameters_.n_jobs = self.n_jobs @@ -294,13 +293,16 @@ def fit(self, X, y): self.parameters_.functions = self.functions self.parameters_.mutation_probs = self.mutation_probs + if self.random_state is not None: + self.parameters_.random_state = self.random_state + if self.mode == "classification": - self.variator_ = (_brush.ClassifierVariator + self.variator_ = (ClassifierVariator if self.n_classes_ == 2 else - _brush.MultiClassifierVariator + MultiClassifierVariator )(self.parameters_, self.search_space_) elif self.mode == "regressor": - self.variator_ = _brush.RegressorVariator(self.parameters_, self.search_space_) + self.variator_ = RegressorVariator(self.parameters_, self.search_space_) # from pybrush import RegressorEngine # brush_estimator = RegressorEngine(self.parameters_) @@ -315,7 +317,7 @@ def fit(self, X, y): # nsga2 and ga differ in the toolbox self.archive_, self.logbook_ = nsga2( self.toolbox_, self.gens, self.pop_size, self.cx_prob, - (0.0(m, "Fitness", py::dynamic_attr()) .def(py::init<>()) .def(py::init&>(), "Constructor with weights") diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index 11ec9238..6074ca6f 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -24,8 +24,8 @@ void bind_individual(py::module& m, string name) ) .def("init", &Class::init) .def_property("objectives", &Class::get_objectives, &Class::set_objectives) - .def_property_readonly("program", &Class::get_program) // program cannot be changed by the user. Either create a new instance with the program as argument (so it will be a clone), or call init() (TODO: I should make init reset the attributes in the cpp end to avoid reseting the program but keeping the attributes) - .def_property_readonly("fitness", &Class::get_fitness) // program cannot be changed by the user. Either create a new instance with the program as argument (so it will be a clone), or call init() (TODO: I should make init reset the attributes in the cpp end to avoid reseting the program but keeping the attributes) + .def_property_readonly("program", &Class::get_program) + .def_property_readonly("fitness", &Class::get_fitness) // .def_property("complexity", &Class::get_complexity, &Class::set_complexity) .def(py::pickle( [](const Class &p) { // __getstate__ diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 39a4ffa6..df0d2640 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -6,23 +6,13 @@ namespace br = Brush; void bind_params(py::module& m) { - // py::object params = Brush::PARAMS; - // m.attr("PARAMS") = params; - - // py::class_(m, "Params", py::dynamic_attr()) - // .def(py::init<>()) - - m.def("set_params", &Brush::set_params); // TODO: delete this. use parameters class - - m.def("get_params", &br::get_params); m.def("set_random_state", [](unsigned int seed) { br::Util::r = *br::Util::Rnd::initRand(); br::Util::r.set_seed(seed); }); m.def("rnd_flt", [](){ return br::Util::r.rnd_flt(); }); py::class_(m, "Parameters") - .def(py::init([]() - { Brush::Parameters p; return p; })) + .def(py::init([](){ Brush::Parameters p; return p; })) .def_property("pop_size", &Brush::Parameters::get_pop_size, &Brush::Parameters::set_pop_size) .def_property("gens", &Brush::Parameters::get_gens, &Brush::Parameters::set_gens) .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) diff --git a/src/bindings/bind_population.cpp b/src/bindings/bind_population.cpp deleted file mode 100644 index ccdd7203..00000000 --- a/src/bindings/bind_population.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include "module.h" -#include "bind_population.h" - -namespace py = pybind11; -namespace br = Brush; -namespace nl = nlohmann; - -// using Reg = br::Program; -// using Cls = br::Program; -// using Rep = br::Program; -// using MCls = br::Program; - -void bind_populations(py::module& m) -{ - // TODO: make them a single class - bind_population(m, "RegressorPopulation"); - bind_population(m, "ClassifierPopulation"); - - bind_population(m, "MultiClassifierPopulation"); - // bind_population(m, "RepresenterPopulation"); -} \ No newline at end of file diff --git a/src/bindings/bind_population.h b/src/bindings/bind_population.h deleted file mode 100644 index 53fbcd6a..00000000 --- a/src/bindings/bind_population.h +++ /dev/null @@ -1,20 +0,0 @@ -#include "module.h" -#include "../population.h" -#include "../population.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) - -namespace py = pybind11; -namespace nl = nlohmann; -namespace br = Brush; - -template -void bind_population(py::module& m, string name) -{ - using Class = br::Pop::Population; - - // TODO: make population a non-templated class - py::class_ pop(m, name.data() ); - - // TODO: access individuals by index - pop.def(py::init<>()) - ; -} \ No newline at end of file diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index 09f2f267..b4154c6e 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -22,7 +22,6 @@ void bind_programs(py::module &); void bind_variations(py::module &); void bind_selections(py::module &); void bind_individuals(py::module &); -void bind_populations(py::module &); void bind_engines(py::module &); void bind_evaluators(py::module &); @@ -45,7 +44,6 @@ PYBIND11_MODULE(_brush, m) { bind_variations(m); bind_selections(m); bind_evaluators(m); - // bind_populations(m); // solutions py::module_ m2 = m.def_submodule("program", "Contains Program classes."); diff --git a/src/engine.cpp b/src/engine.cpp index ca252610..05259208 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -51,24 +51,6 @@ void Engine::init() this->survivor = Selection(params.surv, true); //std::cout << "created survivor" << std::endl; - //TODO - ///return fraction of data to use for training - // float get_split(); - // /// set train fraction of dataset - // void set_split(float sp); - - // TODO - // int get_batch_size(){return params.bp.batch_size;}; - // void set_batch_size(int bs); - - // TODO - ///set number of threads (and use them in taskflow) - // void set_n_jobs(unsigned t); - // int get_n_jobs(){return omp_get_num_threads();}; - - ///set flag to use batch for training - // void set_use_batch(); - // TODO getters and setters for the best solution found after evolution // predict, transform, predict_proba, etc. // get statistics @@ -87,11 +69,6 @@ void Engine::init() // // reset statistics // this->stats = Log_Stats(); - - // params.use_batch = params.bp.batch_size>0; - - // TODO: initialize dataset and search space here or inside fit? - } template // TODO: use the dataset, or ignore it @@ -113,10 +90,8 @@ bool Engine::update_best(const Dataset& data, bool val) //std::cout << "got hof" << std::endl; - // will look only in the first half of the population (this is intended to be done after survival step) for (int i=0; i < hof.size(); ++i) { - // TODO: i guess the right way of doing this is using island indexes (or just take the hall of fame) //std::cout << "index" << hof[i] << std::endl; const auto& ind = *pop.individuals.at(hof[i]); diff --git a/src/engine.h b/src/engine.h index bbdef45f..23c8c26b 100644 --- a/src/engine.h +++ b/src/engine.h @@ -17,7 +17,6 @@ license: GNU/GPL v3 #include -// TODO: rename it to engine namespace Brush { @@ -51,7 +50,7 @@ class Engine{ int best_complexity; Individual& get_best_ind(){return best_ind;}; - /// train a model. TODO: take arguments needed to build the dataset. once we have it, go through params to set global options and use them + /// train the model void run(Dataset &d); Parameters params; ///< hyperparameters of brush, which the user can interact diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index 62a48414..6ec61e9a 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -4,7 +4,6 @@ namespace Brush{ namespace Eval{ -// TODO: merge validation and update fitness into one function // fitness of population template void Evaluation::update_fitness(Population& pop, diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index 242cf916..81952938 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -40,10 +40,7 @@ class Evaluation { void set_scorer(string scorer){this->S.set_scorer(scorer);}; string get_scorer(){return this->S.get_scorer();}; - // TODO: set objectives - // TODO: evaluation bind - // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING? (caps) - // TODO: MAKE it work for classification (do I need to have a way to set accuracy as a minimization problem?) + /// fitness of population. void update_fitness(Population& pop, int island, diff --git a/src/eval/metrics.h b/src/eval/metrics.h index 79d41378..5f064db5 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -31,8 +31,6 @@ float mean_multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, const vector& class_weights=vector()); // TODO: average_precision_score for classification -// TODO: implement other metrics. Right know I have just the MSE - } // metrics } // Brush diff --git a/src/individual.h b/src/individual.h index e5b1fb76..fc33813a 100644 --- a/src/individual.h +++ b/src/individual.h @@ -27,15 +27,14 @@ namespace Brush{ // TODO make a better use of this (in selection, when fitting, etc) (actually i need to start using it) struct Fitness { + // the loss is used in evolutionary functions float loss; ///< aggregate loss score float loss_v; ///< aggregate validation loss score - // TODO: maybe this should be all part of fitness, and individual should have only the fitness, program, and error (and objectives) size_t complexity; size_t size; size_t depth; - // these can be different depending on the island the individual is unsigned int dcounter; ///< number of individuals this dominates vector dominated; ///< individual indices this dominates @@ -63,8 +62,6 @@ struct Fitness { vector values; vector weights; - // TODO: fitness could have a function size() - // weighted values vector wvalues; @@ -189,6 +186,8 @@ class Individual{ // store just info that we dont have a getter. size, depth, complexity: they can all be obtained with program. + // error is the aggregation of error vector, and can be user sppecified + VectorXf error; ///< training error (used in lexicase selectors) Fitness fitness; ///< aggregate fitness score @@ -197,22 +196,7 @@ class Individual{ Individual() { - // TODO: default value for fitness - // the fitness is used in evolutionary functions - // fitness = -1; - - // loss is the aggregation of error vector, and can be user sppecified - // loss = -1; - // loss_v = -1; - - // complexity=-1; - // size=-1; - // depth=-1; - - // dcounter=-1; - // rank=-1; - // crowding_dist = -1; - + // TODO: better initialization of arguments objectives = {"error", "complexity"}; }; diff --git a/src/params.cpp b/src/params.cpp index a2ddad92..add4b4fd 100644 --- a/src/params.cpp +++ b/src/params.cpp @@ -6,9 +6,4 @@ license: GNU/GPL v3 namespace Brush { - -nlohmann::json PARAMS; -void set_params(const ns::json& j) { PARAMS = j; } -ns::json get_params(){ return PARAMS;} - } // Brush diff --git a/src/params.h b/src/params.h index cb1af4a1..5132e678 100644 --- a/src/params.h +++ b/src/params.h @@ -21,7 +21,6 @@ struct Parameters int random_state; // TODO: constructor should set the global rng to random_state (if given, otherwise just let it work normally) //int verbosity = 0; // TODO: implement log and verbosity - // TODO: every parameter should have a default value // Evolutionary stuff string mode="regression"; @@ -30,8 +29,10 @@ struct Parameters int pop_size = 100; int gens = 1000; unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size + unsigned int max_size = 50; vector objectives{"error","complexity"}; // error should be generic and deducted based on mode + string sel = "lexicase"; //selection method string surv = "nsga2"; //survival method std::unordered_map functions; @@ -53,6 +54,7 @@ struct Parameters string scorer_="mse"; ///< actual loss function used, determined by error // for classification (TODO: should I have these, or they could be just dataset arguments (except the ones needed to use in dataset constructor)) + bool classification; unsigned int n_classes; ///< number of classes for classification @@ -122,12 +124,6 @@ struct Parameters void set_functions(std::unordered_map new_functions){ functions = new_functions; }; std::unordered_map get_functions(){ return functions; }; }; - -// Global (deprecated) params -extern ns::json PARAMS; -void set_params(const ns::json& j); -ns::json get_params(); - } // Brush #endif diff --git a/src/population.h b/src/population.h index 3cfa63c9..5cab09b7 100644 --- a/src/population.h +++ b/src/population.h @@ -27,7 +27,6 @@ class Population{ // - prepare offspring and update are not thread safe because we insert/delete elements from the array. vector> island_indexes; - // TODO: taskflow needs to use num_islands as n_jobs Population(); ~Population(){}; diff --git a/src/program/program.h b/src/program/program.h index 82b8bc06..d694c171 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -500,22 +500,6 @@ template struct Program return out; } - // TODO: delete this declarations - //////////////////////////////////////////////////////////////////////////// - // Mutation & Crossover - - // /// @brief convenience wrapper for :cpp:func:`variation:mutate()` in variation.h - // /// @return a mutated version of this program - // std::optional> mutate() const; - - // /** - // * @brief convenience wrapper for :cpp:func:`variation:cross` in variation.h - // * - // * @param other another program to cross with this one. - // * @return a new version of this and the other program - // */ - // std::optional> cross(Program other) const; - /// @brief turns program tree into a linear program. /// @return a vector of nodes encoding the program in reverse polish notation vector linearize() const { @@ -544,23 +528,6 @@ void Program::update_weights(const Dataset& d) }; -// TODO: delete this declarations -//////////////////////////////////////////////////////////////////////////////// -// mutation and crossover -// template -// std::optional> Program::mutate() const -// { -// return Brush::Var::mutate(*this, this->SSref.value().get()); -// }; - -// /// swaps subtrees between this and other (note the pass by copy) -// template -// std::optional> Program::cross(Program other) const -// { -// return Brush::Var::cross(*this, other); -// }; - - //////////////////////////////////////////////////////////////////////////////// // serialization // serialization for program diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index f8c286eb..25992cfc 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -187,7 +187,6 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan #pragma omp critical { - // TODO: dcounter rank etc should be local variables (because one individual can be in multiple islands) p->fitness.dcounter = dcount; p->fitness.dominated.clear(); p->fitness.dominated = dom; // dom will have values already referring to island indexes diff --git a/src/selection/selection_operator.h b/src/selection/selection_operator.h index 215b79e8..d4b3195a 100644 --- a/src/selection/selection_operator.h +++ b/src/selection/selection_operator.h @@ -26,8 +26,6 @@ class SelectionOperator bool survival; string name; - // TODO: implement lexicase - // shoudn't have a constructor // SelectionOperator(){}; diff --git a/src/variation.cpp b/src/variation.cpp index 85f73cf6..ff3b8e44 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -458,7 +458,6 @@ std::optional> Variation::cross( // fmt::print("other_spot : {}\n",other_spot.node->data); // swap subtrees at child_spot and other_spot - // TODO: do I need to delete the removed node? child.Tree.move_ontop(child_spot, other_spot); Individual ind(child); @@ -608,7 +607,6 @@ template void Variation::vary(Population& pop, int island, const vector& parents) { - // TODO: fill indexes with nullptr, istead of using second half auto idxs = pop.get_island_indexes(island); // TODO: fix pragma omp usage diff --git a/tests/cpp/test_variation.cpp b/tests/cpp/test_variation.cpp index b8dc37da..6209237e 100644 --- a/tests/cpp/test_variation.cpp +++ b/tests/cpp/test_variation.cpp @@ -193,11 +193,11 @@ TEST(Variation, InsertMutationWorks) } // lets also see if it always fails when the child exceeds the maximum limits - params.max_size = IND.program.size(); - params.max_depth = IND.program.depth(); + variator.parameters.set_max_depth(IND.program.depth()); + variator.parameters.set_max_size(IND.program.size()); auto opt2 = variator.mutate(IND); - if (opt2){ // This shoudl't happen. We'll print then error + if (opt2){ // This shoudl't happen. We'll print the error auto Child2 = opt2.value(); std::cout << "Fail failed. Mutation weights:" << std::endl; @@ -205,15 +205,15 @@ TEST(Variation, InsertMutationWorks) std::cout << k << " : " << v << std::endl; fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" + "max depth = {}, max size= {}\n" "Initial Model: {}\n" - "Mutated Model: {}\n", + "Mutated Model: {}\n" + "=================================================\n", params.max_depth, params.max_size, IND.program.get_model("compact", true), Child2.program.get_model("compact", true) ); - ASSERT_TRUE(opt2==std::nullopt); + ASSERT_TRUE(opt2==std::nullopt); // this will fail, so we can see the log } } ASSERT_TRUE(successes > 0); From 17cb266baf75def966d6f7f8c19dca74df6ef0ec Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 13 Mar 2024 13:21:40 -0300 Subject: [PATCH 136/199] fixed typo --- pybrush/DeapEstimator.py | 2 +- pybrush/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index 411dffe3..8ee9cb10 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -22,7 +22,7 @@ from pybrush import RegressorVariator, ClassifierVariator, MultiClassifierVariator from pybrush import Parameters, Dataset, SearchSpace -from pybrush brush_rng +from pybrush import brush_rng # TODO: LOGGER AND ARCHIVE diff --git a/pybrush/__init__.py b/pybrush/__init__.py index 77bbbfe2..8a9786b8 100644 --- a/pybrush/__init__.py +++ b/pybrush/__init__.py @@ -4,7 +4,7 @@ from _brush import Parameters # geting random floats -from _brush import rng_flt as brush_rng +from _brush import rnd_flt as brush_rng # Population modifiers from _brush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator From cde53e501dfc7d96196de1653a3bba0b19cb2634 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 13 Mar 2024 18:06:06 -0300 Subject: [PATCH 137/199] cleaning and refactoring --- src/bindings/bind_fitness.cpp | 51 +++++++++ src/bindings/bind_individuals.cpp | 38 ------ src/bindings/module.cpp | 13 ++- src/eval/evaluation.cpp | 15 ++- src/eval/fitness.cpp | 1 + src/eval/fitness.h | 182 ++++++++++++++++++++++++++++- src/eval/metrics.h | 1 - src/individual.h | 184 +----------------------------- src/population.cpp | 7 -- src/population.h | 6 - tests/cpp/test_brush.cpp | 18 +++ 11 files changed, 267 insertions(+), 249 deletions(-) create mode 100644 src/bindings/bind_fitness.cpp diff --git a/src/bindings/bind_fitness.cpp b/src/bindings/bind_fitness.cpp new file mode 100644 index 00000000..1133c5b9 --- /dev/null +++ b/src/bindings/bind_fitness.cpp @@ -0,0 +1,51 @@ +#include "module.h" + +#include "../eval/fitness.h" + +namespace nl = nlohmann; +namespace br = Brush; + +using stream_redirect = py::call_guard; + +template +void bind_fitness(py::module& m) +{ + py::class_(m, "Fitness", py::dynamic_attr()) + .def(py::init<>()) + .def(py::init&>(), "Constructor with weights") + .def_property("values", &br::Fitness::get_values, &br::Fitness::set_values) + .def_property_readonly("weights", &br::Fitness::get_weights) + .def_property_readonly("wvalues", &br::Fitness::get_wvalues) + .def("dominates", &br::Fitness::dominates) + .def("clearValues", &br::Fitness::clearValues, "Clear the weighted values vector") + .def_property("rank", &br::Fitness::get_rank, &br::Fitness::set_rank) + .def_property("loss", &br::Fitness::get_loss, &br::Fitness::set_loss) + .def_property("loss_v", &br::Fitness::get_loss_v, &br::Fitness::set_loss_v) + .def_property("crowding_dist", &br::Fitness::get_crowding_dist, &br::Fitness::set_crowding_dist) + + .def("valid", &br::Fitness::valid, "Check if the fitness is valid") + .def("__hash__", &br::Fitness::hash, py::is_operator()) + .def("__eq__", &br::Fitness::operator==, py::is_operator()) + .def("__ne__", &br::Fitness::operator!=, py::is_operator()) + .def("__lt__", &br::Fitness::operator<, py::is_operator()) + .def("__gt__", &br::Fitness::operator>, py::is_operator()) + .def("__le__", &br::Fitness::operator<=, py::is_operator()) + .def("__ge__", &br::Fitness::operator>=, py::is_operator()) + // .def("__str__", &br::Fitness::toString, "String representation of the Fitness object") + // .def("__repr__", &br::Fitness::repr, "Representation for debugging the Fitness object") + .def(py::pickle( + [](const br::Fitness &f) { // __getstate__ + /* Return a tuple that fully encodes the state of the object */ + // return py::make_tuple(p.value(), p.extra()); + nl::json j = f; + return j; + }, + [](nl::json j) { // __setstate__ + br::Fitness f = j; + return f; + } + ) + ) + ; + +} \ No newline at end of file diff --git a/src/bindings/bind_individuals.cpp b/src/bindings/bind_individuals.cpp index 4d162098..8b5a9851 100644 --- a/src/bindings/bind_individuals.cpp +++ b/src/bindings/bind_individuals.cpp @@ -8,44 +8,6 @@ namespace nl = nlohmann; void bind_individuals(py::module& m) { - py::class_(m, "Fitness", py::dynamic_attr()) - .def(py::init<>()) - .def(py::init&>(), "Constructor with weights") - .def_property("values", &br::Fitness::get_values, &br::Fitness::set_values) - .def_property_readonly("weights", &br::Fitness::get_weights) - .def_property_readonly("wvalues", &br::Fitness::get_wvalues) - .def("dominates", &Fitness::dominates) - .def("clearValues", &Fitness::clearValues, "Clear the weighted values vector") - .def_property("rank", &Fitness::get_rank, &Fitness::set_rank) - .def_property("loss", &Fitness::get_loss, &Fitness::set_loss) - .def_property("loss_v", &Fitness::get_loss_v, &Fitness::set_loss_v) - .def_property("crowding_dist", &Fitness::get_crowding_dist, &Fitness::set_crowding_dist) - - .def("valid", &Fitness::valid, "Check if the fitness is valid") - .def("__hash__", &Fitness::hash, py::is_operator()) - .def("__eq__", &Fitness::operator==, py::is_operator()) - .def("__ne__", &Fitness::operator!=, py::is_operator()) - .def("__lt__", &Fitness::operator<, py::is_operator()) - .def("__gt__", &Fitness::operator>, py::is_operator()) - .def("__le__", &Fitness::operator<=, py::is_operator()) - .def("__ge__", &Fitness::operator>=, py::is_operator()) - // .def("__str__", &Fitness::toString, "String representation of the Fitness object") - // .def("__repr__", &Fitness::repr, "Representation for debugging the Fitness object") - .def(py::pickle( - [](const br::Fitness &f) { // __getstate__ - /* Return a tuple that fully encodes the state of the object */ - // return py::make_tuple(p.value(), p.extra()); - nl::json j = f; - return j; - }, - [](nl::json j) { // __setstate__ - br::Fitness f = j; - return f; - } - ) - ) - ; - bind_individual(m, "RegressorIndividual"); bind_individual(m, "ClassifierIndividual"); bind_individual(m, "MultiClassifierIndividual"); diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index b4154c6e..fbcd8e97 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -14,10 +14,15 @@ license: GNU/GPL v3 namespace py = pybind11; -// forward declarations +// forward declarations ------------------ + +// non-templated bindings void bind_params(py::module &); void bind_dataset(py::module &); void bind_search_space(py::module &); +void bind_fitness(py::module &); + +// templated bindings void bind_programs(py::module &); void bind_variations(py::module &); void bind_selections(py::module &); @@ -32,13 +37,11 @@ PYBIND11_MODULE(_brush, m) { #else m.attr("__version__") = "dev"; #endif - // main algorithm - // bind_cbrush(m); - - // data structures to store solutions + // data structures bind_params(m); bind_dataset(m); bind_search_space(m); + bind_fitness(m); // should these 4 below be exposed? bind_variations(m); diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index 6ec61e9a..c92cf84f 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -50,21 +50,20 @@ template void Evaluation::assign_fit(Individual& ind, const Dataset& data, const Parameters& params, bool val) { - VectorXf loss; + VectorXf errors; using PT = ProgramType; Dataset validation = data.get_validation_data(); - float f_v = S.score(ind, validation, loss, params); + float f_v = S.score(ind, validation, errors, params); - // TODO: implement the class weights and use it here (and on loss) + // TODO: implement the class weights and use it here (and on errors) Dataset train = data.get_training_data(); - float f = S.score(ind, train, loss, params); + float f = S.score(ind, train, errors, params); - // TODO: setter for loss and loss_v - ind.error = loss; - ind.fitness.loss = f; - ind.fitness.loss_v = f_v; + ind.error = errors; + ind.fitness.set_loss(f); + ind.fitness.set_loss_v(f_v); ind.fitness.size = ind.program.size(); ind.fitness.complexity = ind.program.complexity(); ind.fitness.depth = ind.program.depth(); diff --git a/src/eval/fitness.cpp b/src/eval/fitness.cpp index e69de29b..9e2de1ca 100644 --- a/src/eval/fitness.cpp +++ b/src/eval/fitness.cpp @@ -0,0 +1 @@ +#include "fitness.h" \ No newline at end of file diff --git a/src/eval/fitness.h b/src/eval/fitness.h index c7c35660..48d09631 100644 --- a/src/eval/fitness.h +++ b/src/eval/fitness.h @@ -1,3 +1,181 @@ -// Minimizing/maximizing problem: negative/positive weight, respectively. +#ifndef FITNESS_H +#define FITNESS_H + +#include +#include "../init.h" + + +using namespace nlohmann; + + +template <> // this is intended to be used with DEAP. TODO: decide if im going to keep it +struct std::hash> { + std::size_t operator()(const std::vector& v) const { + std::size_t seed = v.size(); + for (const auto& elem : v) { + seed ^= std::hash{}(elem) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; + +// TODO: separate declaration from implementation (for all classes. have a folder with headers and other with srcs, just like operon) +namespace Brush{ +struct Fitness { + // the loss is used in evolutionary functions + float loss; ///< aggregate loss score + float loss_v; ///< aggregate validation loss score + + size_t complexity; + size_t size; + size_t depth; + + // these can be different depending on the island the individual is + unsigned int dcounter; ///< number of individuals this dominates + vector dominated; ///< individual indices this dominates + unsigned int rank; ///< pareto front rank + float crowding_dist; ///< crowding distance on the Pareto front + + void set_dominated(vector& dom){ dominated=dom; }; + vector get_dominated() const { return dominated; }; + + void set_loss(float f){ loss=f; }; + float get_loss() const { return loss; }; + + void set_loss_v(float f_v){ loss_v=f_v; }; + float get_loss_v() const { return loss_v; }; + + void set_dcounter(unsigned int d){ dcounter=d; }; + unsigned int get_dcounter() const { return dcounter; }; + + void set_rank(unsigned r){ rank=r; }; + size_t get_rank() const { return rank; }; + + void set_crowding_dist(float cd){ crowding_dist=cd; }; + float get_crowding_dist() const { return crowding_dist; }; + + vector values; + vector weights; + + // weighted values + vector wvalues; + + // Constructor with initializer list for weights + Fitness(const vector& w={}) : values(), wvalues(), weights(w) { + dcounter = 0; + set_rank(0); + set_crowding_dist(0); + dominated.resize(0); + } + + // Hash function TODO: stop using it (i think only deap needs this) + size_t hash() const { + std::size_t h = std::hash>{}(wvalues); + return h; + } + + void set_weights(vector& w) { + weights = w; + } + vector get_weights() const { + return weights; + } + vector get_values() const { + return values; + } + vector get_wvalues() const { + return wvalues; + } + + // TODO: debug size, it is giving weird values + // Method to set values + void set_values(vector& v) { + if (v.size() != weights.size()) { + throw std::length_error("Assigned values have not the same length than current values"); + } + // fmt::print("updated values\n"); + + values.resize(0); + for (const auto& element : v) { + values.push_back(element); + } + + wvalues.resize(weights.size()); + + // Perform element-wise multiplication + std::transform(v.begin(), v.end(), + weights.begin(), wvalues.begin(), + [](double a, double b) { + return a * b; + }); + } + + // Method to clear values + void clearValues() { + wvalues.clear(); + } + + bool valid() const { + return !wvalues.empty(); + } + + // Equality comparison + bool operator==(const Fitness& other) const { + return wvalues == other.wvalues; + } + + // Inequality comparison + bool operator!=(const Fitness& other) const { + return !(*this == other); + } + + // Less than comparison + bool operator<(const Fitness& other) const { + // Minimizing/maximizing problem: negative/positive weight, respectively. + return std::lexicographical_compare(wvalues.begin(), wvalues.end(), + other.wvalues.begin(), other.wvalues.end()); + } + + // Greater than comparison + bool operator>(const Fitness& other) const { + return other < *this; + } + + // Less than or equal to comparison + bool operator<=(const Fitness& other) const { + return !(other < *this); + } + + // Greater than or equal to comparison + bool operator>=(const Fitness& other) const { + return !(*this < other); + } + + // String representation + std::string toString() const { + if (valid()) { + return "TODO: implement string representation"; //std::to_string(wvalues); + } else { + return "Tuple()"; + } + } + + // Representation for debugging + std::string repr() const { + return "Fitness(TODO: implement string representation)"; + } + + + /// set obj vector given a string of objective names + int dominates(const Fitness& b) const; +}; + +void to_json(json &j, const Fitness &f); +void from_json(const json &j, Fitness& f); + + +} +#endif + + -// TODO: move fitness here \ No newline at end of file diff --git a/src/eval/metrics.h b/src/eval/metrics.h index 5f064db5..fab075d1 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -13,7 +13,6 @@ float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, const vector& class_weights=vector() ); // TODO: test cases for the metrics -// TODO: implement the metrics for classification /// log loss (2 methods below) VectorXf log_loss(const VectorXf& y, const VectorXf& predict_proba, diff --git a/src/individual.h b/src/individual.h index fc33813a..237b0423 100644 --- a/src/individual.h +++ b/src/individual.h @@ -1,184 +1,16 @@ #ifndef INDIVIDUAL_H #define INDIVIDUAL_H -// #include "search_space.h" #include "program/program.h" +#include "eval/fitness.h" #include using namespace nlohmann; -template <> // this is intended to be used with DEAP. TODO: decide if im going to keep it -struct std::hash> { - std::size_t operator()(const std::vector& v) const { - std::size_t seed = v.size(); - for (const auto& elem : v) { - seed ^= std::hash{}(elem) + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - return seed; - } -}; - namespace Brush{ - - -// TODO: separate declaration from implementation -// TODO: move fitness to eval folder -// TODO make a better use of this (in selection, when fitting, etc) (actually i need to start using it) -struct Fitness { - - // the loss is used in evolutionary functions - float loss; ///< aggregate loss score - float loss_v; ///< aggregate validation loss score - - size_t complexity; - size_t size; - size_t depth; - - // these can be different depending on the island the individual is - unsigned int dcounter; ///< number of individuals this dominates - vector dominated; ///< individual indices this dominates - unsigned int rank; ///< pareto front rank - float crowding_dist; ///< crowding distance on the Pareto front - - void set_dominated(vector& dom){ dominated=dom; }; - vector get_dominated() const { return dominated; }; - - void set_loss(float f){ loss=f; }; - float get_loss() const { return loss; }; - - void set_loss_v(float f_v){ loss_v=f_v; }; - float get_loss_v() const { return loss_v; }; - - void set_dcounter(unsigned int d){ dcounter=d; }; - unsigned int get_dcounter() const { return dcounter; }; - - void set_rank(unsigned r){ rank=r; }; - size_t get_rank() const { return rank; }; - - void set_crowding_dist(float cd){ crowding_dist=cd; }; - float get_crowding_dist() const { return crowding_dist; }; - - vector values; - vector weights; - - // weighted values - vector wvalues; - - // Constructor with initializer list for weights - Fitness(const vector& w={}) : values(), wvalues(), weights(w) { - dcounter = 0; - set_rank(0); - set_crowding_dist(0); - dominated.resize(0); - } - - // Hash function - size_t hash() const { - std::size_t h = std::hash>{}(wvalues); - return h; - } - - void set_weights(vector& w) { - weights = w; - } - vector get_weights() const { - return weights; - } - vector get_values() const { - return values; - } - vector get_wvalues() const { - return wvalues; - } - - // TODO: debug size, it is giving weird values - // Method to set values - void set_values(vector& v) { - if (v.size() != weights.size()) { - throw std::length_error("Assigned values have not the same length than current values"); - } - // fmt::print("updated values\n"); - - values.resize(0); - for (const auto& element : v) { - values.push_back(element); - } - - wvalues.resize(weights.size()); - - // Perform element-wise multiplication - std::transform(v.begin(), v.end(), - weights.begin(), wvalues.begin(), - [](double a, double b) { - return a * b; - }); - } - - // Method to clear values - void clearValues() { - wvalues.clear(); - } - - bool valid() const { - return !wvalues.empty(); - } - - // Equality comparison - bool operator==(const Fitness& other) const { - return wvalues == other.wvalues; - } - - // Inequality comparison - bool operator!=(const Fitness& other) const { - return !(*this == other); - } - - // Less than comparison - bool operator<(const Fitness& other) const { - return std::lexicographical_compare(wvalues.begin(), wvalues.end(), - other.wvalues.begin(), other.wvalues.end()); - } - - // Greater than comparison - bool operator>(const Fitness& other) const { - return other < *this; - } - - // Less than or equal to comparison - bool operator<=(const Fitness& other) const { - return !(other < *this); - } - - // Greater than or equal to comparison - bool operator>=(const Fitness& other) const { - return !(*this < other); - } - - // String representation - std::string toString() const { - if (valid()) { - return "TODO: implement string representation"; //std::to_string(wvalues); - } else { - return "Tuple()"; - } - } - - // Representation for debugging - std::string repr() const { - return "Fitness(TODO: implement string representation)"; - } - - - /// set obj vector given a string of objective names - int dominates(const Fitness& b) const; -}; - -void to_json(json &j, const Fitness &f); -void from_json(const json &j, Fitness& f); - namespace Pop{ - + template class Individual{ public: // TODO: make these private (and work with nlohman json) @@ -202,8 +34,6 @@ class Individual{ Individual(Program& prg) : Individual() { program = prg; }; - // TODO: clone? maybe a constructor that takes another individual as arg and copies everything - void init(SearchSpace& ss, const Parameters& params) { program = ss.make_program>(params, 0, 0); @@ -237,14 +67,6 @@ class Individual{ // template // void Individual::set_objectives(const vector& objectives) - // TODO: fix to use these with fitness instead of with individual - // unsigned int dcounter; ///< number of individuals this dominates - // vector dominated; ///< individual indices this dominates - - // unsigned int rank; ///< pareto front rank - // float crowding_dist; ///< crowding distance on the Pareto front - - // Static map for weights associated with strings // TODO: weights for different values. loss should be calculated duing runtime, based on the metric inline static std::map weightsMap = []() { @@ -280,13 +102,11 @@ class Individual{ }; -// TODO: rename (something better (more meaningful) than p) // serialization for Individual template void to_json(json &j, const Individual &p) { j = json{ - // TODO: jsonify fitness struct, and new possible obj functions {"program", p.program}, {"fitness", p.fitness}, // {"loss", p.loss}, diff --git a/src/population.cpp b/src/population.cpp index ded1f1be..50fa1baf 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -107,13 +107,6 @@ void Population::init(SearchSpace& ss, const Parameters& params) template void Population::add_offspring_indexes(int island) { - // TODO 2: i guess I dont need to do this (below) anymore - // TODO: find unused indexes and distribute them to the islands (I think islands can point to anywhere in the population. also make sure that the selection survival and mutation works like that) - // reading and writing is thread-safe, as long as there's no overlap on island ranges. - // manipulating a vector IS NOT thread-safe (inserting and erasing elements). - // So, add_offspring_indexes and update should be the synchronization points, not - // operations performed concurrently - size_t p = pop_size; // population size. prep_offspring slots will douple the population, adding the new expressions into the islands // this is going to be tricky (pay attention to delta and p use) diff --git a/src/population.h b/src/population.h index 5cab09b7..0d3192e4 100644 --- a/src/population.h +++ b/src/population.h @@ -19,12 +19,6 @@ class Population{ float mig_prob; vector>> individuals; - - // TODO: right now, the number of islands must be a divisor of the popsize, and cannot be greater than half of the popsize (it cant be the same as popsize). Should this behavior change? Also, write this in docs - - // TODO: MAKE SURE THIS TWO ITEMS BELOW ARE TAKEN CARE IN THE MAIN LOOP AND IN TEST_POPULATION (I may need to create new methods for taking care of this) - // - fitting, fitness calculation, and setting the objectives are not thread safe because we write in individual attributes. - // - prepare offspring and update are not thread safe because we insert/delete elements from the array. vector> island_indexes; Population(); diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 8897c08c..946a467d 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -87,5 +87,23 @@ TEST(Engine, EngineWorks) Brush::RegressorEngine est9(params); est9.run(data); + // when popsize is not divisible by num_islands + std::cout << "popsize not divisible by num_islands" << std::endl; + params.set_pop_size(15); + params.set_gens(10); + params.set_num_islands(4); // fewer individuals in one island + params.set_n_jobs(1); + Brush::RegressorEngine est_not_div1(params); + est_not_div1.run(data); + + // TODO: logger + std::cout << "popsize not divisible by num_islands" << std::endl; + params.set_pop_size(10); + params.set_gens(10); + params.set_num_islands(3); // extra individuals in one island + params.set_n_jobs(1); + Brush::RegressorEngine est_not_div2(params); + est_not_div2.run(data); + // TODO: test classifier and multiclassifier } \ No newline at end of file From d7e47abfb75c35dfe8af7b8d6a8ccaf5851d3c9c Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 13 Mar 2024 18:24:32 -0300 Subject: [PATCH 138/199] fixed bind fitness not working --- src/bindings/bind_fitness.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bindings/bind_fitness.cpp b/src/bindings/bind_fitness.cpp index 1133c5b9..25bbc81c 100644 --- a/src/bindings/bind_fitness.cpp +++ b/src/bindings/bind_fitness.cpp @@ -7,7 +7,6 @@ namespace br = Brush; using stream_redirect = py::call_guard; -template void bind_fitness(py::module& m) { py::class_(m, "Fitness", py::dynamic_attr()) From 84dd4950bac0b45618891ff2d8dff6e5c5cb13a1 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Fri, 15 Mar 2024 18:43:05 -0300 Subject: [PATCH 139/199] Starting to implement pure c++ BrushEstimator --- pybrush/BrushEstimator.py | 345 ++++++++++++++++++++++++++++++++++++++ pybrush/DeapEstimator.py | 8 +- pybrush/__init__.py | 20 ++- src/bindings/module.cpp | 3 +- src/engine.cpp | 2 + src/engine.h | 2 +- 6 files changed, 365 insertions(+), 15 deletions(-) create mode 100644 pybrush/BrushEstimator.py diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py new file mode 100644 index 00000000..bb31abc5 --- /dev/null +++ b/pybrush/BrushEstimator.py @@ -0,0 +1,345 @@ +""" +sklearn-compatible wrapper for GP analyses. + +TODO: update this docstring +See brushgp.cpp for Python (via pybind11) modules that give more fine-grained +control of the underlying GP objects. +""" +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +from sklearn.utils.validation import check_is_fitted +# from sklearn.metrics import mean_squared_error +import numpy as np +import pandas as pd + +from _brush.individual import * # RegressorIndividual, ClassifierIndividual, MultiClassifierIndividual +from _brush.engine import * # Regressor, Classifier, and MultiClassifier engines +from pybrush import Parameters, Dataset, SearchSpace +from pybrush import brush_rng + + +# TODO: LOGGER AND ARCHIVE +class BrushEstimator(BaseEstimator): + """ + This is the base class for Deap-based Brush estimators. + This class shouldn't be called directly; instead, call a child class like + :py:class:`DeapRegressor ` or :py:class:`DeapClassifier `. + All of the shared parameters are documented here. + + Parameters + ---------- + mode : str, default 'classification' + The mode of the estimator. Used by subclasses + pop_size : int, default 100 + Population size. + gens : int, default 100 + Maximum iterations of the algorithm. + verbosity : int, default 0 + Controls level of printouts. + max_depth : int, default 0 + Maximum depth of GP trees in the GP program. Use 0 for no limit. + max_size : int, default 0 + Maximum number of nodes in a tree. Use 0 for no limit. + num_islands : int, default 5 + Number of independent islands to use in evolutionary framework. + Ignored if `algorithm!="nsga2island"`. + mig_prob : float, default 0.05 + Probability of occuring a migration between two random islands at the + end of a generation, must be between 0 and 1. + cx_prob : float, default 1/7 + Probability of applying the crossover variation when generating the offspring, + must be between 0 and 1. + Given that there are `n` mutations, and either crossover or mutation is + used to generate each individual in the offspring (but not both at the + same time), we want to have by default an uniform probability between + crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and + `1/n` for each mutation, we can achieve an uniform distribution. + mutation_probs : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} + A dictionary with keys naming the types of mutation and floating point + values specifying the fraction of total mutations to do with that method. + The probability of having a mutation is `(1-cx_prob)` and, in case the mutation + is applied, then each mutation option is sampled based on the probabilities + defined in `mutation_probs`. The set of probabilities should add up to 1.0. + functions: dict[str,float] or list[str], default {} + A dictionary with keys naming the function set and values giving the probability + of sampling them, or a list of functions which will be weighted uniformly. + If empty, all available functions are included in the search space. + initialization : {"uniform", "max_size"}, default "uniform" + Distribution of sizes on the initial population. If `max_size`, then every + expression is created with `max_size` nodes. If `uniform`, size will be + uniformly distributed between 1 and `max_size`. + objectives : list[str], default ["error", "size"] + list with one or more objectives to use. Options are `"error", "size", "complexity"`. + If `"error"` is used, then it will be the mean squared error for regression, + and accuracy for classification. + algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2" + Which Evolutionary Algorithm framework to use to evolve the population. + weights_init : bool, default True + Whether the search space should initialize the sampling weights of terminal nodes + based on the correlation with the output y. If `False`, then all terminal nodes + will have the same probability of 1.0. + validation_size : float, default 0.0 + Percentage of samples to use as a hold-out partition. These samples are used + to calculate statistics during evolution, but not used to train the models. + The `best_estimator_` will be selected using this partition. If zero, then + the same data used for training is used for validation. + batch_size : float, default 1.0 + Percentage of training data to sample every generation. If `1.0`, then + all data is used. Very small values can improve execution time, but + also lead to underfit. + random_state: int or None, default None + If int, then the value is used to seed the c++ random generator; if None, + then a seed will be generated using a non-deterministic generator. It is + important to notice that, even if the random state is fixed, it is + unlikely that running brush using multiple threads will have the same + results. This happens because the Operating System's scheduler is + responsible to choose which thread will run at any given time, thus + reproductibility is not guaranteed. + + Attributes + ---------- + best_estimator_ : pybrush.Program + The final model picked from training. Used in subsequent calls to :func:`predict`. + archive_ : list[deap_api.DeapIndividual] + The final population from training. + data_ : pybrush.Dataset + The complete data in Brush format. + train_ : pybrush.Dataset + Partition of `data_` containing `(1-validation_size)`% of the data, in Brush format. + validation_ : pybrush.Dataset + Partition of `data_` containing `(validation_size)`% of the data, in Brush format. + search_space_ : a Brush `SearchSpace` object. + Holds the operators and terminals and sampling utilities to update programs. + toolbox_ : deap.Toolbox + The toolbox used by DEAP for EA algorithm. + """ + + def __init__( + self, + mode='classification', + pop_size=100, + gens=100, + verbosity=0, + max_depth=3, + max_size=20, + num_islands=1, + n_jobs=1, + mig_prob=0.05, + cx_prob= 1/7, + mutation_probs = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, + "toggle_weight_on":1/6, "toggle_weight_off":1/6}, + functions: list[str]|dict[str,float] = {}, + initialization="uniform", + algorithm="nsga2", + objectives=["error", "size"], + random_state=None, + weights_init=True, + validation_size: float = 0.0, + batch_size: float = 1.0 + ): + + self.pop_size=pop_size + self.gens=gens + self.verbosity=verbosity + self.algorithm=algorithm + self.mode=mode + self.max_depth=max_depth + self.max_size=max_size + self.num_islands=num_islands + self.mig_prob=mig_prob + self.n_jobs=n_jobs + self.cx_prob=cx_prob + self.mutation_probs=mutation_probs + self.functions=functions + self.objectives=objectives + self.initialization=initialization + self.random_state=random_state + self.batch_size=batch_size + self.weights_init=weights_init + self.validation_size=validation_size + + + def fit(self, X, y): + """ + Fit an estimator to X,y. + + Parameters + ---------- + X : np.ndarray + 2-d array of input data. + y : np.ndarray + 1-d array of (boolean) target values. + """ + + self.feature_names_ = [] + if isinstance(X, pd.DataFrame): + self.feature_names_ = X.columns.to_list() + + self.data_ = self._make_data(X, y, + feature_names=self.feature_names_, + validation_size=self.validation_size) + + if isinstance(self.functions, list): + self.functions_ = {k:1.0 for k in self.functions} + else: + self.functions_ = self.functions + + # set n classes if relevant + self.n_classes_ = 0 + if self.mode=="classification": + self.n_classes_ = len(np.unique(y)) + + # Including necessary functions for classification programs. This + # is needed so the search space can create the hash and mapping of + # the functions. + if self.n_classes_ == 2 and "Logistic" not in self.functions_: + self.functions_["Logistic"] = 1.0 + # elif "Softmax" not in self.functions_: # TODO: implement multiclassific. + # self.functions_["Softmax"] = 1.0 + + # These have a default behavior to return something meaningfull if + # no values are set + self.train_ = self.data_.get_training_data() + self.train_.set_batch_size(self.batch_size) # TODO: update batch indexes at the beggining of every generation + self.validation_ = self.data_.get_validation_data() + + self.search_space_ = SearchSpace(self.train_, self.functions_, self.weights_init) + + self.parameters_ = Parameters() + self.parameters_.classification = self.mode == "classification" + self.parameters_.n_classes = self.n_classes_ + self.parameters_.n_jobs = self.n_jobs + self.parameters_.pop_size = self.pop_size + self.parameters_.gens = self.gens + self.parameters_.num_islands = self.num_islands + self.parameters_.max_depth = self.max_depth + self.parameters_.max_size = self.max_size + self.parameters_.objectives = self.objectives + self.parameters_.cx_prob = self.cx_prob + self.parameters_.mig_prob = self.mig_prob + self.parameters_.functions = self.functions + self.parameters_.mutation_probs = self.mutation_probs + + if self.random_state is not None: + self.parameters_.random_state = self.random_state + + self.engine_ = None + if self.mode == 'classification': + self.engine_ = ( ClassifierEngine + if self.n_classes_ == 2 else + MultiClassifierEngine)(self.parameters_) + else: + self.engine_ = RegressorEngine(self.parameters_) + + self.engine_.run(self.data_) + self.best_estimator_ = self.engine_.best_ind + + return self + + def _make_data(self, X, y=None, feature_names=[], validation_size=0.0): + # This function should not partition data (since it may be used in `predict`). + # partitioning is done by `fit`. Feature names should be inferred + # before calling _make_data (so predict can be made with np arrays or + # pd dataframes). + + if isinstance(y, pd.Series): + y = y.values + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + if y is None: + return Dataset(X=X, + feature_names=feature_names, validation_size=validation_size) + + return Dataset(X=X, y=y, + feature_names=feature_names, validation_size=validation_size) + + def predict(self, X): + """Predict using the best estimator in the archive. """ + + check_is_fitted(self) + + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + data = Dataset(X=X, ref_dataset=self.data_, + feature_names=self.feature_names_) + + # data = self._make_data(X, feature_names=self.feature_names_) + + return self.best_estimator_.program.predict(data) + + def get_params(self, deep=True): + out = dict() + for (key, value) in self.__dict__.items(): + if not key.endswith('_'): + if deep and hasattr(value, "get_params") and not isinstance(value, type): + deep_items = value.get_params().items() + out.update((key + "__" + k, val) for k, val in deep_items) + out[key] = value + return out + + +class BrushClassifier(BrushEstimator,ClassifierMixin): + """Deap-based Brush for classification. + + For options, see :py:class:`DeapEstimator `. + + Examples + -------- + >>> import pandas as pd + >>> df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') + >>> X = df.drop(columns='target') + >>> y = df['target'] + >>> from pybrush import DeapClassifier + >>> est = DeapClassifier() + >>> est.fit(X,y) + >>> # print('score:', est.score(X,y)) + """ + def __init__( self, **kwargs): + super().__init__(mode='classification',**kwargs) + + def predict_proba(self, X): + """Predict class probabilities for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32``. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + + """ + + check_is_fitted(self) + + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + data = Dataset(X=X, ref_dataset=self.data_, + feature_names=self.feature_names_) + + # data = self._make_data(X, feature_names=self.feature_names_) + + prob = self.best_estimator_.program.predict_proba(data) + + if self.n_classes_ <= 2: + prob = np.hstack( (np.ones(X.shape[0]).reshape(-1,1), prob.reshape(-1,1)) ) + prob[:, 0] -= prob[:, 1] + + return prob + + +class BrushRegressor(BrushEstimator, RegressorMixin): + def __init__(self, **kwargs): + super().__init__(mode='regressor',**kwargs) \ No newline at end of file diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index 8ee9cb10..12b29a9b 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -16,7 +16,7 @@ from sklearn.preprocessing import MinMaxScaler import functools from pybrush.deap_api import nsga2 -from pybrush import RegressorIndividual, ClassifierIndividual, MultiClassifierIndividual +from pybrush import individual from pybrush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator from pybrush import RegressorSelector, ClassifierSelector, MultiClassifierSelector from pybrush import RegressorVariator, ClassifierVariator, MultiClassifierVariator @@ -172,9 +172,9 @@ def _setup_toolbox(self): # create Individual class, inheriting from self.Individual with a fitness attribute if self.mode == 'classification': - self.Individual = ( ClassifierIndividual + self.Individual = ( individual.ClassifierIndividual if self.n_classes_ == 2 else - MultiClassifierIndividual) + individual.MultiClassifierIndividual) self.eval_ = ( ClassifierEvaluator() if self.n_classes_ == 2 else MultiClassifierEvaluator() ) @@ -185,7 +185,7 @@ def _setup_toolbox(self): if self.n_classes_ == 2 else MultiClassifierSelector("nsga2", True) ) else: - self.Individual = RegressorIndividual + self.Individual = individual.RegressorIndividual self.sel_ = RegressorSelector("lexicase", False) self.surv_ = RegressorSelector("nsga2", True) self.eval_ = RegressorEvaluator() diff --git a/pybrush/__init__.py b/pybrush/__init__.py index 8a9786b8..82e7c846 100644 --- a/pybrush/__init__.py +++ b/pybrush/__init__.py @@ -1,22 +1,24 @@ -# Interfaces for Brush classes. Use to prototype with Brush +# Interfaces for Brush data structures. Use to prototype with Brush from _brush import Dataset from _brush import SearchSpace from _brush import Parameters -# geting random floats +# geting random floats with the same engine from _brush import rnd_flt as brush_rng +# Individuals +from _brush import individual #RegressorIndividual, ClassifierIndividual, MultiClassifierIndividual + +# c++ learning engines. These are wrapped into a scikit-learn-like estimator in the python side +from _brush import engine # RegressorEngine, ClassifierEngine, MultiClassifierEngine + + # Population modifiers from _brush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator from _brush import RegressorSelector, ClassifierSelector, MultiClassifierSelector from _brush import RegressorVariator, ClassifierVariator, MultiClassifierVariator +# -------------------- -# Individuals -from _brush.individual import RegressorIndividual, \ - ClassifierIndividual, MultiClassifierIndividual - +# -------------------- # Prototyping an EA using brush classes, but other EA framework from pybrush.DeapEstimator import DeapClassifier, DeapRegressor - -# c++ learning engines. These are wrapped into a scikit-learn-like estimator in the python side -from _brush.engine import RegressorEngine, ClassifierEngine, MultiClassifierEngine \ No newline at end of file diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index fbcd8e97..3c705f30 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -43,7 +43,8 @@ PYBIND11_MODULE(_brush, m) { bind_search_space(m); bind_fitness(m); - // should these 4 below be exposed? + // TODO: get rid of deap wrapper? + // should these 4 below be exposed? should i add them to submodules? bind_variations(m); bind_selections(m); bind_evaluators(m); diff --git a/src/engine.cpp b/src/engine.cpp index 05259208..d435c906 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -51,6 +51,8 @@ void Engine::init() this->survivor = Selection(params.surv, true); //std::cout << "created survivor" << std::endl; + this->best_loss = MAX_FLT; + this->best_complexity = MAX_FLT; // TODO getters and setters for the best solution found after evolution // predict, transform, predict_proba, etc. // get statistics diff --git a/src/engine.h b/src/engine.h index 23c8c26b..abc8a295 100644 --- a/src/engine.h +++ b/src/engine.h @@ -45,7 +45,7 @@ class Engine{ /// updates best score by searching in the population for the individual that best fits the given data bool update_best(const Dataset& data, bool val=false); - // TODO: im thinking about getting rid of these first two, and keep only the best ind + // TODO: best fitness instead of these. use fitness comparison float best_loss; int best_complexity; Individual& get_best_ind(){return best_ind;}; From f9a7070f2973f07ce91d125647671a112d52576f Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 19 Mar 2024 08:28:19 -0300 Subject: [PATCH 140/199] n_jobs and random_state in engine. new tests. --- src/engine.cpp | 4 +-- src/params.h | 8 +++-- tests/cpp/test_individuals.cpp | 0 tests/cpp/test_params.cpp | 54 +++++++++++++++++++++++++++++++++- tests/cpp/test_population.cpp | 1 + 5 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 tests/cpp/test_individuals.cpp diff --git a/src/engine.cpp b/src/engine.cpp index d435c906..0fb3484f 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -22,11 +22,11 @@ void Engine::init() // initialize population with initial model and/or starting pop if (params.n_jobs!=0) // TODO: change this to set taskflow jobs - omp_set_num_threads(params.n_jobs); + omp_set_num_threads(params.get_n_jobs()); // std::cout << "set number of threads" << std::endl; - r.set_seed(params.random_state); + r.set_seed(params.get_random_state()); // std::cout << "set random state" << std::endl; diff --git a/src/params.h b/src/params.h index 5132e678..c142b104 100644 --- a/src/params.h +++ b/src/params.h @@ -15,10 +15,9 @@ namespace Brush struct Parameters { public: - // TODO: setters and getters for all parameters? (and do checks in setters?). Also make them private, and use the getters and setters in the code + // TODO: make parameters private, and use the getters and setters in the code - // settings - int random_state; // TODO: constructor should set the global rng to random_state (if given, otherwise just let it work normally) + int random_state = 0; // by default, the rng generator will use any random seed if random_state is zero //int verbosity = 0; // TODO: implement log and verbosity // Evolutionary stuff @@ -75,6 +74,9 @@ struct Parameters Parameters(){}; ~Parameters(){}; + void set_random_state(int new_random_state){random_state = new_random_state; }; + int get_random_state(){ return random_state; }; + void set_pop_size(int new_pop_size){ pop_size = new_pop_size; }; int get_pop_size(){ return pop_size; }; diff --git a/tests/cpp/test_individuals.cpp b/tests/cpp/test_individuals.cpp new file mode 100644 index 00000000..e69de29b diff --git a/tests/cpp/test_params.cpp b/tests/cpp/test_params.cpp index d33bed0c..67537277 100644 --- a/tests/cpp/test_params.cpp +++ b/tests/cpp/test_params.cpp @@ -1 +1,53 @@ -// TODO: test it \ No newline at end of file +#include "testsHeader.h" + +// +// #include "../../src/individual.cpp" +// #include "../../src/population.cpp" // TODO: figure out if thats ok to include cpps instead of headers +// #include "../../src/eval/evaluation.cpp" +// #include "../../src/selection/nsga2.cpp" +// #include "../../src/selection/lexicase.cpp" +// #include "../../src/selection/selection_operator.cpp" +// #include "../../src/selection/selection.cpp" + +using namespace Brush::Pop; +using namespace Brush::Sel; +using namespace Brush::Eval; +using namespace Brush::Sel; + +TEST(Params, ParamsTests) +{ + + Parameters params; + + params.set_max_size(12); + ASSERT_EQ(params.max_size, 12); + ASSERT_EQ(params.get_max_size(), 12); + + params.set_max_depth(4); + ASSERT_EQ(params.max_depth, 4); + ASSERT_EQ(params.get_max_depth(), 4); + + params.set_max_depth(6); + ASSERT_EQ(params.max_depth, 6); + ASSERT_EQ(params.get_max_depth(), 6); + + params.set_objectives({"fitness","complexity"}); + ASSERT_EQ(params.get_objectives().size(), 2); + ASSERT_STREQ(params.get_objectives()[0].c_str(), "fitness"); + ASSERT_STREQ(params.get_objectives()[1].c_str(), "complexity"); + + // TODO: implement logger and verbosity and make this work + // string str1 = "Hello\n"; + // string str2 = logger.log("Hello", 0); + // ASSERT_STREQ(str1.c_str(), str2.c_str()); + + // str2 = logger.log("Hello", 2); + // ASSERT_STREQ(str1.c_str(), str2.c_str()); + + // str2 = logger.log("Hello", 3); + // ASSERT_STREQ(str1.c_str(), str2.c_str()); + + // ft.params.set_verbosity(2); + // ASSERT_EQ(ft.params.verbosity, 2); + // ASSERT_STREQ("", logger.log("Hello", 3).c_str()); +} diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 410e5923..1da967d6 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -72,6 +72,7 @@ TEST(Population, PopulationTests) fmt::print("Printing from population method:\n"); fmt::print("{}\n",pop.print_models()); // may yeld seg fault if string is too large for buffer + // this is basically the engine with some debug messages // island sizes increases and comes back to the same values after update fmt::print("Performing all steps of an evolution (sequential, not parallel)\n"); for (int i=0; i<100; ++i) // update and prep offspring slots works properly From a3c9650a643f59442c380639b79b2b9f62e04d1c Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 19 Mar 2024 09:17:18 -0300 Subject: [PATCH 141/199] TODO: check selection implementation --- src/selection/selection.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/selection/selection.h b/src/selection/selection.h index 32574416..f90c6795 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -18,6 +18,8 @@ using namespace Pop; // struct Parameters; // forward declaration of Parameters +// TODO: it seems that the selection is doing a poor job with the size. investigate it. + /*! * @class Selection * @brief interfaces with selection operators. From 8759c09d74230e75cca1fbb8c3b736093e14e229 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 19 Mar 2024 09:18:31 -0300 Subject: [PATCH 142/199] Saving and load population --- .gitignore | 3 +++ pybrush/__init__.py | 1 + src/bindings/bind_params.cpp | 2 ++ src/engine.cpp | 18 ++++++++++--- src/engine.h | 1 + src/params.h | 9 +++++++ src/population.cpp | 42 ++++++++++++++++++++++++++++++ src/population.h | 49 ++++++++++++++++++++++++++++++++++- tests/cpp/test_population.cpp | 3 +++ 9 files changed, 123 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 82c153cd..72ab359a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ *.vscode *.html +# files generated by running the test suite +tests/cpp/__* + tags build/ operon/ diff --git a/pybrush/__init__.py b/pybrush/__init__.py index 82e7c846..9abeb91b 100644 --- a/pybrush/__init__.py +++ b/pybrush/__init__.py @@ -22,3 +22,4 @@ # -------------------- # Prototyping an EA using brush classes, but other EA framework from pybrush.DeapEstimator import DeapClassifier, DeapRegressor +from pybrush.BrushEstimator import BrushClassifier, BrushRegressor diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index df0d2640..4dae39e2 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -16,6 +16,8 @@ void bind_params(py::module& m) .def_property("pop_size", &Brush::Parameters::get_pop_size, &Brush::Parameters::set_pop_size) .def_property("gens", &Brush::Parameters::get_gens, &Brush::Parameters::set_gens) .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) + .def_property("load_population", &Brush::Parameters::get_load_population, &Brush::Parameters::set_load_population) + .def_property("save_population", &Brush::Parameters::get_save_population, &Brush::Parameters::set_save_population) .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands) .def_property("n_classes", &Brush::Parameters::get_n_classes, &Brush::Parameters::set_n_classes) .def_property("n_jobs", &Brush::Parameters::get_n_jobs, &Brush::Parameters::set_n_classes) diff --git a/src/engine.cpp b/src/engine.cpp index d435c906..ccbca9d1 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -2,6 +2,7 @@ #include +#include namespace Brush{ @@ -29,15 +30,20 @@ void Engine::init() r.set_seed(params.random_state); // std::cout << "set random state" << std::endl; - // set up the pop, variator, etc set_is_fitted(false); // std::cout << "is fitted is false" << std::endl; - - this->pop = Population(); + this->pop = Population(); //std::cout << "created population" << std::endl; + // TODO: load population into file + // TODO: if initializing from a population file, then this is where we should load previous models. + // three behaviors: if we have only 1 ind, then replicate it trought the entire pop + // if n_ind is the same as pop_size, load all models. if n_ind != pop_size, throw error + if (params.load_population != "") + this->pop.load(params.load_population); + this->evaluator = Evaluation(); //std::cout << "created evaluator" << std::endl; @@ -306,7 +312,11 @@ void Engine::run(Dataset &data) [&]() { return 0; }, // jump back to the next iteration - [&]() { this->set_is_fitted(true); } // work done, report last gen and stop + [&]() { + if (params.save_population != "") + this->pop.save(params.save_population); + this->set_is_fitted(true); + } // work done, report last gen and stop ); // evolutionary loop init.name("init"); diff --git a/src/engine.h b/src/engine.h index abc8a295..1cf47155 100644 --- a/src/engine.h +++ b/src/engine.h @@ -44,6 +44,7 @@ class Engine{ /// updates best score by searching in the population for the individual that best fits the given data bool update_best(const Dataset& data, bool val=false); + // TODO: hyperparameter to set how the best is picked (MCDM, best on val, pareto front, etc). one of the options should be getting the pareto front // TODO: best fitness instead of these. use fitness comparison float best_loss; diff --git a/src/params.h b/src/params.h index 5132e678..500f0ff0 100644 --- a/src/params.h +++ b/src/params.h @@ -70,6 +70,9 @@ struct Parameters float batch_size = 0.0; bool use_batch = false; ///< whether to use mini batch for training + string load_population = ""; + string save_population = ""; + int n_jobs = 1; // -1; ///< number of parallel jobs -1 use all threads; 0 use same as number of islands; positive number specify the amouut of threads Parameters(){}; @@ -80,6 +83,12 @@ struct Parameters void set_gens(int new_gens){ gens = new_gens; }; int get_gens(){ return gens; }; + + void set_load_population(string new_load_population){ load_population = new_load_population; }; + string get_load_population(){ return load_population; }; + + void set_save_population(string new_save_population){ save_population = new_save_population; }; + string get_save_population(){ return save_population; }; void set_current_gen(unsigned int gen){ current_gen = gen; }; unsigned int get_current_gen(){ return current_gen; }; diff --git a/src/population.cpp b/src/population.cpp index 50fa1baf..f08a01f0 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -103,6 +103,48 @@ void Population::init(SearchSpace& ss, const Parameters& params) } +template +void Population::save(string filename) +{ + std::ofstream out; + if (!filename.empty()) + out.open(filename); + else + out.open("pop.json"); + + json j; + to_json(j, *this); + out << j ; + out.close(); + logger.log("Saved population to file " + filename, 1); +} + +template +void Population::load(string filename) +{ + + // TODO: if initializing from a population file, then this is where we should load previous models. + // three behaviors: if we have only 1 ind, then replicate it trought the entire pop + // if n_ind is the same as pop_size, load all models. if n_ind != pop_size, throw error + + //TODO: replace with from_json(j, this) call + std::ifstream indata; + indata.open(filename); + if (!indata.good()) + HANDLE_ERROR_THROW("Invalid input file " + filename + "\n"); + + std::string line; + indata >> line; + + json j = json::parse(line); + from_json(j, *this); + + logger.log("Loaded population from " + filename + " of size = " + + to_string(this->size()),1); + + indata.close(); +} + /// update individual vector size and island indexes template void Population::add_offspring_indexes(int island) diff --git a/src/population.h b/src/population.h index 0d3192e4..cb0fe140 100644 --- a/src/population.h +++ b/src/population.h @@ -8,6 +8,40 @@ using std::vector; using std::string; using Eigen::Map; +// TODO: move this serialization elsewhere +// serializing vector of shared ptr: https://github.com/nlohmann/json/discussions/2377 +namespace nlohmann +{ +template +struct adl_serializer> +{ + static void to_json(json& j, const std::shared_ptr& opt) + { + if (opt) + { + j = *opt; + } + else + { + j = nullptr; + } + } + + static void from_json(const json& j, std::shared_ptr& opt) + { + if (j.is_null()) + { + opt = nullptr; + } + else + { + opt.reset(new T(j.get())); + } + } +}; +} + + namespace Brush { namespace Pop { @@ -16,7 +50,7 @@ class Population{ public: size_t pop_size; int num_islands; - float mig_prob; + float mig_prob; // TODO: mig_prob should not be part of population vector>> individuals; vector> island_indexes; @@ -31,6 +65,10 @@ class Population{ void init(vector>& individuals, const Parameters& params); // TODO: init from file (like FEAT) + // save serialized population + void save(string filename); + // load serialized population + void load(string filename); /// returns population size (the effective size of the individuals) int size() { return individuals.size(); }; @@ -83,6 +121,15 @@ class Population{ }; }; +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( + Population, individuals, island_indexes, pop_size, num_islands); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( + Population, individuals, island_indexes, pop_size, num_islands); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( + Population, individuals, island_indexes, pop_size, num_islands); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( + Population, individuals, island_indexes, pop_size, num_islands); + }// Pop }// Brush diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 410e5923..c7b93881 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -68,6 +68,7 @@ TEST(Population, PopulationTests) pop[i].program.get_model("compact", true)); } + pop.save("./tests/cpp/__pop_save_first_gen.json"); // print models fmt::print("Printing from population method:\n"); fmt::print("{}\n",pop.print_models()); // may yeld seg fault if string is too large for buffer @@ -139,5 +140,7 @@ TEST(Population, PopulationTests) } } } + pop.save("./tests/cpp/__pop_save_100_gen.json"); + pop.load("./tests/cpp/__pop_save_100_gen.json"); } From f1ab6b13d0a491adcb2c3bdf3ba0868193602de7 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 20 Mar 2024 12:04:21 -0300 Subject: [PATCH 143/199] Resolve offspring bottleneck --- pybrush/BrushEstimator.py | 3 +- pybrush/DeapEstimator.py | 1 - src/engine.cpp | 98 ++++++++++--------- src/population.cpp | 191 ++++++++++++++++++++++++++++++-------- 4 files changed, 205 insertions(+), 88 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index bb31abc5..56ad724d 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -17,7 +17,8 @@ from pybrush import brush_rng -# TODO: LOGGER AND ARCHIVE +# TODO: fix deap estimator breaking with num_islands > 1. write a documentation +# on how to use brush with deap class BrushEstimator(BaseEstimator): """ This is the base class for Deap-based Brush estimators. diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index 12b29a9b..dfe9f609 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -25,7 +25,6 @@ from pybrush import brush_rng -# TODO: LOGGER AND ARCHIVE class DeapEstimator(BaseEstimator): """ This is the base class for Deap-based Brush estimators. diff --git a/src/engine.cpp b/src/engine.cpp index 63a74652..758ea559 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -22,7 +22,8 @@ void Engine::init() // TODO: initialize (set operator) for survivor and selector // initialize population with initial model and/or starting pop - if (params.n_jobs!=0) // TODO: change this to set taskflow jobs + // TODO: get rid of omp + if (params.n_jobs!=0) omp_set_num_threads(params.get_n_jobs()); // std::cout << "set number of threads" << std::endl; @@ -184,6 +185,26 @@ void Engine::run(Dataset &data) // vectors to store each island separatedly vector> island_parents; vector> survivors; + island_parents.clear(); + island_parents.resize(pop.num_islands); + + survivors.clear(); + survivors.resize(pop.num_islands); + + for (int i=0; i< params.num_islands; i++){ + size_t idx_start = std::floor(i*params.pop_size/params.num_islands); + size_t idx_end = std::floor((i+1)*params.pop_size/params.num_islands); + + // auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start + auto delta = idx_end - idx_start; + + survivors.at(i).clear(); + island_parents.at(i).clear(); + + survivors.at(i).resize(delta); + island_parents.at(i).resize(delta); + } + //std::cout << "vectors are created " << std::endl; // TODO: progress bar? (it would be cool) @@ -201,30 +222,30 @@ void Engine::run(Dataset &data) params.set_current_gen(generation); batch = data.get_batch(); // will return the original dataset if it is set to dont use batch - island_parents.clear(); - island_parents.resize(pop.num_islands); + // island_parents.clear(); + // island_parents.resize(pop.num_islands); - survivors.clear(); - survivors.resize(pop.num_islands); + // survivors.clear(); + // survivors.resize(pop.num_islands); - for (int i=0; i< params.num_islands; i++){ - size_t idx_start = std::floor(i*params.pop_size/params.num_islands); - size_t idx_end = std::floor((i+1)*params.pop_size/params.num_islands); + // for (int i=0; i< params.num_islands; i++){ + // size_t idx_start = std::floor(i*params.pop_size/params.num_islands); + // size_t idx_end = std::floor((i+1)*params.pop_size/params.num_islands); - // auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start - auto delta = idx_end - idx_start; + // // auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start + // auto delta = idx_end - idx_start; - survivors.at(i).clear(); - island_parents.at(i).clear(); + // survivors.at(i).clear(); + // island_parents.at(i).clear(); - survivors.at(i).resize(delta); - island_parents.at(i).resize(delta); - } + // survivors.at(i).resize(delta); + // island_parents.at(i).resize(delta); + // } ++generation; }).name("prepare generation");// set generation in params, get batch - auto select_parents = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { + auto run_generation = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { //std::cout << "inside select parents" << std::endl; evaluator.update_fitness(this->pop, island, data, params, true); // fit the weights with all training data @@ -241,26 +262,15 @@ void Engine::run(Dataset &data) //std::cout << i << std::endl; island_parents.at(island).at(i) = parents.at(i); } - }).name("select parents for each island"); - - // this is not thread safe. But it is nice to keep out of parallel execution the bits of the - // code that uses random generators (i think this helps to having random_seed to work properly). Also, - // fit and evaluation are paralellized in survive_population, and these are expensive to run - auto generate_offspring = subflow.emplace([&]() { - for (int island=0; island < params.num_islands; island++){ - //std::cout << "inside generate offspring" << std::endl; - this->pop.add_offspring_indexes(island); // we just need to add them, not remove (they are removed in survival step, that will return a selection with the same number of individuals as the original island size) - - //std::cout << "before vary" << std::endl; - - // // variation to produce offspring - variator.vary(this->pop, island, island_parents.at(island)); - //std::cout << "before update fitness" << std::endl; - } - }).name("generate offspring for each island"); - - auto survive_population = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { + //std::cout << "inside generate offspring" << std::endl; + this->pop.add_offspring_indexes(island); + + //std::cout << "before vary" << std::endl; + // // variation to produce offspring + variator.vary(this->pop, island, island_parents.at(island)); + //std::cout << "before update fitness" << std::endl; + evaluator.update_fitness(this->pop, island, data, params, true); // evaluator.validation(*this->pop, island_range, data, params); @@ -278,7 +288,7 @@ void Engine::run(Dataset &data) //std::cout << i << std::endl; survivors.at(island).at(i) = island_survivors.at(i); } - }).name("evaluate offspring and select survivors"); + }).name("runs one generation at each island in parallel"); auto update_pop = subflow.emplace([&]() { //std::cout << "before updating survivors" << std::endl; @@ -287,27 +297,22 @@ void Engine::run(Dataset &data) //std::cout << "after updating survivors" << std::endl; //std::cout << pop.print_models() << std::endl; - }).name("update population and detangle indexes"); - - auto migration = subflow.emplace([&]() { + //std::cout << "before migrating" << std::endl; //std::cout << pop.print_models() << std::endl; this->pop.migrate(); //std::cout << "after migrating" << std::endl; //std::cout << pop.print_models() << std::endl; - }).name("migration between islands"); + }).name("update, migrate and disentangle indexes between islands"); // TODO: update best, update log, increment generation counter (but not set in params) auto finish_gen = subflow.emplace([&]() { bool updated_best = this->update_best(data); }).name("update best, log, archive"); // set-up subflow graph - prepare_gen.precede(select_parents); - select_parents.precede(generate_offspring); - generate_offspring.precede(survive_population); - survive_population.precede(update_pop); - update_pop.precede(migration); - migration.precede(finish_gen); + prepare_gen.precede(run_generation); + run_generation.precede(update_pop); + update_pop.precede(finish_gen); }, [&]() { return 0; }, // jump back to the next iteration @@ -315,6 +320,7 @@ void Engine::run(Dataset &data) [&]() { if (params.save_population != "") this->pop.save(params.save_population); + this->set_is_fitted(true); } // work done, report last gen and stop ); // evolutionary loop diff --git a/src/population.cpp b/src/population.cpp index f08a01f0..3b5c6510 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -300,21 +300,144 @@ vector Population::hall_of_fame(unsigned rank) } +// template +// void Population::migrate() +// { +// // changes where island points to based on HOF and pareto fronts + +// if (num_islands==1) +// return; // skipping. this only work because update is fixing island indexes + +// // we cant use more than half of population here +// // std::cout << "finding island sorted fronts" << std::endl; +// auto island_fronts = sorted_front(1); + +// // std::cout << "finding global hall of fame" << std::endl; +// auto global_hall_of_fame = hall_of_fame(1); + +// // This method is not thread safe (as it is now) +// vector> new_island_indexes; +// new_island_indexes.resize(num_islands); + +// // std::cout << "Looping" << std::endl; +// for (int island=0; island other_islands(num_islands-1); +// iota(other_islands.begin(), other_islands.end(), 0); + +// // skipping current island +// auto it = other_islands.begin(); +// std::advance(it, island); +// for (;it != other_islands.end(); ++it) { +// ++(*it); // TODO: is this really skipping the current island? +// } + +// // picking other island +// int other_island = *r.select_randomly( +// other_islands.begin(), +// other_islands.end()); + +// migrating_idx = *r.select_randomly( +// island_fronts.at(other_island).begin(), +// island_fronts.at(other_island).end()); +// // std::cout << "mig idx" << migrating_idx << std::endl; +// } + +// // std::cout << "index " << i << " of island " << island; +// // std::cout << " is now" << migrating_idx << std::endl; + +// new_island_indexes.at(island).push_back(migrating_idx); +// } +// else +// { +// new_island_indexes.at(island).push_back(idxs.at(i)); +// } +// } +// } +// // making hard copies (so the next generation starts with islands that does not share individuals +// // this is particularly important to avoid multiple threads assigning different rank/crowdist/dcounter +// // or different fitness) + +// // std::cout << "starting to consolidate pop" << std::endl; +// vector> new_pop; +// new_pop.resize(0); +// for (int j=0; jindividuals.resize(0); +// for (auto ind : new_pop) +// { +// // making hard copies of the individuals +// json ind_copy = ind; + +// // this will fill just half of the pop +// individuals.push_back( +// std::make_shared>(ind_copy) ); +// } +// for (int i=0; i< pop_size; ++i) +// { +// // second half is space to the offspring (but we dont initialize them) +// individuals.push_back(nullptr); +// } +// } + + + template void Population::migrate() { - // changes where island points to + // changes where island points to by shuffling it if (num_islands==1) return; // skipping. this only work because update is fixing island indexes - // we cant use more than half of population here - // std::cout << "finding island sorted fronts" << std::endl; - auto island_fronts = sorted_front(1); - - // std::cout << "finding global hall of fame" << std::endl; - auto global_hall_of_fame = hall_of_fame(1); - // This method is not thread safe (as it is now) vector> new_island_indexes; new_island_indexes.resize(num_islands); @@ -332,39 +455,28 @@ void Population::migrate() // std::cout << "migrating in island " << island << std::endl; size_t migrating_idx; - // determine if incoming individual comes from global or local hall of fame - if (r() < 0.5) { // from global hall of fame - // std::cout << "from hall of fame" << std::endl; - migrating_idx = *r.select_randomly( - global_hall_of_fame.begin(), - global_hall_of_fame.end()); - - // std::cout << "mig idx" << migrating_idx << std::endl; - } - else { // from any other local hall of fame - // std::cout << "from other island" << std::endl; - // finding other island indexes - vector other_islands(num_islands-1); - iota(other_islands.begin(), other_islands.end(), 0); - - // skipping current island - auto it = other_islands.begin(); - std::advance(it, island); - for (;it != other_islands.end(); ++it) { - ++(*it); // TODO: is this really skipping the current island? - } - - // picking other island - int other_island = *r.select_randomly( - other_islands.begin(), - other_islands.end()); - - migrating_idx = *r.select_randomly( - island_fronts.at(other_island).begin(), - island_fronts.at(other_island).end()); - // std::cout << "mig idx" << migrating_idx << std::endl; + + vector other_islands(num_islands-1); + iota(other_islands.begin(), other_islands.end(), 0); + + // skipping current island + auto it = other_islands.begin(); + std::advance(it, island); + for (;it != other_islands.end(); ++it) { + ++(*it); // TODO: is this really skipping the current island? } + // picking other island + int other_island = *r.select_randomly( + other_islands.begin(), + other_islands.end()); + + migrating_idx = *r.select_randomly( + island_indexes.at(other_island).begin(), + island_indexes.at(other_island).end()); + // std::cout << "mig idx" << migrating_idx << std::endl; + + // std::cout << "index " << i << " of island " << island; // std::cout << " is now" << migrating_idx << std::endl; @@ -409,7 +521,6 @@ void Population::migrate() assert(new_pop.size() == pop_size && " migration ended up with a different popsize"); - // std::cout << "filling individuals" << std::endl; this->individuals.resize(0); for (auto ind : new_pop) From 28067b0cfcf44d09563f0a39ab0be521bcb30d9b Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 2 Apr 2024 15:06:26 -0300 Subject: [PATCH 144/199] Fixed PTC2 creating huge trees when max_depth is a high number --- src/program/program.h | 8 +++++++- src/search_space.cpp | 7 +++++-- src/variation.h | 3 +++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/program/program.h b/src/program/program.h index d694c171..5eb1454c 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -108,9 +108,13 @@ template struct Program // SplitBest has an optimizable decision tree consisting of 3 nodes // (terminal, arithmetic comparison, value) that needs to be taken - // into account + // into account. Split on will have an random decision tree that can + // have different sizes, but will also have the arithmetic comparison + // and a value. if (Is(node.node_type)) acc += 3; + else if (Is(node.node_type)) + acc += 2; if ( (include_weight && node.get_is_weighted()==true) && Isnt(node.node_type) ) @@ -147,6 +151,8 @@ template struct Program // into account if (Is(it.node->data.node_type)) acc += 3; + else if (Is(it.node->data.node_type)) + acc += 2; if ( (include_weight && it.node->data.get_is_weighted()==true) && Isnt(it.node->data.node_type) ) diff --git a/src/search_space.cpp b/src/search_space.cpp index dd076162..9adb9c20 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -254,6 +254,9 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // updating size accordingly to root node if (Is(root.node_type)) s += 3; + else if (Is(root.node_type)) + s += 2; + if ( root.get_is_weighted()==true && Isnt(root.node_type) ) s += 2; @@ -266,7 +269,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const queue.push_back(make_tuple(child_spot, a, d)); } - int max_arity = 3; + int max_arity = 4; Node n; // Now we actually start the PTC2 procedure to create the program tree @@ -287,7 +290,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const auto [qspot, t, d] = RandomDequeue(queue); /* cout << "current depth: " << d << endl; */ - if (d >= max_d) + if (d >= max_d || s >= max_size) { // choose terminal of matching type /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ diff --git a/src/variation.h b/src/variation.h index 13866568..0bca47dc 100644 --- a/src/variation.h +++ b/src/variation.h @@ -73,11 +73,14 @@ class MutationBase { [include_weight, &acc](auto& node){ ++acc; // the node operator or terminal + // TODO: the same size check occurs in search_space.cpp and program.h. Make a function (stop doing hardcoded) // SplitBest has an optimizable decision tree consisting of 3 nodes // (terminal, arithmetic comparison, value) that needs to be taken // into account if (Is(node.node_type)) acc += 3; + else if (Is(node.node_type)) + acc += 2; if ( (include_weight && node.get_is_weighted()==true) && Isnt(node.node_type) ) From 9f6be61eb89127dc5ab6224998a737b2db16144d Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 2 Apr 2024 17:01:51 -0300 Subject: [PATCH 145/199] Missing if statement in search space ptc2 --- src/search_space.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/search_space.cpp b/src/search_space.cpp index 9adb9c20..5f6fe447 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -341,8 +341,12 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // increment is different based on node weights ++s; + if (Is(n.node_type)) s += 3; + else if (Is(n.node_type)) + s += 2; + if ( n.get_is_weighted()==true && Isnt(n.node_type) ) s += 2; From 1efedd3315b9a2b839e460399ed9965552571b1b Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 10 Apr 2024 13:00:02 -0300 Subject: [PATCH 146/199] Fixed tree creation with logistic/softmax when not specified in function set --- pybrush/BrushEstimator.py | 25 +++++++++--------- src/bindings/bind_dataset.cpp | 16 ++++++++---- src/data/data.h | 8 +++--- src/program/optimizer/weight_optimizer.h | 1 + src/search_space.cpp | 33 +++++++++++++++++++++++- 5 files changed, 61 insertions(+), 22 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 56ad724d..834a8c0e 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -189,14 +189,6 @@ def fit(self, X, y): if self.mode=="classification": self.n_classes_ = len(np.unique(y)) - # Including necessary functions for classification programs. This - # is needed so the search space can create the hash and mapping of - # the functions. - if self.n_classes_ == 2 and "Logistic" not in self.functions_: - self.functions_["Logistic"] = 1.0 - # elif "Softmax" not in self.functions_: # TODO: implement multiclassific. - # self.functions_["Softmax"] = 1.0 - # These have a default behavior to return something meaningfull if # no values are set self.train_ = self.data_.get_training_data() @@ -220,6 +212,10 @@ def fit(self, X, y): self.parameters_.functions = self.functions self.parameters_.mutation_probs = self.mutation_probs + self.parameters_.scorer_ = "mse" + if self.mode == "classification": + self.parameters_.scorer_ = "log" if self.n_classes_ == 2 else "multi_log" + if self.random_state is not None: self.parameters_.random_state = self.random_state @@ -251,10 +247,13 @@ def _make_data(self, X, y=None, feature_names=[], validation_size=0.0): if y is None: return Dataset(X=X, - feature_names=feature_names, validation_size=validation_size) + feature_names=feature_names, c=self.mode == "classification", + validation_size=validation_size) return Dataset(X=X, y=y, - feature_names=feature_names, validation_size=validation_size) + feature_names=feature_names, c=self.mode == "classification", + validation_size=validation_size) + def predict(self, X): """Predict using the best estimator in the archive. """ @@ -266,8 +265,8 @@ def predict(self, X): assert isinstance(X, np.ndarray) - data = Dataset(X=X, ref_dataset=self.data_, - feature_names=self.feature_names_) + data = Dataset(X=X, ref_dataset=self.data_, c=self.mode == "classification", + feature_names=self.feature_names_) # data = self._make_data(X, feature_names=self.feature_names_) @@ -327,7 +326,7 @@ def predict_proba(self, X): assert isinstance(X, np.ndarray) - data = Dataset(X=X, ref_dataset=self.data_, + data = Dataset(X=X, ref_dataset=self.data_, c=True, feature_names=self.feature_names_) # data = self._make_data(X, feature_names=self.feature_names_) diff --git a/src/bindings/bind_dataset.cpp b/src/bindings/bind_dataset.cpp index ade036dc..41cbb94a 100644 --- a/src/bindings/bind_dataset.cpp +++ b/src/bindings/bind_dataset.cpp @@ -12,13 +12,15 @@ void bind_dataset(py::module & m) // construct from X, feature names (and optional validation and batch sizes) with constructor 3. .def(py::init([](const Ref& X, const vector& feature_names=vector(), + const bool c=false, const float validation_size=0.0, const float batch_size=1.0){ return br::Data::Dataset( - X, feature_names, validation_size, batch_size); + X, feature_names, c, validation_size, batch_size); }), py::arg("X"), py::arg("feature_names") = vector(), + py::arg("c") = false, py::arg("validation_size") = 0.0, py::arg("batch_size") = 1.0 ) @@ -26,14 +28,16 @@ void bind_dataset(py::module & m) .def(py::init([](const Ref& X, const Ref& y, const vector& feature_names=vector(), + const bool c=false, const float validation_size=0.0, const float batch_size=1.0){ return br::Data::Dataset( - X, y, feature_names, {}, false, validation_size, batch_size); + X, y, feature_names, {}, c, validation_size, batch_size); }), py::arg("X"), py::arg("y"), py::arg("feature_names") = vector(), + py::arg("c") = false, py::arg("validation_size") = 0.0, py::arg("batch_size") = 1.0 ) @@ -43,12 +47,14 @@ void bind_dataset(py::module & m) // no feature names). .def(py::init([](const Ref& X, const br::Data::Dataset& ref_dataset, - const vector& feature_names){ - return br::Data::Dataset(X, ref_dataset, feature_names); + const vector& feature_names, + const bool c=false){ + return br::Data::Dataset(X, ref_dataset, feature_names, c); }), py::arg("X"), py::arg("ref_dataset"), - py::arg("feature_names") + py::arg("feature_names"), + py::arg("c") = false ) .def_readwrite("y", &br::Data::Dataset::y) diff --git a/src/data/data.h b/src/data/data.h index 358f0856..0f6ef69a 100644 --- a/src/data/data.h +++ b/src/data/data.h @@ -143,10 +143,11 @@ class Dataset /// 3. initialize data from X and feature names Dataset(const ArrayXXf& X, const vector& vn, + bool c = false, float validation_size = 0.0, float batch_size = 1.0 ) - : classification(false) + : classification(c) , features(make_features(X,map{},vn)) , validation_size(validation_size) , use_validation(validation_size > 0.0 && validation_size < 1.0) @@ -161,9 +162,10 @@ class Dataset //// reference dataset. Useful for bypass Brush's type sniffer and //// doing predictions with small number of samples Dataset(const ArrayXXf& X, const Dataset& ref_dataset, - const vector& vn + const vector& vn, + bool c = false ) - : classification(false) + : classification(c) , features(copy_and_make_features(X,ref_dataset,vn)) , validation_size(0.0) , use_validation(false) diff --git a/src/program/optimizer/weight_optimizer.h b/src/program/optimizer/weight_optimizer.h index 9b727fae..79a2d4fd 100644 --- a/src/program/optimizer/weight_optimizer.h +++ b/src/program/optimizer/weight_optimizer.h @@ -86,6 +86,7 @@ struct WeightOptimizer { if (program.get_n_weights() == 0) return; + // fmt::print("number of weights: {}\n",program.get_n_weights()); auto init_weights = program.get_weights(); diff --git a/src/search_space.cpp b/src/search_space.cpp index 5f6fe447..dbf206a2 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -6,6 +6,8 @@ namespace Brush{ float calc_initial_weight(const ArrayXf& value, const ArrayXf& y) { + // OBS: only for terminals! + // weights are initialized as the slope of the z-score of x and y. // If y has different length from X, we get a core dump here. @@ -180,9 +182,37 @@ void SearchSpace::init(const Dataset& d, const unordered_map& user vector terminals = generate_terminals(d, weights_init); + // If it is a classification problem, we need to add the fixed root nodes + // (logistic for binary classification, softmax for multiclassification). + // Sometimes, the user may not specify these two nodes as candidates when + // sampling functions, so we check if they are already in the terminal set, and + // we add them with zero prob if they are not. They need to be in the func set + // when calling GenerateNodeMap, so the search_space will contain all the hashes + // and signatures for them (and they can be used only in program root). + // TODO: fix softmax and add it here + + // Copy the original map using the copy constructor + std::unordered_map extended_user_ops(user_ops); + + if (d.classification) + { + // Convert ArrayXf to std::vector for compatibility with std::set + std::vector vec(d.y.data(), d.y.data() + d.y.size()); + + std::set unique_classes(vec.begin(), vec.end()); + + if (unique_classes.size()==2 && (user_ops.find("Logistic") != user_ops.end())) { + extended_user_ops.insert({"Logistic", 0.0f}); + } + else if (user_ops.find("Softmax") != user_ops.end()) { + // extended_user_ops.insert({"Softmax", 0.0f}); + } + } + /* fmt::print("generate nodetype\n"); */ - GenerateNodeMap(user_ops, d.unique_data_types, + GenerateNodeMap(extended_user_ops, d.unique_data_types, std::make_index_sequence()); + // map terminals /* fmt::print("looping through terminals...\n"); */ for (const auto& term : terminals) @@ -246,6 +276,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // auto spot = Tree.set_head(n); /* cout << "inserting...\n"; */ auto spot = Tree.insert(Tree.begin(), root); + // node depth int d = 1; // current tree size From 4af5d4dc02f2ca38fe77070518b8df5df4f78050 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 10 Apr 2024 18:15:05 -0300 Subject: [PATCH 147/199] OffsetSum (sum with W as argument, not as weight) --- src/program/functions.h | 16 ++++ src/program/node.cpp | 14 ++++ src/program/nodetype.cpp | 1 + src/program/nodetype.h | 47 +++++------ src/program/operator.h | 160 +++++++++++++++++++++++++++++++++++++- src/program/program.h | 6 +- src/program/signatures.h | 3 +- src/program/split.h | 1 + src/program/tree_node.cpp | 13 ++-- src/search_space.cpp | 4 +- src/search_space.h | 7 +- src/variation.cpp | 12 ++- src/variation.h | 2 +- 13 files changed, 246 insertions(+), 40 deletions(-) diff --git a/src/program/functions.h b/src/program/functions.h index 144c96a1..66136b62 100644 --- a/src/program/functions.h +++ b/src/program/functions.h @@ -167,6 +167,7 @@ namespace Brush template inline auto operator()(const TimeSeries& t) { return t.prod(); } }; + /* sum */ template<> struct Function @@ -182,6 +183,21 @@ namespace Brush inline auto operator()(const TimeSeries& t) { return t.sum(); } }; + /* OffsetSum */ // TODO: IMPLEMENT + template<> + struct Function + { + template + inline auto operator()(const T& t) { return t.rowwise().sum(); } + + inline auto operator()(ArrayXXb t) { + return (t.rowwise().count().cast ()); + } + + template + inline auto operator()(const TimeSeries& t) { return t.sum(); } + }; + template<> struct Function { diff --git a/src/program/node.cpp b/src/program/node.cpp index 245ef1dc..93c44cc8 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -73,6 +73,19 @@ string Node::get_model(const vector& children) const noexcept children.at(2) ); } + else if (Is(node_type)){ + // weight is part of the model + string args = fmt::format("{},", W); + + for (int i = 0; i < children.size(); ++i){ + args += children.at(i); + if (i < children.size()-1) + args += ","; + } + + // TODO: rename it to just Sum (the user doesnt need to know the offset is fixed) + return fmt::format("OffsetSum({})", args); + } else{ string args = ""; for (int i = 0; i < children.size(); ++i){ @@ -178,6 +191,7 @@ void init_node_with_default_signature(Node& node) NT::Mean, NT::Median, NT::Sum, + NT::OffsetSum, NT::Prod, NT::Softmax >(n)) diff --git a/src/program/nodetype.cpp b/src/program/nodetype.cpp index d9224849..de7a6668 100644 --- a/src/program/nodetype.cpp +++ b/src/program/nodetype.cpp @@ -51,6 +51,7 @@ std::map NodeNameType = { {"Median", NodeType::Median}, {"Count", NodeType::Count}, {"Sum", NodeType::Sum}, + {"OffsetSum", NodeType::OffsetSum}, {"Prod", NodeType::Prod}, {"ArgMax", NodeType::ArgMax}, diff --git a/src/program/nodetype.h b/src/program/nodetype.h index 7d153ab7..4cdc1db1 100644 --- a/src/program/nodetype.h +++ b/src/program/nodetype.h @@ -62,22 +62,23 @@ enum class NodeType : uint64_t { // Each node type must have a complexity Max = 1UL << 24UL, Mean = 1UL << 25UL, Median = 1UL << 26UL, - Sum = 1UL << 27UL, - Prod = 1UL << 28UL, + Prod = 1UL << 27UL, + Sum = 1UL << 28UL, + OffsetSum = 1UL << 29UL, // Sum with weight as one of its arguments // Transformers - Softmax = 1UL << 29UL, + Softmax = 1UL << 30UL, // Binary - Add = 1UL << 30UL, - Sub = 1UL << 31UL, - Mul = 1UL << 32UL, - Div = 1UL << 33UL, - Pow = 1UL << 34UL, + Add = 1UL << 31UL, + Sub = 1UL << 32UL, + Mul = 1UL << 33UL, + Div = 1UL << 34UL, + Pow = 1UL << 35UL, //split - SplitBest = 1UL << 35UL, - SplitOn = 1UL << 36UL, + SplitBest = 1UL << 36UL, + SplitOn = 1UL << 37UL, // these ones change type /* Equals = 1UL << 39UL, */ @@ -87,29 +88,29 @@ enum class NodeType : uint64_t { // Each node type must have a complexity /* Geq = 1UL << 43UL, */ // boolean - And = 1UL << 37UL, - Or = 1UL << 38UL, - Not = 1UL << 39UL, + And = 1UL << 38UL, + Or = 1UL << 39UL, + Not = 1UL << 40UL, // Xor = 1UL << 39UL, // leaves (must be the last ones in this enum) - MeanLabel = 1UL << 40UL, - Constant = 1UL << 41UL, - Terminal = 1UL << 42UL, - ArgMax = 1UL << 43UL, // TODO: move before leaves - Count = 1UL << 44UL, + MeanLabel = 1UL << 41UL, + Constant = 1UL << 42UL, + Terminal = 1UL << 43UL, + ArgMax = 1UL << 44UL, // TODO: move before leaves + Count = 1UL << 45UL, // custom - CustomUnaryOp = 1UL << 44UL, - CustomBinaryOp = 1UL << 45UL, - CustomSplit = 1UL << 46UL + CustomUnaryOp = 1UL << 46UL, + CustomBinaryOp = 1UL << 47UL, + CustomSplit = 1UL << 48UL }; using UnderlyingNodeType = std::underlying_type_t; struct NodeTypes { // magic number keeping track of the number of different node types - static constexpr size_t Count = 43; + static constexpr size_t Count = 44; static constexpr size_t OpCount = Count-3; // subtracting leaves // returns the index of the given type in the NodeType enum @@ -196,6 +197,7 @@ NLOHMANN_JSON_SERIALIZE_ENUM( NodeType, { {NodeType::Median,"Median" }, {NodeType::Count,"Count" }, {NodeType::Sum,"Sum" }, + {NodeType::OffsetSum,"OffsetSum" }, {NodeType::Prod,"Prod" }, {NodeType::ArgMax,"ArgMax" }, @@ -292,6 +294,7 @@ static constexpr bool NaryOp = is_in_v; diff --git a/src/program/operator.h b/src/program/operator.h index f59205fe..054a18f5 100644 --- a/src/program/operator.h +++ b/src/program/operator.h @@ -225,7 +225,6 @@ struct Operator } return this->apply(inputs); }; - }; ////////////////////////////////////////////////////////////////////////////////// @@ -303,6 +302,7 @@ struct Operator else return RetType::Constant(d.get_n_samples(), d.get_n_features(), w); }; + }; //////////////////////////////////////////////////////////////////////////// @@ -336,6 +336,164 @@ struct Operator }; }; +//////////////////////////////////////////////////////////////////////////// +// OffsetSum overload +template +struct Operator>> +{ + /** + * @brief set argument types to those of the signature unless: + * + * a) the operator is unary and there are more than one arguments + * b) the operator is binary and associative + * + * In the case of a) or b), arguments to the operator are stacked into an + * array and the operator is applied to that array + */ + using ArgTypes = conditional_t< + ((UnaryOp || NaryOp) && S::ArgCount > 1), + Array, + typename S::ArgTypes>; + + /// @brief return type of the operator + using RetType = typename S::RetType; + + /// @brief stores the argument count of the operator + static constexpr size_t ArgCount = S::ArgCount; + + /// utility for returning the type of the Nth argument + template + using NthType = typename S::NthType; + + /// set weight type + using W = typename S::WeightType; + + /// @brief wrapper function for the node function + static constexpr auto F = [](const auto& ...args) { + Function f; + return f(args...); + }; + + Operator() = default; + //////////////////////////////////////////////////////////////////////////////// + // Utilities to grab child outputs. + + /// get a std::array or eigen array of kids + template requires(is_std_array_v || is_eigen_array_v) + T get_kids(const Dataset& d, TreeNode& tn, const W** weights=nullptr) const + { + T child_outputs; + using arg_type = std::conditional_t, + typename T::value_type, Array>; + if constexpr (is_eigen_array_v) + child_outputs.resize(d.get_n_samples(), Eigen::NoChange); + + TreeNode* sib = tn.first_child; + for (int i = 0; i < ArgCount; ++i) + { + if (sib == nullptr) + HANDLE_ERROR_THROW("bad sibling ptr in get kids"); + if constexpr (Fit){ + if constexpr(is_std_array_v) + child_outputs.at(i) = sib->fit(d); + else + child_outputs.col(i) = sib->fit(d); + } + else{ + if constexpr(is_std_array_v) + child_outputs.at(i) = sib->predict(d, weights); + else + child_outputs.col(i) = sib->predict(d, weights); + } + sib = sib->next_sibling; + } + return child_outputs; + }; + + /// gets one kid for a tuple of kids + template + NthType get_kid(const Dataset& d,TreeNode& tn, const W** weights ) const + { + auto sib = tree::sibling_iterator(tn.first_child) ; + sib += I; + if constexpr(Fit) + return sib->fit>(d); + else + return sib->predict>(d,weights); + }; + + /** + * @brief Makes and returns a tuple of child outputs + * + * @tparam T a tuple + * @tparam Is integer sequence + * @param d dataset + * @param tn a tree node + * @return a tuple with elements corresponding to each child node + */ + template requires(is_tuple_v) + T get_kids_seq(const Dataset& d, TreeNode& tn, const W** weights, std::index_sequence) const + { + return std::make_tuple(get_kid(d,tn,weights)...); + }; + + /// @brief get a std::tuple of kids. Used when child arguments are different types. + /// @tparam T argument types + /// @param d the dataset + /// @param tn the tree node + /// @param weights option pointer to a weight array, used in place of node weight + /// @return a tuple of the child arguments + template requires(is_tuple_v) + T get_kids(const Dataset& d, TreeNode& tn, const W** weights=nullptr) const + { + return get_kids_seq(d, tn, weights, std::make_index_sequence{}); + }; + + /////////////////////////////////////////////////////////////////////////// + + /// @brief Apply node function in a functional style + /// @tparam T argument types + /// @param inputs the child node outputs + /// @return return values applying F to the inputs + template requires ( is_std_array_v || is_tuple_v) + RetType apply(const T& inputs) const + { + return std::apply(F, inputs); + } + + /// @brief Apply the node function like a function + /// @tparam T argument types + /// @param inputs the child node outputs + /// @return return values applying F to the inputs + template requires ( is_eigen_array_v && !is_std_array_v) + RetType apply(const T& inputs) const + { + return F(inputs); + } + + /// @brief evaluate the operator on the data. main entry point. + /// @tparam T argument types + /// @tparam Scalar the underlying scalar type of the return type + /// @param d dataset + /// @param tn tree node + /// @param weights option pointer to a weight array, used in place of node weight + /// @return output values from applying operator function + template + RetType eval(const Dataset& d, TreeNode& tn, const W** weights=nullptr) const + { + auto inputs = get_kids(d, tn, weights); + if constexpr (is_one_of_v) + { + if (tn.data.get_is_weighted()) + { + auto w = util::get_weight(tn, weights); + return this->apply(inputs) + w; + } + } + return this->apply(inputs); + }; +}; + //////////////////////////////////////////////////////////////////////////// // Operator overloads // Split diff --git a/src/program/program.h b/src/program/program.h index 5eb1454c..28a366af 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -117,7 +117,7 @@ template struct Program acc += 2; if ( (include_weight && node.get_is_weighted()==true) - && Isnt(node.node_type) ) + && Isnt(node.node_type) ) // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) acc += 2; @@ -155,7 +155,7 @@ template struct Program acc += 2; if ( (include_weight && it.node->data.get_is_weighted()==true) - && Isnt(it.node->data.node_type) ) + && Isnt(it.node->data.node_type) ) // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) acc += 2; @@ -457,7 +457,7 @@ template struct Program // kid_id = kid_id.substr(2); if (kid->data.get_is_weighted() - && Isnt(kid->data.node_type)){ + && Isnt(kid->data.node_type)){ edge_label = fmt::format("{:.2f}",kid->data.W); } diff --git a/src/program/signatures.h b/src/program/signatures.h index 91daba28..b7d57474 100644 --- a/src/program/signatures.h +++ b/src/program/signatures.h @@ -295,6 +295,7 @@ struct Signatures struct Signatures, Signature - >; + >;// TODO: should I implement compatibility with integers? using naryTuple = NarySignatures_t; diff --git a/src/program/split.h b/src/program/split.h index b7078738..9b937ea8 100644 --- a/src/program/split.h +++ b/src/program/split.h @@ -181,6 +181,7 @@ namespace Split{ } } // namespace Split + //////////////////////////////////////////////////////////////////////////////// // Split operator overload template diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index 56036d08..a60de280 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -106,12 +106,13 @@ unordered_map operator_complexities = { {NodeType::During, 3}, // Reducers - {NodeType::Min , 3}, - {NodeType::Max , 3}, - {NodeType::Mean , 3}, - {NodeType::Median, 3}, - {NodeType::Sum , 2}, - {NodeType::Prod , 3}, + {NodeType::Min , 3}, + {NodeType::Max , 3}, + {NodeType::Mean , 3}, + {NodeType::Median , 3}, + {NodeType::Sum , 2}, + {NodeType::OffsetSum, 2}, + {NodeType::Prod , 3}, // Transformers {NodeType::Softmax, 4}, diff --git a/src/search_space.cpp b/src/search_space.cpp index dbf206a2..ee1724a9 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -289,7 +289,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const s += 2; if ( root.get_is_weighted()==true - && Isnt(root.node_type) ) + && Isnt(root.node_type) ) s += 2; //For each argument position a of n, Enqueue(a; g) @@ -379,7 +379,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const s += 2; if ( n.get_is_weighted()==true - && Isnt(n.node_type) ) + && Isnt(n.node_type) ) s += 2; /* cout << "current tree size: " << s << endl; */ diff --git a/src/search_space.h b/src/search_space.h index 7e57f470..83883360 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -606,12 +606,13 @@ struct SearchSpace const vector& unique_data_types ) { - bool use_all = user_ops.size() == 0; auto name = NodeTypeName[NT]; - //TODO: address this (whether weights should be included by default) - // bool weighted = (IsWeighable() && is_same_v); + bool weighted = false; + if (Is(NT)) // this has to have weights on by default + weighted = true; + auto n_maybe = CreateNode(unique_data_types, use_all, weighted); if (n_maybe){ diff --git a/src/variation.cpp b/src/variation.cpp index ff3b8e44..ff8f268f 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -187,10 +187,16 @@ class ToggleWeightOnMutation : public MutationBase if (size_with_weights(Tree) < max_size()) { std::transform(Tree.begin(), Tree.end(), weights.begin(), [&](const auto& n){ + // some nodetypes must always have a weight + if (Is(n.node_type)) + return 0.0f; + // only weighted nodes can be toggled off if (!n.get_is_weighted() && IsWeighable(n.ret_type)) + { return n.get_prob_change(); + } else return 0.0f; }); @@ -206,7 +212,7 @@ class ToggleWeightOnMutation : public MutationBase auto operator()(tree& Tree, Iter spot) const -> bool override { // cout << "toggle_weight_on mutation\n"; - + if (spot.node->data.get_is_weighted()==true // cant turn on whats already on || !IsWeighable(spot.node->data.ret_type)) // does not accept weights (e.g. boolean) return false; // false indicates that mutation failed and should return std::nullopt @@ -236,6 +242,10 @@ class ToggleWeightOffMutation : public MutationBase std::transform(Tree.begin(), Tree.end(), weights.begin(), [&](const auto& n){ + // some nodetypes must always have a weight + if (Is(n.node_type)) + return 0.0f; + if (n.get_is_weighted() && IsWeighable(n.ret_type)) return n.get_prob_change(); diff --git a/src/variation.h b/src/variation.h index 0bca47dc..bbf3af63 100644 --- a/src/variation.h +++ b/src/variation.h @@ -83,7 +83,7 @@ class MutationBase { acc += 2; if ( (include_weight && node.get_is_weighted()==true) - && Isnt(node.node_type) ) + && Isnt(node.node_type) ) // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) acc += 2; From a9b4cab752732f67e0710b7c896b48b6eb98e91e Mon Sep 17 00:00:00 2001 From: gAldeia Date: Fri, 12 Apr 2024 07:11:31 -0300 Subject: [PATCH 148/199] Fixed bad logic when adding Logistic to search space --- pybrush/BrushEstimator.py | 2 +- pybrush/DeapEstimator.py | 11 ++--------- src/search_space.cpp | 4 ++-- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 834a8c0e..9da806f2 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -195,7 +195,7 @@ def fit(self, X, y): self.train_.set_batch_size(self.batch_size) # TODO: update batch indexes at the beggining of every generation self.validation_ = self.data_.get_validation_data() - self.search_space_ = SearchSpace(self.train_, self.functions_, self.weights_init) + self.search_space_ = SearchSpace(self.data_, self.functions_, self.weights_init) self.parameters_ = Parameters() self.parameters_.classification = self.mode == "classification" diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index dfe9f609..f51ac28f 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -261,21 +261,14 @@ def fit(self, X, y): if self.mode=="classification": self.n_classes_ = len(np.unique(y)) - # Including necessary functions for classification programs. This - # is needed so the search space can create the hash and mapping of - # the functions. - if self.n_classes_ == 2 and "Logistic" not in self.functions_: - self.functions_["Logistic"] = 1.0 - # elif "Softmax" not in self.functions_: # TODO: implement multiclassific. - # self.functions_["Softmax"] = 1.0 - # These have a default behavior to return something meaningfull if # no values are set self.train_ = self.data_.get_training_data() self.train_.set_batch_size(self.batch_size) # TODO: update batch indexes at the beggining of every generation + self.validation_ = self.data_.get_validation_data() - self.search_space_ = SearchSpace(self.train_, self.functions_, self.weights_init) + self.search_space_ = SearchSpace(self.data_, self.functions_, self.weights_init) self.parameters_ = Parameters() self.parameters_.classification = self.mode == "classification" diff --git a/src/search_space.cpp b/src/search_space.cpp index ee1724a9..5fc8621e 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -201,10 +201,10 @@ void SearchSpace::init(const Dataset& d, const unordered_map& user std::set unique_classes(vec.begin(), vec.end()); - if (unique_classes.size()==2 && (user_ops.find("Logistic") != user_ops.end())) { + if (unique_classes.size()==2 && (user_ops.find("Logistic") == user_ops.end())) { extended_user_ops.insert({"Logistic", 0.0f}); } - else if (user_ops.find("Softmax") != user_ops.end()) { + else if (user_ops.find("Softmax") == user_ops.end()) { // extended_user_ops.insert({"Softmax", 0.0f}); } } From 3908eb29fce31d1dd1d583575d5416c57971b555 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Sun, 14 Apr 2024 13:31:03 -0300 Subject: [PATCH 149/199] Better dot models for OffsetSum --- src/program/program.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/program/program.h b/src/program/program.h index 28a366af..d5a92677 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -441,6 +441,9 @@ template struct Program if (Is(parent->data.node_type)){ node_label = fmt::format("{}>{:.2f}?", parent->data.get_feature(), parent->data.W); } + if (Is(parent->data.node_type)){ + node_label = fmt::format("{:.2f} + Sum", parent->data.W); + } out += fmt::format("\"{}\" [label=\"{}\"];\n", parent_id, node_label); // add edges to the node's children From f96c594e1a5931297fe1b54a98c43abe8980f452 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Sun, 14 Apr 2024 13:31:44 -0300 Subject: [PATCH 150/199] Minor changes --- src/data/data.cpp | 1 + src/engine.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/data/data.cpp b/src/data/data.cpp index 7819be7e..5ca46f1d 100644 --- a/src/data/data.cpp +++ b/src/data/data.cpp @@ -227,6 +227,7 @@ void Dataset::init() } } +// TODO: use integer instead of percentage (or even better, have both) float Dataset::get_batch_size() { return batch_size; } void Dataset::set_batch_size(float new_size) { batch_size = new_size; diff --git a/src/engine.cpp b/src/engine.cpp index 758ea559..9543ec37 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -304,7 +304,7 @@ void Engine::run(Dataset &data) //std::cout << "after migrating" << std::endl; //std::cout << pop.print_models() << std::endl; - }).name("update, migrate and disentangle indexes between islands"); + }).name("update, migrate and disentangle indexes between islands"); // TODO: update best, update log, increment generation counter (but not set in params) auto finish_gen = subflow.emplace([&]() { bool updated_best = this->update_best(data); }).name("update best, log, archive"); From 39a1e16a37e3baf7b6955055fb7d2fc434931123 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Sun, 14 Apr 2024 19:23:17 -0300 Subject: [PATCH 151/199] Implemented max_stall --- pybrush/BrushEstimator.py | 6 ++++++ src/bindings/bind_params.cpp | 1 + src/engine.cpp | 23 ++++++++++++++++++----- src/params.h | 12 +++++++++--- tests/cpp/test_params.cpp | 2 ++ 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 9da806f2..70f58550 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -34,6 +34,9 @@ class BrushEstimator(BaseEstimator): Population size. gens : int, default 100 Maximum iterations of the algorithm. + max_stall: int, optional (default: 0) + How many generations to continue after the validation loss has + stalled. If 0, not used. verbosity : int, default 0 Controls level of printouts. max_depth : int, default 0 @@ -119,6 +122,7 @@ def __init__( mode='classification', pop_size=100, gens=100, + max_stall=0, verbosity=0, max_depth=3, max_size=20, @@ -140,6 +144,7 @@ def __init__( self.pop_size=pop_size self.gens=gens + self.max_stall=max_stall self.verbosity=verbosity self.algorithm=algorithm self.mode=mode @@ -203,6 +208,7 @@ def fit(self, X, y): self.parameters_.n_jobs = self.n_jobs self.parameters_.pop_size = self.pop_size self.parameters_.gens = self.gens + self.parameters_.max_stall = self.max_stall self.parameters_.num_islands = self.num_islands self.parameters_.max_depth = self.max_depth self.parameters_.max_size = self.max_size diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 4dae39e2..f4c661c8 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -15,6 +15,7 @@ void bind_params(py::module& m) .def(py::init([](){ Brush::Parameters p; return p; })) .def_property("pop_size", &Brush::Parameters::get_pop_size, &Brush::Parameters::set_pop_size) .def_property("gens", &Brush::Parameters::get_gens, &Brush::Parameters::set_gens) + .def_property("max_stall", &Brush::Parameters::get_max_stall, &Brush::Parameters::set_max_stall) .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) .def_property("load_population", &Brush::Parameters::get_load_population, &Brush::Parameters::set_load_population) .def_property("save_population", &Brush::Parameters::get_save_population, &Brush::Parameters::set_save_population) diff --git a/src/engine.cpp b/src/engine.cpp index 9543ec37..700aff7c 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -176,9 +176,14 @@ void Engine::run(Dataset &data) //std::cout << "stop criteria is ready " << std::endl; // stop criteria unsigned generation = 0; + unsigned stall_count = 0; + auto stop = [&]() { //std::cout << "inside stop " << std::endl; - return generation == params.gens; // TODO: max stall, max time, etc + // TODO: max time + return ( (generation == params.gens) + && (params.max_stall == 0 || stall_count < params.max_stall) + ); }; // TODO: check that I dont use pop.size() (or I use correctly, because it will return the size with the slots for the offspring) @@ -205,7 +210,6 @@ void Engine::run(Dataset &data) island_parents.at(i).resize(delta); } - //std::cout << "vectors are created " << std::endl; // TODO: progress bar? (it would be cool) // heavily inspired in https://github.com/heal-research/operon/blob/main/source/algorithms/nsga2.cpp @@ -242,7 +246,6 @@ void Engine::run(Dataset &data) // island_parents.at(i).resize(delta); // } - ++generation; }).name("prepare generation");// set generation in params, get batch auto run_generation = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { @@ -306,8 +309,18 @@ void Engine::run(Dataset &data) //std::cout << pop.print_models() << std::endl; }).name("update, migrate and disentangle indexes between islands"); - // TODO: update best, update log, increment generation counter (but not set in params) - auto finish_gen = subflow.emplace([&]() { bool updated_best = this->update_best(data); }).name("update best, log, archive"); + // TODO: update log and archive + auto finish_gen = subflow.emplace([&]() { + bool updated_best = this->update_best(data); + + if (generation == 0 || updated_best ) + stall_count = 0; + else + ++stall_count; + + ++generation; + + }).name("update best, log, archive, stall"); // set-up subflow graph prepare_gen.precede(run_generation); diff --git a/src/params.h b/src/params.h index b155d33a..1791763d 100644 --- a/src/params.h +++ b/src/params.h @@ -25,11 +25,14 @@ struct Parameters unsigned int current_gen = 1; - int pop_size = 100; - int gens = 1000; - unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size + // TODO: rename it to max_gens + int pop_size = 100; + int gens = 1000; + int max_stall = 0; + unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size unsigned int max_size = 50; + vector objectives{"error","complexity"}; // error should be generic and deducted based on mode string sel = "lexicase"; //selection method @@ -86,6 +89,9 @@ struct Parameters void set_gens(int new_gens){ gens = new_gens; }; int get_gens(){ return gens; }; + void set_max_stall(int new_max_stall){ max_stall = new_max_stall; }; + int get_max_stall(){ return max_stall; }; + void set_load_population(string new_load_population){ load_population = new_load_population; }; string get_load_population(){ return load_population; }; diff --git a/tests/cpp/test_params.cpp b/tests/cpp/test_params.cpp index 67537277..dc88d633 100644 --- a/tests/cpp/test_params.cpp +++ b/tests/cpp/test_params.cpp @@ -50,4 +50,6 @@ TEST(Params, ParamsTests) // ft.params.set_verbosity(2); // ASSERT_EQ(ft.params.verbosity, 2); // ASSERT_STREQ("", logger.log("Hello", 3).c_str()); + + // TODO: test termination criterion --- max stall, generations, time } From 2c438e01c5a6235cbedebdbc7c27faabafa16872 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Sun, 14 Apr 2024 20:27:00 -0300 Subject: [PATCH 152/199] Improving PTC2 to work with strong typed programs. Fixed infinite loop in PTC2 --- src/data/io.cpp | 7 ++--- src/search_space.cpp | 54 +++++++++++++++++++++----------------- tests/cpp/test_program.cpp | 14 ++++++++-- 3 files changed, 46 insertions(+), 29 deletions(-) diff --git a/src/data/io.cpp b/src/data/io.cpp index 8293f478..d81559ae 100755 --- a/src/data/io.cpp +++ b/src/data/io.cpp @@ -81,9 +81,10 @@ Dataset read_csv ( // check if endpoint is binary bool binary_endpoint = (y.array() == 0 || y.array() == 1).all(); - auto result = Dataset(features,y,binary_endpoint); - return result; - + // using constructor 1. (initializing data from a map) + auto result = Dataset(features, y, binary_endpoint); + + return result; } } // Brush diff --git a/src/search_space.cpp b/src/search_space.cpp index 5fc8621e..6aaf45cb 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -267,14 +267,14 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const auto Tree = tree(); - /* fmt::print("building program with max size {}, max depth {}",max_size,max_d); */ + fmt::print("building program with max size {}, max depth {}",max_size,max_d); // Queue of nodes that need children vector> queue; - /* cout << "chose " << n.name << endl; */ + cout << "root " << root.name << endl; // auto spot = Tree.set_head(n); - /* cout << "inserting...\n"; */ + cout << "inserting...\n"; auto spot = Tree.insert(Tree.begin(), root); // node depth @@ -295,7 +295,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const //For each argument position a of n, Enqueue(a; g) for (auto a : root.arg_types) { - /* cout << "queing a node of type " << DataTypeName[a] << endl; */ + cout << "queing a node of type " << DataTypeName[a] << endl; auto child_spot = Tree.append_child(spot); queue.push_back(make_tuple(child_spot, a, d)); } @@ -304,8 +304,8 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const Node n; // Now we actually start the PTC2 procedure to create the program tree - /* cout << "queue size: " << queue.size() << endl; */ - /* cout << "entering first while loop...\n"; */ + cout << "queue size: " << queue.size() << endl; + cout << "entering first while loop...\n"; while ( queue.size() + s < max_size && queue.size() > 0) { // including the queue size in the max_size, since each element in queue @@ -317,14 +317,14 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // always insert a non terminal (which by default has weights off). // this way, we can have PTC2 working properly. - /* cout << "queue size: " << queue.size() << endl; */ + cout << "queue size: " << queue.size() << endl; auto [qspot, t, d] = RandomDequeue(queue); - /* cout << "current depth: " << d << endl; */ + cout << "current depth: " << d << endl; if (d >= max_d || s >= max_size) { // choose terminal of matching type - /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ + cout << "getting " << DataTypeName[t] << " terminal\n"; // qspot = sample_terminal(t); // Tree.replace(qspot, sample_terminal(t)); // Tree.append_child(qspot, sample_terminal(t)); @@ -344,15 +344,22 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const else { //choose a nonterminal of matching type - /* cout << "getting op of type " << DataTypeName[t] << endl; */ + cout << "getting op of type " << DataTypeName[t] << endl; auto opt = sample_op(t); - /* cout << "chose " << n.name << endl; */ + cout << "chose " << n.name << endl; // TreeIter new_spot = Tree.append_child(qspot, n); // qspot = n; - if (!opt) { - queue.push_back(make_tuple(qspot, t, d)); - continue; + if (!opt) { // there is no operator for this node. sample a terminal instead + opt = sample_terminal(t); + } + + if (!opt) { // no operator nor terminal. weird. + auto msg = fmt::format("Failed to sample operator AND terminal of data type {} during PTC2.\n", DataTypeName[t]); + HANDLE_ERROR_THROW(msg); + + // queue.push_back(make_tuple(qspot, t, d)); + // continue; } n = opt.value(); @@ -362,7 +369,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // For each arg of n, add to queue for (auto a : n.arg_types) { - /* cout << "queing a node of type " << DataTypeName[a] << endl; */ + cout << "queing a node of type " << DataTypeName[a] << endl; // queue.push_back(make_tuple(new_spot, a, d+1)); auto child_spot = Tree.append_child(newspot); @@ -382,19 +389,20 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const && Isnt(n.node_type) ) s += 2; - /* cout << "current tree size: " << s << endl; */ + cout << "current tree size: " << s << endl; } - /* cout << "entering second while loop...\n"; */ + + cout << "entering second while loop...\n"; while (queue.size() > 0) { if (queue.size() == 0) break; - /* cout << "queue size: " << queue.size() << endl; */ + cout << "queue size: " << queue.size() << endl; auto [qspot, t, d] = RandomDequeue(queue); - /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ + cout << "getting " << DataTypeName[t] << " terminal\n"; // Tree.append_child(qspot, sample_terminal(t)); // qspot = sample_terminal(t); // auto newspot = Tree.replace(qspot, sample_terminal(t)); @@ -408,11 +416,9 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const auto newspot = Tree.replace(qspot, n); } - /* cout << "final tree:\n" */ - /* << Tree.begin().node->get_model() << "\n" */ - /* << Tree.begin().node->get_tree_model(true) << endl; */ - /* << Tree.get_model() << "\n" */ - /* << Tree.get_model(true) << endl; // pretty */ + cout << "final tree:\n" + << Tree.begin().node->get_model() << "\n" + << Tree.begin().node->get_tree_model(true) << endl; return Tree; }; diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp index 69d5819b..e838d60d 100644 --- a/tests/cpp/test_program.cpp +++ b/tests/cpp/test_program.cpp @@ -97,6 +97,8 @@ TEST(Program, PredictWithWeights) Dataset data = Data::read_csv("docs/examples/datasets/d_enc.csv","label"); + ASSERT_FALSE(data.classification); + SearchSpace SS; SS.init(data); @@ -138,16 +140,20 @@ TEST(Program, FitClassifier) { Parameters params; - Dataset data = Data::read_csv("docs/examples/datasets/d_analcatdata_aids.csv","target"); + Dataset data = Data::read_csv("docs/examples/datasets/d_analcatdata_aids.csv", "target"); + + ASSERT_TRUE(data.classification); SearchSpace SS; + SS.init(data); for (int d = 1; d < 10; ++d) { for (int s = 1; s < 100; s+=10) { - params.max_size = s; params.max_depth = d; + params.max_size = s; + fmt::print( "Calling make_classifier...\n"); auto PRG = SS.make_classifier(0, 0, params); fmt::print( @@ -156,8 +162,12 @@ TEST(Program, FitClassifier) "=================================================\n", d, s, PRG.get_model("compact", true) ); + + fmt::print( "Fitting the model...\n"); PRG.fit(data); + fmt::print( "predict...\n"); auto y = PRG.predict(data); + fmt::print( "predict proba...\n"); auto yproba = PRG.predict_proba(data); } } From 7c364ea2a29e2abe44f92ba32ffe2bd414e76b7f Mon Sep 17 00:00:00 2001 From: gAldeia Date: Mon, 15 Apr 2024 08:36:58 -0300 Subject: [PATCH 153/199] Implemented max_time --- pybrush/BrushEstimator.py | 5 +++++ src/bindings/bind_params.cpp | 1 + src/engine.cpp | 8 +++++--- src/engine.h | 2 ++ src/params.h | 8 ++++++-- 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 70f58550..009c49f9 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -34,6 +34,8 @@ class BrushEstimator(BaseEstimator): Population size. gens : int, default 100 Maximum iterations of the algorithm. + max_time: int, optional (default: -1) + Maximum time terminational criterion in seconds. If -1, not used. max_stall: int, optional (default: 0) How many generations to continue after the validation loss has stalled. If 0, not used. @@ -122,6 +124,7 @@ def __init__( mode='classification', pop_size=100, gens=100, + max_time=-1, max_stall=0, verbosity=0, max_depth=3, @@ -145,6 +148,7 @@ def __init__( self.pop_size=pop_size self.gens=gens self.max_stall=max_stall + self.max_time=max_time self.verbosity=verbosity self.algorithm=algorithm self.mode=mode @@ -209,6 +213,7 @@ def fit(self, X, y): self.parameters_.pop_size = self.pop_size self.parameters_.gens = self.gens self.parameters_.max_stall = self.max_stall + self.parameters_.max_time = self.max_time self.parameters_.num_islands = self.num_islands self.parameters_.max_depth = self.max_depth self.parameters_.max_size = self.max_size diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index f4c661c8..77e67a30 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -16,6 +16,7 @@ void bind_params(py::module& m) .def_property("pop_size", &Brush::Parameters::get_pop_size, &Brush::Parameters::set_pop_size) .def_property("gens", &Brush::Parameters::get_gens, &Brush::Parameters::set_gens) .def_property("max_stall", &Brush::Parameters::get_max_stall, &Brush::Parameters::set_max_stall) + .def_property("max_time", &Brush::Parameters::get_max_time, &Brush::Parameters::set_max_time) .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) .def_property("load_population", &Brush::Parameters::get_load_population, &Brush::Parameters::set_load_population) .def_property("save_population", &Brush::Parameters::get_save_population, &Brush::Parameters::set_save_population) diff --git a/src/engine.cpp b/src/engine.cpp index 700aff7c..ab815592 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -60,6 +60,7 @@ void Engine::init() this->best_loss = MAX_FLT; this->best_complexity = MAX_FLT; + // TODO getters and setters for the best solution found after evolution // predict, transform, predict_proba, etc. // get statistics @@ -69,9 +70,8 @@ void Engine::init() // score functions // fit methods (this will run the evolution) - // TODO: implement stuff below - // // start the clock - // timer.Reset(); + // start the clock + timer.Reset(); // // signal handler // signal(SIGINT, my_handler); @@ -177,12 +177,14 @@ void Engine::run(Dataset &data) // stop criteria unsigned generation = 0; unsigned stall_count = 0; + float fraction = 0; auto stop = [&]() { //std::cout << "inside stop " << std::endl; // TODO: max time return ( (generation == params.gens) && (params.max_stall == 0 || stall_count < params.max_stall) + && (params.max_time == -1 || params.max_time > timer.Elapsed().count()) ); }; diff --git a/src/engine.h b/src/engine.h index 1cf47155..4947ba85 100644 --- a/src/engine.h +++ b/src/engine.h @@ -65,6 +65,8 @@ class Engine{ Selection survivor; ///< survival algorithm // TODO: MISSING CLASSES: timer, archive, logger + Timer timer; ///< start time of training + Individual best_ind; bool is_fitted; ///< keeps track of whether fit was called. diff --git a/src/params.h b/src/params.h index 1791763d..f46c73a8 100644 --- a/src/params.h +++ b/src/params.h @@ -25,10 +25,11 @@ struct Parameters unsigned int current_gen = 1; - // TODO: rename it to max_gens + // termination criteria int pop_size = 100; - int gens = 1000; + int gens = 1000; // TODO: rename it to max_gens int max_stall = 0; + int max_time = -1; unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size unsigned int max_size = 50; @@ -91,6 +92,9 @@ struct Parameters void set_max_stall(int new_max_stall){ max_stall = new_max_stall; }; int get_max_stall(){ return max_stall; }; + + void set_max_time(int new_max_time){ max_time = new_max_time; }; + int get_max_time(){ return max_time; }; void set_load_population(string new_load_population){ load_population = new_load_population; }; string get_load_population(){ return load_population; }; From c2d3648cd58c9917b3611d0f747ff0311990590e Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 16 Apr 2024 09:56:15 -0300 Subject: [PATCH 154/199] OffsetSum as an unary operator --- src/program/functions.h | 21 +++-- src/program/node.cpp | 6 +- src/program/operator.h | 175 ++++---------------------------------- src/program/program.h | 4 +- src/program/signatures.h | 4 +- src/program/tree_node.cpp | 42 ++++----- 6 files changed, 59 insertions(+), 193 deletions(-) diff --git a/src/program/functions.h b/src/program/functions.h index 66136b62..c334afe5 100644 --- a/src/program/functions.h +++ b/src/program/functions.h @@ -183,19 +183,26 @@ namespace Brush inline auto operator()(const TimeSeries& t) { return t.sum(); } }; - /* OffsetSum */ // TODO: IMPLEMENT + /* OffsetSum */ template<> struct Function { + // just add with a constant (definition is like identity) template - inline auto operator()(const T& t) { return t.rowwise().sum(); } - - inline auto operator()(ArrayXXb t) { - return (t.rowwise().count().cast ()); + inline auto operator()(const T& t) { + return t; } - template - inline auto operator()(const TimeSeries& t) { return t.sum(); } + // n-ary version + // template + // inline auto operator()(const T& t) { return t.rowwise().sum(); } + + // inline auto operator()(ArrayXXb t) { + // return (t.rowwise().count().cast ()); + // } + + // template + // inline auto operator()(const TimeSeries& t) { return t.sum(); } }; template<> diff --git a/src/program/node.cpp b/src/program/node.cpp index 93c44cc8..bb9dc351 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -83,8 +83,7 @@ string Node::get_model(const vector& children) const noexcept args += ","; } - // TODO: rename it to just Sum (the user doesnt need to know the offset is fixed) - return fmt::format("OffsetSum({})", args); + return fmt::format("Sum({})", args); } else{ string args = ""; @@ -155,6 +154,7 @@ void init_node_with_default_signature(Node& node) NT::Sqrtabs, NT::Square, NT::Logistic, + NT::OffsetSum, // unary version NT::CustomUnaryOp >(n)) { @@ -191,7 +191,7 @@ void init_node_with_default_signature(Node& node) NT::Mean, NT::Median, NT::Sum, - NT::OffsetSum, + // NT::OffsetSum, // n-ary version NT::Prod, NT::Softmax >(n)) diff --git a/src/program/operator.h b/src/program/operator.h index 054a18f5..2abceeba 100644 --- a/src/program/operator.h +++ b/src/program/operator.h @@ -225,6 +225,23 @@ struct Operator } return this->apply(inputs); }; + + // overloaded version for offset sum + template + requires is_in_v + RetType eval(const Dataset& d, TreeNode& tn, const W** weights=nullptr) const + { + auto inputs = get_kids(d, tn, weights); + if constexpr (is_one_of_v) + { + if (tn.data.get_is_weighted()) + { + auto w = util::get_weight(tn, weights); + return this->apply(inputs) + w; + } + } + return this->apply(inputs); + }; }; ////////////////////////////////////////////////////////////////////////////////// @@ -336,164 +353,6 @@ struct Operator }; }; -//////////////////////////////////////////////////////////////////////////// -// OffsetSum overload -template -struct Operator>> -{ - /** - * @brief set argument types to those of the signature unless: - * - * a) the operator is unary and there are more than one arguments - * b) the operator is binary and associative - * - * In the case of a) or b), arguments to the operator are stacked into an - * array and the operator is applied to that array - */ - using ArgTypes = conditional_t< - ((UnaryOp || NaryOp) && S::ArgCount > 1), - Array, - typename S::ArgTypes>; - - /// @brief return type of the operator - using RetType = typename S::RetType; - - /// @brief stores the argument count of the operator - static constexpr size_t ArgCount = S::ArgCount; - - /// utility for returning the type of the Nth argument - template - using NthType = typename S::NthType; - - /// set weight type - using W = typename S::WeightType; - - /// @brief wrapper function for the node function - static constexpr auto F = [](const auto& ...args) { - Function f; - return f(args...); - }; - - Operator() = default; - //////////////////////////////////////////////////////////////////////////////// - // Utilities to grab child outputs. - - /// get a std::array or eigen array of kids - template requires(is_std_array_v || is_eigen_array_v) - T get_kids(const Dataset& d, TreeNode& tn, const W** weights=nullptr) const - { - T child_outputs; - using arg_type = std::conditional_t, - typename T::value_type, Array>; - if constexpr (is_eigen_array_v) - child_outputs.resize(d.get_n_samples(), Eigen::NoChange); - - TreeNode* sib = tn.first_child; - for (int i = 0; i < ArgCount; ++i) - { - if (sib == nullptr) - HANDLE_ERROR_THROW("bad sibling ptr in get kids"); - if constexpr (Fit){ - if constexpr(is_std_array_v) - child_outputs.at(i) = sib->fit(d); - else - child_outputs.col(i) = sib->fit(d); - } - else{ - if constexpr(is_std_array_v) - child_outputs.at(i) = sib->predict(d, weights); - else - child_outputs.col(i) = sib->predict(d, weights); - } - sib = sib->next_sibling; - } - return child_outputs; - }; - - /// gets one kid for a tuple of kids - template - NthType get_kid(const Dataset& d,TreeNode& tn, const W** weights ) const - { - auto sib = tree::sibling_iterator(tn.first_child) ; - sib += I; - if constexpr(Fit) - return sib->fit>(d); - else - return sib->predict>(d,weights); - }; - - /** - * @brief Makes and returns a tuple of child outputs - * - * @tparam T a tuple - * @tparam Is integer sequence - * @param d dataset - * @param tn a tree node - * @return a tuple with elements corresponding to each child node - */ - template requires(is_tuple_v) - T get_kids_seq(const Dataset& d, TreeNode& tn, const W** weights, std::index_sequence) const - { - return std::make_tuple(get_kid(d,tn,weights)...); - }; - - /// @brief get a std::tuple of kids. Used when child arguments are different types. - /// @tparam T argument types - /// @param d the dataset - /// @param tn the tree node - /// @param weights option pointer to a weight array, used in place of node weight - /// @return a tuple of the child arguments - template requires(is_tuple_v) - T get_kids(const Dataset& d, TreeNode& tn, const W** weights=nullptr) const - { - return get_kids_seq(d, tn, weights, std::make_index_sequence{}); - }; - - /////////////////////////////////////////////////////////////////////////// - - /// @brief Apply node function in a functional style - /// @tparam T argument types - /// @param inputs the child node outputs - /// @return return values applying F to the inputs - template requires ( is_std_array_v || is_tuple_v) - RetType apply(const T& inputs) const - { - return std::apply(F, inputs); - } - - /// @brief Apply the node function like a function - /// @tparam T argument types - /// @param inputs the child node outputs - /// @return return values applying F to the inputs - template requires ( is_eigen_array_v && !is_std_array_v) - RetType apply(const T& inputs) const - { - return F(inputs); - } - - /// @brief evaluate the operator on the data. main entry point. - /// @tparam T argument types - /// @tparam Scalar the underlying scalar type of the return type - /// @param d dataset - /// @param tn tree node - /// @param weights option pointer to a weight array, used in place of node weight - /// @return output values from applying operator function - template - RetType eval(const Dataset& d, TreeNode& tn, const W** weights=nullptr) const - { - auto inputs = get_kids(d, tn, weights); - if constexpr (is_one_of_v) - { - if (tn.data.get_is_weighted()) - { - auto w = util::get_weight(tn, weights); - return this->apply(inputs) + w; - } - } - return this->apply(inputs); - }; -}; - //////////////////////////////////////////////////////////////////////////// // Operator overloads // Split diff --git a/src/program/program.h b/src/program/program.h index d5a92677..33cca237 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -117,7 +117,7 @@ template struct Program acc += 2; if ( (include_weight && node.get_is_weighted()==true) - && Isnt(node.node_type) ) + && Isnt(node.node_type) ) // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) acc += 2; @@ -155,7 +155,7 @@ template struct Program acc += 2; if ( (include_weight && it.node->data.get_is_weighted()==true) - && Isnt(it.node->data.node_type) ) + && Isnt(it.node->data.node_type) ) // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) acc += 2; diff --git a/src/program/signatures.h b/src/program/signatures.h index b7d57474..12ff9319 100644 --- a/src/program/signatures.h +++ b/src/program/signatures.h @@ -260,7 +260,8 @@ struct Signatures>>{ // using type = std::tuple< // Signature, @@ -295,7 +296,6 @@ struct Signatures struct Signatures &t) unordered_map operator_complexities = { // Unary - {NodeType::Abs , 3}, - {NodeType::Acos , 5}, - {NodeType::Asin , 5}, - {NodeType::Atan , 5}, - {NodeType::Cos , 5}, - {NodeType::Cosh , 5}, - {NodeType::Sin , 5}, - {NodeType::Sinh , 5}, - {NodeType::Tan , 5}, - {NodeType::Tanh , 5}, - {NodeType::Ceil , 4}, - {NodeType::Floor , 4}, - {NodeType::Exp , 4}, - {NodeType::Log , 4}, - {NodeType::Logabs , 12}, - {NodeType::Log1p , 8}, - {NodeType::Sqrt , 4}, - {NodeType::Sqrtabs , 4}, - {NodeType::Square , 3}, - {NodeType::Logistic, 3}, + {NodeType::Abs , 3}, + {NodeType::Acos , 5}, + {NodeType::Asin , 5}, + {NodeType::Atan , 5}, + {NodeType::Cos , 5}, + {NodeType::Cosh , 5}, + {NodeType::Sin , 5}, + {NodeType::Sinh , 5}, + {NodeType::Tan , 5}, + {NodeType::Tanh , 5}, + {NodeType::Ceil , 4}, + {NodeType::Floor , 4}, + {NodeType::Exp , 4}, + {NodeType::Log , 4}, + {NodeType::Logabs , 12}, + {NodeType::Log1p , 8}, + {NodeType::Sqrt , 4}, + {NodeType::Sqrtabs , 4}, + {NodeType::Square , 3}, + {NodeType::Logistic, 3}, + {NodeType::OffsetSum, 2}, // timing masks {NodeType::Before, 3}, @@ -111,7 +112,6 @@ unordered_map operator_complexities = { {NodeType::Mean , 3}, {NodeType::Median , 3}, {NodeType::Sum , 2}, - {NodeType::OffsetSum, 2}, {NodeType::Prod , 3}, // Transformers From 4f077a741a90a8868519bdf7660b3ccfc2f20a76 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 16 Apr 2024 09:56:33 -0300 Subject: [PATCH 155/199] Logistic(Add(Const, <>)) --- src/search_space.cpp | 30 ++++++++++++++++--------- src/search_space.h | 53 ++++++++++++++++++++++++++++++++------------ src/variation.h | 2 +- 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/src/search_space.cpp b/src/search_space.cpp index 6aaf45cb..a6b11e8b 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -201,6 +201,10 @@ void SearchSpace::init(const Dataset& d, const unordered_map& user std::set unique_classes(vec.begin(), vec.end()); + // We need some ops in the search space so we can have the logit and offset + if (user_ops.find("OffsetSum") == user_ops.end()) + extended_user_ops.insert({"OffsetSum", 0.0f}); + if (unique_classes.size()==2 && (user_ops.find("Logistic") == user_ops.end())) { extended_user_ops.insert({"Logistic", 0.0f}); } @@ -249,13 +253,19 @@ std::optional> SearchSpace::sample_subtree(Node root, int max_d, int terminal_weights.at(root.ret_type).end())) ) return std::nullopt; + auto Tree = tree(); + auto spot = Tree.insert(Tree.begin(), root); + // we should notice the difference between size of a PROGRAM and a TREE. // program count weights in its size, while the TREE structure dont. Wenever // using size of a program/tree, make sure you use the function from the correct class - return PTC2(root, max_d, max_size); + PTC2(Tree, spot, max_d, max_size); + + return Tree; }; -tree SearchSpace::PTC2(Node root, int max_d, int max_size) const +tree& SearchSpace::PTC2(tree& Tree, + tree::iterator spot, int max_d, int max_size) const { // PTC2 is agnostic of program type @@ -265,23 +275,23 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const // parameters, the real maximum size that can occur is `max_size` plus the // highest operator arity, and the real maximum depth is `max_depth` plus one. - auto Tree = tree(); + // auto Tree = tree(); fmt::print("building program with max size {}, max depth {}",max_size,max_d); // Queue of nodes that need children vector> queue; - cout << "root " << root.name << endl; - // auto spot = Tree.set_head(n); - cout << "inserting...\n"; - auto spot = Tree.insert(Tree.begin(), root); - // node depth int d = 1; // current tree size int s = 1; + Node root = spot.node->data; + + cout << "root " << root.name << endl; + // auto spot = Tree.set_head(n); + // updating size accordingly to root node if (Is(root.node_type)) s += 3; @@ -289,7 +299,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const s += 2; if ( root.get_is_weighted()==true - && Isnt(root.node_type) ) + && Isnt(root.node_type) ) s += 2; //For each argument position a of n, Enqueue(a; g) @@ -386,7 +396,7 @@ tree SearchSpace::PTC2(Node root, int max_d, int max_size) const s += 2; if ( n.get_is_weighted()==true - && Isnt(n.node_type) ) + && Isnt(n.node_type) ) s += 2; cout << "current tree size: " << s << endl; diff --git a/src/search_space.h b/src/search_space.h index 83883360..ef0ea309 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -579,7 +579,7 @@ struct SearchSpace void print() const; private: - tree PTC2(Node root, int max_d, int max_size) const; + tree& PTC2(tree& Tree, tree::iterator root, int max_d, int max_size) const; template requires (!is_in_v) @@ -691,24 +691,46 @@ P SearchSpace::make_program(const Parameters& params, int max_d, int max_size) ProgramType program_type = P::program_type; // ProgramType program_type = ProgramTypeEnum::value; - // building the root node for each program case. We give the root, and it - // fills the rest of the tree - Node root; + // Tree is pre-filled with some fixed nodes depending on program type + auto Tree = tree(); + + // building the tree for each program case. Then, we give the spot to PTC2, + // and it will fill the rest of the tree + tree::iterator spot; // building the root node for each program case if (P::program_type == ProgramType::BinaryClassifier) { - root = get(NodeType::Logistic, DataType::ArrayF, Signature()); - root.set_prob_change(0.0); - root.fixed=true; + Node node_logit = get(NodeType::Logistic, DataType::ArrayF, Signature()); + node_logit.set_prob_change(0.0); + node_logit.fixed=true; + auto spot_logit = Tree.insert(Tree.begin(), node_logit); + + if (true) { // Logistic(Add(Constant, <>)). + Node node_offset = get(NodeType::OffsetSum, DataType::ArrayF, Signature()); + node_offset.set_prob_change(0.0); + node_offset.fixed=true; + + auto spot_offset = Tree.append_child(spot_logit); + + spot = Tree.replace(spot_offset, node_offset); + } + else { // If false, then model will be Logistic(<>) + spot = spot_logit; + } } else if (P::program_type == ProgramType::MulticlassClassifier) { - root = get(NodeType::Softmax, DataType::MatrixF, Signature()); - root.set_prob_change(0.0); - root.fixed=true; + Node node_softmax = get(NodeType::Softmax, DataType::MatrixF, Signature()); + node_softmax.set_prob_change(0.0); + node_softmax.fixed=true; + + spot = Tree.insert(Tree.begin(), node_softmax); } - else { + else // regression or representer --- sampling any candidate op or terminal + { + Node root; + std::optional opt=std::nullopt; if (max_size>1 && max_d>1) @@ -716,13 +738,16 @@ P SearchSpace::make_program(const Parameters& params, int max_d, int max_size) if (!opt) // if failed, then we dont have any operator to use as root... opt = sample_terminal(root_type, true); + root = opt.value(); - } + spot = Tree.insert(Tree.begin(), root); + } + // max_d-1 because we always pick the root before calling ptc2 - auto Tree = PTC2(root, max_d-1, max_size); + PTC2(Tree, spot, max_d-1, max_size); // change inplace - return P(*this,Tree); + return P(*this, Tree); }; extern SearchSpace SS; diff --git a/src/variation.h b/src/variation.h index bbf3af63..0bca47dc 100644 --- a/src/variation.h +++ b/src/variation.h @@ -83,7 +83,7 @@ class MutationBase { acc += 2; if ( (include_weight && node.get_is_weighted()==true) - && Isnt(node.node_type) ) + && Isnt(node.node_type) ) // Taking into account the weight and multiplication, if enabled. // weighted constants still count as 1 (simpler than constant terminals) acc += 2; From dad0ba82d9dc861027ef6c704a1ddeabc241a06d Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 16 Apr 2024 11:21:16 -0300 Subject: [PATCH 156/199] is_fitted_ flag to avoid re-fitting individuals --- src/engine.cpp | 1 - src/eval/evaluation.cpp | 8 +++++--- src/individual.h | 6 +++++- src/population.cpp | 1 - 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/engine.cpp b/src/engine.cpp index ab815592..8c7d13c1 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -276,7 +276,6 @@ void Engine::run(Dataset &data) variator.vary(this->pop, island, island_parents.at(island)); //std::cout << "before update fitness" << std::endl; - evaluator.update_fitness(this->pop, island, data, params, true); // evaluator.validation(*this->pop, island_range, data, params); //std::cout << "before batch update" << std::endl; diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index c92cf84f..884c0789 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -14,7 +14,7 @@ void Evaluation::update_fitness(Population& pop, bool validation ) { - //TODO: it could use the validation_loss + //TODO: it could use the validation_loss auto idxs = pop.get_island_indexes(island); int counter = 0; @@ -35,13 +35,16 @@ void Evaluation::update_fitness(Population& pop, else { // assign weights to individual - if (fit) + if (fit && ind.get_is_fitted() == false) + { ind.program.fit(data); + } assign_fit(ind, data, params, validation); } ++counter; } + assert(counter > 0); } @@ -91,6 +94,5 @@ void Evaluation::assign_fit(Individual& ind, const Dataset& data, ind.fitness.set_values(values); } - } // Pop } // Brush \ No newline at end of file diff --git a/src/individual.h b/src/individual.h index 237b0423..5a29ecfb 100644 --- a/src/individual.h +++ b/src/individual.h @@ -20,6 +20,8 @@ class Individual{ // error is the aggregation of error vector, and can be user sppecified + bool is_fitted_ = false; + VectorXf error; ///< training error (used in lexicase selectors) Fitness fitness; ///< aggregate fitness score @@ -46,7 +48,8 @@ class Individual{ // fitness, objetives, complexity, etc. TODO: create intermediate functions to interact with fitness and program? void fit(Dataset& data) { program.fit(data); - + // this flag is used to avoid re-fitting an individual. the program is_fitted_ flag is used to perform checks (like in predict with weights). They are two different things and I think I;ll keep this way (individual is just a container to keep program and fitness together) + this->is_fitted_ = true; }; auto predict(Dataset& data) { return program.predict(data); }; @@ -54,6 +57,7 @@ class Individual{ // TODO: This class should also have its own cpp wrapper. Update it into the deap api (the idea is that the user is still able to prototype with brush, I dont think we should disable that feature) // just getters (TODO: use the attributes ) + bool get_is_fitted() const { return this->is_fitted_; }; string get_model() const { return program.get_model(); }; size_t get_size() const { return program.size(); }; size_t get_depth() const { return program.depth(); }; diff --git a/src/population.cpp b/src/population.cpp index 3b5c6510..a4d44265 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -100,7 +100,6 @@ void Population::init(SearchSpace& ss, const Parameters& params) // second half is space to the offspring (but we dont initialize them) individuals.at(p+i) = nullptr; } - } template From 69d569edee048d14a3c9800682f9734373033478 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 16 Apr 2024 13:19:39 -0300 Subject: [PATCH 157/199] Fixed constants having higher complexity than they should --- src/program/tree_node.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index c73684e0..cd04fbee 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -161,7 +161,9 @@ int TreeNode::get_complexity() const // avoid multiplication by zero if the node is a terminal children_complexity_sum = max(children_complexity_sum, 1); - if (data.get_is_weighted()) // include the `w` and `*` if the node is weighted + // include the `w` and `*` if the node is weighted (and it is not a constant or mean label) + if (data.get_is_weighted() + && (Is(data.node_type) || Is(data.node_type)) ) return operator_complexities.at(NodeType::Mul)*( operator_complexities.at(NodeType::Constant) + node_complexity*(children_complexity_sum) From 7976fe24eb470ffa89dcc51e4eef6fc02b407f4f Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 16 Apr 2024 13:20:08 -0300 Subject: [PATCH 158/199] Improved dot model for OffsetSum nodes --- src/program/program.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/program/program.h b/src/program/program.h index 33cca237..b2cbf9a9 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -442,7 +442,7 @@ template struct Program node_label = fmt::format("{}>{:.2f}?", parent->data.get_feature(), parent->data.W); } if (Is(parent->data.node_type)){ - node_label = fmt::format("{:.2f} + Sum", parent->data.W); + node_label = fmt::format("Add"); } out += fmt::format("\"{}\" [label=\"{}\"];\n", parent_id, node_label); @@ -503,6 +503,22 @@ template struct Program } kid = kid->next_sibling; } + + // adding the offset as the last child + if (Is(parent->data.node_type)){ + // drawing the edge + out += fmt::format("\"{}\" -> \"{}\" [label=\"\"];\n", + parent_id, + parent_id+"Offset" + ); + + // drawing the node + out += fmt::format("\"{}\" [label=\"{}\"];\n", + parent_id+"Offset", + parent->data.W + ); + } + ++i; } out += "}\n"; From 658b68d70a64ed8058b3584355be44b868adc19d Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 16 Apr 2024 14:58:42 -0300 Subject: [PATCH 159/199] Fixed scorer_ not visible to python --- src/bindings/bind_params.cpp | 1 + src/params.h | 3 +++ src/program/node.cpp | 10 +++++++--- src/program/tree_node.cpp | 9 +++++---- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 77e67a30..8c0c080e 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -18,6 +18,7 @@ void bind_params(py::module& m) .def_property("max_stall", &Brush::Parameters::get_max_stall, &Brush::Parameters::set_max_stall) .def_property("max_time", &Brush::Parameters::get_max_time, &Brush::Parameters::set_max_time) .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) + .def_property("scorer_", &Brush::Parameters::get_scorer_, &Brush::Parameters::set_scorer_) .def_property("load_population", &Brush::Parameters::get_load_population, &Brush::Parameters::set_load_population) .def_property("save_population", &Brush::Parameters::get_save_population, &Brush::Parameters::set_save_population) .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands) diff --git a/src/params.h b/src/params.h index f46c73a8..749df87b 100644 --- a/src/params.h +++ b/src/params.h @@ -96,6 +96,9 @@ struct Parameters void set_max_time(int new_max_time){ max_time = new_max_time; }; int get_max_time(){ return max_time; }; + void set_scorer_(string new_scorer_){ scorer_ = new_scorer_; }; + string get_scorer_(){ return scorer_; }; + void set_load_population(string new_load_population){ load_population = new_load_population; }; string get_load_population(){ return load_population; }; diff --git a/src/program/node.cpp b/src/program/node.cpp index bb9dc351..23a7be82 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -34,12 +34,16 @@ auto Node::get_name(bool include_weight) const noexcept -> std::string else if (Is(node_type)) { if (include_weight) - return fmt::format("{:.2f}", W); // Handle as if it was a constant - //explicitly print as a MeanLabel and include weight on label - return fmt::format("MeanLabel({:.2f})", W); + return fmt::format("{:.2f}*{}", W, feature); + + return feature; + } + else if (Is(node_type)){ + return fmt::format("{}+Sum", W); } else if (is_weighted && include_weight) return fmt::format("{:.2f}*{}",W,name); + return name; } diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index cd04fbee..c1695122 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -37,9 +37,7 @@ string TreeNode::get_tree_model(bool pretty, string offset) const if (sib != nullptr) child_outputs += "\n"; } - /* if (pretty) */ - /* return op_name + child_outputs; */ - /* else */ + return data.get_name() + child_outputs; }; //////////////////////////////////////////////////////////////////////////////// @@ -163,7 +161,10 @@ int TreeNode::get_complexity() const // include the `w` and `*` if the node is weighted (and it is not a constant or mean label) if (data.get_is_weighted() - && (Is(data.node_type) || Is(data.node_type)) ) + && (Is(data.node_type) + || Is(data.node_type) + || Is(data.node_type)) ) + return operator_complexities.at(NodeType::Mul)*( operator_complexities.at(NodeType::Constant) + node_complexity*(children_complexity_sum) From abd7841a6fc28c25eb2ef3f1e42bf45a9902e097 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Sat, 20 Apr 2024 21:42:49 -0300 Subject: [PATCH 160/199] Fixed bad logic when calculating complexity of constant terminals --- src/program/tree_node.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index c1695122..eacacff7 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -161,10 +161,10 @@ int TreeNode::get_complexity() const // include the `w` and `*` if the node is weighted (and it is not a constant or mean label) if (data.get_is_weighted() - && (Is(data.node_type) - || Is(data.node_type) - || Is(data.node_type)) ) - + && !(Is(data.node_type) + || Is(data.node_type) + || Is(data.node_type)) + ) return operator_complexities.at(NodeType::Mul)*( operator_complexities.at(NodeType::Constant) + node_complexity*(children_complexity_sum) From d3f20dd8420da56ff00ed67348574edd03ab670e Mon Sep 17 00:00:00 2001 From: gAldeia Date: Sat, 20 Apr 2024 21:43:56 -0300 Subject: [PATCH 161/199] Logfile and verbosity. Lots of new TODOs to fix! --- pybrush/BrushEstimator.py | 6 ++ src/bindings/bind_params.cpp | 2 + src/engine.cpp | 203 +++++++++++++++++++++++++++++++++-- src/engine.h | 16 ++- src/eval/evaluation.cpp | 18 ++-- src/individual.cpp | 4 +- src/individual.h | 10 +- src/params.h | 14 ++- src/util/utils.cpp | 16 +-- src/util/utils.h | 35 ++++-- src/variation.cpp | 3 + tests/cpp/test_brush.cpp | 7 +- 12 files changed, 294 insertions(+), 40 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 009c49f9..020d27ec 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -92,6 +92,8 @@ class BrushEstimator(BaseEstimator): Percentage of training data to sample every generation. If `1.0`, then all data is used. Very small values can improve execution time, but also lead to underfit. + logfile: str, optional (default: "") + If specified, spits statistics into a logfile. "" means don't log. random_state: int or None, default None If int, then the value is used to seed the c++ random generator; if None, then a seed will be generated using a non-deterministic generator. It is @@ -140,6 +142,7 @@ def __init__( algorithm="nsga2", objectives=["error", "size"], random_state=None, + logfile="", weights_init=True, validation_size: float = 0.0, batch_size: float = 1.0 @@ -158,6 +161,7 @@ def __init__( self.mig_prob=mig_prob self.n_jobs=n_jobs self.cx_prob=cx_prob + self.logfile=logfile self.mutation_probs=mutation_probs self.functions=functions self.objectives=objectives @@ -209,9 +213,11 @@ def fit(self, X, y): self.parameters_ = Parameters() self.parameters_.classification = self.mode == "classification" self.parameters_.n_classes = self.n_classes_ + self.parameters_.verbosity = self.verbosity self.parameters_.n_jobs = self.n_jobs self.parameters_.pop_size = self.pop_size self.parameters_.gens = self.gens + self.parameters_.logfile = self.logfile self.parameters_.max_stall = self.max_stall self.parameters_.max_time = self.max_time self.parameters_.num_islands = self.num_islands diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 8c0c080e..bd87d0ec 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -13,6 +13,7 @@ void bind_params(py::module& m) py::class_(m, "Parameters") .def(py::init([](){ Brush::Parameters p; return p; })) + .def_property("verbosity", &Brush::Parameters::get_verbosity, &Brush::Parameters::set_verbosity) .def_property("pop_size", &Brush::Parameters::get_pop_size, &Brush::Parameters::set_pop_size) .def_property("gens", &Brush::Parameters::get_gens, &Brush::Parameters::set_gens) .def_property("max_stall", &Brush::Parameters::get_max_stall, &Brush::Parameters::set_max_stall) @@ -21,6 +22,7 @@ void bind_params(py::module& m) .def_property("scorer_", &Brush::Parameters::get_scorer_, &Brush::Parameters::set_scorer_) .def_property("load_population", &Brush::Parameters::get_load_population, &Brush::Parameters::set_load_population) .def_property("save_population", &Brush::Parameters::get_save_population, &Brush::Parameters::set_save_population) + .def_property("logfile", &Brush::Parameters::get_logfile, &Brush::Parameters::set_logfile) .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands) .def_property("n_classes", &Brush::Parameters::get_n_classes, &Brush::Parameters::set_n_classes) .def_property("n_jobs", &Brush::Parameters::get_n_jobs, &Brush::Parameters::set_n_classes) diff --git a/src/engine.cpp b/src/engine.cpp index 8c7d13c1..5710d875 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -58,7 +58,7 @@ void Engine::init() this->survivor = Selection(params.surv, true); //std::cout << "created survivor" << std::endl; - this->best_loss = MAX_FLT; + this->best_score = MAX_FLT; this->best_complexity = MAX_FLT; // TODO getters and setters for the best solution found after evolution @@ -76,17 +76,160 @@ void Engine::init() // // signal handler // signal(SIGINT, my_handler); - // // reset statistics - // this->stats = Log_Stats(); + // reset statistics + this->stats = Log_Stats(); } +template +void Engine::print_progress(float percentage) +{ + int val = (int) (percentage * 100); + int lpad = (int) (percentage * PBWIDTH); + int rpad = PBWIDTH - lpad; + + printf ("\rCompleted %3d%% [%.*s%*s]", val, lpad, PBSTR.c_str(), rpad, ""); + + fflush (stdout); + + if(val == 100) + cout << "\n"; +} + + +template +void Engine::calculate_stats(const Dataset& d) +{ + size_t pop_size = this->pop.size(); + + ArrayXf scores(pop_size); + ArrayXf scores_v(pop_size); + // TODO: change all size_t to unsigned? + ArrayXi sizes(pop_size); + ArrayXi complexities(pop_size); + + float error_weight = Individual::weightsMap[params.scorer_]; + + int i=0; + for (int island=0; islandpop.individuals.at(idxs[i]); + + // Fitness class will store every information that can be used as + // fitness. you just need to access them. Multiplying by weight + // so we can find best score. From Fitness::dominates: + // the proper way of comparing weighted values is considering + // everything as a maximization problem + scores(i) = p->fitness.loss; + scores_v(i) = p->fitness.loss_v; + sizes(i) = p->fitness.size; + complexities(i) = p->fitness.complexity; + scores_v(i) = p->fitness.loss_v; + + ++i; + } + } + + assert(i == pop_size); + + // multiply by weight again to get rid of signal + float best_score = (scores*error_weight).maxCoeff()*error_weight; + float best_score_v = (scores_v*error_weight).maxCoeff()*error_weight; + float med_score = median(scores); + float med_score_v = median(scores_v); + unsigned med_size = median(sizes); + unsigned med_complexity = median(complexities); + unsigned max_size = sizes.maxCoeff(); + unsigned max_complexity = complexities.maxCoeff(); + + // update stats + stats.update(params.current_gen, + timer.Elapsed().count(), + best_score, + best_score_v, + med_score, + med_score_v, + med_size, + med_complexity, + max_size, + max_complexity); +} + + +template +void Engine::log_stats(std::ofstream& log) +{ + // print stats in tabular format + string sep = ","; + if (params.current_gen == 0) // print header + { + log << "generation" << sep + << "time" << sep + << "best_score" << sep + << "best_score_val" << sep + << "med_score" << sep + << "med_score_val" << sep + << "med_size" << sep + << "med_complexity" << sep + << "max_size" << sep + << "max_complexity" << "\n"; + } + log << params.current_gen << sep + << timer.Elapsed().count() << sep + << stats.best_score.back() << sep + << stats.best_score_v.back() << sep + << stats.med_score.back() << sep + << stats.med_score_v.back() << sep + << stats.med_size.back() << sep + << stats.med_complexity.back() << sep + << stats.max_size.back() << sep + << stats.max_complexity.back() << "\n"; +} + +template +void Engine::print_stats(std::ofstream& log, float fraction) +{ + // progress bar + string bar, space = ""; + for (unsigned int i = 0; i<50; ++i) + { + if (i <= 50*fraction) bar += "/"; + else space += " "; + } + + std::cout.precision(5); + std::cout << std::scientific; + + if(params.max_time == -1) + std::cout << "Generation " << params.current_gen+1 << "/" + << params.gens << " [" + bar + space + "]\n"; + else + std::cout << std::fixed << "Time elapsed "<< timer + << "/" << params.max_time + << " seconds (Generation "<< params.current_gen+1 + << ") [" + bar + space + "]\n"; + + std::cout << std::fixed + << "Train Loss (Med): " << stats.best_score.back() << " (" << stats.med_score.back() << ")\n" + << "Val Loss (Med): " << stats.best_score_v.back() << " (" << stats.med_score_v.back() << ")\n" + << "Median Size (Max): " << stats.med_size.back() << " (" << stats.max_size.back() << ")\n" + << "Time (s): " << timer + <<"\n\n"; +} + + template // TODO: use the dataset, or ignore it bool Engine::update_best(const Dataset& data, bool val) { //std::cout << "updating best" << std::endl; + float error_weight = Individual::weightsMap[params.scorer_]; + float bs; - bs = this->best_loss; + bs = this->best_score; float f; // TODO: archive here? @@ -112,9 +255,10 @@ bool Engine::update_best(const Dataset& data, bool val) else f = ind.fitness.loss; - if (f < bs - || (f == bs && ind.fitness.complexity < this->best_complexity) - ) + // TODO: fix this by multiplying by weight + if (f*error_weight > bs*error_weight + || (f == bs && ind.fitness.complexity < this->best_complexity) + ) { //std::cout << "updated" << std::endl; @@ -126,7 +270,7 @@ bool Engine::update_best(const Dataset& data, bool val) } } - this->best_loss = bs; + this->best_score = bs; return updated; } @@ -148,6 +292,11 @@ void Engine::run(Dataset &data) pop.init(this->ss, this->params); + // log file stream + std::ofstream log; + if (!params.logfile.empty()) + log.open(params.logfile, std::ofstream::app); + //std::cout << "pop initialized with size " << params.pop_size << " and " << params.num_islands << "islands" << std::endl; //std::cout << pop.print_models() << std::endl; @@ -314,6 +463,31 @@ void Engine::run(Dataset &data) auto finish_gen = subflow.emplace([&]() { bool updated_best = this->update_best(data); + // TODO: fix this code below (if needed. this is borrowed from feat) + // if ( (use_arch || params.verbosity>1) || !logfile.empty()) { + // // set objectives to make sure they are reported in log/verbose/arch + // #pragma omp parallel for + // for (unsigned int i=0; i1) + print_stats(log, fraction); + else if(params.verbosity == 1) + print_progress(fraction); + + if (!params.logfile.empty()) + log_stats(log); + if (generation == 0 || updated_best ) stall_count = 0; else @@ -336,6 +510,19 @@ void Engine::run(Dataset &data) this->pop.save(params.save_population); this->set_is_fitted(true); + + // TODO: make this work + // if (save_pop > 0) + // { + // pop.save(this->logfile+".pop.gen" + to_string(params.current_gen) + // + ".json"); + // this->best_ind.save(this->logfile+".best.json"); + // } + + // TODO: open, write, close? (to avoid breaking the file and allow some debugging if things dont work well) + if (log.is_open()) + log.close(); + } // work done, report last gen and stop ); // evolutionary loop diff --git a/src/engine.h b/src/engine.h index 4947ba85..33dbab4f 100644 --- a/src/engine.h +++ b/src/engine.h @@ -36,6 +36,12 @@ class Engine{ ~Engine(){}; + // outputs a progress bar, filled according to @param percentage. + void print_progress(float percentage); + void calculate_stats(const Dataset& d); + void print_stats(std::ofstream& log, float fraction); + void log_stats(std::ofstream& log); + // all hyperparameters are controlled by the parameter class. please refer to that to change something inline Parameters& get_params(){return params;} inline void set_params(Parameters& p){params=p;} @@ -47,8 +53,8 @@ class Engine{ // TODO: hyperparameter to set how the best is picked (MCDM, best on val, pareto front, etc). one of the options should be getting the pareto front // TODO: best fitness instead of these. use fitness comparison - float best_loss; - int best_complexity; + float best_score; + int best_complexity; // TODO: best complexity in log/print stats? Individual& get_best_ind(){return best_ind;}; /// train the model @@ -64,7 +70,9 @@ class Engine{ Variation variator; ///< variation operators Selection survivor; ///< survival algorithm - // TODO: MISSING CLASSES: timer, archive, logger + Log_Stats stats; ///< runtime stats + + // TODO: MISSING CLASSES: archive Timer timer; ///< start time of training Individual best_ind; @@ -74,8 +82,6 @@ class Engine{ /// set flag indicating whether fit has been called inline void set_is_fitted(bool f){is_fitted=f;} - - // TODO: calculate/print stats }; } // Brush diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index 884c0789..a6c6af34 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -22,9 +22,9 @@ void Evaluation::update_fitness(Population& pop, { Individual& ind = *pop.individuals.at(idxs.at(i)).get(); // we are modifying it, so operator[] wont work - bool pass = true; + bool pass = false; - if (!pass) + if (pass) { // TODO: check if score was nan and assign the max float // TODO: better handling of nan or inf scores when doing selection and survival (and hall of fame and rank for migration) @@ -42,7 +42,7 @@ void Evaluation::update_fitness(Population& pop, assign_fit(ind, data, params, validation); } - ++counter; + ++counter;// TODO: get rid of this counter } assert(counter > 0); @@ -56,20 +56,20 @@ void Evaluation::assign_fit(Individual& ind, const Dataset& data, VectorXf errors; using PT = ProgramType; + Dataset train = data.get_training_data(); + float f = S.score(ind, train, errors, params); + Dataset validation = data.get_validation_data(); float f_v = S.score(ind, validation, errors, params); // TODO: implement the class weights and use it here (and on errors) - Dataset train = data.get_training_data(); - float f = S.score(ind, train, errors, params); - ind.error = errors; ind.fitness.set_loss(f); ind.fitness.set_loss_v(f_v); - ind.fitness.size = ind.program.size(); - ind.fitness.complexity = ind.program.complexity(); - ind.fitness.depth = ind.program.depth(); + ind.fitness.size = ind.get_size(); + ind.fitness.complexity = ind.get_complexity(); + ind.fitness.depth = ind.get_depth(); ind.set_objectives(params.objectives); diff --git a/src/individual.cpp b/src/individual.cpp index 3a27b9a7..5c3f3ae1 100644 --- a/src/individual.cpp +++ b/src/individual.cpp @@ -39,8 +39,8 @@ void from_json(const json &j, Fitness& f) int Fitness::dominates(const Fitness& b) const { - int flag1 = 0, // to check if this has a smaller objective - flag2 = 0; // to check if b has a smaller objective + int flag1 = 0, // to check if this has a better objective + flag2 = 0; // to check if b has a better objective // TODO: replace comparison of individual values by using the overloaded operators (here and in nsga2) for (int i=0; i::set_objectives(const vector& objectives) // Static map for weights associated with strings + // TODO: change this to an attribute instead of a function // TODO: weights for different values. loss should be calculated duing runtime, based on the metric inline static std::map weightsMap = []() { std::map map = { @@ -79,8 +80,15 @@ class Individual{ {"size", -1.0} // Add more key-value pairs as needed }; + + // TODO: move these key value initializations to line above // example on how to have weight based on templated class - map["error"] = (T == Brush::ProgramType::Regressor) ? -1.0 : -1.0; + map["error"] = (T == Brush::ProgramType::Regressor) ? -1.0 : +1.0; + + // TODO: eu deveria fazer um check para ver se a string eh error e qual o scorer_ nesse caso + map["mse"] = -1.0; + map["log"] = +1.0; + map["multi_log"] = +1.0; return map; }(); diff --git a/src/params.h b/src/params.h index 749df87b..a62f84f0 100644 --- a/src/params.h +++ b/src/params.h @@ -7,18 +7,20 @@ license: GNU/GPL v3 #define PARAMS_H #include "init.h" +#include "util/logger.h" namespace ns = nlohmann; namespace Brush { + struct Parameters { public: // TODO: make parameters private, and use the getters and setters in the code int random_state = 0; // by default, the rng generator will use any random seed if random_state is zero - //int verbosity = 0; // TODO: implement log and verbosity + int verbosity = 0; // Evolutionary stuff string mode="regression"; @@ -76,10 +78,17 @@ struct Parameters string load_population = ""; string save_population = ""; + string logfile = ""; + int n_jobs = 1; // -1; ///< number of parallel jobs -1 use all threads; 0 use same as number of islands; positive number specify the amouut of threads Parameters(){}; ~Parameters(){}; + + // TODO: use logger to log information + void set_verbosity(int new_verbosity){ Brush::Util::logger.set_log_level(new_verbosity); + verbosity = new_verbosity; }; + int get_verbosity(){ return verbosity; }; void set_random_state(int new_random_state){random_state = new_random_state; }; int get_random_state(){ return random_state; }; @@ -105,6 +114,9 @@ struct Parameters void set_save_population(string new_save_population){ save_population = new_save_population; }; string get_save_population(){ return save_population; }; + string get_logfile(){ return logfile; }; + void set_logfile(string s){ logfile=s; }; + void set_current_gen(unsigned int gen){ current_gen = gen; }; unsigned int get_current_gen(){ return current_gen; }; diff --git a/src/util/utils.cpp b/src/util/utils.cpp index 9f469618..6427cac2 100644 --- a/src/util/utils.cpp +++ b/src/util/utils.cpp @@ -229,22 +229,26 @@ void Log_Stats::update(int index, float bst_score, float bst_score_v, float md_score, - float md_loss_v, + float md_score_v, unsigned md_size, unsigned md_complexity, - unsigned md_num_params, - unsigned md_dim) + unsigned mx_size, + unsigned mx_complexity + ) { generation.push_back(index+1); time.push_back(timer_count); + best_score.push_back(bst_score); best_score_v.push_back(bst_score_v); med_score.push_back(md_score); - med_loss_v.push_back(md_loss_v); + med_score_v.push_back(md_score_v); + med_size.push_back(md_size); med_complexity.push_back(md_complexity); - med_num_params.push_back(md_num_params); - med_dim.push_back(md_dim); + + max_size.push_back(mx_size); + max_complexity.push_back(mx_complexity); } /* array split(ArrayXf& v, ArrayXb& mask) */ diff --git a/src/util/utils.h b/src/util/utils.h index 4cc9a35c..adb8eaee 100644 --- a/src/util/utils.h +++ b/src/util/utils.h @@ -350,29 +350,50 @@ struct Log_Stats { vector generation; vector time; + vector best_score; vector best_score_v; vector med_score; - vector med_loss_v; + vector med_score_v; + vector med_size; vector med_complexity; - vector med_num_params; - vector med_dim; - + vector max_size; + vector max_complexity; + void update(int index, float timer_count, + float bst_score, float bst_score_v, float md_score, - float md_loss_v, + float md_score_v, + unsigned md_size, unsigned md_complexity, - unsigned md_num_params, - unsigned md_dim); + unsigned mx_size, + unsigned mx_complexity + ); }; typedef struct Log_Stats Log_stats; +// TODO: change this to something more modern +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Log_Stats, + generation, + time, + + best_score, + best_score_v, + med_score, + med_score_v, + + med_size, + med_complexity, + max_size, + max_complexity + ); + /// limits the output to finite real numbers template std::enable_if_t, T> diff --git a/src/variation.cpp b/src/variation.cpp index ff8f268f..35819e97 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -650,12 +650,15 @@ void Variation::vary(Population& pop, int island, if (opt) // no optional value was returned { Individual ind = opt.value(); + ind.is_fitted_ = false; assert(ind.program.size()>0); pop.individuals.at(idxs.at(i)) = std::make_shared>(ind); } else { Individual new_ind; + new_ind.is_fitted_ = false; + new_ind.init(search_space, parameters); new_ind.set_objectives(mom.get_objectives()); // it will have an invalid fitness diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 946a467d..bfbd285e 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -21,6 +21,7 @@ #include "../../src/eval/evaluation.cpp" #include "../../src/population.cpp" +// TODO: test logger, verbose, print stats, etc. TEST(Engine, EngineWorks) { MatrixXf X(10,2); @@ -37,10 +38,12 @@ TEST(Engine, EngineWorks) Dataset data(X,y); Parameters params; - params.set_pop_size(10); + params.set_pop_size(100); params.set_gens(10); params.set_mig_prob(0.0); + params.set_verbosity(2); // TODO: verbosity tests + std::cout << "n jobs = 1" << std::endl; params.set_n_jobs(1); Brush::RegressorEngine est5(params); @@ -73,9 +76,11 @@ TEST(Engine, EngineWorks) est6.run(data); std::cout << "n jobs = 2" << std::endl; + params.set_logfile("./tests/cpp/__logfile.csv"); // TODO: test classification and regression and save log so we can inspect it params.set_n_jobs(2); Brush::RegressorEngine est7(params); est7.run(data); + params.set_logfile(""); std::cout << "n jobs = -1" << std::endl; params.set_n_jobs(-1); From 8c48695a28faaae2f66d2a58bb26da00fdf95b6a Mon Sep 17 00:00:00 2001 From: gAldeia Date: Mon, 22 Apr 2024 14:51:18 -0300 Subject: [PATCH 162/199] Starting to implement archive --- pybrush/BrushEstimator.py | 5 ++++ src/engine.cpp | 3 +- src/engine.h | 34 ++++++++++++++++++++- src/individual.h | 2 ++ src/pop/archive.cpp | 8 +++++ src/pop/archive.h | 63 +++++++++++++++++++++++++++++++++++++++ src/population.h | 2 ++ src/variation.h | 2 ++ tests/cpp/test_brush.cpp | 2 ++ 9 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 src/pop/archive.cpp create mode 100644 src/pop/archive.h diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 020d27ec..71bf4e1d 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -88,6 +88,9 @@ class BrushEstimator(BaseEstimator): to calculate statistics during evolution, but not used to train the models. The `best_estimator_` will be selected using this partition. If zero, then the same data used for training is used for validation. + val_from_arch: boolean, optional (default: True) + Validates the final model using the archive rather than the whole + population. batch_size : float, default 1.0 Percentage of training data to sample every generation. If `1.0`, then all data is used. Very small values can improve execution time, but @@ -144,6 +147,7 @@ def __init__( random_state=None, logfile="", weights_init=True, + val_from_arch=True, validation_size: float = 0.0, batch_size: float = 1.0 ): @@ -163,6 +167,7 @@ def __init__( self.cx_prob=cx_prob self.logfile=logfile self.mutation_probs=mutation_probs + self.val_from_arch=val_from_arch # TODO: val from arch self.functions=functions self.objectives=objectives self.initialization=initialization diff --git a/src/engine.cpp b/src/engine.cpp index 5710d875..4e7b359d 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -63,13 +63,14 @@ void Engine::init() // TODO getters and setters for the best solution found after evolution // predict, transform, predict_proba, etc. - // get statistics // load and save best individuals // logger, save to file // execution archive // score functions // fit methods (this will run the evolution) + this->archive.set_objectives(params.objectives); + // start the clock timer.Reset(); diff --git a/src/engine.h b/src/engine.h index 33dbab4f..9a3d694e 100644 --- a/src/engine.h +++ b/src/engine.h @@ -10,6 +10,7 @@ license: GNU/GPL v3 #include "init.h" #include "params.h" #include "population.h" +#include "pop/archive.h" #include "./eval/evaluation.h" #include "variation.h" #include "selection/selection.h" @@ -24,6 +25,7 @@ using namespace Pop; using namespace Sel; using namespace Eval; using namespace Var; +using namespace nlohmann; template class Engine{ @@ -57,6 +59,34 @@ class Engine{ int best_complexity; // TODO: best complexity in log/print stats? Individual& get_best_ind(){return best_ind;}; + // TODO: starting pop (just like feat) + + // TODO: make thesqe work + // /// predict on unseen data. + // VectorXf predict(MatrixXf& X, LongData& Z); + // VectorXf predict(MatrixXf& X); + + // /// predict on unseen data. return CLabels. + // shared_ptr predict_labels(MatrixXf& X, LongData Z = LongData()); + + // /// predict probabilities of each class. + // ArrayXXf predict_proba(MatrixXf& X, LongData& Z); + // ArrayXXf predict_proba(MatrixXf& X); + + // archive stuff + // TODO: make these work + ///return archive size + int get_archive_size(){ return this->archive.individuals.size(); }; + ///return population as string + vector get_archive(bool front); + + // /// predict on unseen data from the whole archive + // VectorXf predict_archive(int id, MatrixXf& X); + // VectorXf predict_archive(int id, MatrixXf& X, LongData& Z); + // ArrayXXf predict_proba_archive(int id, MatrixXf& X, LongData& Z); + // ArrayXXf predict_proba_archive(int id, MatrixXf& X); + + /// train the model void run(Dataset &d); @@ -72,8 +102,8 @@ class Engine{ Log_Stats stats; ///< runtime stats - // TODO: MISSING CLASSES: archive Timer timer; ///< start time of training + Archive archive; ///< pareto front archive Individual best_ind; bool is_fitted; ///< keeps track of whether fit was called. @@ -84,6 +114,8 @@ class Engine{ inline void set_is_fitted(bool f){is_fitted=f;} }; +// TODO: serialization for engine with NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE + } // Brush #endif diff --git a/src/individual.h b/src/individual.h index dc2662d1..e0ebe64b 100644 --- a/src/individual.h +++ b/src/individual.h @@ -11,6 +11,8 @@ using namespace nlohmann; namespace Brush{ namespace Pop{ +// TODO: folder for fitness and individual + template class Individual{ public: // TODO: make these private (and work with nlohman json) diff --git a/src/pop/archive.cpp b/src/pop/archive.cpp new file mode 100644 index 00000000..7ea92e8d --- /dev/null +++ b/src/pop/archive.cpp @@ -0,0 +1,8 @@ +// TODO: implement archive functions +#include "archive.h" + +namespace Brush { +namespace Pop { + +} +} \ No newline at end of file diff --git a/src/pop/archive.h b/src/pop/archive.h new file mode 100644 index 00000000..db318049 --- /dev/null +++ b/src/pop/archive.h @@ -0,0 +1,63 @@ +#ifndef ARCHIVE_H +#define ARCHIVE_H + +//#include "node.h" // including node.h since definition of node is in the header +#include "../individual.h" + +///< nsga2 selection operator for getting the front +#include "../selection/nsga2.h" + +// TODO: do i really need these? +using std::vector; +using std::string; +using Eigen::Map; + +namespace Brush{ + +using namespace Sel; + +namespace Pop{ + +template +struct Archive +{ + // I dont need shared pointers here + vector> individuals; ///< individual programs in the archive + bool sort_complexity; ///< whether to sort archive by complexity + + NSGA2 selector; + + Archive(){}; + ~Archive(){}; + + void init(Population& pop){}; + + void update(const Population& pop, const Parameters& params){}; + + void set_objectives(vector objectives){}; + + /// Sort population in increasing complexity. + static bool sortComplexity(const Individual& lhs, + const Individual& rhs){ return false; }; + + /// Sort population by first objective. + static bool sortObj1(const Individual& lhs, + const Individual& rhs){ return false; }; + + /// check for repeats + static bool sameFitComplexity(const Individual& lhs, + const Individual& rhs){ return false; }; + static bool sameObjectives(const Individual& lhs, + const Individual& rhs){ return false; }; +}; + +//serialization +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals); + +} // Pop +} // Brush + +#endif diff --git a/src/population.h b/src/population.h index cb0fe140..929785e5 100644 --- a/src/population.h +++ b/src/population.h @@ -4,10 +4,12 @@ #include "util/error.h" #include "individual.h" +// TODO: do i really need these? using std::vector; using std::string; using Eigen::Map; +// TODO: folder for population and archive // TODO: move this serialization elsewhere // serializing vector of shared ptr: https://github.com/nlohmann/json/discussions/2377 namespace nlohmann diff --git a/src/variation.h b/src/variation.h index 0bca47dc..8b00cea7 100644 --- a/src/variation.h +++ b/src/variation.h @@ -6,6 +6,8 @@ license: GNU/GPL v3 #ifndef VARIATION_H #define VARIATION_H + +// TODO: folder for variation. move search_space and variation to this folder // #include "util/error.h" // #include "util/utils.h" diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index bfbd285e..a6b7aad1 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -11,6 +11,7 @@ #include "../../src/selection/nsga2.h" #include "../../src/selection/lexicase.h" #include "../../src/eval/evaluation.h" +#include "../../src/pop/archive.h" #include "../../src/population.h" // TODO: omg i need to figure out why my code only works if i import basically the whole stuff @@ -19,6 +20,7 @@ #include "../../src/selection/nsga2.cpp" #include "../../src/selection/lexicase.cpp" #include "../../src/eval/evaluation.cpp" +#include "../../src/pop/archive.cpp" #include "../../src/population.cpp" // TODO: test logger, verbose, print stats, etc. From b6476dea5933e887fb58640c26919e77f9fb7d0e Mon Sep 17 00:00:00 2001 From: gAldeia Date: Tue, 23 Apr 2024 10:43:01 -0300 Subject: [PATCH 163/199] Notebook to test the island implementation --- test.ipynb | 410 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 410 insertions(+) create mode 100644 test.ipynb diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 00000000..0250befd --- /dev/null +++ b/test.ipynb @@ -0,0 +1,410 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "libcbrush.so: cannot open shared object file: No such file or directory", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m mean_squared_error, r2_score\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m_brush\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindividual\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RegressorIndividual\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m_brush\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SearchSpace, Parameters, Dataset\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpybrush\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DeapRegressor\n", + "\u001b[0;31mImportError\u001b[0m: libcbrush.so: cannot open shared object file: No such file or directory" + ] + } + ], + "source": [ + "from sklearn import datasets\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "\n", + "from _brush.individual import RegressorIndividual\n", + "from _brush import SearchSpace, Parameters, Dataset\n", + "\n", + "from pybrush import DeapRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the diabetes dataset\n", + "diabetes = datasets.load_diabetes()\n", + "\n", + "# Use only one feature\n", + "X = diabetes.data[:, None, 2]\n", + "y = diabetes.target\n", + "\n", + "import pandas as pd\n", + "\n", + "# df = pd.read_csv(\"https://raw.githubusercontent.com/gAldeia/hashing-symbolic-expressions/master/data/lexicase_paper/d_airfoil.txt?token=GHSAT0AAAAAACPJ5UIOJY42GOUHC4GKZOBOZPS7BHA\")\n", + "# X = df.drop('label', axis=1)\n", + "# y = df['label']\n", + "\n", + "print(X.shape, y.shape)\n", + "# Split the data into training/testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pybrush import engine\n", + "print(\"imported\")\n", + "\n", + "# Validation is to hold some part of the data as the inner validation split\n", + "dataset = Dataset(X=X_train, y=y_train, validation_size=0.75)\n", + "print(\"dataset\")\n", + "\n", + "params = Parameters()\n", + "print(\"parameters\")\n", + "\n", + "brush_estimator = engine.RegressorEngine(params)\n", + "print(\"estimator\")\n", + "\n", + "print(brush_estimator.params.pop_size)\n", + "brush_estimator.params.pop_size = 100\n", + "brush_estimator.params.gens = 100\n", + "brush_estimator.params.num_islands = 5\n", + "brush_estimator.params.max_size = 2**6\n", + "brush_estimator.params.max_depth = 6\n", + "brush_estimator.params.n_jobs = 5\n", + "brush_estimator.params.objectives = [\"error\", \"size\"]\n", + "print(brush_estimator.params.pop_size)\n", + "\n", + "print(brush_estimator.is_fitted)\n", + "print(brush_estimator.best_ind.program.get_model())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reg = DeapRegressor(\n", + " gens=100, pop_size=100, max_size=2**6, max_depth=6,\n", + " num_islands=1,\n", + " n_jobs=1,\n", + " objectives=['error', 'size'], #, 'complexity'],\n", + " verbosity=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "print(\"starting to run\")\n", + "\n", + "brush_estimator.run(dataset)\n", + "print(\"done\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(brush_estimator.is_fitted)\n", + "print(brush_estimator.best_ind.program.get_model())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "\n", + "mean_squared_error(\n", + " brush_estimator.best_ind.program.predict(X_test), y_test, squared=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "brush_estimator.best_ind.fitness.values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "regr = RegressorIndividual()\n", + "print(dir(regr))\n", + "\n", + "# Validation is to hold some part of the data as the inner validation split\n", + "dataset = Dataset(X=X_train, y=y_train, validation_size=0.75)\n", + "ss = SearchSpace(dataset)\n", + "params = Parameters()\n", + "\n", + "regr.init(ss, params)\n", + "\n", + "# regr.fit(X_train, y_train)\n", + "regr.program.get_model()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reg.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pybrush import BrushRegressor\n", + "\n", + "reg2 = BrushRegressor(\n", + " gens=100, pop_size=100, max_size=2**6, max_depth=6,\n", + " num_islands=1,\n", + " n_jobs=3,\n", + " objectives=['error', 'size'], #, 'complexity'],\n", + " verbosity=1\n", + ").fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reg.best_estimator_.fitness.values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pybrush import RegressorEvaluator\n", + "\n", + "# RegressorEvaluator()\n", + "print( reg.best_estimator_.program.get_model() )\n", + "print( reg.best_estimator_.fitness.values )\n", + "\n", + "RegressorEvaluator().assign_fit(\n", + " reg.best_estimator_, reg.data_, reg.parameters_, True)\n", + "print( reg.best_estimator_.fitness.values )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pybrush import RegressorEvaluator\n", + "import numpy as np\n", + "print(regr.fitness.weights)\n", + "regr.objectives = ['error', 'size']\n", + "print(regr.fitness.weights)\n", + "\n", + "regr.fitness.values = [1, 2]\n", + "regr.init(reg.search_space_, reg.parameters_)\n", + "regr.program.fit(reg.data_)\n", + "\n", + "print(regr.program.get_model())\n", + "print(regr.fitness.wvalues)\n", + "print(regr.fitness.values)\n", + "\n", + "RegressorEvaluator().assign_fit(\n", + " regr, reg.data_, reg.parameters_, False)\n", + "print( regr.fitness.values )\n", + "\n", + "def _error(ind, data):\n", + " MSE = np.mean( (data.y-ind.program.predict(data))**2 )\n", + " if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf\n", + " MSE = np.inf\n", + "\n", + " return MSE\n", + "\n", + "def _fitness_validation(ind, data):\n", + " # Fitness without fitting the expression, used with validation data\n", + "\n", + " ind_objectives = {\n", + " \"error\" : _error(ind, data),\n", + " \"size\" : ind.program.size(),\n", + " \"complexity\": ind.program.complexity()\n", + " }\n", + " return [ ind_objectives[obj] for obj in reg.objectives ]\n", + "\n", + "def _fitness_function(ind, data):\n", + " return _fitness_validation(ind, data)\n", + "\n", + "print(_fitness_function(regr, reg.data_))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import log_loss\n", + "from sklearn.datasets import load_iris, load_breast_cancer\n", + "\n", + "from pybrush import individual\n", + "from pybrush import ClassifierEvaluator\n", + "\n", + "# Load the iris dataset\n", + "iris = load_breast_cancer()\n", + "X = iris.data\n", + "y = iris.target\n", + "print(np.unique(y))\n", + "\n", + "clf = individual.ClassifierIndividual()\n", + "print(dir(clf))\n", + "\n", + "# c=True will add logistic function into the search space\n", + "# Validation is to hold some part of the data as the inner validation split\n", + "dataset = Dataset(X=X, y=y, c=True, validation_size=0.0)\n", + "ss = SearchSpace(dataset)\n", + "params = Parameters()\n", + "\n", + "clf.init(ss, params)\n", + "\n", + "# clf.fit(X_train, y_train)\n", + "clf.program.get_model()\n", + "\n", + "clf.objectives = ['error', 'size']\n", + "print(clf.fitness.weights)\n", + "\n", + "clf.fitness.values = [1, 2]\n", + "clf.program.fit(dataset)\n", + "\n", + "print(clf.program.get_model())\n", + "print(clf.fitness.wvalues)\n", + "print(clf.fitness.values)\n", + "\n", + "ClassifierEvaluator().assign_fit(clf, dataset, params, False)\n", + "print( clf.fitness.values )\n", + "def _error(ind, data):\n", + " probas = ind.program.predict_proba(data)\n", + " print(probas[:3])\n", + " probas = np.array([probas, 1-probas]).T\n", + " print(probas.shape)\n", + " print(probas[:3, :])\n", + " ERR = log_loss(data.y, probas, labels=['a', 'b'])\n", + " if not np.isfinite(ERR): # numeric erros, np.nan, +-np.inf\n", + " ERR = np.inf\n", + "\n", + " return ERR\n", + "\n", + "def _fitness_validation(ind, data):\n", + " # Fitness without fitting the expression, used with validation data\n", + "\n", + " ind_objectives = {\n", + " \"error\" : _error(ind, data),\n", + " \"size\" : ind.program.size(),\n", + " \"complexity\": ind.program.complexity()\n", + " }\n", + " return [ ind_objectives[obj] for obj in clf.objectives ]\n", + "\n", + "def _fitness_function(ind, data):\n", + " return _fitness_validation(ind, data)\n", + "\n", + "print(_fitness_function(clf, dataset))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pybrush import BrushClassifier\n", + "\n", + "clf = BrushClassifier(\n", + " gens=10, pop_size=10, max_size=2**5, max_depth=5,\n", + " num_islands=1,\n", + " n_jobs=3,\n", + " objectives=['error', 'size'], #, 'complexity'],\n", + " verbosity=1,\n", + " functions={\"Add\":1.0,\"Logistic\":1.0},\n", + ").fit(X, y)\n", + "clf.best_estimator_.program.get_model()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clf_eval = ClassifierEvaluator()\n", + "clf_eval.scorer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#from pybrush import RegressorSelector\n", + "\n", + "from _brush import RegressorSelector\n", + "\n", + "# RegressorSelector().select([reg.best_estimator_], params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "brush", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From e06cd7ccfbf5951f92e573c45f45e4f99c7e2186 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 23 Apr 2024 17:23:58 -0300 Subject: [PATCH 164/199] comments print statements --- pybrush/deap_api/nsga2.py | 49 ++++++++++++++++++++------------------- src/program/tree_node.cpp | 4 ++-- src/search_space.cpp | 36 ++++++++++++++-------------- 3 files changed, 45 insertions(+), 44 deletions(-) diff --git a/pybrush/deap_api/nsga2.py b/pybrush/deap_api/nsga2.py index 048cde56..378beb62 100644 --- a/pybrush/deap_api/nsga2.py +++ b/pybrush/deap_api/nsga2.py @@ -47,10 +47,10 @@ def calculate_statistics(ind): toolbox.update_current_gen(gen) # Vary the population - print("--"*20) - print("pop before select") - for p in pop: - print(p.program.get_model()) + # print("--"*20) + # print("pop before select") + # for p in pop: + # print(p.program.get_model()) # print(p.fitness.values) # print(p.fitness.weights) # print(p.fitness.wvalues) @@ -60,39 +60,40 @@ def calculate_statistics(ind): parents = toolbox.select(pop) # , len(pop) # select method from brush's cpp side will use the values in self.parameters_ to decide how many individuals it should select - print("--"*20) - print("pop after select") - for p in pop: - print(p.program.get_model()) + # print("--"*20) + # print("pop after select") + # for p in pop: + # print(p.program.get_model()) - print("--"*20) - print("selected parents") - for p in parents: - print(p.program.get_model()) + # print("--"*20) + # print("selected parents") + # for p in parents: + # print(p.program.get_model()) offspring = toolbox.vary_pop(parents) offspring = list(toolbox.map(toolbox.assign_fit, offspring)) - print("--"*20) - print("offspring") - for p in offspring: - print(p.program.get_model()) + # print("--"*20) + # print("offspring") + # for p in offspring: + # print(p.program.get_model()) # Select the next generation population (no sorting before this step, as # survive==offspring will cut it in half) pop = toolbox.survive(pop + offspring) - print("--"*20) - print("pop after survival") - for p in pop: - print(p.program.get_model()) + # print("--"*20) + # print("pop after survival") + # for p in pop: + # print(p.program.get_model()) pop = toolbox.migrate(pop) - print("--"*20) - print("pop after migration") - for p in pop: - print(p.program.get_model()) + # print("--"*20) + # print("pop after migration") + # for p in pop: + # print(p.program.get_model()) + pop.sort(key=lambda x: x.fitness, reverse=True) record = stats.compile(pop) diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index eacacff7..d67f3793 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -162,8 +162,8 @@ int TreeNode::get_complexity() const // include the `w` and `*` if the node is weighted (and it is not a constant or mean label) if (data.get_is_weighted() && !(Is(data.node_type) - || Is(data.node_type) - || Is(data.node_type)) + || (Is(data.node_type) + || Is(data.node_type)) ) ) return operator_complexities.at(NodeType::Mul)*( operator_complexities.at(NodeType::Constant) + diff --git a/src/search_space.cpp b/src/search_space.cpp index a6b11e8b..72f15ded 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -277,7 +277,7 @@ tree& SearchSpace::PTC2(tree& Tree, // auto Tree = tree(); - fmt::print("building program with max size {}, max depth {}",max_size,max_d); + // fmt::print("building program with max size {}, max depth {}",max_size,max_d); // Queue of nodes that need children vector> queue; @@ -289,7 +289,7 @@ tree& SearchSpace::PTC2(tree& Tree, Node root = spot.node->data; - cout << "root " << root.name << endl; + // cout << "root " << root.name << endl; // auto spot = Tree.set_head(n); // updating size accordingly to root node @@ -305,7 +305,7 @@ tree& SearchSpace::PTC2(tree& Tree, //For each argument position a of n, Enqueue(a; g) for (auto a : root.arg_types) { - cout << "queing a node of type " << DataTypeName[a] << endl; + // cout << "queing a node of type " << DataTypeName[a] << endl; auto child_spot = Tree.append_child(spot); queue.push_back(make_tuple(child_spot, a, d)); } @@ -314,8 +314,8 @@ tree& SearchSpace::PTC2(tree& Tree, Node n; // Now we actually start the PTC2 procedure to create the program tree - cout << "queue size: " << queue.size() << endl; - cout << "entering first while loop...\n"; + // cout << "queue size: " << queue.size() << endl; + // cout << "entering first while loop...\n"; while ( queue.size() + s < max_size && queue.size() > 0) { // including the queue size in the max_size, since each element in queue @@ -327,14 +327,14 @@ tree& SearchSpace::PTC2(tree& Tree, // always insert a non terminal (which by default has weights off). // this way, we can have PTC2 working properly. - cout << "queue size: " << queue.size() << endl; + // cout << "queue size: " << queue.size() << endl; auto [qspot, t, d] = RandomDequeue(queue); - cout << "current depth: " << d << endl; + // cout << "current depth: " << d << endl; if (d >= max_d || s >= max_size) { // choose terminal of matching type - cout << "getting " << DataTypeName[t] << " terminal\n"; + // cout << "getting " << DataTypeName[t] << " terminal\n"; // qspot = sample_terminal(t); // Tree.replace(qspot, sample_terminal(t)); // Tree.append_child(qspot, sample_terminal(t)); @@ -354,9 +354,9 @@ tree& SearchSpace::PTC2(tree& Tree, else { //choose a nonterminal of matching type - cout << "getting op of type " << DataTypeName[t] << endl; + // cout << "getting op of type " << DataTypeName[t] << endl; auto opt = sample_op(t); - cout << "chose " << n.name << endl; + // cout << "chose " << n.name << endl; // TreeIter new_spot = Tree.append_child(qspot, n); // qspot = n; @@ -379,7 +379,7 @@ tree& SearchSpace::PTC2(tree& Tree, // For each arg of n, add to queue for (auto a : n.arg_types) { - cout << "queing a node of type " << DataTypeName[a] << endl; + // cout << "queing a node of type " << DataTypeName[a] << endl; // queue.push_back(make_tuple(new_spot, a, d+1)); auto child_spot = Tree.append_child(newspot); @@ -399,20 +399,20 @@ tree& SearchSpace::PTC2(tree& Tree, && Isnt(n.node_type) ) s += 2; - cout << "current tree size: " << s << endl; + // cout << "current tree size: " << s << endl; } - cout << "entering second while loop...\n"; + // cout << "entering second while loop...\n"; while (queue.size() > 0) { if (queue.size() == 0) break; - cout << "queue size: " << queue.size() << endl; + // cout << "queue size: " << queue.size() << endl; auto [qspot, t, d] = RandomDequeue(queue); - cout << "getting " << DataTypeName[t] << " terminal\n"; + // cout << "getting " << DataTypeName[t] << " terminal\n"; // Tree.append_child(qspot, sample_terminal(t)); // qspot = sample_terminal(t); // auto newspot = Tree.replace(qspot, sample_terminal(t)); @@ -426,9 +426,9 @@ tree& SearchSpace::PTC2(tree& Tree, auto newspot = Tree.replace(qspot, n); } - cout << "final tree:\n" - << Tree.begin().node->get_model() << "\n" - << Tree.begin().node->get_tree_model(true) << endl; + // cout << "final tree:\n" + // << Tree.begin().node->get_model() << "\n" + // << Tree.begin().node->get_tree_model(true) << endl; return Tree; }; From be98ce5cae124079c4a1f20875f8f8b7933215d4 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Tue, 23 Apr 2024 21:00:25 -0300 Subject: [PATCH 165/199] Cleaning some TODOs. Organizing files --- src/bindings/bind_engines.h | 4 +- src/bindings/bind_fitness.cpp | 2 +- src/bindings/bind_individuals.h | 2 +- src/bindings/bind_search_space.cpp | 2 +- src/bindings/bind_selection.h | 4 +- src/bindings/bind_variation.h | 8 +- src/bindings/module.cpp | 3 +- src/engine.cpp | 33 +------ src/engine.h | 6 +- src/eval/evaluation.cpp | 6 -- src/eval/evaluation.h | 6 +- src/{eval => ind}/fitness.cpp | 0 src/{eval => ind}/fitness.h | 4 +- src/{ => ind}/individual.cpp | 0 src/{ => ind}/individual.h | 34 +++---- src/pop/archive.h | 7 +- src/{ => pop}/population.cpp | 149 +---------------------------- src/{ => pop}/population.h | 13 +-- src/program/nodetype.h | 14 ++- src/program/program.h | 2 +- src/selection/selection_operator.h | 2 +- src/util/utils.h | 2 +- src/{ => vary}/search_space.cpp | 5 +- src/{ => vary}/search_space.h | 16 ++-- src/{ => vary}/variation.cpp | 1 - src/{ => vary}/variation.h | 3 +- tests/cpp/test_brush.cpp | 6 +- tests/cpp/test_optimization.cpp | 2 +- tests/cpp/test_population.cpp | 6 +- tests/cpp/test_program.cpp | 2 +- tests/cpp/test_search_space.cpp | 2 +- tests/cpp/testsHeader.h | 8 +- 32 files changed, 87 insertions(+), 267 deletions(-) rename src/{eval => ind}/fitness.cpp (100%) rename src/{eval => ind}/fitness.h (96%) rename src/{ => ind}/individual.cpp (100%) rename src/{ => ind}/individual.h (80%) rename src/{ => pop}/population.cpp (68%) rename src/{ => pop}/population.h (94%) rename src/{ => vary}/search_space.cpp (98%) rename src/{ => vary}/search_space.h (99%) rename src/{ => vary}/variation.cpp (99%) rename src/{ => vary}/variation.h (97%) diff --git a/src/bindings/bind_engines.h b/src/bindings/bind_engines.h index c166c058..7033ba59 100644 --- a/src/bindings/bind_engines.h +++ b/src/bindings/bind_engines.h @@ -15,8 +15,8 @@ #include "../eval/evaluation.h" #include "../eval/evaluation.cpp" -#include "../population.cpp" -#include "../population.h" +#include "../pop/population.cpp" +#include "../pop/population.h" using Reg = Brush::RegressorEngine; using Cls = Brush::ClassifierEngine; diff --git a/src/bindings/bind_fitness.cpp b/src/bindings/bind_fitness.cpp index 25bbc81c..8b031b10 100644 --- a/src/bindings/bind_fitness.cpp +++ b/src/bindings/bind_fitness.cpp @@ -1,6 +1,6 @@ #include "module.h" -#include "../eval/fitness.h" +#include "../ind/fitness.h" namespace nl = nlohmann; namespace br = Brush; diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index 6074ca6f..1ee1dd68 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -1,6 +1,6 @@ #include "module.h" -#include "../individual.h" +#include "../ind/individual.h" namespace nl = nlohmann; namespace br = Brush; diff --git a/src/bindings/bind_search_space.cpp b/src/bindings/bind_search_space.cpp index c86e2fba..5bb2c795 100644 --- a/src/bindings/bind_search_space.cpp +++ b/src/bindings/bind_search_space.cpp @@ -1,5 +1,5 @@ #include "module.h" -#include "../search_space.h" +#include "../vary/search_space.h" #include "../program/program.h" namespace py = pybind11; namespace br = Brush; diff --git a/src/bindings/bind_selection.h b/src/bindings/bind_selection.h index 781f65d8..2d1ed49f 100644 --- a/src/bindings/bind_selection.h +++ b/src/bindings/bind_selection.h @@ -9,8 +9,8 @@ #include "../selection/lexicase.h" #include "../selection/lexicase.cpp" -#include "../population.cpp" -#include "../population.h" +#include "../pop/population.cpp" +#include "../pop/population.h" // #include "../individual.h" //#include "../selection/selection.cpp" diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index 02ab18de..9647b61d 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -1,9 +1,9 @@ #include "module.h" -#include "../variation.h" -#include "../variation.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) +#include "../vary/variation.h" +#include "../vary/variation.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) -#include "../population.cpp" -#include "../population.h" +#include "../pop/population.cpp" +#include "../pop/population.h" namespace py = pybind11; namespace nl = nlohmann; diff --git a/src/bindings/module.cpp b/src/bindings/module.cpp index 3c705f30..6dcf0449 100644 --- a/src/bindings/module.cpp +++ b/src/bindings/module.cpp @@ -43,8 +43,7 @@ PYBIND11_MODULE(_brush, m) { bind_search_space(m); bind_fitness(m); - // TODO: get rid of deap wrapper? - // should these 4 below be exposed? should i add them to submodules? + // TODO: should these 4 below be exposed? should i add them to submodules? bind_variations(m); bind_selections(m); bind_evaluators(m); diff --git a/src/engine.cpp b/src/engine.cpp index 4e7b359d..1bfa1a25 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -17,11 +17,6 @@ using namespace Var; template void Engine::init() { - // std::cout << "inside init" << std::endl; - - // TODO: initialize (set operator) for survivor and selector - // initialize population with initial model and/or starting pop - // TODO: get rid of omp if (params.n_jobs!=0) omp_set_num_threads(params.get_n_jobs()); @@ -52,22 +47,14 @@ void Engine::init() this->variator.init(params, ss); //std::cout << "initialized variator" << std::endl; + // initializing survivor and selector based on params this->selector = Selection(params.sel, false); - //std::cout << "created selector" << std::endl; - this->survivor = Selection(params.surv, true); - //std::cout << "created survivor" << std::endl; this->best_score = MAX_FLT; this->best_complexity = MAX_FLT; - // TODO getters and setters for the best solution found after evolution - // predict, transform, predict_proba, etc. - // load and save best individuals - // logger, save to file - // execution archive - // score functions - // fit methods (this will run the evolution) + // TODO: predict, transform, predict_proba, fit (will run the engine) this->archive.set_objectives(params.objectives); @@ -232,8 +219,7 @@ bool Engine::update_best(const Dataset& data, bool val) float bs; bs = this->best_score; - float f; - // TODO: archive here? + float f; bool updated = false; @@ -256,7 +242,6 @@ bool Engine::update_best(const Dataset& data, bool val) else f = ind.fitness.loss; - // TODO: fix this by multiplying by weight if (f*error_weight > bs*error_weight || (f == bs && ind.fitness.complexity < this->best_complexity) ) @@ -330,11 +315,9 @@ void Engine::run(Dataset &data) float fraction = 0; auto stop = [&]() { - //std::cout << "inside stop " << std::endl; - // TODO: max time return ( (generation == params.gens) - && (params.max_stall == 0 || stall_count < params.max_stall) - && (params.max_time == -1 || params.max_time > timer.Elapsed().count()) + && ((params.max_stall == 0 || stall_count < params.max_stall) + && (params.max_time == -1 || params.max_time > timer.Elapsed().count()) ) ); }; @@ -362,8 +345,6 @@ void Engine::run(Dataset &data) island_parents.at(i).resize(delta); } - //std::cout << "vectors are created " << std::endl; - // TODO: progress bar? (it would be cool) // heavily inspired in https://github.com/heal-research/operon/blob/main/source/algorithms/nsga2.cpp auto [init, cond, body, back, done] = taskflow.emplace( [&]() { /* done nothing to do */ }, // init (entry point for taskflow) @@ -404,8 +385,6 @@ void Engine::run(Dataset &data) //std::cout << "inside select parents" << std::endl; evaluator.update_fitness(this->pop, island, data, params, true); // fit the weights with all training data - // TODO: individuals should have a flag is_fitted so we avoid re-fitting them - // TODO: have some way to set which fitness to use (for example in params, or it can infer based on split size idk) // TODO: if using batch, fitness should be called before selection to set the batch if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) @@ -460,7 +439,6 @@ void Engine::run(Dataset &data) //std::cout << pop.print_models() << std::endl; }).name("update, migrate and disentangle indexes between islands"); - // TODO: update log and archive auto finish_gen = subflow.emplace([&]() { bool updated_best = this->update_best(data); @@ -480,7 +458,6 @@ void Engine::run(Dataset &data) // if (use_arch) // TODO: archive // archive.update(pop,params); - // TODO: make verbosity print progress and stats if(params.verbosity>1) print_stats(log, fraction); else if(params.verbosity == 1) diff --git a/src/engine.h b/src/engine.h index 9a3d694e..26dfba76 100644 --- a/src/engine.h +++ b/src/engine.h @@ -9,10 +9,10 @@ license: GNU/GPL v3 #include "./util/rnd.h" #include "init.h" #include "params.h" -#include "population.h" +#include "pop/population.h" #include "pop/archive.h" #include "./eval/evaluation.h" -#include "variation.h" +#include "vary/variation.h" #include "selection/selection.h" #include "taskflow/taskflow.hpp" @@ -56,7 +56,7 @@ class Engine{ // TODO: best fitness instead of these. use fitness comparison float best_score; - int best_complexity; // TODO: best complexity in log/print stats? + int best_complexity; Individual& get_best_ind(){return best_ind;}; // TODO: starting pop (just like feat) diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index a6c6af34..faf913ec 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -17,7 +17,6 @@ void Evaluation::update_fitness(Population& pop, //TODO: it could use the validation_loss auto idxs = pop.get_island_indexes(island); - int counter = 0; for (unsigned i = 0; i& ind = *pop.individuals.at(idxs.at(i)).get(); // we are modifying it, so operator[] wont work @@ -26,8 +25,6 @@ void Evaluation::update_fitness(Population& pop, if (pass) { - // TODO: check if score was nan and assign the max float - // TODO: better handling of nan or inf scores when doing selection and survival (and hall of fame and rank for migration) ind.fitness.loss = MAX_FLT; ind.fitness.loss_v = MAX_FLT; ind.error = MAX_FLT*VectorXf::Ones(data.y.size()); @@ -42,10 +39,7 @@ void Evaluation::update_fitness(Population& pop, assign_fit(ind, data, params, validation); } - ++counter;// TODO: get rid of this counter } - - assert(counter > 0); } // assign loss to program diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index 81952938..91fd2c47 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -4,11 +4,11 @@ #include -#include "../search_space.h" -#include "../individual.h" +#include "../vary/search_space.h" +#include "../ind/individual.h" #include "../data/data.h" #include "scorer.h" -#include "../population.h" +#include "../pop/population.h" using std::string; diff --git a/src/eval/fitness.cpp b/src/ind/fitness.cpp similarity index 100% rename from src/eval/fitness.cpp rename to src/ind/fitness.cpp diff --git a/src/eval/fitness.h b/src/ind/fitness.h similarity index 96% rename from src/eval/fitness.h rename to src/ind/fitness.h index 48d09631..36bc2ba5 100644 --- a/src/eval/fitness.h +++ b/src/ind/fitness.h @@ -8,7 +8,7 @@ using namespace nlohmann; -template <> // this is intended to be used with DEAP. TODO: decide if im going to keep it +template <> // this is intended to be used with DEAP (so our brush individuals can be hashed and compared to each other in python side) struct std::hash> { std::size_t operator()(const std::vector& v) const { std::size_t seed = v.size(); @@ -162,7 +162,7 @@ struct Fitness { // Representation for debugging std::string repr() const { - return "Fitness(TODO: implement string representation)"; + return "TODO: implement string representation"; } diff --git a/src/individual.cpp b/src/ind/individual.cpp similarity index 100% rename from src/individual.cpp rename to src/ind/individual.cpp diff --git a/src/individual.h b/src/ind/individual.h similarity index 80% rename from src/individual.h rename to src/ind/individual.h index e0ebe64b..1885d43c 100644 --- a/src/individual.h +++ b/src/ind/individual.h @@ -1,8 +1,8 @@ #ifndef INDIVIDUAL_H #define INDIVIDUAL_H -#include "program/program.h" -#include "eval/fitness.h" +#include "../program/program.h" +#include "fitness.h" #include @@ -11,8 +11,6 @@ using namespace nlohmann; namespace Brush{ namespace Pop{ -// TODO: folder for fitness and individual - template class Individual{ public: // TODO: make these private (and work with nlohman json) @@ -56,9 +54,8 @@ class Individual{ auto predict(Dataset& data) { return program.predict(data); }; // TODO: predict proba and classification related methods. - // TODO: This class should also have its own cpp wrapper. Update it into the deap api (the idea is that the user is still able to prototype with brush, I dont think we should disable that feature) - // just getters (TODO: use the attributes ) + // just getters bool get_is_fitted() const { return this->is_fitted_; }; string get_model() const { return program.get_model(); }; size_t get_size() const { return program.size(); }; @@ -74,23 +71,20 @@ class Individual{ // void Individual::set_objectives(const vector& objectives) // Static map for weights associated with strings - // TODO: change this to an attribute instead of a function - // TODO: weights for different values. loss should be calculated duing runtime, based on the metric inline static std::map weightsMap = []() { std::map map = { + // this will determine each fitness metric to be a min/max problem {"complexity", -1.0}, - {"size", -1.0} - // Add more key-value pairs as needed - }; + {"size", -1.0}, + {"mse", -1.0}, + {"log", +1.0}, + {"multi_log", +1.0}, - // TODO: move these key value initializations to line above - // example on how to have weight based on templated class - map["error"] = (T == Brush::ProgramType::Regressor) ? -1.0 : +1.0; + // generic error metrics (will use default metrics for clf or reg) + {"error", (T == Brush::ProgramType::Regressor) ? -1.0 : +1.0} - // TODO: eu deveria fazer um check para ver se a string eh error e qual o scorer_ nesse caso - map["mse"] = -1.0; - map["log"] = +1.0; - map["multi_log"] = +1.0; + // Add more key-value pairs as needed + }; return map; }(); @@ -106,8 +100,8 @@ class Individual{ if (it != weightsMap.end()) { weights.push_back(it->second); } else { - // TODO: throw error here, unknown objective - std::cout << obj << " not found in the weight map." << std::endl; + throw std::runtime_error( + "Unknown metric used as fitness. Value was " + obj); } } diff --git a/src/pop/archive.h b/src/pop/archive.h index db318049..ff6692c8 100644 --- a/src/pop/archive.h +++ b/src/pop/archive.h @@ -2,16 +2,11 @@ #define ARCHIVE_H //#include "node.h" // including node.h since definition of node is in the header -#include "../individual.h" +#include "../ind/individual.h" ///< nsga2 selection operator for getting the front #include "../selection/nsga2.h" -// TODO: do i really need these? -using std::vector; -using std::string; -using Eigen::Map; - namespace Brush{ using namespace Sel; diff --git a/src/population.cpp b/src/pop/population.cpp similarity index 68% rename from src/population.cpp rename to src/pop/population.cpp index a4d44265..b8e9152a 100644 --- a/src/population.cpp +++ b/src/pop/population.cpp @@ -77,16 +77,14 @@ void Population::init(SearchSpace& ss, const Parameters& params) for (int i=0; i::save(string filename) template void Population::load(string filename) { - - // TODO: if initializing from a population file, then this is where we should load previous models. - // three behaviors: if we have only 1 ind, then replicate it trought the entire pop - // if n_ind is the same as pop_size, load all models. if n_ind != pop_size, throw error - - //TODO: replace with from_json(j, this) call std::ifstream indata; indata.open(filename); if (!indata.good()) @@ -275,8 +267,8 @@ vector> Population::sorted_front(unsigned rank) template vector Population::hall_of_fame(unsigned rank) { - // TODO: remove this ignore offspring (things should work without it) - // this is used to migration and update archive at the end of a generation. expect islands without offspring + // this is used to migration and update archive at the end of a generation. + // Thiis function expects islands without offspring vector pf(0); @@ -298,137 +290,6 @@ vector Population::hall_of_fame(unsigned rank) return pf; } - -// template -// void Population::migrate() -// { -// // changes where island points to based on HOF and pareto fronts - -// if (num_islands==1) -// return; // skipping. this only work because update is fixing island indexes - -// // we cant use more than half of population here -// // std::cout << "finding island sorted fronts" << std::endl; -// auto island_fronts = sorted_front(1); - -// // std::cout << "finding global hall of fame" << std::endl; -// auto global_hall_of_fame = hall_of_fame(1); - -// // This method is not thread safe (as it is now) -// vector> new_island_indexes; -// new_island_indexes.resize(num_islands); - -// // std::cout << "Looping" << std::endl; -// for (int island=0; island other_islands(num_islands-1); -// iota(other_islands.begin(), other_islands.end(), 0); - -// // skipping current island -// auto it = other_islands.begin(); -// std::advance(it, island); -// for (;it != other_islands.end(); ++it) { -// ++(*it); // TODO: is this really skipping the current island? -// } - -// // picking other island -// int other_island = *r.select_randomly( -// other_islands.begin(), -// other_islands.end()); - -// migrating_idx = *r.select_randomly( -// island_fronts.at(other_island).begin(), -// island_fronts.at(other_island).end()); -// // std::cout << "mig idx" << migrating_idx << std::endl; -// } - -// // std::cout << "index " << i << " of island " << island; -// // std::cout << " is now" << migrating_idx << std::endl; - -// new_island_indexes.at(island).push_back(migrating_idx); -// } -// else -// { -// new_island_indexes.at(island).push_back(idxs.at(i)); -// } -// } -// } -// // making hard copies (so the next generation starts with islands that does not share individuals -// // this is particularly important to avoid multiple threads assigning different rank/crowdist/dcounter -// // or different fitness) - -// // std::cout << "starting to consolidate pop" << std::endl; -// vector> new_pop; -// new_pop.resize(0); -// for (int j=0; jindividuals.resize(0); -// for (auto ind : new_pop) -// { -// // making hard copies of the individuals -// json ind_copy = ind; - -// // this will fill just half of the pop -// individuals.push_back( -// std::make_shared>(ind_copy) ); -// } -// for (int i=0; i< pop_size; ++i) -// { -// // second half is space to the offspring (but we dont initialize them) -// individuals.push_back(nullptr); -// } -// } - - - template void Population::migrate() { @@ -462,7 +323,7 @@ void Population::migrate() auto it = other_islands.begin(); std::advance(it, island); for (;it != other_islands.end(); ++it) { - ++(*it); // TODO: is this really skipping the current island? + ++(*it); } // picking other island diff --git a/src/population.h b/src/pop/population.h similarity index 94% rename from src/population.h rename to src/pop/population.h index 929785e5..4a028b86 100644 --- a/src/population.h +++ b/src/pop/population.h @@ -1,17 +1,12 @@ #ifndef POPULATION_H #define POPULATION_H -#include "util/error.h" -#include "individual.h" +#include "../util/error.h" +#include "../ind/individual.h" -// TODO: do i really need these? -using std::vector; -using std::string; -using Eigen::Map; - -// TODO: folder for population and archive // TODO: move this serialization elsewhere // serializing vector of shared ptr: https://github.com/nlohmann/json/discussions/2377 +// (this is used by population, which has a shared_ptr vector) namespace nlohmann { template @@ -43,7 +38,6 @@ struct adl_serializer> }; } - namespace Brush { namespace Pop { @@ -66,7 +60,6 @@ class Population{ // initialize based on list of individuals void init(vector>& individuals, const Parameters& params); - // TODO: init from file (like FEAT) // save serialized population void save(string filename); // load serialized population diff --git a/src/program/nodetype.h b/src/program/nodetype.h index 4cdc1db1..14b0d2e0 100644 --- a/src/program/nodetype.h +++ b/src/program/nodetype.h @@ -50,7 +50,7 @@ enum class NodeType : uint64_t { // Each node type must have a complexity Sqrt = 1UL << 16UL, Sqrtabs = 1UL << 17UL, Square = 1UL << 18UL, - Logistic = 1UL << 19UL, + Logistic = 1UL << 19UL, // used as root for classification trees // timing masks Before = 1UL << 20UL, @@ -67,7 +67,7 @@ enum class NodeType : uint64_t { // Each node type must have a complexity OffsetSum = 1UL << 29UL, // Sum with weight as one of its arguments // Transformers - Softmax = 1UL << 30UL, + Softmax = 1UL << 30UL, // used as root for multiclf trees // Binary Add = 1UL << 31UL, @@ -97,7 +97,9 @@ enum class NodeType : uint64_t { // Each node type must have a complexity MeanLabel = 1UL << 41UL, Constant = 1UL << 42UL, Terminal = 1UL << 43UL, - ArgMax = 1UL << 44UL, // TODO: move before leaves + + // TODO: implement operators below and move them before leaves + ArgMax = 1UL << 44UL, Count = 1UL << 45UL, // custom @@ -110,8 +112,12 @@ enum class NodeType : uint64_t { // Each node type must have a complexity using UnderlyingNodeType = std::underlying_type_t; struct NodeTypes { // magic number keeping track of the number of different node types + + // index of last available node visible to search_space static constexpr size_t Count = 44; - static constexpr size_t OpCount = Count-3; // subtracting leaves + + // subtracting leaves (leaving just the ops into this) + static constexpr size_t OpCount = Count-3; // returns the index of the given type in the NodeType enum static auto GetIndex(NodeType type) -> size_t diff --git a/src/program/program.h b/src/program/program.h index b2cbf9a9..238caa8f 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -18,7 +18,7 @@ license: GNU/GPL v3 #include "../init.h" #include "tree_node.h" #include "node.h" -#include "../search_space.h" +#include "../vary/search_space.h" #include "../params.h" #include "../util/utils.h" #include "functions.h" diff --git a/src/selection/selection_operator.h b/src/selection/selection_operator.h index d4b3195a..bcfab9f6 100644 --- a/src/selection/selection_operator.h +++ b/src/selection/selection_operator.h @@ -7,7 +7,7 @@ // #include "../data/data.h" // #include "../types.h" // #include "../params.h" -#include "../population.h" +#include "../pop/population.h" namespace Brush { namespace Sel { diff --git a/src/util/utils.h b/src/util/utils.h index adb8eaee..55f78647 100644 --- a/src/util/utils.h +++ b/src/util/utils.h @@ -392,7 +392,7 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Log_Stats, med_complexity, max_size, max_complexity - ); +); /// limits the output to finite real numbers template diff --git a/src/search_space.cpp b/src/vary/search_space.cpp similarity index 98% rename from src/search_space.cpp rename to src/vary/search_space.cpp index 72f15ded..7c75ec14 100644 --- a/src/search_space.cpp +++ b/src/vary/search_space.cpp @@ -1,5 +1,5 @@ #include "search_space.h" -#include "program/program.h" +#include "../program/program.h" // TODO: dont import this header here namespace Brush{ @@ -10,7 +10,8 @@ float calc_initial_weight(const ArrayXf& value, const ArrayXf& y) // weights are initialized as the slope of the z-score of x and y. - // If y has different length from X, we get a core dump here. + // If y has different length from X, we get a core dump in this function. + // That is why Dataset makes a check for this // TODO: need to make SS (or Datasaet) check for this when loading the data vector dtypes = {'f', 'f'}; diff --git a/src/search_space.h b/src/vary/search_space.h similarity index 99% rename from src/search_space.h rename to src/vary/search_space.h index ef0ea309..0697fbae 100644 --- a/src/search_space.h +++ b/src/vary/search_space.h @@ -5,15 +5,15 @@ license: GNU/GPL v3 #ifndef SEARCHSPACE_H #define SEARCHSPACE_H //internal includes -#include "init.h" -#include "program/node.h" -#include "program/nodetype.h" -#include "program/tree_node.h" +#include "../init.h" +#include "../program/node.h" +#include "../program/nodetype.h" +#include "../program/tree_node.h" // #include "program/program.h" -#include "util/error.h" -#include "util/utils.h" -#include "util/rnd.h" -#include "params.h" +#include "../util/error.h" +#include "../util/utils.h" +#include "../util/rnd.h" +#include "../params.h" #include #include #include diff --git a/src/variation.cpp b/src/vary/variation.cpp similarity index 99% rename from src/variation.cpp rename to src/vary/variation.cpp index 35819e97..603dbbce 100644 --- a/src/variation.cpp +++ b/src/vary/variation.cpp @@ -107,7 +107,6 @@ class InsertMutation : public MutationBase if (spot_filled) { // if spot is in its child position, append children. - // TODO: reminding that sample_terminal may fail as well auto opt = SS().sample_terminal(a); if (!opt) diff --git a/src/variation.h b/src/vary/variation.h similarity index 97% rename from src/variation.h rename to src/vary/variation.h index 8b00cea7..6c9994aa 100644 --- a/src/variation.h +++ b/src/vary/variation.h @@ -7,12 +7,11 @@ license: GNU/GPL v3 #define VARIATION_H -// TODO: folder for variation. move search_space and variation to this folder // #include "util/error.h" // #include "util/utils.h" //#include "search_space.h" -#include "population.h" +#include "../pop/population.h" #include #include diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index a6b7aad1..6956c673 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -1,6 +1,6 @@ #include "testsHeader.h" -#include "../../src/search_space.h" +#include "../../src/vary/search_space.h" #include "../../src/program/program.h" // #include "../../src/program/dispatch_table.h" #include "../../src/data/io.h" @@ -12,7 +12,7 @@ #include "../../src/selection/lexicase.h" #include "../../src/eval/evaluation.h" #include "../../src/pop/archive.h" -#include "../../src/population.h" +#include "../../src/pop/population.h" // TODO: omg i need to figure out why my code only works if i import basically the whole stuff #include "../../src/selection/selection.cpp" @@ -21,7 +21,7 @@ #include "../../src/selection/lexicase.cpp" #include "../../src/eval/evaluation.cpp" #include "../../src/pop/archive.cpp" -#include "../../src/population.cpp" +#include "../../src/pop/population.cpp" // TODO: test logger, verbose, print stats, etc. TEST(Engine, EngineWorks) diff --git a/tests/cpp/test_optimization.cpp b/tests/cpp/test_optimization.cpp index 2d45ff27..b7c6fdfd 100644 --- a/tests/cpp/test_optimization.cpp +++ b/tests/cpp/test_optimization.cpp @@ -1,5 +1,5 @@ #include "testsHeader.h" -#include "../../src/search_space.h" +#include "../../src/vary/search_space.h" #include "../../src/program/program.h" #include "../../src/program/dispatch_table.h" #include "../../src/data/io.h" diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 59d581dd..8bd2937f 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -1,7 +1,7 @@ #include "testsHeader.h" -#include "../../src/individual.cpp" -#include "../../src/population.cpp" // TODO: figure out if thats ok to include cpps instead of headers +#include "../../src/ind/individual.cpp" +#include "../../src/pop/population.cpp" // TODO: figure out if thats ok to include cpps instead of headers #include "../../src/eval/evaluation.cpp" #include "../../src/selection/nsga2.cpp" #include "../../src/selection/lexicase.cpp" @@ -141,6 +141,8 @@ TEST(Population, PopulationTests) } } } + + // testing that we can save and load the population pop.save("./tests/cpp/__pop_save_100_gen.json"); pop.load("./tests/cpp/__pop_save_100_gen.json"); } diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp index e838d60d..0feb25ce 100644 --- a/tests/cpp/test_program.cpp +++ b/tests/cpp/test_program.cpp @@ -1,5 +1,5 @@ #include "testsHeader.h" -#include "../../src/search_space.h" +#include "../../src/vary/search_space.h" #include "../../src/program/program.h" #include "../../src/program/dispatch_table.h" #include "../../src/data/io.h" diff --git a/tests/cpp/test_search_space.cpp b/tests/cpp/test_search_space.cpp index 8cebd9f0..7777e4cb 100644 --- a/tests/cpp/test_search_space.cpp +++ b/tests/cpp/test_search_space.cpp @@ -1,5 +1,5 @@ #include "testsHeader.h" -#include "../../src/search_space.h" +#include "../../src/vary/search_space.h" #include "../../src/program/program.h" #include "../../src/program/dispatch_table.h" diff --git a/tests/cpp/testsHeader.h b/tests/cpp/testsHeader.h index 10016765..63c9ea9b 100644 --- a/tests/cpp/testsHeader.h +++ b/tests/cpp/testsHeader.h @@ -31,10 +31,10 @@ using std::stof; #include "../../src/program/operator.h" #include "../../src/program/dispatch_table.h" #include "../../src/program/program.h" -#include "../../src/individual.h" -#include "../../src/search_space.h" +#include "../../src/ind/individual.h" +#include "../../src/vary/search_space.h" #include "../../src/params.h" -#include "../../src/variation.h" +#include "../../src/vary/variation.h" #include "../../src/selection/selection.h" #include "../../src/selection/selection_operator.h" #include "../../src/selection/nsga2.h" @@ -43,7 +43,7 @@ using std::stof; #include "../../src/eval/metrics.h" #include "../../src/eval/scorer.h" #include "../../src/engine.h" -#include "../../src/variation.cpp" // TODO: is this ok? (otherwise I would have to create a test separated file, or move the implementation to the header) +#include "../../src/vary/variation.cpp" // TODO: is this ok? (otherwise I would have to create a test separated file, or move the implementation to the header) using namespace Brush; using namespace Brush::Data; From 134118368baa26b7f521d90cd1d6e54a569bfa6f Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 24 Apr 2024 12:22:56 -0300 Subject: [PATCH 166/199] Fixed bad minimization of fitness. Classification working --- pybrush/BrushEstimator.py | 15 +++-- src/bindings/bind_params.cpp | 1 + src/engine.cpp | 32 +++++---- src/eval/evaluation.cpp | 17 +++-- src/ind/fitness.h | 20 ++++-- src/ind/individual.h | 21 +++--- src/selection/nsga2.h | 3 +- src/vary/search_space.cpp | 2 +- tests/cpp/test_brush.cpp | 23 +++++++ tests/python/test_deap_api.py | 122 ++++++++++++++++++++++++---------- 10 files changed, 181 insertions(+), 75 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 71bf4e1d..79f9cfd8 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -17,8 +17,6 @@ from pybrush import brush_rng -# TODO: fix deap estimator breaking with num_islands > 1. write a documentation -# on how to use brush with deap class BrushEstimator(BaseEstimator): """ This is the base class for Deap-based Brush estimators. @@ -176,7 +174,6 @@ def __init__( self.weights_init=weights_init self.validation_size=validation_size - def fit(self, X, y): """ Fit an estimator to X,y. @@ -239,7 +236,15 @@ def fit(self, X, y): self.parameters_.scorer_ = "log" if self.n_classes_ == 2 else "multi_log" if self.random_state is not None: - self.parameters_.random_state = self.random_state + seed = 0 + if isinstance(self.random_state, np.random.Generator): + seed = self.random_state.integers(10000) + elif isinstance(self.random_state, int): + seed = self.random_state + else: + raise ValueError("random_state must be either a numpy random generator or an integer") + + self.parameters_.random_state = seed self.engine_ = None if self.mode == 'classification': @@ -355,7 +360,7 @@ def predict_proba(self, X): prob = self.best_estimator_.program.predict_proba(data) - if self.n_classes_ <= 2: + if self.n_classes_ == 2: prob = np.hstack( (np.ones(X.shape[0]).reshape(-1,1), prob.reshape(-1,1)) ) prob[:, 0] -= prob[:, 1] diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index bd87d0ec..47c726bb 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -20,6 +20,7 @@ void bind_params(py::module& m) .def_property("max_time", &Brush::Parameters::get_max_time, &Brush::Parameters::set_max_time) .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) .def_property("scorer_", &Brush::Parameters::get_scorer_, &Brush::Parameters::set_scorer_) + .def_property("random_state", &Brush::Parameters::get_random_state, &Brush::Parameters::set_random_state) .def_property("load_population", &Brush::Parameters::get_load_population, &Brush::Parameters::set_load_population) .def_property("save_population", &Brush::Parameters::get_save_population, &Brush::Parameters::set_save_population) .def_property("logfile", &Brush::Parameters::get_logfile, &Brush::Parameters::set_logfile) diff --git a/src/engine.cpp b/src/engine.cpp index 1bfa1a25..9e7915d9 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -87,20 +87,25 @@ void Engine::print_progress(float percentage) template void Engine::calculate_stats(const Dataset& d) { - size_t pop_size = this->pop.size(); + int pop_size = 0; + for (int island=0; island::weightsMap[params.scorer_]; - int i=0; + int index = 0; for (int island=0; island::calculate_stats(const Dataset& d) // so we can find best score. From Fitness::dominates: // the proper way of comparing weighted values is considering // everything as a maximization problem - scores(i) = p->fitness.loss; - scores_v(i) = p->fitness.loss_v; - sizes(i) = p->fitness.size; - complexities(i) = p->fitness.complexity; - scores_v(i) = p->fitness.loss_v; - - ++i; + scores(index) = p->fitness.get_loss(); + scores_v(index) = p->fitness.get_loss_v(); + sizes(index) = p->get_size(); + complexities(index) = p->get_complexity(); + ++index; } } - assert(i == pop_size); + assert (pop_size == this->params.pop_size); - // multiply by weight again to get rid of signal + // Multiply by weight to make it a maximization problem. + // Then, multiply again to get rid of signal float best_score = (scores*error_weight).maxCoeff()*error_weight; float best_score_v = (scores_v*error_weight).maxCoeff()*error_weight; float med_score = median(scores); @@ -204,6 +208,7 @@ void Engine::print_stats(std::ofstream& log, float fraction) << "Train Loss (Med): " << stats.best_score.back() << " (" << stats.med_score.back() << ")\n" << "Val Loss (Med): " << stats.best_score_v.back() << " (" << stats.med_score_v.back() << ")\n" << "Median Size (Max): " << stats.med_size.back() << " (" << stats.max_size.back() << ")\n" + << "Median complexity (Max): " << stats.med_complexity.back() << " (" << stats.max_complexity.back() << ")\n" << "Time (s): " << timer <<"\n\n"; } @@ -458,6 +463,9 @@ void Engine::run(Dataset &data) // if (use_arch) // TODO: archive // archive.update(pop,params); + fraction = params.max_time == -1 ? ((generation+1)*1.0)/params.gens : + timer.Elapsed().count()/params.max_time; + if(params.verbosity>1) print_stats(log, fraction); else if(params.verbosity == 1) diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index faf913ec..d440989c 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -53,19 +53,22 @@ void Evaluation::assign_fit(Individual& ind, const Dataset& data, Dataset train = data.get_training_data(); float f = S.score(ind, train, errors, params); - Dataset validation = data.get_validation_data(); - float f_v = S.score(ind, validation, errors, params); + float f_v = f; + if (data.use_validation) { + Dataset validation = data.get_validation_data(); + f_v = S.score(ind, validation, errors, params); + } // TODO: implement the class weights and use it here (and on errors) + ind.set_objectives(params.objectives); + ind.error = errors; ind.fitness.set_loss(f); ind.fitness.set_loss_v(f_v); - ind.fitness.size = ind.get_size(); - ind.fitness.complexity = ind.get_complexity(); - ind.fitness.depth = ind.get_depth(); - - ind.set_objectives(params.objectives); + ind.fitness.set_size(ind.get_size()); + ind.fitness.set_complexity(ind.get_complexity()); + ind.fitness.set_depth(ind.get_depth()); vector values; values.resize(0); diff --git a/src/ind/fitness.h b/src/ind/fitness.h index 36bc2ba5..ea38987a 100644 --- a/src/ind/fitness.h +++ b/src/ind/fitness.h @@ -26,9 +26,9 @@ struct Fitness { float loss; ///< aggregate loss score float loss_v; ///< aggregate validation loss score - size_t complexity; - size_t size; - size_t depth; + unsigned int complexity; + unsigned int size; + unsigned int depth; // these can be different depending on the island the individual is unsigned int dcounter; ///< number of individuals this dominates @@ -44,6 +44,15 @@ struct Fitness { void set_loss_v(float f_v){ loss_v=f_v; }; float get_loss_v() const { return loss_v; }; + + void set_size(unsigned int new_s){ size=new_s; }; + unsigned int get_size() const { return size; }; + + void set_complexity(unsigned int new_c){ complexity=new_c; }; + unsigned int get_complexity() const { return complexity; }; + + void set_depth(unsigned int new_d){ depth=new_d; }; + unsigned int get_depth() const { return depth; }; void set_dcounter(unsigned int d){ dcounter=d; }; unsigned int get_dcounter() const { return dcounter; }; @@ -100,6 +109,7 @@ struct Fitness { values.push_back(element); } + // Minimizing/maximizing problem: negative/positive weight, respectively. wvalues.resize(weights.size()); // Perform element-wise multiplication @@ -131,8 +141,8 @@ struct Fitness { // Less than comparison bool operator<(const Fitness& other) const { - // Minimizing/maximizing problem: negative/positive weight, respectively. - return std::lexicographical_compare(wvalues.begin(), wvalues.end(), + // because of the weights, every objective is a maximization problem + return !std::lexicographical_compare(wvalues.begin(), wvalues.end(), other.wvalues.begin(), other.wvalues.end()); } diff --git a/src/ind/individual.h b/src/ind/individual.h index 1885d43c..e68c0e49 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -58,9 +58,9 @@ class Individual{ // just getters bool get_is_fitted() const { return this->is_fitted_; }; string get_model() const { return program.get_model(); }; - size_t get_size() const { return program.size(); }; - size_t get_depth() const { return program.depth(); }; - size_t get_complexity() const { return program.complexity(); }; + unsigned int get_size() const { return program.size(); }; + unsigned int get_depth() const { return program.depth(); }; + unsigned int get_complexity() const { return program.complexity(); }; Program& get_program() { return program; }; void set_fitness(Fitness &f) { fitness=f; }; @@ -77,13 +77,18 @@ class Individual{ {"complexity", -1.0}, {"size", -1.0}, {"mse", -1.0}, - {"log", +1.0}, - {"multi_log", +1.0}, + {"log", -1.0}, + {"multi_log", -1.0}, - // generic error metrics (will use default metrics for clf or reg) - {"error", (T == Brush::ProgramType::Regressor) ? -1.0 : +1.0} + {"accuracy", +1.0}, - // Add more key-value pairs as needed + // generic error metrics (will use default metrics for clf or reg) + // by default we use log and multi_log if the user specifies error + // for a classification problem. However, other metrics (such as + // accuracy or precision or AUC) can be a maximization problem, + // so this map allow us to have flexibility when setting the + // objectives + {"error", (T == Brush::ProgramType::Regressor) ? -1.0 : -1.0} }; return map; diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index 8ba735b4..d38d4f23 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -69,8 +69,9 @@ class NSGA2 : public SelectionOperator comparator_obj(const Population& population, int index) : pop(population), m(index) {}; + // because of the weighted values, every objective is a maximization problem bool operator() (int i, int j) { - return pop[i].fitness.get_wvalues()[m] < pop[j].fitness.get_wvalues()[m]; }; + return pop[i].fitness.get_wvalues()[m] > pop[j].fitness.get_wvalues()[m]; }; }; size_t tournament(Population& pop, size_t i, size_t j) const; diff --git a/src/vary/search_space.cpp b/src/vary/search_space.cpp index 7c75ec14..bed6af65 100644 --- a/src/vary/search_space.cpp +++ b/src/vary/search_space.cpp @@ -210,7 +210,7 @@ void SearchSpace::init(const Dataset& d, const unordered_map& user extended_user_ops.insert({"Logistic", 0.0f}); } else if (user_ops.find("Softmax") == user_ops.end()) { - // extended_user_ops.insert({"Softmax", 0.0f}); + extended_user_ops.insert({"Softmax", 0.0f}); } } diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 6956c673..421d2fff 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -112,5 +112,28 @@ TEST(Engine, EngineWorks) Brush::RegressorEngine est_not_div2(params); est_not_div2.run(data); + // TODO: test predict and predict proba + + // TODO: why isnt this working for classification + // TODO: validation loss +} + + +TEST(Engine, ClassificationEngineWorks) +{ // TODO: test classifier and multiclassifier + Dataset data = Data::read_csv("docs/examples/datasets/d_analcatdata_aids.csv", "target"); + + ASSERT_TRUE(data.classification); + + Parameters params; + params.set_pop_size(100); + params.set_gens(10); + params.set_mig_prob(0.0); + params.set_scorer_("log"); + + params.set_verbosity(2); // TODO: verbosity tests + + Brush::ClassifierEngine est(params); + est.run(data); } \ No newline at end of file diff --git a/tests/python/test_deap_api.py b/tests/python/test_deap_api.py index 3d44708c..8d310cc4 100644 --- a/tests/python/test_deap_api.py +++ b/tests/python/test_deap_api.py @@ -9,7 +9,7 @@ import traceback import logging -# TODO: get deap api back and implement it as deap_nsga2 (or something like that. the idea is that it can be used as a reference. I could even do a documentation prototyping_with_brush.ipynb) +# TODO: prototyping_with_brush.ipynb or something like that @pytest.fixture def brush_args(): return dict( @@ -18,12 +18,13 @@ def brush_args(): max_size=50, max_depth=6, cx_prob= 1/7, + num_islands=1, mutation_probs = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6}, ) @pytest.fixture -def classification_setup(): +def DEAP_classification_setup(): df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') X = df.drop(columns='target') y = df['target'] @@ -31,7 +32,7 @@ def classification_setup(): return pybrush.DeapClassifier, X, y @pytest.fixture -def multiclass_classification_setup(): +def DEAP_multiclass_classification_setup(): df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') X = df.drop(columns='target') y = df['target'] @@ -39,22 +40,52 @@ def multiclass_classification_setup(): return pybrush.DeapClassifier, X, y @pytest.fixture -def regression_setup(): +def DEAP_regression_setup(): df = pd.read_csv('docs/examples/datasets/d_enc.csv') X = df.drop(columns='label') y = df['label'] return pybrush.DeapRegressor, X, y + +@pytest.fixture +def BRUSH_classification_setup(): + df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') + X = df.drop(columns='target') + y = df['target'] + + return pybrush.BrushClassifier, X, y + +@pytest.fixture +def BRUSH_multiclass_classification_setup(): + df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') + X = df.drop(columns='target') + y = df['target'] + + return pybrush.BrushClassifier, X, y + +@pytest.fixture +def BRUSH_regression_setup(): + df = pd.read_csv('docs/examples/datasets/d_enc.csv') + X = df.drop(columns='label') + y = df['label'] + + return pybrush.BrushRegressor, X, y + + @pytest.mark.parametrize('setup,algorithm', - [('classification_setup', 'nsga2island'), - ('classification_setup', 'nsga2' ), - ('classification_setup', 'gaisland' ), - ('classification_setup', 'ga' ), - ('regression_setup', 'nsga2island'), - ('regression_setup', 'nsga2' ), - ('regression_setup', 'gaisland' ), - ('regression_setup', 'ga' )]) + [('DEAP_classification_setup', 'nsga2island'), + ('DEAP_classification_setup', 'nsga2' ), + ('DEAP_classification_setup', 'gaisland' ), + ('DEAP_classification_setup', 'ga' ), + ('DEAP_regression_setup', 'nsga2island'), + ('DEAP_regression_setup', 'nsga2' ), + ('DEAP_regression_setup', 'gaisland' ), + ('DEAP_regression_setup', 'ga' ), + + ('BRUSH_classification_setup', 'nsga2island'), + ('BRUSH_regression_setup', 'nsga2island') + ]) def test_fit(setup, algorithm, brush_args, request): """Testing common utilities related to fitting and generic brush estimator. """ @@ -72,9 +103,13 @@ def test_fit(setup, algorithm, brush_args, request): pytest.fail(f"Unexpected Exception caught: {e}") logging.error(traceback.format_exc()) + @pytest.mark.parametrize('setup', - [('classification_setup'), - ('multiclass_classification_setup')]) + [('DEAP_classification_setup'), + ('DEAP_multiclass_classification_setup'), + ('BRUSH_classification_setup'), + ('BRUSH_multiclass_classification_setup'), + ]) def test_predict_proba(setup, brush_args, request): Estimator, X, y = request.getfixturevalue(setup) @@ -83,34 +118,48 @@ def test_predict_proba(setup, brush_args, request): est.fit(X, y) y_prob = est.predict_proba(X) + assert len(y_prob.shape) == 2, "predict_proba should be 2-dimensional" assert y_prob.shape[1] >= 2, \ "every class should have its own column (even for binary clf)" - -# @pytest.mark.parametrize('setup', -# [('regression_setup')]) -# def test_brush_engine(setup, brush_args, request): +# @pytest.mark.parametrize('setup,num_islands', +# [('DEAP_classification_setup', 1), +# ('DEAP_regression_setup', 1), +# ('BRUSH_classification_setup', 1), +# ('BRUSH_regression_setup', 1), + +# ('DEAP_classification_setup', -1), +# ('DEAP_regression_setup', -1), +# ('BRUSH_classification_setup', -1), +# ('BRUSH_regression_setup', -1), + +# ('DEAP_classification_setup', 2), +# ('DEAP_regression_setup', 2), +# ('BRUSH_classification_setup', 2), +# ('BRUSH_regression_setup', 2)]) +# def test_num_islands(setup, num_islands, brush_args, request): # Estimator, X, y = request.getfixturevalue(setup) -# dataset = pybrush.Dataset(X=X, y=y) - -# # TODO: pybrush parameters could have named arguments -# params = pybrush.Parameters() -# params.pop_size = 10 -# params.gens = 10 -# params.num_islands = 1 - -# eng = pybrush.RegressorEngine(params) -# # eng.run(dataset) - +# brush_args["algorithm"] = 'nsga2island' +# brush_args["num_islands"] = num_islands +# try: +# est = Estimator(**brush_args) +# est.fit(X, y) + +# print('score:', est.score(X,y)) + +# except Exception as e: +# pytest.fail(f"Unexpected Exception caught: {e}") +# logging.error(traceback.format_exc()) +# TODO: make this test for BRUSH_classification (it does not use toolbox) @pytest.mark.parametrize('setup,fixed_node', [ - ('classification_setup', 'Logistic'), - # ('multiclass_classification_setup', 'Softmax') - ]) + ('DEAP_classification_setup', 'Logistic'), + # ('DEAP_multiclass_classification_setup', 'Softmax'), + ]) def test_fixed_nodes(setup, fixed_node, brush_args, request): # Classification has a fixed root that should not change after mutation or crossover @@ -163,13 +212,14 @@ def test_fixed_nodes(setup, fixed_node, brush_args, request): -# def test_random_state(): # TODO: make it work +# TODO: make this work (i need to make each island (thread) use its own random generator) +# def test_random_state(): # test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) # test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0], # [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T -# est1 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) -# est2 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) +# est1 = pybrush.BrushRegressor(random_state=42).fit(test_X, test_y) +# est2 = pybrush.BrushRegressor(random_state=42).fit(test_X, test_y) -# assert est1.best_estimator_.get_model() == est2.best_estimator_.get_model(), \ +# assert est1.best_estimator_.program.get_model() == est2.best_estimator_.program.get_model(), \ # "random state failed to generate same results" \ No newline at end of file From 5ee93938dffc293be6b7f838bb368b1bef395096 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 24 Apr 2024 14:52:57 -0300 Subject: [PATCH 167/199] get_rank instead of just rank (was failing to install in older python versions) --- README.md | 2 ++ src/selection/nsga2.h | 4 ++-- tests/cpp/test_brush.cpp | 2 -- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6d330399..744280ca 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,8 @@ That means it should be compatible with sklearn pipelines, wrappers, and so fort In addition, Brush provides functionality that allows you to feed in more complicated data types than just matrices of floating point values. + + ## Regression ```python diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index d38d4f23..f883d832 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -51,9 +51,9 @@ class NSGA2 : public SelectionOperator auto ind1 = pop.individuals[i]; auto ind2 = pop.individuals[j]; - if (ind1->fitness.rank < ind2->fitness.rank) + if (ind1->fitness.get_rank() < ind2->fitness.get_rank()) return true; - else if (ind1->fitness.rank == ind2->fitness.rank && + else if (ind1->fitness.get_rank() == ind2->fitness.get_rank() && ind1->fitness.crowding_dist > ind2->fitness.crowding_dist) return true; return false; diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 421d2fff..615c3512 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -113,8 +113,6 @@ TEST(Engine, EngineWorks) est_not_div2.run(data); // TODO: test predict and predict proba - - // TODO: why isnt this working for classification // TODO: validation loss } From c64fad7c762d7519118633c5f2655d3bcbd13581 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 24 Apr 2024 17:58:37 -0300 Subject: [PATCH 168/199] Cleaning more TODOs --- src/engine.cpp | 17 +++++------------ src/engine.h | 2 +- src/eval/evaluation.cpp | 3 +-- src/ind/fitness.h | 4 ++-- src/ind/individual.h | 4 ++-- 5 files changed, 11 insertions(+), 19 deletions(-) diff --git a/src/engine.cpp b/src/engine.cpp index 9e7915d9..f22ee639 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -85,7 +85,7 @@ void Engine::print_progress(float percentage) template -void Engine::calculate_stats(const Dataset& d) +void Engine::calculate_stats() { int pop_size = 0; for (int island=0; island::run(Dataset &data) tf::Taskflow taskflow; - // TODO: get references to all classes ( so they can be captured by taskflow) (like some private getters and setters) - //std::cout << "stop criteria is ready " << std::endl; // stop criteria unsigned generation = 0; @@ -447,18 +445,13 @@ void Engine::run(Dataset &data) auto finish_gen = subflow.emplace([&]() { bool updated_best = this->update_best(data); - // TODO: fix this code below (if needed. this is borrowed from feat) - // if ( (use_arch || params.verbosity>1) || !logfile.empty()) { - // // set objectives to make sure they are reported in log/verbose/arch - // #pragma omp parallel for - // for (unsigned int i=0; i1 || !logfile.empty()) { + calculate_stats(); + } // TODO: logger working // logger.log("calculate stats...",2); - calculate_stats(data); // TODO: calculate stats only if archive, logstats, or verbosity (otherwise it is not used) - // TODO: calculate stats does not need dataset // if (use_arch) // TODO: archive // archive.update(pop,params); diff --git a/src/engine.h b/src/engine.h index 26dfba76..faadce49 100644 --- a/src/engine.h +++ b/src/engine.h @@ -40,7 +40,7 @@ class Engine{ // outputs a progress bar, filled according to @param percentage. void print_progress(float percentage); - void calculate_stats(const Dataset& d); + void calculate_stats(); void print_stats(std::ofstream& log, float fraction); void log_stats(std::ofstream& log); diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index d440989c..ea8d1e78 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -13,8 +13,7 @@ void Evaluation::update_fitness(Population& pop, bool fit, bool validation ) -{ - //TODO: it could use the validation_loss +{ auto idxs = pop.get_island_indexes(island); for (unsigned i = 0; i>{}(wvalues); return h; @@ -96,7 +97,6 @@ struct Fitness { return wvalues; } - // TODO: debug size, it is giving weird values // Method to set values void set_values(vector& v) { if (v.size() != weights.size()) { diff --git a/src/ind/individual.h b/src/ind/individual.h index e68c0e49..c90856a2 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -30,7 +30,6 @@ class Individual{ Individual() { - // TODO: better initialization of arguments objectives = {"error", "complexity"}; }; @@ -45,12 +44,13 @@ class Individual{ // program = SS.make_program(params, params.max_depth, params.max_size); }; - // fitness, objetives, complexity, etc. TODO: create intermediate functions to interact with fitness and program? + // fitness, objetives, complexity, etc. void fit(Dataset& data) { program.fit(data); // this flag is used to avoid re-fitting an individual. the program is_fitted_ flag is used to perform checks (like in predict with weights). They are two different things and I think I;ll keep this way (individual is just a container to keep program and fitness together) this->is_fitted_ = true; }; + auto predict(Dataset& data) { return program.predict(data); }; // TODO: predict proba and classification related methods. From ddeb6fd91cae1611c3f59ce8bc49cf30e833aa79 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 24 Apr 2024 22:30:41 -0300 Subject: [PATCH 169/199] Fit, predict and predict_proba for individual classes --- src/bindings/bind_individuals.h | 37 ++++++++++++++++++++++++++++----- src/bindings/bind_programs.h | 1 - src/engine.cpp | 2 +- src/ind/individual.h | 37 +++++++++++++++++++++++++++------ tests/cpp/test_individuals.cpp | 1 + 5 files changed, 65 insertions(+), 13 deletions(-) diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index 1ee1dd68..e1d2277f 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -5,6 +5,11 @@ namespace nl = nlohmann; namespace br = Brush; +using Reg = br::Pop::Individual; +using Cls = br::Pop::Individual; +using MCls = br::Pop::Individual; +using Rep = br::Pop::Individual; + using stream_redirect = py::call_guard; // TODO: unify PT or T @@ -12,6 +17,11 @@ template void bind_individual(py::module& m, string name) { using Class = br::Pop::Individual; + + using RetType = std::conditional_t< + std::is_same_v, ArrayXf, + std::conditional_t, ArrayXb, + std::conditional_t, ArrayXi, ArrayXXf>>>; py::class_ ind(m, name.data() ); ind.def(py::init<>()) @@ -26,7 +36,18 @@ void bind_individual(py::module& m, string name) .def_property("objectives", &Class::get_objectives, &Class::set_objectives) .def_property_readonly("program", &Class::get_program) .def_property_readonly("fitness", &Class::get_fitness) - // .def_property("complexity", &Class::get_complexity, &Class::set_complexity) + .def("fit", + static_cast(&Class::fit), + "fit from Dataset object") + .def("fit", + static_cast &X, const Ref &y)>(&Class::fit), + "fit from X,y data") + .def("predict", + static_cast(&Class::predict), + "predict from Dataset object") + .def("predict", + static_cast &X)>(&Class::predict), + "predict from X data") .def(py::pickle( [](const Class &p) { // __getstate__ /* Return a tuple that fully encodes the state of the object */ @@ -42,9 +63,15 @@ void bind_individual(py::module& m, string name) ) ; - // if constexpr (std::is_same_v) - // { - - // } + if constexpr (std::is_same_v) + { + ind.def("predict_proba", + static_cast(&Class::predict_proba), + "predict from Dataset object") + .def("predict_proba", + static_cast &X)>(&Class::predict_proba), + "predict from X data") + ; + } } \ No newline at end of file diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index 81d5b294..49ca8ff7 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -71,7 +71,6 @@ void bind_program(py::module& m, string name) ; if constexpr (std::is_same_v) { - // TODO: have these in individual and wrapper prog.def("predict_proba", static_cast(&T::predict_proba), "predict from Dataset object") diff --git a/src/engine.cpp b/src/engine.cpp index f22ee639..bc885e01 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -446,7 +446,7 @@ void Engine::run(Dataset &data) bool updated_best = this->update_best(data); // TODO: use_arch - if ( params.verbosity>1 || !logfile.empty()) { + if ( params.verbosity>1 || !params.logfile.empty()) { calculate_stats(); } diff --git a/src/ind/individual.h b/src/ind/individual.h index c90856a2..980b0b4b 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -20,6 +20,7 @@ class Individual{ // error is the aggregation of error vector, and can be user sppecified + // this flag is used to avoid re-fitting an individual. the program is_fitted_ flag is used to perform checks (like in predict with weights). They are two different things and I think I;ll keep this way (individual is just a container to keep program and fitness together) bool is_fitted_ = false; VectorXf error; ///< training error (used in lexicase selectors) @@ -44,16 +45,40 @@ class Individual{ // program = SS.make_program(params, params.max_depth, params.max_size); }; - // fitness, objetives, complexity, etc. - void fit(Dataset& data) { + // TODO: replace occurences of program.fit with these (also predict and predict_proba) + Individual &fit(const Dataset& data) { program.fit(data); - // this flag is used to avoid re-fitting an individual. the program is_fitted_ flag is used to perform checks (like in predict with weights). They are two different things and I think I;ll keep this way (individual is just a container to keep program and fitness together) this->is_fitted_ = true; + return *this; + }; + Individual &fit(const Ref& X, const Ref& y) + { + Dataset d(X,y); + return fit(d); + }; + + auto predict(const Dataset& data) { return program.predict(data); }; + auto predict(const Ref& X) + { + Dataset d(X); + return predict(d); + }; + + template + requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) + auto predict_proba(const Dataset &d) + { + return program.predict_proba(d); + }; + + template + requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) + auto predict_proba(const Ref& X) + { + Dataset d(X); + return predict_proba(d); }; - - auto predict(Dataset& data) { return program.predict(data); }; - // TODO: predict proba and classification related methods. // just getters bool get_is_fitted() const { return this->is_fitted_; }; diff --git a/tests/cpp/test_individuals.cpp b/tests/cpp/test_individuals.cpp index e69de29b..86cfb2c3 100644 --- a/tests/cpp/test_individuals.cpp +++ b/tests/cpp/test_individuals.cpp @@ -0,0 +1 @@ +// TODO: test predict, predict proba, fit. \ No newline at end of file From ee5379d05f7ba31356b0c3c6599842bee2bbfc5b Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 25 Apr 2024 08:13:07 -0300 Subject: [PATCH 170/199] Engine bindings for fit and predict. Engine serialization --- pybrush/BrushEstimator.py | 7 ++-- src/bindings/bind_engines.h | 36 ++++++++++++++++++-- src/bindings/bind_individuals.h | 8 ++--- src/bindings/bind_params.cpp | 16 ++++++++- src/engine.h | 41 ++++++++++++++++++++-- src/ind/individual.h | 7 +--- src/params.h | 60 ++++++++++++++++++++++++++------- src/pop/population.h | 1 - src/util/utils.h | 1 - 9 files changed, 145 insertions(+), 32 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 79f9cfd8..82af3e9e 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -230,7 +230,10 @@ def fit(self, X, y): self.parameters_.mig_prob = self.mig_prob self.parameters_.functions = self.functions self.parameters_.mutation_probs = self.mutation_probs - + self.parameters_.validation_size = self.validation_size + self.parameters_.batch_size = self.batch_size + self.parameters_.feature_names = self.feature_names_ + self.parameters_.scorer_ = "mse" if self.mode == "classification": self.parameters_.scorer_ = "log" if self.n_classes_ == 2 else "multi_log" @@ -254,7 +257,7 @@ def fit(self, X, y): else: self.engine_ = RegressorEngine(self.parameters_) - self.engine_.run(self.data_) + self.engine_.fit(self.data_) self.best_estimator_ = self.engine_.best_ind return self diff --git a/src/bindings/bind_engines.h b/src/bindings/bind_engines.h index 7033ba59..1ac661ca 100644 --- a/src/bindings/bind_engines.h +++ b/src/bindings/bind_engines.h @@ -44,12 +44,44 @@ void bind_engine(py::module& m, string name) .def_property("params", &T::get_params, &T::set_params) .def_property_readonly("is_fitted", &T::get_is_fitted) .def_property_readonly("best_ind", &T::get_best_ind) - .def("run", &T::run, py::call_guard(), "run from brush dataset") + // .def("run", &T::run, py::call_guard(), "run from brush dataset") + .def("fit", + static_cast(&T::fit), + py::call_guard(), + "fit from Dataset object") + .def("fit", + static_cast &X, const Ref &y)>(&T::fit), + py::call_guard(), + "fit from X,y data") + .def("predict", + static_cast(&T::predict), + "predict from Dataset object") + .def("predict", + static_cast &X)>(&T::predict), + "predict from X data") + .def(py::pickle( + [](const T &p) { // __getstate__ + /* Return a tuple that fully encodes the state of the object */ + // return py::make_tuple(p.value(), p.extra()); + nl::json j = p; + return j; + }, + [](nl::json j) { // __setstate__ + T p = j; + return p; + }) + ) ; // specialization for subclasses if constexpr (std::is_same_v) { - + engine.def("predict_proba", + static_cast(&T::predict_proba), + "predict from Dataset object") + .def("predict_proba", + static_cast &X)>(&T::predict_proba), + "predict from X data") + ; } } \ No newline at end of file diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index e1d2277f..5c5b62a8 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -5,10 +5,10 @@ namespace nl = nlohmann; namespace br = Brush; -using Reg = br::Pop::Individual; -using Cls = br::Pop::Individual; -using MCls = br::Pop::Individual; -using Rep = br::Pop::Individual; +using Reg = Brush::RegressorIndividual; +using Cls = Brush::ClassifierIndividual; +using MCls = Brush::MulticlassClassifierIndividual; +using Rep = Brush::RepresenterIndividual; using stream_redirect = py::call_guard; diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 47c726bb..6c740a23 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -28,6 +28,9 @@ void bind_params(py::module& m) .def_property("n_classes", &Brush::Parameters::get_n_classes, &Brush::Parameters::set_n_classes) .def_property("n_jobs", &Brush::Parameters::get_n_jobs, &Brush::Parameters::set_n_classes) .def_property("classification", &Brush::Parameters::get_classification, &Brush::Parameters::set_classification) + .def_property("validation_size", &Brush::Parameters::get_validation_size, &Brush::Parameters::set_validation_size) + .def_property("feature_names", &Brush::Parameters::get_feature_names, &Brush::Parameters::set_feature_names) + .def_property("batch_size", &Brush::Parameters::get_batch_size, &Brush::Parameters::set_batch_size) .def_property("max_depth", &Brush::Parameters::get_max_depth, &Brush::Parameters::set_max_depth) .def_property("max_size", &Brush::Parameters::get_max_size, &Brush::Parameters::set_max_size) .def_property("objectives", &Brush::Parameters::get_objectives, &Brush::Parameters::set_objectives) @@ -37,6 +40,17 @@ void bind_params(py::module& m) .def_property("mig_prob", &Brush::Parameters::get_mig_prob, &Brush::Parameters::set_mig_prob) .def_property("functions", &Brush::Parameters::get_functions, &Brush::Parameters::set_functions) .def_property("mutation_probs", &Brush::Parameters::get_mutation_probs, &Brush::Parameters::set_mutation_probs) - + .def(py::pickle( + [](const Brush::Parameters &p) { // __getstate__ + /* Return a tuple that fully encodes the state of the object */ + // return py::make_tuple(p.value(), p.extra()); + nl::json j = p; + return j; + }, + [](nl::json j) { // __setstate__ + Brush::Parameters p = j; + return p; + }) + ) ; } \ No newline at end of file diff --git a/src/engine.h b/src/engine.h index faadce49..ad4878c6 100644 --- a/src/engine.h +++ b/src/engine.h @@ -59,6 +59,36 @@ class Engine{ int best_complexity; Individual& get_best_ind(){return best_ind;}; + Engine &fit(Dataset& data) { + run(data); + return *this; + }; + Engine &fit(const Ref& X, const Ref& y) + { + // Using constructor 2 to create the dataset + Dataset d(X,y,params.feature_names,{},params.classification, + params.validation_size, params.batch_size); + return fit(d); + }; + + auto predict(const Dataset& data) { return this->best_ind.predict(data); }; + auto predict(const Ref& X) + { + Dataset d(X); + return predict(d); + }; + + template + requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) + auto predict_proba(const Dataset &d) { return this->best_ind.predict_proba(d); }; + template + requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) + auto predict_proba(const Ref& X) + { + Dataset d(X); + return predict_proba(d); + }; + // TODO: starting pop (just like feat) // TODO: make thesqe work @@ -74,9 +104,11 @@ class Engine{ // ArrayXXf predict_proba(MatrixXf& X); // archive stuff + // TODO: make these work ///return archive size int get_archive_size(){ return this->archive.individuals.size(); }; + ///return population as string vector get_archive(bool front); @@ -86,11 +118,11 @@ class Engine{ // ArrayXXf predict_proba_archive(int id, MatrixXf& X, LongData& Z); // ArrayXXf predict_proba_archive(int id, MatrixXf& X); - /// train the model void run(Dataset &d); Parameters params; ///< hyperparameters of brush, which the user can interact + Individual best_ind; private: SearchSpace ss; @@ -105,7 +137,6 @@ class Engine{ Timer timer; ///< start time of training Archive archive; ///< pareto front archive - Individual best_ind; bool is_fitted; ///< keeps track of whether fit was called. void init(); @@ -114,7 +145,11 @@ class Engine{ inline void set_is_fitted(bool f){is_fitted=f;} }; -// TODO: serialization for engine with NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE +// Only stuff to make new predictions or call fit again +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine, params, best_ind); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind); } // Brush diff --git a/src/ind/individual.h b/src/ind/individual.h index 980b0b4b..ac6692ca 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -66,11 +66,7 @@ class Individual{ template requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) - auto predict_proba(const Dataset &d) - { - return program.predict_proba(d); - }; - + auto predict_proba(const Dataset &d) { return program.predict_proba(d); }; template requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) auto predict_proba(const Ref& X) @@ -79,7 +75,6 @@ class Individual{ return predict_proba(d); }; - // just getters bool get_is_fitted() const { return this->is_fitted_; }; string get_model() const { return program.get_model(); }; diff --git a/src/params.h b/src/params.h index a62f84f0..da61ae15 100644 --- a/src/params.h +++ b/src/params.h @@ -10,10 +10,10 @@ license: GNU/GPL v3 #include "util/logger.h" namespace ns = nlohmann; + namespace Brush { - struct Parameters { public: @@ -58,22 +58,18 @@ struct Parameters string scorer_="mse"; ///< actual loss function used, determined by error - // for classification (TODO: should I have these, or they could be just dataset arguments (except the ones needed to use in dataset constructor)) - - bool classification; - unsigned int n_classes; ///< number of classes for classification - // TODO: set these values when creating the parameters in python side - vector classes; ///< class labels + vector classes; ///< class labels vector class_weights; ///< weights for each class vector sample_weights; ///< weights for each sample - // for dataset. TODO: make it work - bool shuffle = true; ///< option to shuffle the data - float split = 0.75; ///< fraction of data to use for training - vector feature_names; ///< names of features + // for creating dataset from X and y in Engine::fit. Ignored if + // the uses uses an dataset + bool classification; + unsigned int n_classes; + float validation_size = 0.75; + vector feature_names = {}; float batch_size = 0.0; - bool use_batch = false; ///< whether to use mini batch for training string load_population = ""; string save_population = ""; @@ -153,6 +149,15 @@ struct Parameters void set_n_classes(unsigned int new_n_classes){ n_classes = new_n_classes; }; unsigned int get_n_classes(){ return n_classes; }; + void set_validation_size(float s){ validation_size = s; }; + float get_validation_size(){ return validation_size; }; + + void set_feature_names(vector vn){ feature_names = vn; }; + vector get_feature_names(){ return feature_names; }; + + void set_batch_size(float c){ batch_size = c; }; + float get_batch_size(){ return batch_size; }; + //TODO: unify unordered or ordered void set_mutation_probs(std::map new_mutation_probs){ mutation_probs = new_mutation_probs; }; std::map get_mutation_probs(){ return mutation_probs; }; @@ -160,6 +165,37 @@ struct Parameters void set_functions(std::unordered_map new_functions){ functions = new_functions; }; std::unordered_map get_functions(){ return functions; }; }; + +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Parameters, + verbosity, + random_state, + pop_size, + gens, + max_stall, + max_time, + scorer_, + load_population, + save_population, + logfile, + current_gen, + num_islands, + max_depth, + n_jobs, + max_size, + objectives, + sel, + surv, + cx_prob, + mig_prob, + classification, + n_classes, + validation_size, + feature_names, + batch_size, + mutation_probs, + functions +); + } // Brush #endif diff --git a/src/pop/population.h b/src/pop/population.h index 4a028b86..783c3011 100644 --- a/src/pop/population.h +++ b/src/pop/population.h @@ -124,7 +124,6 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( Population, individuals, island_indexes, pop_size, num_islands); NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( Population, individuals, island_indexes, pop_size, num_islands); - }// Pop }// Brush diff --git a/src/util/utils.h b/src/util/utils.h index 55f78647..932a0cca 100644 --- a/src/util/utils.h +++ b/src/util/utils.h @@ -378,7 +378,6 @@ struct Log_Stats typedef struct Log_Stats Log_stats; -// TODO: change this to something more modern NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Log_Stats, generation, time, From bd6a5d45f754703d9ef03a606313be71f54b04f2 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 25 Apr 2024 10:28:28 -0300 Subject: [PATCH 171/199] Deleted test notebook --- test.ipynb | 410 ----------------------------------------------------- 1 file changed, 410 deletions(-) delete mode 100644 test.ipynb diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index 0250befd..00000000 --- a/test.ipynb +++ /dev/null @@ -1,410 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "libcbrush.so: cannot open shared object file: No such file or directory", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m mean_squared_error, r2_score\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m_brush\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindividual\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RegressorIndividual\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m_brush\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SearchSpace, Parameters, Dataset\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpybrush\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DeapRegressor\n", - "\u001b[0;31mImportError\u001b[0m: libcbrush.so: cannot open shared object file: No such file or directory" - ] - } - ], - "source": [ - "from sklearn import datasets\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import mean_squared_error, r2_score\n", - "\n", - "from _brush.individual import RegressorIndividual\n", - "from _brush import SearchSpace, Parameters, Dataset\n", - "\n", - "from pybrush import DeapRegressor" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load the diabetes dataset\n", - "diabetes = datasets.load_diabetes()\n", - "\n", - "# Use only one feature\n", - "X = diabetes.data[:, None, 2]\n", - "y = diabetes.target\n", - "\n", - "import pandas as pd\n", - "\n", - "# df = pd.read_csv(\"https://raw.githubusercontent.com/gAldeia/hashing-symbolic-expressions/master/data/lexicase_paper/d_airfoil.txt?token=GHSAT0AAAAAACPJ5UIOJY42GOUHC4GKZOBOZPS7BHA\")\n", - "# X = df.drop('label', axis=1)\n", - "# y = df['label']\n", - "\n", - "print(X.shape, y.shape)\n", - "# Split the data into training/testing sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pybrush import engine\n", - "print(\"imported\")\n", - "\n", - "# Validation is to hold some part of the data as the inner validation split\n", - "dataset = Dataset(X=X_train, y=y_train, validation_size=0.75)\n", - "print(\"dataset\")\n", - "\n", - "params = Parameters()\n", - "print(\"parameters\")\n", - "\n", - "brush_estimator = engine.RegressorEngine(params)\n", - "print(\"estimator\")\n", - "\n", - "print(brush_estimator.params.pop_size)\n", - "brush_estimator.params.pop_size = 100\n", - "brush_estimator.params.gens = 100\n", - "brush_estimator.params.num_islands = 5\n", - "brush_estimator.params.max_size = 2**6\n", - "brush_estimator.params.max_depth = 6\n", - "brush_estimator.params.n_jobs = 5\n", - "brush_estimator.params.objectives = [\"error\", \"size\"]\n", - "print(brush_estimator.params.pop_size)\n", - "\n", - "print(brush_estimator.is_fitted)\n", - "print(brush_estimator.best_ind.program.get_model())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "reg = DeapRegressor(\n", - " gens=100, pop_size=100, max_size=2**6, max_depth=6,\n", - " num_islands=1,\n", - " n_jobs=1,\n", - " objectives=['error', 'size'], #, 'complexity'],\n", - " verbosity=1\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "print(\"starting to run\")\n", - "\n", - "brush_estimator.run(dataset)\n", - "print(\"done\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(brush_estimator.is_fitted)\n", - "print(brush_estimator.best_ind.program.get_model())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import mean_squared_error\n", - "\n", - "mean_squared_error(\n", - " brush_estimator.best_ind.program.predict(X_test), y_test, squared=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "brush_estimator.best_ind.fitness.values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "regr = RegressorIndividual()\n", - "print(dir(regr))\n", - "\n", - "# Validation is to hold some part of the data as the inner validation split\n", - "dataset = Dataset(X=X_train, y=y_train, validation_size=0.75)\n", - "ss = SearchSpace(dataset)\n", - "params = Parameters()\n", - "\n", - "regr.init(ss, params)\n", - "\n", - "# regr.fit(X_train, y_train)\n", - "regr.program.get_model()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "reg.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pybrush import BrushRegressor\n", - "\n", - "reg2 = BrushRegressor(\n", - " gens=100, pop_size=100, max_size=2**6, max_depth=6,\n", - " num_islands=1,\n", - " n_jobs=3,\n", - " objectives=['error', 'size'], #, 'complexity'],\n", - " verbosity=1\n", - ").fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "reg.best_estimator_.fitness.values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pybrush import RegressorEvaluator\n", - "\n", - "# RegressorEvaluator()\n", - "print( reg.best_estimator_.program.get_model() )\n", - "print( reg.best_estimator_.fitness.values )\n", - "\n", - "RegressorEvaluator().assign_fit(\n", - " reg.best_estimator_, reg.data_, reg.parameters_, True)\n", - "print( reg.best_estimator_.fitness.values )\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pybrush import RegressorEvaluator\n", - "import numpy as np\n", - "print(regr.fitness.weights)\n", - "regr.objectives = ['error', 'size']\n", - "print(regr.fitness.weights)\n", - "\n", - "regr.fitness.values = [1, 2]\n", - "regr.init(reg.search_space_, reg.parameters_)\n", - "regr.program.fit(reg.data_)\n", - "\n", - "print(regr.program.get_model())\n", - "print(regr.fitness.wvalues)\n", - "print(regr.fitness.values)\n", - "\n", - "RegressorEvaluator().assign_fit(\n", - " regr, reg.data_, reg.parameters_, False)\n", - "print( regr.fitness.values )\n", - "\n", - "def _error(ind, data):\n", - " MSE = np.mean( (data.y-ind.program.predict(data))**2 )\n", - " if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf\n", - " MSE = np.inf\n", - "\n", - " return MSE\n", - "\n", - "def _fitness_validation(ind, data):\n", - " # Fitness without fitting the expression, used with validation data\n", - "\n", - " ind_objectives = {\n", - " \"error\" : _error(ind, data),\n", - " \"size\" : ind.program.size(),\n", - " \"complexity\": ind.program.complexity()\n", - " }\n", - " return [ ind_objectives[obj] for obj in reg.objectives ]\n", - "\n", - "def _fitness_function(ind, data):\n", - " return _fitness_validation(ind, data)\n", - "\n", - "print(_fitness_function(regr, reg.data_))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import log_loss\n", - "from sklearn.datasets import load_iris, load_breast_cancer\n", - "\n", - "from pybrush import individual\n", - "from pybrush import ClassifierEvaluator\n", - "\n", - "# Load the iris dataset\n", - "iris = load_breast_cancer()\n", - "X = iris.data\n", - "y = iris.target\n", - "print(np.unique(y))\n", - "\n", - "clf = individual.ClassifierIndividual()\n", - "print(dir(clf))\n", - "\n", - "# c=True will add logistic function into the search space\n", - "# Validation is to hold some part of the data as the inner validation split\n", - "dataset = Dataset(X=X, y=y, c=True, validation_size=0.0)\n", - "ss = SearchSpace(dataset)\n", - "params = Parameters()\n", - "\n", - "clf.init(ss, params)\n", - "\n", - "# clf.fit(X_train, y_train)\n", - "clf.program.get_model()\n", - "\n", - "clf.objectives = ['error', 'size']\n", - "print(clf.fitness.weights)\n", - "\n", - "clf.fitness.values = [1, 2]\n", - "clf.program.fit(dataset)\n", - "\n", - "print(clf.program.get_model())\n", - "print(clf.fitness.wvalues)\n", - "print(clf.fitness.values)\n", - "\n", - "ClassifierEvaluator().assign_fit(clf, dataset, params, False)\n", - "print( clf.fitness.values )\n", - "def _error(ind, data):\n", - " probas = ind.program.predict_proba(data)\n", - " print(probas[:3])\n", - " probas = np.array([probas, 1-probas]).T\n", - " print(probas.shape)\n", - " print(probas[:3, :])\n", - " ERR = log_loss(data.y, probas, labels=['a', 'b'])\n", - " if not np.isfinite(ERR): # numeric erros, np.nan, +-np.inf\n", - " ERR = np.inf\n", - "\n", - " return ERR\n", - "\n", - "def _fitness_validation(ind, data):\n", - " # Fitness without fitting the expression, used with validation data\n", - "\n", - " ind_objectives = {\n", - " \"error\" : _error(ind, data),\n", - " \"size\" : ind.program.size(),\n", - " \"complexity\": ind.program.complexity()\n", - " }\n", - " return [ ind_objectives[obj] for obj in clf.objectives ]\n", - "\n", - "def _fitness_function(ind, data):\n", - " return _fitness_validation(ind, data)\n", - "\n", - "print(_fitness_function(clf, dataset))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pybrush import BrushClassifier\n", - "\n", - "clf = BrushClassifier(\n", - " gens=10, pop_size=10, max_size=2**5, max_depth=5,\n", - " num_islands=1,\n", - " n_jobs=3,\n", - " objectives=['error', 'size'], #, 'complexity'],\n", - " verbosity=1,\n", - " functions={\"Add\":1.0,\"Logistic\":1.0},\n", - ").fit(X, y)\n", - "clf.best_estimator_.program.get_model()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "clf_eval = ClassifierEvaluator()\n", - "clf_eval.scorer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#from pybrush import RegressorSelector\n", - "\n", - "from _brush import RegressorSelector\n", - "\n", - "# RegressorSelector().select([reg.best_estimator_], params)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "brush", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From d3538e3d44442175d1ae53803a4ff28e68911a9f Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 25 Apr 2024 10:48:40 -0300 Subject: [PATCH 172/199] Fixed missing taskflow. simple test to see if validation is being used --- environment.yml | 1 + src/engine.cpp | 7 +++---- tests/cpp/test_brush.cpp | 11 ++++++++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index 0325ff03..ee0fe201 100644 --- a/environment.yml +++ b/environment.yml @@ -11,6 +11,7 @@ dependencies: - ninja - ceres-solver=2.1.0 - pybind11>=2.6.2 + - taskflow - pytest #=6.2.4 - pydot - scikit-learn diff --git a/src/engine.cpp b/src/engine.cpp index bc885e01..4bae6972 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -242,10 +242,9 @@ bool Engine::update_best(const Dataset& data, bool val) //std::cout << ind.program.get_model() << std::endl; //std::cout << "got individual of rank" << ind.fitness.rank << std::endl; - if (val) - f = ind.fitness.loss_v; - else - f = ind.fitness.loss; + + // if there is no validation, then loss_v == loss and this should work just fine + f = ind.fitness.loss_v; if (f*error_weight > bs*error_weight || (f == bs && ind.fitness.complexity < this->best_complexity) diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 615c3512..b265063d 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -46,10 +46,19 @@ TEST(Engine, EngineWorks) params.set_verbosity(2); // TODO: verbosity tests + // checking if validation size works + params.set_validation_size(0.2); + std::cout << "n jobs = 1" << std::endl; params.set_n_jobs(1); Brush::RegressorEngine est5(params); - est5.run(data); + est5.run(data); // this will not use validation size from parameters + std::cout << "best individual using run(data)" << std::endl; + std::cout << est5.best_ind.program.get_model() << std::endl; + + est5.fit(X, y); // this will use validation size from parameters + std::cout << "best individual using fit(X, y)" << std::endl; + std::cout << est5.best_ind.program.get_model() << std::endl; std::cout << "n jobs = 2" << std::endl; params.set_n_jobs(2); From fe84d8d2935f8311526cc3d6f3934e125c51de9d Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 2 May 2024 09:28:02 -0300 Subject: [PATCH 173/199] Working (but slightly incorrect) implementation of archive --- src/engine.cpp | 2 - src/eval/evaluation.cpp | 3 + src/pop/archive.cpp | 129 +++++++++++++++++++++++++++++++++++++++ src/pop/archive.h | 20 +++--- src/pop/population.cpp | 4 ++ src/vary/variation.cpp | 2 +- tests/cpp/test_brush.cpp | 3 + 7 files changed, 150 insertions(+), 13 deletions(-) diff --git a/src/engine.cpp b/src/engine.cpp index 4bae6972..33825127 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -54,8 +54,6 @@ void Engine::init() this->best_score = MAX_FLT; this->best_complexity = MAX_FLT; - // TODO: predict, transform, predict_proba, fit (will run the engine) - this->archive.set_objectives(params.objectives); // start the clock diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index ea8d1e78..d892e5a0 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -62,6 +62,9 @@ void Evaluation::assign_fit(Individual& ind, const Dataset& data, ind.set_objectives(params.objectives); + // we will always set all values for fitness (regardless of being used). + // this will make sure the information is calculated and ready to be used + // regardless of how the program is set to run. ind.error = errors; ind.fitness.set_loss(f); ind.fitness.set_loss_v(f_v); diff --git a/src/pop/archive.cpp b/src/pop/archive.cpp index 7ea92e8d..7ae8df01 100644 --- a/src/pop/archive.cpp +++ b/src/pop/archive.cpp @@ -3,6 +3,135 @@ namespace Brush { namespace Pop { + +template +Archive::Archive(): selector(true) {}; + +template +void Archive::set_objectives(vector objectives) +{ + this->sort_complexity = in(objectives, std::string("complexity")); +} + +// sorting etc --- all done using fitness class (easier to compare regardless of obj func) +template +bool Archive::sortComplexity(const Individual& lhs, + const Individual& rhs) +{ + // TODO: use getters for all info in fitness (instead of directly accessing them?). + // other option would be having the getters and setters to use iin pybind11, but + // in cpp we do it directly (we know how to manipulate this thing, but users may not, + // so these setters could do some validation to justify its existence). + + return lhs.fitness.complexity < rhs.fitness.complexity; +} + +template +bool Archive::sortObj(const Individual& lhs, + const Individual& rhs, const int index) +{ + // sort based on index (we can have more than 2 obj in brush implementation) + // obs: because of the weights, every objective is a maximization problem + // when comparing weighted values (which should be the right way of doing it) + // the bigger the better. the weights allow us to use different min/max metrics + // without having to deal with this particular details + + return lhs.fitness.wvalues.at(index) > rhs.fitness.wvalues.at(index); +} + +template +bool Archive::sameFitComplexity(const Individual& lhs, + const Individual& rhs) +{ + // fitness' operator== is overloaded to compare wvalues. + // we also check complexity equality to avoid the case where the user + // did not specified complexity as one of the objectives + return (lhs.fitness == rhs.fitness && + lhs.fitness.complexity == rhs.fitness.complexity); +} + +template +bool Archive::sameObjectives(const Individual& lhs, + const Individual& rhs) +{ + for (const auto& o_lhs : lhs.fitness) + { + for (const auto& o_rhs : rhs.fitness) + { + if (o_lhs != o_rhs) + return false; + } + } + return true; +} + +template +void Archive::init(Population& pop) +{ + // TODO: copy the population to a new vector (instead of changing inplace). + // also, fix this in update function + + individuals.resize(0); + + // dealing with islands --> fast nds for each island + for (int island =0; island< pop.num_islands; ++island) { + selector.fast_nds(pop, island); + } + + // OBS: fast_nds will change all individual fitness inplace. + // It will update the values for dcounter, rank, and dominated individuals. + + // TODO: fix this way of getting pareto front (the pareto front of different islands combined will not necessarily be the final pareto front). Also fix this in update + + /* vector front = this->sorted_front(); */ + for (int island =0; island< pop.num_islands; ++island) { + auto idxs = pop.get_island_indexes(island); + + for (unsigned i = 0; isort_complexity) + std::sort(individuals.begin(),individuals.end(), &sortComplexity); + else + std::sort(individuals.begin(),individuals.end(), &sortObj); + +} + +template +void Archive::update(const Population& pop, const Parameters& params) +{ + individuals.resize(0); // clear archive + + // refill archive with new pareto fronts (one pareto front for each island!) + for (int island =0; island< pop.num_islands; ++island) { + auto front = selector.fast_nds(pop, island); + for (const auto& i : front) + { + individuals.push_back( pop.individuals.at(i) ); + } + } + if (this->sort_complexity) + std::sort(individuals.begin(),individuals.end(),&sortComplexity); + else + std::sort(individuals.begin(),individuals.end(), &sortObj); + + /* auto it = std::unique(individuals.begin(),individuals.end(), &sameFitComplexity); */ + auto it = std::unique(individuals.begin(),individuals.end(), + &sameObjectives); + + individuals.resize(std::distance(individuals.begin(),it)); +} + } } \ No newline at end of file diff --git a/src/pop/archive.h b/src/pop/archive.h index ff6692c8..8abbac9d 100644 --- a/src/pop/archive.h +++ b/src/pop/archive.h @@ -20,30 +20,30 @@ struct Archive vector> individuals; ///< individual programs in the archive bool sort_complexity; ///< whether to sort archive by complexity + // using NSGA2 in survival mode (nsga2 does not implement selection) NSGA2 selector; - Archive(){}; - ~Archive(){}; + Archive(); - void init(Population& pop){}; + void init(Population& pop); - void update(const Population& pop, const Parameters& params){}; + void update(const Population& pop, const Parameters& params); - void set_objectives(vector objectives){}; + void set_objectives(vector objectives); /// Sort population in increasing complexity. static bool sortComplexity(const Individual& lhs, - const Individual& rhs){ return false; }; + const Individual& rhs); /// Sort population by first objective. - static bool sortObj1(const Individual& lhs, - const Individual& rhs){ return false; }; + static bool sortObj(const Individual& lhs, + const Individual& rhs, const int index=0); /// check for repeats static bool sameFitComplexity(const Individual& lhs, - const Individual& rhs){ return false; }; + const Individual& rhs); static bool sameObjectives(const Individual& lhs, - const Individual& rhs){ return false; }; + const Individual& rhs); }; //serialization diff --git a/src/pop/population.cpp b/src/pop/population.cpp index b8e9152a..1ab1e48a 100644 --- a/src/pop/population.cpp +++ b/src/pop/population.cpp @@ -267,6 +267,10 @@ vector> Population::sorted_front(unsigned rank) template vector Population::hall_of_fame(unsigned rank) { + // TODO: hall of fame should unify all pareto fronts by doing a new fast_nds. + // TODO: use hall of fame instead of re-implmementing this feature in + // archive init and update functions + // this is used to migration and update archive at the end of a generation. // Thiis function expects islands without offspring diff --git a/src/vary/variation.cpp b/src/vary/variation.cpp index 603dbbce..213bbd7a 100644 --- a/src/vary/variation.cpp +++ b/src/vary/variation.cpp @@ -618,7 +618,7 @@ void Variation::vary(Population& pop, int island, { auto idxs = pop.get_island_indexes(island); - // TODO: fix pragma omp usage + // TODO: fix pragma omp usage (by fix I mean remove) //#pragma omp parallel for for (unsigned i = 0; i Date: Fri, 3 May 2024 18:53:00 -0300 Subject: [PATCH 174/199] Archive implementation. Individual ids. New TODOs to solve --- pybrush/BrushEstimator.py | 58 +++++++++++++++- src/bindings/bind_engines.h | 14 ++++ src/bindings/bind_params.cpp | 1 + src/bindings/bind_variation.h | 2 +- src/engine.cpp | 119 +++++++++++++++++++++++++++++++-- src/engine.h | 33 +++++---- src/ind/individual.h | 18 +++++ src/params.h | 11 +++ src/pop/archive.cpp | 45 +++++++------ src/pop/archive.h | 14 ++-- src/vary/variation.cpp | 24 +++++-- src/vary/variation.h | 5 +- tests/cpp/test_brush.cpp | 8 +++ tests/cpp/test_individuals.cpp | 4 +- tests/cpp/test_population.cpp | 2 +- 15 files changed, 301 insertions(+), 57 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 82af3e9e..21ad723c 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -89,6 +89,9 @@ class BrushEstimator(BaseEstimator): val_from_arch: boolean, optional (default: True) Validates the final model using the archive rather than the whole population. + use_arch: boolean, optional (default: False) + Determines if we should save pareto front of the entire evolution + (when set to True) or just the final population (False). batch_size : float, default 1.0 Percentage of training data to sample every generation. If `1.0`, then all data is used. Very small values can improve execution time, but @@ -146,6 +149,7 @@ def __init__( logfile="", weights_init=True, val_from_arch=True, + use_arch=False, validation_size: float = 0.0, batch_size: float = 1.0 ): @@ -165,7 +169,8 @@ def __init__( self.cx_prob=cx_prob self.logfile=logfile self.mutation_probs=mutation_probs - self.val_from_arch=val_from_arch # TODO: val from arch + self.val_from_arch=val_from_arch # TODO: val from arch implementation (in cpp side) + self.use_arch=use_arch self.functions=functions self.objectives=objectives self.initialization=initialization @@ -227,6 +232,8 @@ def fit(self, X, y): self.parameters_.max_size = self.max_size self.parameters_.objectives = self.objectives self.parameters_.cx_prob = self.cx_prob + self.parameters_.use_arch = self.use_arch + self.parameters_.val_from_arch = self.val_from_arch self.parameters_.mig_prob = self.mig_prob self.parameters_.functions = self.functions self.parameters_.mutation_probs = self.mutation_probs @@ -312,6 +319,30 @@ def get_params(self, deep=True): out[key] = value return out + def predict_archive(self, X): + """Returns a list of dictionary predictions for all models.""" + check_is_fitted(self) + + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + data = Dataset(X=X, ref_dataset=self.data_, c=self.mode == "classification", + feature_names=self.feature_names_) + + archive = self.engine_.get_archive() + + preds = [] + for ind in archive: + tmp = { + 'id' : ind['id'], + 'y_pred' : self.engine_.predict_archive(ind['id'], data) + } + preds.append(tmp) + + return preds + class BrushClassifier(BrushEstimator,ClassifierMixin): """Deap-based Brush for classification. @@ -368,6 +399,31 @@ def predict_proba(self, X): prob[:, 0] -= prob[:, 1] return prob + + + def predict_archive(self, X): + """Returns a list of dictionary predictions for all models.""" + check_is_fitted(self) + + if isinstance(X, pd.DataFrame): + X = X.values + + assert isinstance(X, np.ndarray) + + data = Dataset(X=X, ref_dataset=self.data_, c=True, + feature_names=self.feature_names_) + + archive = self.engine_.get_archive() + + preds = [] + for ind in archive: + tmp = { + 'id' : ind['id'], + 'y_pred' : self.engine_.predict_proba_archive(ind['id'], data) + } + preds.append(tmp) + + return preds class BrushRegressor(BrushEstimator, RegressorMixin): diff --git a/src/bindings/bind_engines.h b/src/bindings/bind_engines.h index 1ac661ca..24acd2c1 100644 --- a/src/bindings/bind_engines.h +++ b/src/bindings/bind_engines.h @@ -59,6 +59,13 @@ void bind_engine(py::module& m, string name) .def("predict", static_cast &X)>(&T::predict), "predict from X data") + .def("predict_archive", + static_cast(&T::predict_archive), + "predict from individual in archive") + .def("predict_archive", + static_cast &X)>(&T::predict_archive), + "predict from individual in archive") + .def("get_archive", &T::get_archive, py::arg("front") = false) .def(py::pickle( [](const T &p) { // __getstate__ /* Return a tuple that fully encodes the state of the object */ @@ -82,6 +89,13 @@ void bind_engine(py::module& m, string name) .def("predict_proba", static_cast &X)>(&T::predict_proba), "predict from X data") + .def("predict_proba_archive", + static_cast(&T::predict_proba_archive), + "predict from individual in archive") + .def("predict_proba_archive", + static_cast &X)>(&T::predict_proba_archive), + "predict from individual in archive") + ; } } \ No newline at end of file diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 6c740a23..dbf3c316 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -25,6 +25,7 @@ void bind_params(py::module& m) .def_property("save_population", &Brush::Parameters::get_save_population, &Brush::Parameters::set_save_population) .def_property("logfile", &Brush::Parameters::get_logfile, &Brush::Parameters::set_logfile) .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands) + .def_property("use_arch", &Brush::Parameters::get_use_arch, &Brush::Parameters::set_use_arch) .def_property("n_classes", &Brush::Parameters::get_n_classes, &Brush::Parameters::set_n_classes) .def_property("n_jobs", &Brush::Parameters::get_n_jobs, &Brush::Parameters::set_n_classes) .def_property("classification", &Brush::Parameters::get_classification, &Brush::Parameters::set_classification) diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index 9647b61d..65613aab 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -43,7 +43,7 @@ void bind_variation(py::module& m, string name) // including offspring indexes (the vary method will store the offspring in the second half of the index vector) pop.add_offspring_indexes(island); - self.vary(pop, island, parents); + self.vary(pop, island, parents, params); // making copies of the second half of the island individuals vector idxs = pop.get_island_indexes(island); diff --git a/src/engine.cpp b/src/engine.cpp index 33825127..e872b7d6 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -211,6 +211,97 @@ void Engine::print_stats(std::ofstream& log, float fraction) <<"\n\n"; } +template +vector Engine::get_archive(bool front) +{ + json j; // TODO: use this front argument (or remove it). I think I can remove + for (const auto& ind : archive.individuals) { + to_json(j, ind); // Serialize each individual + } + return j; +} + +// TODO: private function called find_individual that searches for it based on id. Then, +// use this function in predict_archive and predict_proba_archive. +template +auto Engine::predict_archive(int id, const Dataset& data) +{ + if (id == best_ind.id) + return best_ind.predict(data); + + for (int i = 0; i < this->archive.individuals.size(); ++i) + { + Individual& ind = this->archive.individuals.at(i); + + if (id == ind.id) + return ind.predict(data); + } + for (int island=0; islandid) + return ind->predict(data); + } + } + + std::runtime_error("Could not find id = " + + to_string(id) + "in archive or population."); + + return best_ind.predict(data); +} + +template +auto Engine::predict_archive(int id, const Ref& X) +{ + Dataset d(X); + return predict_archive(id, d); +} + +template +template + requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) +auto Engine::predict_proba_archive(int id, const Dataset& data) +{ + if (id == best_ind.id) + return best_ind.predict_proba(data); + + for (int i = 0; i < this->archive.individuals.size(); ++i) + { + Individual& ind = this->archive.individuals.at(i); + + if (id == ind.id) + return ind.predict_proba(data); + } + for (int island=0; islandid) + return ind->predict_proba(data); + } + } + + std::runtime_error("Could not find id = " + + to_string(id) + "in archive or population."); + + return best_ind.predict_proba(data); +} + +template +template + requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) +auto Engine::predict_proba_archive(int id, const Ref& X) +{ + Dataset d(X); + return predict_proba_archive(id, d); +} template // TODO: use the dataset, or ignore it bool Engine::update_best(const Dataset& data, bool val) @@ -313,6 +404,7 @@ void Engine::run(Dataset &data) unsigned generation = 0; unsigned stall_count = 0; float fraction = 0; + bool use_arch; auto stop = [&]() { return ( (generation == params.gens) @@ -402,7 +494,7 @@ void Engine::run(Dataset &data) //std::cout << "before vary" << std::endl; // // variation to produce offspring - variator.vary(this->pop, island, island_parents.at(island)); + variator.vary(this->pop, island, island_parents.at(island), params); //std::cout << "before update fitness" << std::endl; evaluator.update_fitness(this->pop, island, data, params, true); @@ -442,16 +534,16 @@ void Engine::run(Dataset &data) auto finish_gen = subflow.emplace([&]() { bool updated_best = this->update_best(data); - // TODO: use_arch - if ( params.verbosity>1 || !params.logfile.empty()) { + if ( (params.verbosity>1 || !params.logfile.empty() ) + || params.use_arch ) { calculate_stats(); } // TODO: logger working // logger.log("calculate stats...",2); - // if (use_arch) // TODO: archive - // archive.update(pop,params); + if (params.use_arch) + archive.update(pop, params); fraction = params.max_time == -1 ? ((generation+1)*1.0)/params.gens : timer.Elapsed().count()/params.max_time; @@ -498,6 +590,23 @@ void Engine::run(Dataset &data) // TODO: open, write, close? (to avoid breaking the file and allow some debugging if things dont work well) if (log.is_open()) log.close(); + + // if we're not using an archive, let's store the final population in the + // archive + if (!params.use_arch) + { + archive.individuals.resize(0); + for (int island =0; island< pop.num_islands; ++island) { + // cout << "island" << island << endl; + vector idxs = pop.get_island_indexes(island); + + for (unsigned i = 0; iarchive.individuals.size(); }; ///return population as string vector get_archive(bool front); - // /// predict on unseen data from the whole archive - // VectorXf predict_archive(int id, MatrixXf& X); - // VectorXf predict_archive(int id, MatrixXf& X, LongData& Z); - // ArrayXXf predict_proba_archive(int id, MatrixXf& X, LongData& Z); - // ArrayXXf predict_proba_archive(int id, MatrixXf& X); + /// predict on unseen data from the archive + auto predict_archive(int id, const Dataset& data); + auto predict_archive(int id, const Ref& X); + + template + requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) + auto predict_proba_archive(int id, const Dataset& data); + template + requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) + auto predict_proba_archive(int id, const Ref& X); + + // TODO: make these work + // VectorXf predict_archive(int id, const Ref& X, LongData& Z); + // ArrayXXf predict_proba_archive(int id, const Ref& X, LongData& Z); /// train the model void run(Dataset &d); Parameters params; ///< hyperparameters of brush, which the user can interact Individual best_ind; + + Archive archive; ///< pareto front archive private: SearchSpace ss; @@ -135,7 +145,6 @@ class Engine{ Log_Stats stats; ///< runtime stats Timer timer; ///< start time of training - Archive archive; ///< pareto front archive bool is_fitted; ///< keeps track of whether fit was called. @@ -146,10 +155,10 @@ class Engine{ }; // Only stuff to make new predictions or call fit again -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine, params, best_ind); -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind); -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind); -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine, params, best_ind, archive); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind, archive); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind, archive); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind, archive); } // Brush diff --git a/src/ind/individual.h b/src/ind/individual.h index ac6692ca..fb5de712 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -23,6 +23,11 @@ class Individual{ // this flag is used to avoid re-fitting an individual. the program is_fitted_ flag is used to perform checks (like in predict with weights). They are two different things and I think I;ll keep this way (individual is just a container to keep program and fitness together) bool is_fitted_ = false; + // archive utility (and also keep track of evolution) (this is meaningful only + // if variation is done using the vary() function) + unsigned id; ///< tracking id + vector parent_id; ///< ids of parents + VectorXf error; ///< training error (used in lexicase selectors) Fitness fitness; ///< aggregate fitness score @@ -32,6 +37,7 @@ class Individual{ Individual() { objectives = {"error", "complexity"}; + id = 0; // unsigned }; Individual(Program& prg) : Individual() { program = prg; }; @@ -85,6 +91,14 @@ class Individual{ void set_fitness(Fitness &f) { fitness=f; }; Fitness& get_fitness() { return fitness; }; + + void set_id(unsigned i){id = i;}; + void set_parents(const vector>& parents){ + parent_id.clear(); + for (const auto& p : parents) + parent_id.push_back(p.id); + }; /// set parent ids using parents + void set_parents(const vector& parents){ parent_id = parents; }; /// set parent ids using id values // TODO: USE setters and getters intead of accessing it directly // template @@ -142,6 +156,8 @@ void to_json(json &j, const Individual &p) j = json{ {"program", p.program}, {"fitness", p.fitness}, + {"id", p.id}, + {"parent_id", p.parent_id}, // {"loss", p.loss}, // {"loss_v", p.loss_v}, // {"complexity", p.complexity}, @@ -158,6 +174,8 @@ void from_json(const json &j, Individual& p) {// TODO: figure out if this works with private attributes and try to actually make them private (and use getters and setters) j.at("program").get_to( p.program ); j.at("fitness").get_to( p.fitness ); + j.at("id").get_to( p.id ); + j.at("parent_id").get_to( p.parent_id ); // j.at("loss").get_to( p.loss ); // j.at("loss_v").get_to( p.loss_v ); // j.at("complexity").get_to( p.complexity ); diff --git a/src/params.h b/src/params.h index da61ae15..20913e31 100644 --- a/src/params.h +++ b/src/params.h @@ -43,6 +43,11 @@ struct Parameters std::unordered_map functions; int num_islands=5; + // if we should save pareto front of the entire evolution (use_arch=true) + // or just the final population (use_arch=false) + bool use_arch=false; + bool val_from_arch=true; + // variation std::map mutation_probs = { {"point", 0.167}, @@ -143,6 +148,12 @@ struct Parameters void set_mig_prob(float new_mig_prob){ mig_prob = new_mig_prob; }; float get_mig_prob(){ return mig_prob; }; + void set_use_arch(bool new_use_arch){ use_arch = new_use_arch; }; + bool get_use_arch(){ return use_arch; }; + + void set_val_from_arch(bool new_val_from_arch){ val_from_arch = new_val_from_arch; }; + bool get_val_from_arch(){ return val_from_arch; }; + void set_classification(bool c){ classification = c; }; bool get_classification(){ return classification; }; diff --git a/src/pop/archive.cpp b/src/pop/archive.cpp index 7ae8df01..4ffe975f 100644 --- a/src/pop/archive.cpp +++ b/src/pop/archive.cpp @@ -27,8 +27,8 @@ bool Archive::sortComplexity(const Individual& lhs, } template -bool Archive::sortObj(const Individual& lhs, - const Individual& rhs, const int index) +bool Archive::sortObj1(const Individual& lhs, + const Individual& rhs) { // sort based on index (we can have more than 2 obj in brush implementation) // obs: because of the weights, every objective is a maximization problem @@ -36,7 +36,10 @@ bool Archive::sortObj(const Individual& lhs, // the bigger the better. the weights allow us to use different min/max metrics // without having to deal with this particular details - return lhs.fitness.wvalues.at(index) > rhs.fitness.wvalues.at(index); + float lhs_obj1 = lhs.fitness.wvalues.at(0); + float rhs_obj1 = rhs.fitness.wvalues.at(0); + + return lhs_obj1 > rhs_obj1; } template @@ -54,15 +57,8 @@ template bool Archive::sameObjectives(const Individual& lhs, const Individual& rhs) { - for (const auto& o_lhs : lhs.fitness) - { - for (const auto& o_rhs : rhs.fitness) - { - if (o_lhs != o_rhs) - return false; - } - } - return true; + return (lhs.fitness == rhs.fitness); + } template @@ -75,7 +71,9 @@ void Archive::init(Population& pop) // dealing with islands --> fast nds for each island for (int island =0; island< pop.num_islands; ++island) { - selector.fast_nds(pop, island); + vector idxs = pop.get_island_indexes(island); + + selector.fast_nds(pop, idxs); } // OBS: fast_nds will change all individual fitness inplace. @@ -89,7 +87,7 @@ void Archive::init(Population& pop) for (unsigned i = 0; i::init(Population& pop) if (this->sort_complexity) std::sort(individuals.begin(),individuals.end(), &sortComplexity); else - std::sort(individuals.begin(),individuals.end(), &sortObj); + std::sort(individuals.begin(),individuals.end(), &sortObj1); } template -void Archive::update(const Population& pop, const Parameters& params) +void Archive::update(Population& pop, const Parameters& params) { individuals.resize(0); // clear archive // refill archive with new pareto fronts (one pareto front for each island!) for (int island =0; island< pop.num_islands; ++island) { - auto front = selector.fast_nds(pop, island); - for (const auto& i : front) + cout << "island" << island << endl; + vector idxs = pop.get_island_indexes(island); + + // TODO: can i just call fast nds with all indexes in idxs? + vector> front = selector.fast_nds(pop, idxs); + for (const auto& i : front[0]) { - individuals.push_back( pop.individuals.at(i) ); + individuals.push_back( *pop.individuals.at(i) ); + cout << "index" << i << endl; } } if (this->sort_complexity) - std::sort(individuals.begin(),individuals.end(),&sortComplexity); + std::sort(individuals.begin(), individuals.end(), &sortComplexity); else - std::sort(individuals.begin(),individuals.end(), &sortObj); + std::sort(individuals.begin(), individuals.end(), &sortObj1); /* auto it = std::unique(individuals.begin(),individuals.end(), &sameFitComplexity); */ auto it = std::unique(individuals.begin(),individuals.end(), diff --git a/src/pop/archive.h b/src/pop/archive.h index 8abbac9d..aef9165e 100644 --- a/src/pop/archive.h +++ b/src/pop/archive.h @@ -27,7 +27,7 @@ struct Archive void init(Population& pop); - void update(const Population& pop, const Parameters& params); + void update(Population& pop, const Parameters& params); void set_objectives(vector objectives); @@ -36,8 +36,8 @@ struct Archive const Individual& rhs); /// Sort population by first objective. - static bool sortObj(const Individual& lhs, - const Individual& rhs, const int index=0); + static bool sortObj1(const Individual& lhs, + const Individual& rhs); /// check for repeats static bool sameFitComplexity(const Individual& lhs, @@ -47,10 +47,10 @@ struct Archive }; //serialization -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals); -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals); -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals); -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals, sort_complexity); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals, sort_complexity); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals, sort_complexity); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals, sort_complexity); } // Pop } // Brush diff --git a/src/vary/variation.cpp b/src/vary/variation.cpp index 213bbd7a..762a3dbc 100644 --- a/src/vary/variation.cpp +++ b/src/vary/variation.cpp @@ -614,7 +614,7 @@ std::optional> Variation::mutate(const Individual& parent) template void Variation::vary(Population& pop, int island, - const vector& parents) + const vector& parents, const Parameters& p) { auto idxs = pop.get_island_indexes(island); @@ -633,33 +633,45 @@ void Variation::vary(Population& pop, int island, const Individual& mom = pop[ *r.select_randomly(parents.begin(), parents.end())]; + vector> ind_parents; if ( r() < parameters.cx_prob) // crossover { const Individual& dad = pop[ *r.select_randomly(parents.begin(), parents.end())]; - opt = cross(mom, dad); + opt = cross(mom, dad); + ind_parents = {mom, dad}; } else // mutation { - opt = mutate(mom); + opt = mutate(mom); + ind_parents = {mom}; } + // this assumes that islands do not share indexes before doing variation + unsigned id = p.current_gen*p.pop_size+idxs.at(i); + // mutation and crossover will already perform 3 attempts. If it fails, we just fill with a random individual - if (opt) // no optional value was returned + if (opt) // variation worked, lets keep this { Individual ind = opt.value(); + ind.is_fitted_ = false; + ind.set_id(id); + ind.set_parents(ind_parents); assert(ind.program.size()>0); pop.individuals.at(idxs.at(i)) = std::make_shared>(ind); } - else { + else { // no optional value was returned Individual new_ind; - new_ind.is_fitted_ = false; + // creating a new random individual from nothing new_ind.init(search_space, parameters); + new_ind.set_objectives(mom.get_objectives()); // it will have an invalid fitness + new_ind.set_id(id); + new_ind.is_fitted_ = false; pop.individuals.at(idxs.at(i)) = std::make_shared>(new_ind); } diff --git a/src/vary/variation.h b/src/vary/variation.h index 6c9994aa..8868a53b 100644 --- a/src/vary/variation.h +++ b/src/vary/variation.h @@ -108,7 +108,7 @@ class Variation { private: SearchSpace search_space; - Parameters parameters; + Parameters parameters; // stop using this thing here and get parameter as argument public: Variation() = default; @@ -129,7 +129,8 @@ class Variation std::optional> mutate(const Individual& parent); /// method to handle variation of population - void vary(Population& pop, int island, const vector& parents); + void vary(Population& pop, int island, const vector& parents, + const Parameters& p); }; } //namespace Var diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 2211c4aa..5e46b324 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -23,6 +23,11 @@ #include "../../src/pop/archive.cpp" #include "../../src/pop/population.cpp" +// TODO: test predict from archive +// TODO: rename it to test_engine + +// TODO: test serialization of archive (get archive and save to json) + // TODO: test logger, verbose, print stats, etc. TEST(Engine, EngineWorks) { @@ -84,6 +89,9 @@ TEST(Engine, EngineWorks) params.set_gens(10); params.set_mig_prob(0.5); + // just to see if nothing breaks + params.set_use_arch(true); + std::cout << "n jobs = 1" << std::endl; params.set_n_jobs(1); Brush::RegressorEngine est6(params); diff --git a/tests/cpp/test_individuals.cpp b/tests/cpp/test_individuals.cpp index 86cfb2c3..5b3e5df6 100644 --- a/tests/cpp/test_individuals.cpp +++ b/tests/cpp/test_individuals.cpp @@ -1 +1,3 @@ -// TODO: test predict, predict proba, fit. \ No newline at end of file +// TODO: test predict, predict proba, fit. + +// TODO: test parent_id and id \ No newline at end of file diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 8bd2937f..3a3e2b27 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -107,7 +107,7 @@ TEST(Population, PopulationTests) // variation applied to population fmt::print("Variations for island {}\n", j); - variator.vary(pop, j, parents); + variator.vary(pop, j, parents, params); fmt::print("fitting {}\n", j); // at this step, we know that theres only one pointer to each individual being fitted, so we can perform it in parallel evaluator.update_fitness(pop, j, data, params, true, true); From 78314fc160fd75bd07cb6f7c02ccc52dddb7d5fc Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Sat, 4 May 2024 16:04:41 -0300 Subject: [PATCH 175/199] fixed missing val_from_arch parameter and pybrush breaking --- src/bindings/bind_engines.h | 3 +++ src/bindings/bind_params.cpp | 1 + 2 files changed, 4 insertions(+) diff --git a/src/bindings/bind_engines.h b/src/bindings/bind_engines.h index 24acd2c1..034aceb1 100644 --- a/src/bindings/bind_engines.h +++ b/src/bindings/bind_engines.h @@ -18,6 +18,9 @@ #include "../pop/population.cpp" #include "../pop/population.h" +#include "../pop/archive.cpp" +#include "../pop/archive.h" + using Reg = Brush::RegressorEngine; using Cls = Brush::ClassifierEngine; using Rep = Brush::RepresenterEngine; diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index dbf3c316..b79a12ee 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -26,6 +26,7 @@ void bind_params(py::module& m) .def_property("logfile", &Brush::Parameters::get_logfile, &Brush::Parameters::set_logfile) .def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands) .def_property("use_arch", &Brush::Parameters::get_use_arch, &Brush::Parameters::set_use_arch) + .def_property("val_from_arch", &Brush::Parameters::get_val_from_arch, &Brush::Parameters::set_val_from_arch) .def_property("n_classes", &Brush::Parameters::get_n_classes, &Brush::Parameters::set_n_classes) .def_property("n_jobs", &Brush::Parameters::get_n_jobs, &Brush::Parameters::set_n_classes) .def_property("classification", &Brush::Parameters::get_classification, &Brush::Parameters::set_classification) From 7ea90c2bb7bee8bdebcad2d810bfac79d7757373 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 5 Jun 2024 10:35:39 -0300 Subject: [PATCH 176/199] Updated examples in readme --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 71b66e3b..ceca5f01 100644 --- a/README.md +++ b/README.md @@ -93,20 +93,21 @@ That means it should be compatible with sklearn pipelines, wrappers, and so fort In addition, Brush provides functionality that allows you to feed in more complicated data types than just matrices of floating point values. - - ## Regression ```python # load data import pandas as pd + df = pd.read_csv('docs/examples/datasets/d_enc.csv') X = df.drop(columns='label') y = df['label'] # import and make a regressor -from brush import BrushRegressor -est = BrushRegressor() +from pybrush import BrushRegressor + +# you can set verbosity=1 to see the progress bar +est = BrushRegressor(verbosity=1) # use like you would a sklearn regressor est.fit(X,y) @@ -120,15 +121,18 @@ print('score:', est.score(X,y)) ```python # load data import pandas as pd + df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') X = df.drop(columns='target') y = df['target'] # import and make a classifier -from brush import BrushClassifier -est = BrushClassifier() +from pybrush import BrushClassifier +est = BrushClassifier(verbosity=1) + # use like you would a sklearn classifier est.fit(X,y) + y_pred = est.predict(X) y_pred_proba = est.predict_proba(X) From 4381134f11f8fffd0dcc260fbc23d2e8151e55ad Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 5 Jun 2024 10:36:12 -0300 Subject: [PATCH 177/199] removing unused file --- .../datasets/d_analcatdata_happiness.csv | 61 ------------------- 1 file changed, 61 deletions(-) delete mode 100644 docs/examples/datasets/d_analcatdata_happiness.csv diff --git a/docs/examples/datasets/d_analcatdata_happiness.csv b/docs/examples/datasets/d_analcatdata_happiness.csv deleted file mode 100644 index b18d1319..00000000 --- a/docs/examples/datasets/d_analcatdata_happiness.csv +++ /dev/null @@ -1,61 +0,0 @@ -Years_of_schooling,Siblings,Count,target -0,0,15.0,0 -1,0,34.0,0 -2,0,36.0,0 -3,0,22.0,0 -0,1,61.0,0 -1,1,31.0,0 -2,1,60.0,0 -3,1,46.0,0 -0,2,25.0,0 -1,2,26.0,0 -2,2,35.0,0 -3,2,45.0,0 -0,3,30.0,0 -1,3,13.0,0 -2,3,8.0,0 -3,3,18.0,0 -0,4,14.0,0 -1,4,3.0,0 -2,4,3.0,0 -3,4,4.0,0 -0,0,17.0,1 -1,0,53.0,1 -2,0,70.0,1 -3,0,67.0,1 -0,1,79.0,1 -1,1,60.0,1 -2,1,96.0,1 -3,1,45.0,1 -0,2,40.0,1 -1,2,31.0,1 -2,2,63.0,1 -3,2,74.0,1 -0,3,39.0,1 -1,3,24.0,1 -2,3,7.0,1 -3,3,15.0,1 -0,4,15.0,1 -1,4,9.0,1 -2,4,2.0,1 -3,4,1.0,1 -0,0,7.0,2 -1,0,20.0,2 -2,0,23.0,2 -3,0,16.0,2 -0,1,36.0,2 -1,1,5.0,2 -2,1,12.0,2 -3,1,11.0,2 -0,2,12.0,2 -1,2,7.0,2 -2,2,5.0,2 -3,2,10.0,2 -0,3,4.0,2 -1,3,4.0,2 -2,3,3.0,2 -3,3,2.0,2 -0,4,1.0,2 -1,4,2.0,2 -2,4,0.0,2 -3,4,1.0,2 From cd9436cc73a382b6b0897dea7b7949030bd4deda Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 5 Jun 2024 10:37:50 -0300 Subject: [PATCH 178/199] cleaning a lot of comments and TODOs --- pybrush/BrushEstimator.py | 207 +--------- pybrush/DeapEstimator.py | 169 +------- pybrush/EstimatorInterface.py | 198 ++++++++++ pybrush/__init__.py | 16 +- pybrush/_versionstr.py | 2 +- pybrush/deap_api/nsga2.py | 35 -- src/bindings/bind_evaluator.h | 2 +- src/bindings/bind_params.cpp | 2 +- src/bindings/bind_variation.h | 8 +- src/bindings/module.cpp | 6 +- src/brush/deap_api/nsga2.py | 105 ----- src/brush/estimator.py | 473 ----------------------- src/data/data.h | 3 +- src/engine.cpp | 142 ++----- src/engine.h | 5 +- src/eval/evaluation.cpp | 6 +- src/eval/metrics.cpp | 4 +- src/eval/scorer.h | 12 +- src/ind/fitness.h | 7 +- src/ind/individual.cpp | 2 +- src/ind/individual.h | 14 - src/params.cpp | 9 - src/params.h | 9 +- src/pop/archive.cpp | 21 +- src/pop/archive.h | 1 - src/pop/population.cpp | 36 +- src/pop/population.h | 5 +- src/program/optimizer/weight_optimizer.h | 1 + src/selection/lexicase.cpp | 22 +- src/selection/lexicase.h | 6 +- src/selection/nsga2.cpp | 71 +--- src/selection/selection.cpp | 6 +- src/selection/selection.h | 9 +- src/util/rnd.cpp | 2 +- src/vary/search_space.cpp | 3 - src/vary/variation.cpp | 32 +- tests/cpp/test_brush.cpp | 20 +- tests/cpp/test_population.cpp | 2 +- tests/python/test_deap_api.py | 2 +- 39 files changed, 358 insertions(+), 1317 deletions(-) create mode 100644 pybrush/EstimatorInterface.py delete mode 100644 src/brush/deap_api/nsga2.py delete mode 100644 src/brush/estimator.py delete mode 100644 src/params.cpp diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 21ad723c..53f02af4 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -1,111 +1,27 @@ """ sklearn-compatible wrapper for GP analyses. -TODO: update this docstring -See brushgp.cpp for Python (via pybind11) modules that give more fine-grained +See engine.cpp for Python (via pybind11) modules that give more fine-grained control of the underlying GP objects. """ -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin -from sklearn.utils.validation import check_is_fitted -# from sklearn.metrics import mean_squared_error + import numpy as np import pandas as pd -from _brush.individual import * # RegressorIndividual, ClassifierIndividual, MultiClassifierIndividual -from _brush.engine import * # Regressor, Classifier, and MultiClassifier engines -from pybrush import Parameters, Dataset, SearchSpace -from pybrush import brush_rng +from sklearn.base import BaseEstimator, ClassifierMixin, \ + RegressorMixin, TransformerMixin + +from sklearn.utils.validation import check_is_fitted +from pybrush import Parameters, Dataset, SearchSpace, brush_rng +from pybrush.EstimatorInterface import EstimatorInterface +from pybrush import RegressorEngine, ClassifierEngine, MultiClassifierEngine -class BrushEstimator(BaseEstimator): +class BrushEstimator(EstimatorInterface, BaseEstimator): """ - This is the base class for Deap-based Brush estimators. - This class shouldn't be called directly; instead, call a child class like - :py:class:`DeapRegressor ` or :py:class:`DeapClassifier `. - All of the shared parameters are documented here. - - Parameters - ---------- - mode : str, default 'classification' - The mode of the estimator. Used by subclasses - pop_size : int, default 100 - Population size. - gens : int, default 100 - Maximum iterations of the algorithm. - max_time: int, optional (default: -1) - Maximum time terminational criterion in seconds. If -1, not used. - max_stall: int, optional (default: 0) - How many generations to continue after the validation loss has - stalled. If 0, not used. - verbosity : int, default 0 - Controls level of printouts. - max_depth : int, default 0 - Maximum depth of GP trees in the GP program. Use 0 for no limit. - max_size : int, default 0 - Maximum number of nodes in a tree. Use 0 for no limit. - num_islands : int, default 5 - Number of independent islands to use in evolutionary framework. - Ignored if `algorithm!="nsga2island"`. - mig_prob : float, default 0.05 - Probability of occuring a migration between two random islands at the - end of a generation, must be between 0 and 1. - cx_prob : float, default 1/7 - Probability of applying the crossover variation when generating the offspring, - must be between 0 and 1. - Given that there are `n` mutations, and either crossover or mutation is - used to generate each individual in the offspring (but not both at the - same time), we want to have by default an uniform probability between - crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and - `1/n` for each mutation, we can achieve an uniform distribution. - mutation_probs : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} - A dictionary with keys naming the types of mutation and floating point - values specifying the fraction of total mutations to do with that method. - The probability of having a mutation is `(1-cx_prob)` and, in case the mutation - is applied, then each mutation option is sampled based on the probabilities - defined in `mutation_probs`. The set of probabilities should add up to 1.0. - functions: dict[str,float] or list[str], default {} - A dictionary with keys naming the function set and values giving the probability - of sampling them, or a list of functions which will be weighted uniformly. - If empty, all available functions are included in the search space. - initialization : {"uniform", "max_size"}, default "uniform" - Distribution of sizes on the initial population. If `max_size`, then every - expression is created with `max_size` nodes. If `uniform`, size will be - uniformly distributed between 1 and `max_size`. - objectives : list[str], default ["error", "size"] - list with one or more objectives to use. Options are `"error", "size", "complexity"`. - If `"error"` is used, then it will be the mean squared error for regression, - and accuracy for classification. - algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2" - Which Evolutionary Algorithm framework to use to evolve the population. - weights_init : bool, default True - Whether the search space should initialize the sampling weights of terminal nodes - based on the correlation with the output y. If `False`, then all terminal nodes - will have the same probability of 1.0. - validation_size : float, default 0.0 - Percentage of samples to use as a hold-out partition. These samples are used - to calculate statistics during evolution, but not used to train the models. - The `best_estimator_` will be selected using this partition. If zero, then - the same data used for training is used for validation. - val_from_arch: boolean, optional (default: True) - Validates the final model using the archive rather than the whole - population. - use_arch: boolean, optional (default: False) - Determines if we should save pareto front of the entire evolution - (when set to True) or just the final population (False). - batch_size : float, default 1.0 - Percentage of training data to sample every generation. If `1.0`, then - all data is used. Very small values can improve execution time, but - also lead to underfit. - logfile: str, optional (default: "") - If specified, spits statistics into a logfile. "" means don't log. - random_state: int or None, default None - If int, then the value is used to seed the c++ random generator; if None, - then a seed will be generated using a non-deterministic generator. It is - important to notice that, even if the random state is fixed, it is - unlikely that running brush using multiple threads will have the same - results. This happens because the Operating System's scheduler is - responsible to choose which thread will run at any given time, thus - reproductibility is not guaranteed. + This is the base class for Brush estimators using the c++ engine. + + Parameters are defined and documented in pybrush.EstimatorInterface. Attributes ---------- @@ -125,59 +41,8 @@ class BrushEstimator(BaseEstimator): The toolbox used by DEAP for EA algorithm. """ - def __init__( - self, - mode='classification', - pop_size=100, - gens=100, - max_time=-1, - max_stall=0, - verbosity=0, - max_depth=3, - max_size=20, - num_islands=1, - n_jobs=1, - mig_prob=0.05, - cx_prob= 1/7, - mutation_probs = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, - "toggle_weight_on":1/6, "toggle_weight_off":1/6}, - functions: list[str]|dict[str,float] = {}, - initialization="uniform", - algorithm="nsga2", - objectives=["error", "size"], - random_state=None, - logfile="", - weights_init=True, - val_from_arch=True, - use_arch=False, - validation_size: float = 0.0, - batch_size: float = 1.0 - ): - - self.pop_size=pop_size - self.gens=gens - self.max_stall=max_stall - self.max_time=max_time - self.verbosity=verbosity - self.algorithm=algorithm - self.mode=mode - self.max_depth=max_depth - self.max_size=max_size - self.num_islands=num_islands - self.mig_prob=mig_prob - self.n_jobs=n_jobs - self.cx_prob=cx_prob - self.logfile=logfile - self.mutation_probs=mutation_probs - self.val_from_arch=val_from_arch # TODO: val from arch implementation (in cpp side) - self.use_arch=use_arch - self.functions=functions - self.objectives=objectives - self.initialization=initialization - self.random_state=random_state - self.batch_size=batch_size - self.weights_init=weights_init - self.validation_size=validation_size + def __init__(self, **kwargs): + EstimatorInterface.__init__(self, **kwargs) def fit(self, X, y): """ @@ -217,45 +82,7 @@ def fit(self, X, y): self.search_space_ = SearchSpace(self.data_, self.functions_, self.weights_init) - self.parameters_ = Parameters() - self.parameters_.classification = self.mode == "classification" - self.parameters_.n_classes = self.n_classes_ - self.parameters_.verbosity = self.verbosity - self.parameters_.n_jobs = self.n_jobs - self.parameters_.pop_size = self.pop_size - self.parameters_.gens = self.gens - self.parameters_.logfile = self.logfile - self.parameters_.max_stall = self.max_stall - self.parameters_.max_time = self.max_time - self.parameters_.num_islands = self.num_islands - self.parameters_.max_depth = self.max_depth - self.parameters_.max_size = self.max_size - self.parameters_.objectives = self.objectives - self.parameters_.cx_prob = self.cx_prob - self.parameters_.use_arch = self.use_arch - self.parameters_.val_from_arch = self.val_from_arch - self.parameters_.mig_prob = self.mig_prob - self.parameters_.functions = self.functions - self.parameters_.mutation_probs = self.mutation_probs - self.parameters_.validation_size = self.validation_size - self.parameters_.batch_size = self.batch_size - self.parameters_.feature_names = self.feature_names_ - - self.parameters_.scorer_ = "mse" - if self.mode == "classification": - self.parameters_.scorer_ = "log" if self.n_classes_ == 2 else "multi_log" - - if self.random_state is not None: - seed = 0 - if isinstance(self.random_state, np.random.Generator): - seed = self.random_state.integers(10000) - elif isinstance(self.random_state, int): - seed = self.random_state - else: - raise ValueError("random_state must be either a numpy random generator or an integer") - - self.parameters_.random_state = seed - + self.parameters_ = self._wrap_parameters() self.engine_ = None if self.mode == 'classification': self.engine_ = ( ClassifierEngine @@ -344,7 +171,7 @@ def predict_archive(self, X): return preds -class BrushClassifier(BrushEstimator,ClassifierMixin): +class BrushClassifier(BrushEstimator, ClassifierMixin): """Deap-based Brush for classification. For options, see :py:class:`DeapEstimator `. diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index f51ac28f..bb162f8b 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -4,103 +4,34 @@ See brushgp.cpp for Python (via pybind11) modules that give more fine-grained control of the underlying GP objects. """ -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin -from sklearn.utils.validation import check_is_fitted -# from sklearn.metrics import mean_squared_error + +import functools + import numpy as np import pandas as pd -# import deap as dp + from deap import algorithms, base, creator, tools -# from tqdm import tqdm + from sklearn.metrics import average_precision_score from sklearn.preprocessing import MinMaxScaler -import functools + +from sklearn.utils.validation import check_is_fitted +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, \ + TransformerMixin + +from pybrush.EstimatorInterface import EstimatorInterface from pybrush.deap_api import nsga2 from pybrush import individual from pybrush import RegressorEvaluator, ClassifierEvaluator, MultiClassifierEvaluator from pybrush import RegressorSelector, ClassifierSelector, MultiClassifierSelector from pybrush import RegressorVariator, ClassifierVariator, MultiClassifierVariator -from pybrush import Parameters, Dataset, SearchSpace - -from pybrush import brush_rng +from pybrush import brush_rng, Parameters, Dataset, SearchSpace - -class DeapEstimator(BaseEstimator): +class DeapEstimator(EstimatorInterface, BaseEstimator): """ - This is the base class for Deap-based Brush estimators. - This class shouldn't be called directly; instead, call a child class like - :py:class:`DeapRegressor ` or :py:class:`DeapClassifier `. - All of the shared parameters are documented here. - - Parameters - ---------- - mode : str, default 'classification' - The mode of the estimator. Used by subclasses - pop_size : int, default 100 - Population size. - gens : int, default 100 - Maximum iterations of the algorithm. - verbosity : int, default 0 - Controls level of printouts. - max_depth : int, default 0 - Maximum depth of GP trees in the GP program. Use 0 for no limit. - max_size : int, default 0 - Maximum number of nodes in a tree. Use 0 for no limit. - num_islands : int, default 5 - Number of independent islands to use in evolutionary framework. - Ignored if `algorithm!="nsga2island"`. - mig_prob : float, default 0.05 - Probability of occuring a migration between two random islands at the - end of a generation, must be between 0 and 1. - cx_prob : float, default 1/7 - Probability of applying the crossover variation when generating the offspring, - must be between 0 and 1. - Given that there are `n` mutations, and either crossover or mutation is - used to generate each individual in the offspring (but not both at the - same time), we want to have by default an uniform probability between - crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and - `1/n` for each mutation, we can achieve an uniform distribution. - mutation_probs : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} - A dictionary with keys naming the types of mutation and floating point - values specifying the fraction of total mutations to do with that method. - The probability of having a mutation is `(1-cx_prob)` and, in case the mutation - is applied, then each mutation option is sampled based on the probabilities - defined in `mutation_probs`. The set of probabilities should add up to 1.0. - functions: dict[str,float] or list[str], default {} - A dictionary with keys naming the function set and values giving the probability - of sampling them, or a list of functions which will be weighted uniformly. - If empty, all available functions are included in the search space. - initialization : {"uniform", "max_size"}, default "uniform" - Distribution of sizes on the initial population. If `max_size`, then every - expression is created with `max_size` nodes. If `uniform`, size will be - uniformly distributed between 1 and `max_size`. - objectives : list[str], default ["error", "size"] - list with one or more objectives to use. Options are `"error", "size", "complexity"`. - If `"error"` is used, then it will be the mean squared error for regression, - and accuracy for classification. - algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2" - Which Evolutionary Algorithm framework to use to evolve the population. - weights_init : bool, default True - Whether the search space should initialize the sampling weights of terminal nodes - based on the correlation with the output y. If `False`, then all terminal nodes - will have the same probability of 1.0. - validation_size : float, default 0.0 - Percentage of samples to use as a hold-out partition. These samples are used - to calculate statistics during evolution, but not used to train the models. - The `best_estimator_` will be selected using this partition. If zero, then - the same data used for training is used for validation. - batch_size : float, default 1.0 - Percentage of training data to sample every generation. If `1.0`, then - all data is used. Very small values can improve execution time, but - also lead to underfit. - random_state: int or None, default None - If int, then the value is used to seed the c++ random generator; if None, - then a seed will be generated using a non-deterministic generator. It is - important to notice that, even if the random state is fixed, it is - unlikely that running brush using multiple threads will have the same - results. This happens because the Operating System's scheduler is - responsible to choose which thread will run at any given time, thus - reproductibility is not guaranteed. + This is the base class for Brush estimators in python. + + Parameters are defined and documented in pybrush.EstimatorInterface. Attributes ---------- @@ -120,50 +51,8 @@ class DeapEstimator(BaseEstimator): The toolbox used by DEAP for EA algorithm. """ - def __init__( - self, - mode='classification', - pop_size=100, - gens=100, - verbosity=0, - max_depth=3, - max_size=20, - num_islands=1, - n_jobs=1, - mig_prob=0.05, - cx_prob= 1/7, - mutation_probs = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, - "toggle_weight_on":1/6, "toggle_weight_off":1/6}, - functions: list[str]|dict[str,float] = {}, - initialization="uniform", - algorithm="nsga2", - objectives=["error", "size"], - random_state=None, - weights_init=True, - validation_size: float = 0.0, - batch_size: float = 1.0 - ): - - self.pop_size=pop_size - self.gens=gens - self.verbosity=verbosity - self.algorithm=algorithm - self.mode=mode - self.max_depth=max_depth - self.max_size=max_size - self.num_islands=num_islands - self.mig_prob=mig_prob - self.n_jobs=n_jobs - self.cx_prob=cx_prob - self.mutation_probs=mutation_probs - self.functions=functions - self.objectives=objectives - self.initialization=initialization - self.random_state=random_state - self.batch_size=batch_size - self.weights_init=weights_init - self.validation_size=validation_size - + def __init__(self, **kwargs): + EstimatorInterface.__init__(self, **kwargs) def _setup_toolbox(self): """Setup the deap toolbox""" @@ -264,29 +153,13 @@ def fit(self, X, y): # These have a default behavior to return something meaningfull if # no values are set self.train_ = self.data_.get_training_data() - self.train_.set_batch_size(self.batch_size) # TODO: update batch indexes at the beggining of every generation + self.train_.set_batch_size(self.batch_size) self.validation_ = self.data_.get_validation_data() self.search_space_ = SearchSpace(self.data_, self.functions_, self.weights_init) - self.parameters_ = Parameters() - self.parameters_.classification = self.mode == "classification" - self.parameters_.n_classes = self.n_classes_ - self.parameters_.n_jobs = self.n_jobs - self.parameters_.pop_size = self.pop_size - self.parameters_.gens = self.gens - self.parameters_.num_islands = self.num_islands - self.parameters_.max_depth = self.max_depth - self.parameters_.max_size = self.max_size - self.parameters_.objectives = self.objectives - self.parameters_.cx_prob = self.cx_prob - self.parameters_.mig_prob = self.mig_prob - self.parameters_.functions = self.functions - self.parameters_.mutation_probs = self.mutation_probs - - if self.random_state is not None: - self.parameters_.random_state = self.random_state + self.parameters_ = self._wrap_parameters() if self.mode == "classification": self.variator_ = (ClassifierVariator @@ -308,7 +181,7 @@ def fit(self, X, y): # nsga2 and ga differ in the toolbox self.archive_, self.logbook_ = nsga2( - self.toolbox_, self.gens, self.pop_size, self.cx_prob, + self.toolbox_, self.max_gens, self.pop_size, self.cx_prob, (0.0 void bind_evaluator(py::module& m, string name) { using Class = br::Eval::Evaluation; - + // TODO: will this part of c++ be exposed? py::class_ eval(m, name.data() ); eval.def(py::init<>()) .def("assign_fit", &Class::assign_fit) diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index b79a12ee..1519bfe4 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -15,7 +15,7 @@ void bind_params(py::module& m) .def(py::init([](){ Brush::Parameters p; return p; })) .def_property("verbosity", &Brush::Parameters::get_verbosity, &Brush::Parameters::set_verbosity) .def_property("pop_size", &Brush::Parameters::get_pop_size, &Brush::Parameters::set_pop_size) - .def_property("gens", &Brush::Parameters::get_gens, &Brush::Parameters::set_gens) + .def_property("max_gens", &Brush::Parameters::get_max_gens, &Brush::Parameters::set_max_gens) .def_property("max_stall", &Brush::Parameters::get_max_stall, &Brush::Parameters::set_max_stall) .def_property("max_time", &Brush::Parameters::get_max_time, &Brush::Parameters::set_max_time) .def_property("current_gen", &Brush::Parameters::get_current_gen, &Brush::Parameters::set_current_gen) diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index 65613aab..fe697c95 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -46,12 +46,12 @@ void bind_variation(py::module& m, string name) self.vary(pop, island, parents, params); // making copies of the second half of the island individuals - vector idxs = pop.get_island_indexes(island); - int start = idxs.size()/2; - for (unsigned i = start; i indices = pop.get_island_indexes(island); + int start = indices.size()/2; + for (unsigned i = start; i 0: - print(logbook.stream) - - # Begin the generational process - for gen in range(1, NGEN): - # The batch will be random only if it is not the size of the entire train set. - # In this case, we dont need to reevaluate the whole pop - if (use_batch): - batch = toolbox.getBatch() - fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), pop) - - for ind, fit in zip(pop, fitnesses): - ind.fitness.values = fit - - # Vary the population - # offspring = tools.selTournamentDCD(pop, len(pop)) - parents = toolbox.select(pop, len(pop)) - # offspring = [toolbox.clone(ind) for ind in offspring] - offspring = [] - - for ind1, ind2 in zip(parents[::2], parents[1::2]): - off1, off2 = None, None - if rnd_flt() < CXPB: - off1, off2 = toolbox.mate(ind1, ind2) - else: - off1 = toolbox.mutate(ind1) - off2 = toolbox.mutate(ind2) - - # avoid inserting empty solutions - if off1 is not None: offspring.extend([off1]) - if off2 is not None: offspring.extend([off2]) - - # archive.update(offspring) - # Evaluate the individuals with an invalid fitness - invalid_ind = [ind for ind in offspring if not ind.fitness.valid] - fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), invalid_ind) - for ind, fit in zip(invalid_ind, fitnesses): - ind.fitness.values = fit - - # Select the next generation population - pop = toolbox.survive(pop + offspring, MU) - record = stats.compile(pop) - logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) - - if verbosity > 0: - print(logbook.stream) - - if verbosity > 0: - print("Final population hypervolume is %f" % hypervolume(pop, [1000.0, 50.0])) - - archive = tools.ParetoFront() - archive.update(pop) - - return archive, logbook \ No newline at end of file diff --git a/src/brush/estimator.py b/src/brush/estimator.py deleted file mode 100644 index fd4913af..00000000 --- a/src/brush/estimator.py +++ /dev/null @@ -1,473 +0,0 @@ -""" -sklearn-compatible wrapper for GP analyses. - -See brushgp.cpp for Python (via pybind11) modules that give more fine-grained -control of the underlying GP objects. -""" -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin -# from sklearn.metrics import mean_squared_error -import numpy as np -import pandas as pd -# import deap as dp -from deap import algorithms, base, creator, tools -# from tqdm import tqdm -from types import NoneType -import _brush -from .deap_api import nsga2, DeapIndividual -# from _brush import Dataset, SearchSpace - - -class BrushEstimator(BaseEstimator): - """ - This is the base class for Brush estimators. - This class shouldn't be called directly; instead, call a child class like - :py:class:`BrushRegressor ` or :py:class:`BrushClassifier `. - All of the shared parameters are documented here. - - Parameters - ---------- - mode : str, default 'classification' - The mode of the estimator. Used by subclasses - pop_size : int, default 100 - Population size. - max_gen : int, default 100 - Maximum iterations of the algorithm. - verbosity : int, default 0 - Controls level of printouts. - max_depth : int, default 0 - Maximum depth of GP trees in the GP program. Use 0 for no limit. - max_size : int, default 0 - Maximum number of nodes in a tree. Use 0 for no limit. - cx_prob : float, default 1/7 - Probability of applying the crossover variation when generating the offspring, - must be between 0 and 1. - Given that there are `n` mutations, and either crossover or mutation is - used to generate each individual in the offspring (but not both at the - same time), we want to have by default an uniform probability between - crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and - `1/n` for each mutation, we can achieve an uniform distribution. - mutation_options : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} - A dictionary with keys naming the types of mutation and floating point - values specifying the fraction of total mutations to do with that method. - The probability of having a mutation is `(1-cx_prob)` and, in case the mutation - is applied, then each mutation option is sampled based on the probabilities - defined in `mutation_options`. The set of probabilities should add up to 1.0. - functions: dict[str,float] or list[str], default {} - A dictionary with keys naming the function set and values giving the probability - of sampling them, or a list of functions which will be weighted uniformly. - If empty, all available functions are included in the search space. - initialization : {"grow", "full"}, default "grow" - Strategy to create the initial population. If `full`, then every expression is created - with `max_size` nodes. If `grow`, size will be uniformly distributed. - validation_size : float, default 0.0 - Percentage of samples to use as a hold-out partition. These samples are used - to calculate statistics during evolution, but not used to train the models. - The `best_estimator_` will be selected using this partition. If zero, then - the same data used for training is used for validation. - batch_size : float, default 1.0 - Percentage of training data to sample every generation. If `1.0`, then - all data is used. Very small values can improve execution time, but - also lead to underfit. - random_state: int or None, default None - If int, then the value is used to seed the c++ random generator; if None, - then a seed will be generated using a non-deterministic generator. It is - important to notice that, even if the random state is fixed, it is - unlikely that running brush using multiple threads will have the same - results. This happens because the Operating System's scheduler is - responsible to choose which thread will run at any given time, thus - reproductibility is not guaranteed. - - Attributes - ---------- - best_estimator_ : _brush.Program - The final model picked from training. Used in subsequent calls to :func:`predict`. - archive_ : list[deap_api.DeapIndividual] - The final population from training. - data_ : _brush.Dataset - The complete data in Brush format. - train_ : _brush.Dataset - Partition of `data_` containing `(1-validation_size)`% of the data, in Brush format. - validation_ : _brush.Dataset - Partition of `data_` containing `(validation_size)`% of the data, in Brush format. - search_space_ : a Brush `SearchSpace` object. - Holds the operators and terminals and sampling utilities to update programs. - toolbox_ : deap.Toolbox - The toolbox used by DEAP for EA algorithm. - - """ - - def __init__( - self, - mode='classification', - pop_size=100, - max_gen=100, - verbosity=0, - max_depth=3, - max_size=20, - cx_prob= 1/7, - mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, - "toggle_weight_on":1/6, "toggle_weight_off":1/6}, - functions: list[str]|dict[str,float] = {}, - initialization="grow", - random_state=None, - validation_size: float = 0.0, - batch_size: float = 1.0 - ): - self.pop_size=pop_size - self.max_gen=max_gen - self.verbosity=verbosity - self.mode=mode - self.max_depth=max_depth - self.max_size=max_size - self.cx_prob=cx_prob - self.mutation_options=mutation_options - self.functions=functions - self.initialization=initialization - self.random_state=random_state - self.batch_size=batch_size - self.validation_size=validation_size - - - def _setup_toolbox(self, data_train, data_validation): - """Setup the deap toolbox""" - toolbox: base.Toolbox = base.Toolbox() - - # creator.create is used to "create new functions", and takes at least - # 2 arguments: the name of the newly created class and a base class - - # Minimizing/maximizing problem: negative/positive weight, respectively. - # Our classification is using the error as a metric - # Comparing fitnesses: https://deap.readthedocs.io/en/master/api/base.html#deap.base.Fitness - creator.create("FitnessMulti", base.Fitness, weights=self.weights) - - # create Individual class, inheriting from self.Individual with a fitness attribute - creator.create("Individual", DeapIndividual, fitness=creator.FitnessMulti) - - toolbox.register("mate", self._crossover) - toolbox.register("mutate", self._mutate) - - # When solving multi-objective problems, selection and survival must - # support this feature. This means that these selection operators must - # accept a tuple of fitnesses as argument) - toolbox.register("select", tools.selTournamentDCD) - toolbox.register("survive", tools.selNSGA2) - - # toolbox.population will return a list of elements by calling toolbox.individual - toolbox.register("createRandom", self._make_individual) - toolbox.register("population", tools.initRepeat, list, toolbox.createRandom) - - toolbox.register("getBatch", data_train.get_batch) - toolbox.register("evaluate", self._fitness_function, data=data_train) - toolbox.register("evaluateValidation", self._fitness_validation, data=data_validation) - - return toolbox - - - def _crossover(self, ind1, ind2): - offspring = [] - - for i,j in [(ind1,ind2),(ind2,ind1)]: - child = i.prg.cross(j.prg) - if child: - offspring.append(creator.Individual(child)) - else: # so we'll always have two elements to unpack in `offspring` - offspring.append(None) - - return offspring[0], offspring[1] - - - def _mutate(self, ind1): - # offspring = (creator.Individual(ind1.prg.mutate(self.search_space_)),) - offspring = ind1.prg.mutate() - - if offspring: - return creator.Individual(offspring) - - return None - - - def fit(self, X, y): - """ - Fit an estimator to X,y. - - Parameters - ---------- - X : np.ndarray - 2-d array of input data. - y : np.ndarray - 1-d array of (boolean) target values. - """ - _brush.set_params(self.get_params()) - - if self.random_state is not None: - _brush.set_random_state(self.random_state) - - self.data_ = self._make_data(X,y, validation_size=self.validation_size) - - # set n classes if relevant - if self.mode=="classification": - self.n_classes_ = len(np.unique(y)) - - # These have a default behavior to return something meaningfull if - # no values are set - self.train_ = self.data_.get_training_data() - self.train_.set_batch_size(self.batch_size) - self.validation_ = self.data_.get_validation_data() - - if isinstance(self.functions, list): - self.functions_ = {k:1.0 for k in self.functions} - else: - self.functions_ = self.functions - - self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) - self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) - - archive, logbook = nsga2( - self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, - (0.0 0: - print(f'best model {self.best_estimator_.get_model()}'+ - f' with size {self.best_estimator_.size()}, ' + - f' depth {self.best_estimator_.depth()}, ' + - f' and fitness {self.archive_[0].fitness}' ) - - return self - - def _make_data(self, X, y=None, validation_size=0.0): - # This function should not partition data (as it is used in predict). - # partitioning is done in fit(). - - if isinstance(y, pd.Series): - y = y.values - if isinstance(X, pd.DataFrame): - # self.data_ = _brush.Dataset(X.to_dict(orient='list'), y) - feature_names = X.columns.to_list() - X = X.values - if isinstance(y, NoneType): - return _brush.Dataset(X, - feature_names=feature_names, validation_size=validation_size) - else: - return _brush.Dataset(X, y, - feature_names=feature_names, validation_size=validation_size) - - assert isinstance(X, np.ndarray) - - # if there is no label, don't include it in library call to Dataset - if isinstance(y, NoneType): - return _brush.Dataset(X, validation_size=validation_size) - - return _brush.Dataset(X, y, validation_size=validation_size) - - - def predict(self, X): - """Predict using the best estimator in the archive. """ - data = self._make_data(X) - return self.best_estimator_.predict(data) - - # def _setup_population(self): - # """initialize programs""" - # if self.mode == 'classification': - # generate = self.search_space_.make_classifier - # else: - # generate = self.search_space_.make_regressor - - # programs = [ - # DeapIndividual(generate(self.max_depth, self.max_size)) - # for i in range(self.pop_size) - # ] - # # return [self._create_deap_individual_(p) for p in programs] - # return programs - - def get_params(self): - return {k:v for k,v in self.__dict__.items() if not k.endswith('_')} - - -class BrushClassifier(BrushEstimator,ClassifierMixin): - """Brush for classification. - - For options, see :py:class:`BrushEstimator `. - - Examples - -------- - >>> import pandas as pd - >>> df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') - >>> X = df.drop(columns='target') - >>> y = df['target'] - >>> from brush import BrushClassifier - >>> est = BrushClassifier() - >>> est.fit(X,y) - >>> print('score:', est.score(X,y)) - """ - def __init__( self, **kwargs): - super().__init__(mode='classification',**kwargs) - - # Weight of each objective (+ for maximization, - for minimization) - self.weights = (+1.0,-1.0) - - def _fitness_validation(self, ind, data: _brush.Dataset): - # Fitness without fitting the expression, used with validation data - return ( # (accuracy, size) - (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], - ind.prg.size() - ) - - def _fitness_function(self, ind, data: _brush.Dataset): - ind.prg.fit(data) - return ( # (accuracy, size) - (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], - ind.prg.size() - ) - - def _make_individual(self): - # C++'s PTC2-based `make_individual` will create a tree of at least - # the given size. By uniformly sampling the size, we can instantiate a - # population with more diversity - - if self.initialization not in ["grow", "full"]: - raise ValueError(f"Invalid argument value for `initialization`. " - f"expected 'full' or 'grow'. got {self.initialization}") - - return creator.Individual( - self.search_space_.make_classifier( - self.max_depth,(0 if self.initialization=='grow' else self.max_size)) - if self.n_classes_ == 2 else - self.search_space_.make_multiclass_classifier( - self.max_depth, (0 if self.initialization=='grow' else self.max_size)) - ) - - def predict_proba(self, X): - """Predict class probabilities for X. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples. Internally, it will be converted to - ``dtype=np.float32``. - - Returns - ------- - p : ndarray of shape (n_samples, n_classes) - The class probabilities of the input samples. The order of the - classes corresponds to that in the attribute :term:`classes_`. - - """ - data = self._make_data(X) - return self.best_estimator_.predict_proba(data) - -class BrushRegressor(BrushEstimator, RegressorMixin): - """Brush for regression. - - For options, see :py:class:`BrushEstimator `. - - Examples - -------- - >>> import pandas as pd - >>> df = pd.read_csv('docs/examples/datasets/d_enc.csv') - >>> X = df.drop(columns='label') - >>> y = df['label'] - >>> from brush import BrushRegressor - >>> est = BrushRegressor() - >>> est.fit(X,y) - >>> print('score:', est.score(X,y)) - """ - def __init__(self, **kwargs): - super().__init__(mode='regressor',**kwargs) - - # Weight of each objective (+ for maximization, - for minimization) - self.weights = (-1.0,-1.0) - - def _fitness_validation(self, ind, data: _brush.Dataset): - # Fitness without fitting the expression, used with validation data - - MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) - if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf - MSE = np.inf - - return ( MSE, ind.prg.size() ) - - def _fitness_function(self, ind, data: _brush.Dataset): - ind.prg.fit(data) - - MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) - if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf - MSE = np.inf - - return ( MSE, ind.prg.size() ) - - def _make_individual(self): - if self.initialization not in ["grow", "full"]: - raise ValueError(f"Invalid argument value for `initialization`. " - f"expected 'full' or 'grow'. got {self.initialization}") - - return creator.Individual( # No arguments (or zero): brush will use PARAMS passed in set_params. max_size is sampled between 1 and params['max_size'] if zero is provided - self.search_space_.make_regressor( - self.max_depth, (0 if self.initialization=='grow' else self.max_size)) - ) - -# Under development -# class BrushRepresenter(BrushEstimator, TransformerMixin): -# """Brush for representation learning. - -# For options, see :py:class:`BrushEstimator `. - -# Examples -# -------- -# >>> import pandas as pd -# >>> df = pd.read_csv('docs/examples/datasets/d_enc.csv') -# >>> X = df.drop(columns='label') -# >>> y = df['label'] -# >>> from brush import BrushRegressor -# >>> est = BrushRegressor() -# >>> est.fit(X,y) -# >>> print('score:', est.score(X,y)) -# """ -# def __init__(self, **kwargs): -# super().__init__(mode='regressor',**kwargs) - -# def _fitness_function(self, ind, data: _brush.Dataset): -# ind.prg.fit(data) -# return ( -# # todo: need to return a matrix from X for this -# np.sum((data.get_X()- ind.prg.predict(data))**2), -# ind.prg.size() -# ) - -# def _make_individual(self): -# return creator.Individual( -# self.search_space_.make_representer(self.max_depth, self.max_size) -# ) - -# def transform(self, X): -# """Transform X using the best estimator in the archive. """ -# return self.predict(X) \ No newline at end of file diff --git a/src/data/data.h b/src/data/data.h index 0f6ef69a..7dde291b 100644 --- a/src/data/data.h +++ b/src/data/data.h @@ -81,7 +81,7 @@ class Dataset /// @brief percentage of original data used for train. if 0.0, then all data is used for train and validation float validation_size; - bool use_validation; + bool use_validation; // TODO: shuffle before validation (this should be a parameter) /// @brief percentage of training data size to use in each batch. if 1.0, then all data is used float batch_size; @@ -247,6 +247,7 @@ template <> struct fmt::formatter: formatter { return formatter::format(Brush::DataTypeName.at(x), ctx); } }; + // TODO: fmt overload for Data // template <> struct fmt::formatter: formatter { // template diff --git a/src/engine.cpp b/src/engine.cpp index e872b7d6..b6623dc5 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -17,21 +17,11 @@ using namespace Var; template void Engine::init() { - // TODO: get rid of omp - if (params.n_jobs!=0) - omp_set_num_threads(params.get_n_jobs()); - - // std::cout << "set number of threads" << std::endl; - r.set_seed(params.get_random_state()); - // std::cout << "set random state" << std::endl; - // set up the pop, variator, etc set_is_fitted(false); - // std::cout << "is fitted is false" << std::endl; this->pop = Population(); - //std::cout << "created population" << std::endl; // TODO: load population into file // TODO: if initializing from a population file, then this is where we should load previous models. @@ -41,13 +31,10 @@ void Engine::init() this->pop.load(params.load_population); this->evaluator = Evaluation(); - //std::cout << "created evaluator" << std::endl; - // TOD: make these classes have a default constructor, and stop recreating instances + // TODO: make these classes have a default constructor, and stop recreating instances this->variator.init(params, ss); - //std::cout << "initialized variator" << std::endl; - // initializing survivor and selector based on params this->selector = Selection(params.sel, false); this->survivor = Selection(params.surv, true); @@ -56,12 +43,8 @@ void Engine::init() this->archive.set_objectives(params.objectives); - // start the clock timer.Reset(); - // // signal handler - // signal(SIGINT, my_handler); - // reset statistics this->stats = Log_Stats(); } @@ -88,8 +71,8 @@ void Engine::calculate_stats() int pop_size = 0; for (int island=0; island::calculate_stats() int index = 0; for (int island=0; islandpop.individuals.at(idxs[i]); + const auto& p = this->pop.individuals.at(indices[i]); // Fitness class will store every information that can be used as // fitness. you just need to access them. Multiplying by weight @@ -195,7 +178,7 @@ void Engine::print_stats(std::ofstream& log, float fraction) if(params.max_time == -1) std::cout << "Generation " << params.current_gen+1 << "/" - << params.gens << " [" + bar + space + "]\n"; + << params.max_gens << " [" + bar + space + "]\n"; else std::cout << std::fixed << "Time elapsed "<< timer << "/" << params.max_time @@ -237,11 +220,11 @@ auto Engine::predict_archive(int id, const Dataset& data) return ind.predict(data); } for (int island=0; islandid) return ind->predict(data); @@ -277,11 +260,11 @@ auto Engine::predict_proba_archive(int id, const Dataset& data) return ind.predict_proba(data); } for (int island=0; islandid) return ind->predict_proba(data); @@ -303,44 +286,28 @@ auto Engine::predict_proba_archive(int id, const Ref& X) return predict_proba_archive(id, d); } -template // TODO: use the dataset, or ignore it +template bool Engine::update_best(const Dataset& data, bool val) { - //std::cout << "updating best" << std::endl; - float error_weight = Individual::weightsMap[params.scorer_]; - - float bs; - bs = this->best_score; float f; - bool updated = false; - - //std::cout << "inside loop" << std::endl; + float bs = this->best_score; vector hof = this->pop.hall_of_fame(1); - //std::cout << "got hof" << std::endl; - for (int i=0; i < hof.size(); ++i) { - //std::cout << "index" << hof[i] << std::endl; const auto& ind = *pop.individuals.at(hof[i]); - - //std::cout << ind.program.get_model() << std::endl; - - //std::cout << "got individual of rank" << ind.fitness.rank << std::endl; - // if there is no validation, then loss_v == loss and this should work just fine + // TODO: dataset arg here with null default value. if the user provides a dataset, we use it to update + // if there is no validation, then loss_v==loss and this should work just fine f = ind.fitness.loss_v; if (f*error_weight > bs*error_weight - || (f == bs && ind.fitness.complexity < this->best_complexity) - ) + || (f == bs && ind.fitness.complexity < this->best_complexity) ) { - //std::cout << "updated" << std::endl; - bs = f; this->best_ind = ind; this->best_complexity = ind.fitness.complexity; @@ -358,16 +325,12 @@ bool Engine::update_best(const Dataset& data, bool val) template void Engine::run(Dataset &data) { - // It is up to the python side to create the dataset (we have a cool wrapper for that) - //std::cout << "starting to run" << std::endl; - //TODO: i need to make sure i initialize everything (pybind needs to have constructors // without arguments to work, and i need to handle correcting these values before running) this->ss = SearchSpace(data, params.functions); //std::cout << "search space was set" << std::endl; this->init(); - //std::cout << "Engine initialized" << std::endl; pop.init(this->ss, this->params); @@ -376,11 +339,7 @@ void Engine::run(Dataset &data) if (!params.logfile.empty()) log.open(params.logfile, std::ofstream::app); - //std::cout << "pop initialized with size " << params.pop_size << " and " << params.num_islands << "islands" << std::endl; - //std::cout << pop.print_models() << std::endl; - evaluator.set_scorer(params.scorer_); - //std::cout << "evaluator configured. starting to run " << std::endl; Dataset &batch = data; @@ -392,22 +351,21 @@ void Engine::run(Dataset &data) else threads = params.n_jobs; - tf::Executor executor(threads); // TODO: executor could be an attribute (so I can move a lot of stuff here to init) - //std::cout << "using n threads " << threads << std::endl; + tf::Executor executor(threads); assert( (executor.num_workers() > 0) && "Invalid number of workers"); tf::Taskflow taskflow; - //std::cout << "stop criteria is ready " << std::endl; // stop criteria unsigned generation = 0; unsigned stall_count = 0; float fraction = 0; + bool use_arch; auto stop = [&]() { - return ( (generation == params.gens) + return ( (generation == params.max_gens) && ((params.max_stall == 0 || stall_count < params.max_stall) && (params.max_time == -1 || params.max_time > timer.Elapsed().count()) ) ); @@ -427,7 +385,6 @@ void Engine::run(Dataset &data) size_t idx_start = std::floor(i*params.pop_size/params.num_islands); size_t idx_end = std::floor((i+1)*params.pop_size/params.num_islands); - // auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start auto delta = idx_end - idx_start; survivors.at(i).clear(); @@ -450,27 +407,6 @@ void Engine::run(Dataset &data) //std::cout << " -------------------- generation " << generation << " -------------------- " << std::endl; params.set_current_gen(generation); batch = data.get_batch(); // will return the original dataset if it is set to dont use batch - - // island_parents.clear(); - // island_parents.resize(pop.num_islands); - - // survivors.clear(); - // survivors.resize(pop.num_islands); - - // for (int i=0; i< params.num_islands; i++){ - // size_t idx_start = std::floor(i*params.pop_size/params.num_islands); - // size_t idx_end = std::floor((i+1)*params.pop_size/params.num_islands); - - // // auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start - // auto delta = idx_end - idx_start; - - // survivors.at(i).clear(); - // island_parents.at(i).clear(); - - // survivors.at(i).resize(delta); - // island_parents.at(i).resize(delta); - // } - }).name("prepare generation");// set generation in params, get batch auto run_generation = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { @@ -489,46 +425,24 @@ void Engine::run(Dataset &data) island_parents.at(island).at(i) = parents.at(i); } - //std::cout << "inside generate offspring" << std::endl; this->pop.add_offspring_indexes(island); - - //std::cout << "before vary" << std::endl; - // // variation to produce offspring variator.vary(this->pop, island, island_parents.at(island), params); - //std::cout << "before update fitness" << std::endl; - evaluator.update_fitness(this->pop, island, data, params, true); - // evaluator.validation(*this->pop, island_range, data, params); - //std::cout << "before batch update" << std::endl; if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) evaluator.update_fitness(this->pop, island, batch, params, false); - //std::cout << "before survive" << std::endl; // select survivors from combined pool of parents and offspring vector island_survivors = survivor.survive(this->pop, island, params); - //std::cout << "before assign to survivors array" << std::endl; for (int i=0; i< island_survivors.size(); i++){ - //std::cout << i << std::endl; survivors.at(island).at(i) = island_survivors.at(i); } }).name("runs one generation at each island in parallel"); auto update_pop = subflow.emplace([&]() { - //std::cout << "before updating survivors" << std::endl; - //std::cout << pop.print_models() << std::endl; this->pop.update(survivors); - - //std::cout << "after updating survivors" << std::endl; - //std::cout << pop.print_models() << std::endl; - - //std::cout << "before migrating" << std::endl; - //std::cout << pop.print_models() << std::endl; this->pop.migrate(); - - //std::cout << "after migrating" << std::endl; - //std::cout << pop.print_models() << std::endl; }).name("update, migrate and disentangle indexes between islands"); auto finish_gen = subflow.emplace([&]() { @@ -539,13 +453,10 @@ void Engine::run(Dataset &data) calculate_stats(); } - // TODO: logger working - // logger.log("calculate stats...",2); - if (params.use_arch) archive.update(pop, params); - fraction = params.max_time == -1 ? ((generation+1)*1.0)/params.gens : + fraction = params.max_time == -1 ? ((generation+1)*1.0)/params.max_gens : timer.Elapsed().count()/params.max_time; if(params.verbosity>1) @@ -598,11 +509,11 @@ void Engine::run(Dataset &data) archive.individuals.resize(0); for (int island =0; island< pop.num_islands; ++island) { // cout << "island" << island << endl; - vector idxs = pop.get_island_indexes(island); + vector indices = pop.get_island_indexes(island); - for (unsigned i = 0; i::run(Dataset &data) //When you have tasks that are created at runtime (e.g., subflow, // cudaFlow), you need to execute the graph first to spawn these tasks and dump the entire graph. + // printing the graph //std::cout << "dumping taskflow in json " << std::endl; - taskflow.dump(std::cout); + // taskflow.dump(std::cout); } } \ No newline at end of file diff --git a/src/engine.h b/src/engine.h index 93271382..c87d3b4f 100644 --- a/src/engine.h +++ b/src/engine.h @@ -54,7 +54,7 @@ class Engine{ bool update_best(const Dataset& data, bool val=false); // TODO: hyperparameter to set how the best is picked (MCDM, best on val, pareto front, etc). one of the options should be getting the pareto front - // TODO: best fitness instead of these. use fitness comparison + // TODO: best fitness (the class) instead of these. use fitness comparison float best_score; int best_complexity; Individual& get_best_ind(){return best_ind;}; @@ -103,7 +103,7 @@ class Engine{ // ArrayXXf predict_proba(MatrixXf& X, LongData& Z); // ArrayXXf predict_proba(MatrixXf& X); - // archive stuff + // archive stuff --- ///return archive size int get_archive_size(){ return this->archive.individuals.size(); }; @@ -161,5 +161,4 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine,params, best_ind, archive); } // Brush - #endif diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index d892e5a0..c365150d 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -14,11 +14,11 @@ void Evaluation::update_fitness(Population& pop, bool validation ) { - auto idxs = pop.get_island_indexes(island); + auto indices = pop.get_island_indexes(island); - for (unsigned i = 0; i& ind = *pop.individuals.at(idxs.at(i)).get(); // we are modifying it, so operator[] wont work + Individual& ind = *pop.individuals.at(indices.at(i)).get(); // we are modifying it, so operator[] wont work bool pass = false; diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp index 96e3014c..ce7ea8c6 100644 --- a/src/eval/metrics.cpp +++ b/src/eval/metrics.cpp @@ -63,6 +63,7 @@ float mean_log_loss(const VectorXf& y, VectorXf multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, const vector& class_weights) { + // TODO: fix softmax and multiclassification, then implement this VectorXf loss = VectorXf::Zero(y.rows()); // TODO: needs to be the index of unique elements @@ -122,8 +123,5 @@ float mean_multi_log_loss(const VectorXf& y, return loss.mean(); } - -// TODO: implement other metrics. Right know I have just the MSE - } // metrics } // Brush \ No newline at end of file diff --git a/src/eval/scorer.h b/src/eval/scorer.h index 5681da02..7f62638c 100644 --- a/src/eval/scorer.h +++ b/src/eval/scorer.h @@ -4,7 +4,6 @@ #include "metrics.h" #include "../util/error.h" #include "../types.h" -// #include "../individual.h" // code to evaluate GP programs. namespace Brush{ @@ -54,14 +53,11 @@ typedef float (*funcPointer)(const VectorXf&, if ( score_hash.find(this->scorer) == score_hash.end() ) { - // not found - HANDLE_ERROR_THROW("Scoring function '" + this->scorer - + "' not defined"); + HANDLE_ERROR_THROW("Scoring function '" + this->scorer + "' not defined"); return 0.0; } else { - // found return score_hash.at(this->scorer)(y_true, y_pred, loss, w); } }; @@ -122,8 +118,7 @@ typedef float (*funcPointer)(const VectorXf&, float score(Individual

& ind, Dataset& data, VectorXf& loss, const Parameters& params) { - // TODO: individual should have a wrapper to predict proba - RetType y_pred = ind.program.predict_proba(data); // .template cast(); + RetType y_pred = ind.predict_proba(data); // .template cast(); return score(data.y, y_pred, loss, params.class_weights); } }; @@ -178,8 +173,7 @@ typedef float (*funcPointer)(const VectorXf&, float score(Individual

& ind, Dataset& data, VectorXf& loss, const Parameters& params) { - // TODO: individual should have a wrapper to predict proba - RetType y_pred = ind.program.predict_proba(data); // .template cast(); + RetType y_pred = ind.predict_proba(data); // .template cast(); return score(data.y, y_pred, loss, params.class_weights); } }; diff --git a/src/ind/fitness.h b/src/ind/fitness.h index e986512e..abbcccf4 100644 --- a/src/ind/fitness.h +++ b/src/ind/fitness.h @@ -19,7 +19,6 @@ struct std::hash> { } }; -// TODO: separate declaration from implementation (for all classes. have a folder with headers and other with srcs, just like operon) namespace Brush{ struct Fitness { // the loss is used in evolutionary functions @@ -183,9 +182,5 @@ struct Fitness { void to_json(json &j, const Fitness &f); void from_json(const json &j, Fitness& f); - } -#endif - - - +#endif \ No newline at end of file diff --git a/src/ind/individual.cpp b/src/ind/individual.cpp index 5c3f3ae1..6d48b2a7 100644 --- a/src/ind/individual.cpp +++ b/src/ind/individual.cpp @@ -42,7 +42,7 @@ int Fitness::dominates(const Fitness& b) const int flag1 = 0, // to check if this has a better objective flag2 = 0; // to check if b has a better objective - // TODO: replace comparison of individual values by using the overloaded operators (here and in nsga2) + // TODO: replace comparison of individual values by using the overloaded operators (here and in nsga2) for (int i=0; i b.get_wvalues().at(i) || std::isnan(b.get_wvalues().at(i)) diff --git a/src/ind/individual.h b/src/ind/individual.h index fb5de712..f411bd75 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -158,13 +158,6 @@ void to_json(json &j, const Individual &p) {"fitness", p.fitness}, {"id", p.id}, {"parent_id", p.parent_id}, - // {"loss", p.loss}, - // {"loss_v", p.loss_v}, - // {"complexity", p.complexity}, - // {"size", p.size}, - // {"depth", p.depth}, - // {"rank", p.rank}, - // {"crowding_dist", p.crowding_dist}, {"objectives", p.objectives} }; } @@ -176,13 +169,6 @@ void from_json(const json &j, Individual& p) j.at("fitness").get_to( p.fitness ); j.at("id").get_to( p.id ); j.at("parent_id").get_to( p.parent_id ); - // j.at("loss").get_to( p.loss ); - // j.at("loss_v").get_to( p.loss_v ); - // j.at("complexity").get_to( p.complexity ); - // j.at("size").get_to( p.size ); - // j.at("depth").get_to( p.depth ); - // j.at("rank").get_to( p.rank ); - // j.at("crowding_dist").get_to( p.crowding_dist ); j.at("objectives").get_to( p.objectives ); } } // Pop diff --git a/src/params.cpp b/src/params.cpp deleted file mode 100644 index add4b4fd..00000000 --- a/src/params.cpp +++ /dev/null @@ -1,9 +0,0 @@ -/* Brush -copyright 2020 William La Cava -license: GNU/GPL v3 -*/ -#include "params.h" - -namespace Brush -{ -} // Brush diff --git a/src/params.h b/src/params.h index 20913e31..a7302461 100644 --- a/src/params.h +++ b/src/params.h @@ -29,7 +29,7 @@ struct Parameters // termination criteria int pop_size = 100; - int gens = 1000; // TODO: rename it to max_gens + int max_gens = 100; int max_stall = 0; int max_time = -1; @@ -97,8 +97,8 @@ struct Parameters void set_pop_size(int new_pop_size){ pop_size = new_pop_size; }; int get_pop_size(){ return pop_size; }; - void set_gens(int new_gens){ gens = new_gens; }; - int get_gens(){ return gens; }; + void set_max_gens(int new_max_gens){ max_gens = new_max_gens; }; + int get_max_gens(){ return max_gens; }; void set_max_stall(int new_max_stall){ max_stall = new_max_stall; }; int get_max_stall(){ return max_stall; }; @@ -169,7 +169,6 @@ struct Parameters void set_batch_size(float c){ batch_size = c; }; float get_batch_size(){ return batch_size; }; - //TODO: unify unordered or ordered void set_mutation_probs(std::map new_mutation_probs){ mutation_probs = new_mutation_probs; }; std::map get_mutation_probs(){ return mutation_probs; }; @@ -181,7 +180,7 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Parameters, verbosity, random_state, pop_size, - gens, + max_gens, max_stall, max_time, scorer_, diff --git a/src/pop/archive.cpp b/src/pop/archive.cpp index 4ffe975f..80fdfe79 100644 --- a/src/pop/archive.cpp +++ b/src/pop/archive.cpp @@ -1,4 +1,3 @@ -// TODO: implement archive functions #include "archive.h" namespace Brush { @@ -71,9 +70,9 @@ void Archive::init(Population& pop) // dealing with islands --> fast nds for each island for (int island =0; island< pop.num_islands; ++island) { - vector idxs = pop.get_island_indexes(island); + vector indices = pop.get_island_indexes(island); - selector.fast_nds(pop, idxs); + selector.fast_nds(pop, indices); } // OBS: fast_nds will change all individual fitness inplace. @@ -83,11 +82,11 @@ void Archive::init(Population& pop) /* vector front = this->sorted_front(); */ for (int island =0; island< pop.num_islands; ++island) { - auto idxs = pop.get_island_indexes(island); + auto indices = pop.get_island_indexes(island); - for (unsigned i = 0; i::update(Population& pop, const Parameters& params) // refill archive with new pareto fronts (one pareto front for each island!) for (int island =0; island< pop.num_islands; ++island) { cout << "island" << island << endl; - vector idxs = pop.get_island_indexes(island); + vector indices = pop.get_island_indexes(island); - // TODO: can i just call fast nds with all indexes in idxs? - vector> front = selector.fast_nds(pop, idxs); + // TODO: can i just call fast nds with all indexes in indices? + vector> front = selector.fast_nds(pop, indices); for (const auto& i : front[0]) { individuals.push_back( *pop.individuals.at(i) ); @@ -136,5 +135,5 @@ void Archive::update(Population& pop, const Parameters& params) individuals.resize(std::distance(individuals.begin(),it)); } -} -} \ No newline at end of file +} // Pop +} // Brush \ No newline at end of file diff --git a/src/pop/archive.h b/src/pop/archive.h index aef9165e..87cdebdc 100644 --- a/src/pop/archive.h +++ b/src/pop/archive.h @@ -1,7 +1,6 @@ #ifndef ARCHIVE_H #define ARCHIVE_H -//#include "node.h" // including node.h since definition of node is in the header #include "../ind/individual.h" ///< nsga2 selection operator for getting the front diff --git a/src/pop/population.cpp b/src/pop/population.cpp index 1ab1e48a..45338f04 100644 --- a/src/pop/population.cpp +++ b/src/pop/population.cpp @@ -49,11 +49,6 @@ void Population::init(vector>& new_individuals, const Parameter island_indexes.at(i).begin() + delta, island_indexes.at(i).end(), p+idx_start); } - else - { - // // second half is space to the offspring (but we dont initialize them) - // individuals.at(i) = std::make_shared; - } }; for (int j=0; j< new_individuals.size(); j++) { @@ -177,7 +172,6 @@ void Population::update(vector> survivors) size_t idx_start = std::floor(j*pop_size/num_islands); size_t idx_end = std::floor((j+1)*pop_size/num_islands); - // auto delta = survivors.at(j).size(); // should have the same size as idx_end - idx_start auto delta = idx_end - idx_start; assert(delta == survivors.at(j).size() @@ -244,13 +238,13 @@ vector> Population::sorted_front(unsigned rank) for (int j=0;j pf; - for (int i=0; ifitness.rank == rank) + if (individuals.at(indices.at(i))->fitness.rank == rank) pf.push_back(i); } @@ -278,11 +272,11 @@ vector Population::hall_of_fame(unsigned rank) for (int j=0;jfitness.rank == rank) - pf.push_back(idxs.at(i)); + if (individuals.at(indices.at(i))->fitness.rank == rank) + pf.push_back(indices.at(i)); } } std::sort(pf.begin(),pf.end(),SortComplexity(*this)); @@ -311,13 +305,11 @@ void Population::migrate() { new_island_indexes.at(island).resize(0); - auto idxs = island_indexes.at(island); - for (unsigned int i=0; i other_islands(num_islands-1); @@ -338,20 +330,16 @@ void Population::migrate() migrating_idx = *r.select_randomly( island_indexes.at(other_island).begin(), island_indexes.at(other_island).end()); - // std::cout << "mig idx" << migrating_idx << std::endl; - - // std::cout << "index " << i << " of island " << island; - // std::cout << " is now" << migrating_idx << std::endl; - new_island_indexes.at(island).push_back(migrating_idx); } else { - new_island_indexes.at(island).push_back(idxs.at(i)); + new_island_indexes.at(island).push_back(indices.at(i)); } } } + // making hard copies (so the next generation starts with islands that does not share individuals // this is particularly important to avoid multiple threads assigning different rank/crowdist/dcounter // or different fitness) @@ -381,11 +369,9 @@ void Population::migrate() iota(island_indexes.at(j).begin(), island_indexes.at(j).end(), idx_start); } - // std::cout << "finished making copies" << std::endl; assert(new_pop.size() == pop_size && " migration ended up with a different popsize"); - // std::cout << "filling individuals" << std::endl; this->individuals.resize(0); for (auto ind : new_pop) { diff --git a/src/pop/population.h b/src/pop/population.h index 783c3011..98cf5d7f 100644 --- a/src/pop/population.h +++ b/src/pop/population.h @@ -46,8 +46,8 @@ class Population{ public: size_t pop_size; int num_islands; - float mig_prob; // TODO: mig_prob should not be part of population - + float mig_prob; + vector>> individuals; vector> island_indexes; @@ -124,6 +124,7 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( Population, individuals, island_indexes, pop_size, num_islands); NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE( Population, individuals, island_indexes, pop_size, num_islands); + }// Pop }// Brush diff --git a/src/program/optimizer/weight_optimizer.h b/src/program/optimizer/weight_optimizer.h index 79a2d4fd..702f27bd 100644 --- a/src/program/optimizer/weight_optimizer.h +++ b/src/program/optimizer/weight_optimizer.h @@ -74,6 +74,7 @@ struct ResidualEvaluator { size_t numParameters_; // cache the number of parameters in the tree }; +// TODO: see this struct and try to understand how to make non-templated classes struct WeightOptimizer { diff --git a/src/selection/lexicase.cpp b/src/selection/lexicase.cpp index 46b8b420..30373412 100644 --- a/src/selection/lexicase.cpp +++ b/src/selection/lexicase.cpp @@ -18,16 +18,12 @@ template vector Lexicase::select(Population& pop, int island, const Parameters& params) { - // cout << "select lexicase island " << island << endl; - // this one can be executed in parallel because it is just reading the errors. This // method assumes that the expressions have been fitted previously, and their respective // error vectors are filled auto island_pool = pop.get_island_indexes(island); - // cout << "got indexes " << endl; - // if this is first generation, just return indices to pop if (params.current_gen==0) return island_pool; @@ -37,9 +33,6 @@ vector Lexicase::select(Population& pop, int island, //< number of individuals unsigned int P = island_pool.size(); - - // cout << "pool size is " << P << endl; - // cout << "epsilon size is " << N << endl; // define epsilon ArrayXf epsilon = ArrayXf::Zero(N); @@ -48,8 +41,6 @@ vector Lexicase::select(Population& pop, int island, if (!params.classification || params.scorer_.compare("log")==0 || params.scorer_.compare("multi_log")==0) { - // cout << "using lexicase for regression " << endl; - // for each sample, calculate epsilon for (int i = 0; i Lexicase::select(Population& pop, int island, vector selected(P,0); // selected individuals - // #pragma omp parallel for for (unsigned int i = 0; i cases; // cases (samples) if (params.classification && !params.class_weights.empty()) { - // cout << "using WEIGHTED for classification " << endl; - // for classification problems, weight case selection // by class weights vector choices(N); @@ -92,11 +78,11 @@ vector Lexicase::select(Population& pop, int island, for (unsigned i = 0; i choice_idxs(N-i); - std::iota(choice_idxs.begin(),choice_idxs.end(),0); + vector choice_indices(N-i); + std::iota(choice_indices.begin(),choice_indices.end(),0); size_t idx = *r.select_randomly( - choice_idxs.begin(), choice_idxs.end(), + choice_indices.begin(), choice_indices.end(), sample_weights.begin(), sample_weights.end()); cases.push_back(choices.at(idx)); @@ -176,7 +162,6 @@ vector Lexicase::select(Population& pop, int island, return selected; } - template vector Lexicase::survive(Population& pop, int island, const Parameters& params) @@ -186,6 +171,5 @@ vector Lexicase::survive(Population& pop, int island, return vector(); } - } } diff --git a/src/selection/lexicase.h b/src/selection/lexicase.h index 2f4365d5..a57f5286 100644 --- a/src/selection/lexicase.h +++ b/src/selection/lexicase.h @@ -36,8 +36,6 @@ class Lexicase : public SelectionOperator const Parameters& p); }; - -} -} - +} // Sel +} // Brush #endif \ No newline at end of file diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 25992cfc..50ca00f8 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -39,8 +39,6 @@ template vector NSGA2::select(Population& pop, int island, const Parameters& params) { - // cout << "select nsga island" << island << endl; - // tournament selection. TODO: move this to tournament selection file, and throw not implemented error in nsga. auto island_pool = pop.get_island_indexes(island); @@ -48,10 +46,6 @@ vector NSGA2::select(Population& pop, int island, if (params.current_gen==0) return island_pool; - // setting the objectives (evaluator should do it. TODO: make sure it does) - // for (unsigned int i=0; iset_obj(params.objectives); - // i am not sure if I need this update of rank and crowding distance (bc first generation is ignored by if above, and the other generations will always have individuals that went through survival, which already calculates this information. TODO: in the final algorithm, I need to make sure this is correct) auto front = fast_nds(pop, island_pool); for (size_t i = 0; i< front.size(); i++) @@ -75,62 +69,32 @@ template vector NSGA2::survive(Population& pop, int island, const Parameters& params) { - - // fmt::print("starting survive\n"); - - // cout << "survive nsga island " << island << endl; - size_t idx_start = std::floor(island*params.pop_size/params.num_islands); size_t idx_end = std::floor((island+1)*params.pop_size/params.num_islands); auto original_size = idx_end - idx_start; // original island size (survive must be called with an island with offfspring) - - // fmt::print("original size {}\n", original_size); auto island_pool = pop.get_island_indexes(island); - // fmt::print("island size {}\n", island_pool.size()); - - // set objectives (this is when the obj vector is updated.) - - // for loop below (originally performed in selection in FEAT) was moved to evaluation --- multiple islands may have the same individual - // for (unsigned int i=0; iset_obj(params.objectives); - // fast non-dominated sort - // fmt::print("fast nds for island {}\n", island); auto front = fast_nds(pop, island_pool); - // fmt::print("selecting...\n"); // Push back selected individuals until full vector selected; - // fmt::print("created array...\n"); selected.resize(0); - // fmt::print("resized...\n"); int i = 0; - - // fmt::print("starting loop...\n"); - // fmt::print("selected size {}...\n",selected.size()); - // fmt::print("first front size {}...\n", front.at(i).size()); - // fmt::print("goal is to select n individuals: {}...\n", original_size); - while ( i < front.size() && ( selected.size() + front.at(i).size() < original_size ) ) { - // fmt::print("1...\n"); std::vector& Fi = front.at(i); // indices in front i - // fmt::print("2...\n"); crowding_distance(pop, front, i); // calculate crowding in Fi - // fmt::print("3...\n"); for (int j = 0; j < Fi.size(); ++j) // Pt+1 = Pt+1 U Fi selected.push_back(Fi.at(j)); - - // fmt::print("4...\n"); ++i; } @@ -154,16 +118,12 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan // this will update pareto dominance attributes in fitness class // based on the population - // fmt::print("inside fast nds with island pool of size {} from pop of size {} and\n", island_pool.size(), pop.size()); - //< the Pareto fronts vector> front; front.resize(1); front.at(0).clear(); - // this pragma must go alongside with the inner pragma omp critical (to avoid racing conditions) - #pragma omp parallel for for (int i = 0; i < island_pool.size(); ++i) { std::vector dom; @@ -184,22 +144,17 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan dcount += 1; } } + p->fitness.dcounter = dcount; + p->fitness.dominated.clear(); + p->fitness.dominated = dom; // dom will have values already referring to island indexes - #pragma omp critical - { - p->fitness.dcounter = dcount; - p->fitness.dominated.clear(); - p->fitness.dominated = dom; // dom will have values already referring to island indexes - - if (p->fitness.dcounter == 0) { - // fmt::print("pushing {}...\n", island_pool[i]); - p->fitness.set_rank(1); - // front will have values already referring to island indexes - front.at(0).push_back(island_pool[i]); - } - - // fmt::print("... index {} dominates {} ({}) and was dominated by {} ({})\n", island_pool[i], dom.size(), p->fitness.get_dominated().size(), dcount, p->fitness.get_dcounter()); + if (p->fitness.dcounter == 0) { + // fmt::print("pushing {}...\n", island_pool[i]); + p->fitness.set_rank(1); + // front will have values already referring to island indexes + front.at(0).push_back(island_pool[i]); } + } // fmt::print("First front size {}...\n", front.at(0).size()); @@ -211,16 +166,12 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan int fi = 1; while (front.at(fi-1).size() > 0) { - // fmt::print("starting front {} with size {} \n", fi, front.at(fi-1).size()); - std::vector& fronti = front.at(fi-1); std::vector Q; for (int i = 0; i < fronti.size(); ++i) { const Individual& p = pop[fronti.at(i)]; - // fmt::print("ind {} dominated {} \n", fronti.at(i), p.fitness.dominated.size()); - // iterating over dominated individuals for (int j = 0; j < p.fitness.dominated.size() ; ++j) { // fmt::print("decreased counter of ind {} for {} to {} \n", j, p.fitness.dominated.at(j), pop.individuals.at(p.fitness.dominated.at(j))->fitness.dcounter); @@ -240,13 +191,9 @@ vector> NSGA2::fast_nds(Population& pop, vector& islan } front.push_back(Q); - // fmt::print("front {} ended with size {}...\n", fi, Q.size()); fi += 1; } - - // fmt::print("finished\n"); - return front; } diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index 3a7c01ba..f1097d02 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -1,13 +1,11 @@ #include "selection.h" -// TODO: organize all namespaces namespace Brush { namespace Sel { using namespace Brush; using namespace Pop; - template Selection::Selection() { @@ -64,5 +62,5 @@ vector Selection::survive(Population& pop, int island, return pselector->survive(pop, island, params); } -} // selection -} // Brush \ No newline at end of file +} // Sel +} // Brush diff --git a/src/selection/selection.h b/src/selection/selection.h index f90c6795..2ab6c344 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -16,10 +16,6 @@ namespace Sel { using namespace Brush; using namespace Pop; -// struct Parameters; // forward declaration of Parameters - -// TODO: it seems that the selection is doing a poor job with the size. investigate it. - /*! * @class Selection * @brief interfaces with selection operators. @@ -51,9 +47,6 @@ struct Selection const Parameters& params); }; -// TODO: MAKE THIS WORK -// NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Selection, type, survival); - -} // selection +} // Sel } // Brush #endif \ No newline at end of file diff --git a/src/util/rnd.cpp b/src/util/rnd.cpp index ac95b699..bb8a9fa6 100644 --- a/src/util/rnd.cpp +++ b/src/util/rnd.cpp @@ -17,8 +17,8 @@ namespace Brush { namespace Util{ * the number of available cores. */ - //cout << "Max threads are " <(params, max_d, max_size); }; -// TODO: stop using params as a default argument and actually pass it (also update tests) ClassifierProgram SearchSpace::make_classifier(int max_d, int max_size, const Parameters& params) { return make_program(params, max_d, max_size); }; -// TODO: stop using params as a default argument and actually pass it (also update tests) MulticlassClassifierProgram SearchSpace::make_multiclass_classifier( int max_d, int max_size, const Parameters& params) { return make_program(params, max_d, max_size); }; -// TODO: stop using params as a default argument and actually pass it (also update tests) RepresenterProgram SearchSpace::make_representer(int max_d, int max_size, const Parameters& params) { return make_program(params, max_d, max_size); diff --git a/src/vary/variation.cpp b/src/vary/variation.cpp index 762a3dbc..cdf26811 100644 --- a/src/vary/variation.cpp +++ b/src/vary/variation.cpp @@ -516,14 +516,10 @@ std::optional> Variation::cross( template std::optional> Variation::mutate(const Individual& parent) { - // std::cout << "selecting options" << parameters.mutation_probs.size() << std::endl; auto options = parameters.mutation_probs; - // std::cout << "selecting options2" << options.size() << std::endl; - bool all_zero = true; for (auto &it : parameters.mutation_probs) { - // std::cout << it.first << it.second << std::endl; if (it.second > 0.0) { all_zero = false; break; @@ -532,18 +528,15 @@ std::optional> Variation::mutate(const Individual& parent) if (all_zero) { // No mutation can be successfully applied to this solution - // std::cout << "no viable one" << std::endl; return std::nullopt; } int attempts = 0; while(++attempts <= 3) { - // std::cout << "selecting (not all are zero)" << std::endl; // choose a valid mutation option string choice = r.random_choice(parameters.mutation_probs); - // std::cout << "picked mutation" << choice << std::endl; // TODO: this could be improved (specially with the Variation class) std::unique_ptr mutation; if (choice == "point") @@ -569,10 +562,8 @@ std::optional> Variation::mutate(const Individual& parent) HANDLE_ERROR_THROW(msg); } - // std::cout << "cloning parent" << std::endl; Program child(parent.program); - // std::cout << "findind spot" << std::endl; // choose location by weighted sampling of program auto weights = mutation->find_spots(child.Tree); @@ -580,16 +571,13 @@ std::optional> Variation::mutate(const Individual& parent) return w<=0.0; })) { // There is no spot that has a probability to be selected - // std::cout << "no spots" << std::endl; continue; } - // std::cout << "apickingt spot" << std::endl; // apply the mutation and check if it succeeded auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), weights.begin(), weights.end()); - // std::cout << "mutating" << std::endl; // Every mutation here works inplace, so they return bool instead of // std::optional to indicare the result of their manipulation over the // program tree. Here we call the mutation function and return the result @@ -616,13 +604,11 @@ template void Variation::vary(Population& pop, int island, const vector& parents, const Parameters& p) { - auto idxs = pop.get_island_indexes(island); + auto indices = pop.get_island_indexes(island); - // TODO: fix pragma omp usage (by fix I mean remove) - //#pragma omp parallel for - for (unsigned i = 0; i::vary(Population& pop, int island, } // this assumes that islands do not share indexes before doing variation - unsigned id = p.current_gen*p.pop_size+idxs.at(i); + unsigned id = p.current_gen*p.pop_size+indices.at(i); - // mutation and crossover will already perform 3 attempts. If it fails, we just fill with a random individual + // mutation and crossover already perform 3 attempts. If it fails, we just fill with a random individual if (opt) // variation worked, lets keep this { Individual ind = opt.value(); @@ -661,23 +647,21 @@ void Variation::vary(Population& pop, int island, ind.set_parents(ind_parents); assert(ind.program.size()>0); - pop.individuals.at(idxs.at(i)) = std::make_shared>(ind); + pop.individuals.at(indices.at(i)) = std::make_shared>(ind); } else { // no optional value was returned Individual new_ind; - // creating a new random individual from nothing + // creating a new random individual new_ind.init(search_space, parameters); - new_ind.set_objectives(mom.get_objectives()); // it will have an invalid fitness new_ind.set_id(id); new_ind.is_fitted_ = false; - pop.individuals.at(idxs.at(i)) = std::make_shared>(new_ind); + pop.individuals.at(indices.at(i)) = std::make_shared>(new_ind); } } } } //namespace Var } //namespace Brush - diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 5e46b324..776f4af2 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -46,11 +46,10 @@ TEST(Engine, EngineWorks) Parameters params; params.set_pop_size(100); - params.set_gens(10); + params.set_max_gens(10); params.set_mig_prob(0.0); // TODO: archive tests - // TODO: solve issues from GH params.set_verbosity(2); // TODO: verbosity tests @@ -86,7 +85,7 @@ TEST(Engine, EngineWorks) std::cout << "testing migration" << std::endl; params.set_pop_size(10); - params.set_gens(10); + params.set_max_gens(10); params.set_mig_prob(0.5); // just to see if nothing breaks @@ -117,40 +116,39 @@ TEST(Engine, EngineWorks) // when popsize is not divisible by num_islands std::cout << "popsize not divisible by num_islands" << std::endl; params.set_pop_size(15); - params.set_gens(10); + params.set_max_gens(10); params.set_num_islands(4); // fewer individuals in one island params.set_n_jobs(1); Brush::RegressorEngine est_not_div1(params); est_not_div1.run(data); - // TODO: logger + // TODO: use logger in the tests std::cout << "popsize not divisible by num_islands" << std::endl; params.set_pop_size(10); - params.set_gens(10); + params.set_max_gens(10); params.set_num_islands(3); // extra individuals in one island params.set_n_jobs(1); Brush::RegressorEngine est_not_div2(params); est_not_div2.run(data); - // TODO: test predict and predict proba - // TODO: validation loss + // TODO: validation loss } TEST(Engine, ClassificationEngineWorks) { - // TODO: test classifier and multiclassifier + // TODO: test regression and multiclassifier . add some asserts here Dataset data = Data::read_csv("docs/examples/datasets/d_analcatdata_aids.csv", "target"); ASSERT_TRUE(data.classification); Parameters params; params.set_pop_size(100); - params.set_gens(10); + params.set_max_gens(10); params.set_mig_prob(0.0); params.set_scorer_("log"); - params.set_verbosity(2); // TODO: verbosity tests + params.set_verbosity(2); Brush::ClassifierEngine est(params); est.run(data); diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 3a3e2b27..b4a2dc43 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -132,7 +132,7 @@ TEST(Population, PopulationTests) for (int j=0; j Date: Wed, 5 Jun 2024 10:40:37 -0300 Subject: [PATCH 179/199] adding some new TODOs --- environment.yml | 4 +++- setup.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index ee0fe201..5d33457f 100644 --- a/environment.yml +++ b/environment.yml @@ -16,12 +16,14 @@ dependencies: - pydot - scikit-learn - pandas - # these are not required for install + # not required for install the c++ library (but used in the wrapper) - jupyter - ipython - pip - nlohmann_json - pybind11_json + # Building documentation + - doxygen - sphinx - pip: - graphviz diff --git a/setup.py b/setup.py index e9359ec5..5e8277fe 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ def build_extension(self, ext): "-DEXAMPLE_VERSION_INFO={}".format(self.distribution.get_version()), "-DCMAKE_BUILD_TYPE={}".format(cfg), # not used on MSVC, but no harm "-DGTEST=OFF", - "-DDOCS=OFF", + "-DDOCS=ON", "-DGTEST_INCLUDE_DIRS={}/include/".format(conda_prefix), "-DGTEST_LIBRARIES={}/lib/libgtest.so".format(conda_prefix), "-DEIGEN3_INCLUDE_DIR={}/include/eigen3/".format(conda_prefix), @@ -107,7 +107,7 @@ def build_extension(self, ext): setup( name="pybrush", - version="0.0.1", + version="0.0.1", # TODO: use versionstr here author="William La Cava, Joseph D. Romano", author_email="joseph.romano@pennmedicine.upenn.edu", # can change to Bill license="GNU General Public License v3.0", From 29afbe75df0915c03a7b6bea17ca782d99dccc96 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 5 Jun 2024 11:06:12 -0300 Subject: [PATCH 180/199] Instructions to build docs locally --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index ceca5f01..d9e1e75b 100644 --- a/README.md +++ b/README.md @@ -243,4 +243,30 @@ If you are developing the cpp code and want to build the cpp tests, run the foll ./install tests ``` +## Building the docs locally + +To build the documentation you will need some additional requirements. +Before proceeding, make sure you have the python wrapper installed, as the documentation have some sample notebooks that will run the code. + +First go to the `docs` folder: + +```bash +cd docs/ +``` + +Then, install additional python packages in the same environemnt as brush is intalled with: + +```bash +conda activate brush +pip install -r requirements.txt +``` + +Now just run: + +```bash +make html +``` + +The static website is located in `-build/html` + From 62aff7a78bd19e6ef11e129c104bcf4d9ceae046 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 5 Jun 2024 15:09:08 -0300 Subject: [PATCH 181/199] Updated example notebooks and started writing new ones --- docs/guide/archive.ipynb | 0 docs/guide/saving_loading_populations.ipynb | 0 docs/guide/search_space.ipynb | 26 +- docs/guide/working_with_programs.ipynb | 427 +++++++++++++++++++- 4 files changed, 430 insertions(+), 23 deletions(-) create mode 100644 docs/guide/archive.ipynb create mode 100644 docs/guide/saving_loading_populations.ipynb diff --git a/docs/guide/archive.ipynb b/docs/guide/archive.ipynb new file mode 100644 index 00000000..e69de29b diff --git a/docs/guide/saving_loading_populations.ipynb b/docs/guide/saving_loading_populations.ipynb new file mode 100644 index 00000000..e69de29b diff --git a/docs/guide/search_space.ipynb b/docs/guide/search_space.ipynb index 7ab2edf4..fbe0e02c 100644 --- a/docs/guide/search_space.ipynb +++ b/docs/guide/search_space.ipynb @@ -29,13 +29,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "b667948a", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", - "from brush import Dataset, SearchSpace\n", + "from pybrush import Dataset, SearchSpace\n", "\n", "df = pd.read_csv('../examples/datasets/d_enc.csv')\n", "X = df.drop(columns='label')\n", @@ -96,7 +96,25 @@ "execution_count": null, "id": "a2953719", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Search Space\n", + "===\n", + "terminal_map: {\"ArrayB\": [\"1.00\"], \"ArrayI\": [\"x_5\", \"x_7\", \"1.00\"], \"ArrayF\": [\"x_0\", \"x_1\", \"x_2\", \"x_3\", \"x_4\", \"x_6\", \"1.00\", \"1.00*MeanLabel\"]}\n", + "terminal_weights: {\"ArrayB\": [-nan], \"ArrayI\": [0.011619061, 0.03579926, 0.023709161], \"ArrayF\": [0.6343385, 0.67299956, 0.42711574, 0.8625447, 0.8957853, 0.20750472, 0.6167148, 0.6167148]}\n", + "node_map[ArrayI][[\"ArrayI\", \"ArrayI\"]][SplitBest] = SplitBest, weight = 0.2\n", + "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][SplitBest] = SplitBest, weight = 0.2\n", + "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][Div] = Div, weight = 0.1\n", + "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][Mul] = Mul, weight = 1\n", + "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][Sub] = Sub, weight = 0.5\n", + "node_map[ArrayF][[\"ArrayF\", \"ArrayF\"]][Add] = Add, weight = 0.5\n", + "===\n" + ] + } + ], "source": [ "search_space.print()" ] @@ -138,7 +156,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/docs/guide/working_with_programs.ipynb b/docs/guide/working_with_programs.ipynb index a73b7b15..28257cdb 100644 --- a/docs/guide/working_with_programs.ipynb +++ b/docs/guide/working_with_programs.ipynb @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "102e3fcb", "metadata": { "tags": [ @@ -74,26 +74,34 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from brush import BrushRegressor\n", - "from pmlb import fetch_data\n", + "from pybrush import BrushRegressor\n", "\n", "# load data\n", "df = pd.read_csv('../examples/datasets/d_enc.csv')\n", "X = df.drop(columns='label')\n", - "y = df['label']\n", - "\n" + "y = df['label']" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "ac39c9ca", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed 100% [====================]\n", + "score: 0.890070958724087\n" + ] + } + ], "source": [ "# import and make a regressor\n", "est = BrushRegressor(\n", - " functions=['SplitBest','Add','Mul','Sin','Cos','Exp','Logabs']\n", + " functions=['SplitBest','Add','Mul','Sin','Cos','Exp','Logabs'],\n", + " verbosity=1 # set verbosity==1 to see a progress bar\n", ")\n", "\n", "# use like you would a sklearn regressor\n", @@ -102,6 +110,96 @@ "print('score:', est.score(X,y))" ] }, + { + "cell_type": "markdown", + "id": "5bbd24cd", + "metadata": {}, + "source": [ + "You can see the fitness of the final individual by accessing the `fitness` attribute. Each fitness value corresponds to the objective of same index defined earlier for the `BrushRegressor` class. By default, it will try to minimize `\"error\"` and `\"size\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "166415c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitness(9.935950 18.000000 )\n", + "['error', 'size']\n" + ] + } + ], + "source": [ + "print(est.best_estimator_.fitness)\n", + "print(est.objectives)" + ] + }, + { + "cell_type": "markdown", + "id": "38b6364e", + "metadata": {}, + "source": [ + "A `fitness` in Brush is actually more than a tuple. It is a class that has all boolean comparison operators overloaded to allow an ease of use when prototyping with Brush.\n", + "\n", + "It also infers the weight of each objective to automatically handle minimization or maximization objetives.\n", + "\n", + "To see the weights, you can try:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "13d0ac5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-1.0, -1.0]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "est.best_estimator_.fitness.weights" + ] + }, + { + "cell_type": "markdown", + "id": "fe594691", + "metadata": {}, + "source": [ + "Brush let's you serialize the entire individual, or just the program or fitness it wraps. You can use pickle to save and load programs.\n", + "\n", + "This is all you need to save and load entire populations, and this feature allow us to create store the `Archive` after running the algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b4537631", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "import os, tempfile\n", + "\n", + "individual_file = os.path.join(tempfile.mkdtemp(), 'individual')\n", + "with open(individual_file, \"wb\") as f:\n", + " pickle.dump(est.best_estimator_, f)\n", + "\n", + "program_file = os.path.join(tempfile.mkdtemp(), 'program')\n", + "with open(program_file, \"wb\") as f:\n", + " pickle.dump(est.best_estimator_.program, f)" + ] + }, { "cell_type": "markdown", "id": "a355d8f3", @@ -115,10 +213,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "316964d5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.03*Add(If(x0>0.75,1.16*x1,191.72*x0),Add(x2,512.72*x6))\n" + ] + } + ], "source": [ "print(est.best_estimator_.get_model())" ] @@ -135,10 +241,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "dad68d01", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.03*Add\n", + "|-SplitBest\n", + " |-1.16*x1\n", + " |-191.72*x0\n", + "|-Add\n", + "| |-x2\n", + "| |-512.72*x6\n" + ] + } + ], "source": [ "print(est.best_estimator_.get_model(\"tree\"))" ] @@ -156,10 +276,132 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "3ef1a735", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "\n", + "\n", + "\n", + "7f6ad0015040\n", + "\n", + "Add\n", + "\n", + "\n", + "\n", + "y->7f6ad0015040\n", + "\n", + "\n", + "0.03\n", + "\n", + "\n", + "\n", + "7f6ad00811a0\n", + "\n", + "x0>0.75?\n", + "\n", + "\n", + "\n", + "7f6ad0015040->7f6ad00811a0\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "7f6ad00810e0\n", + "\n", + "Add\n", + "\n", + "\n", + "\n", + "7f6ad0015040->7f6ad00810e0\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "x1\n", + "\n", + "x1\n", + "\n", + "\n", + "\n", + "7f6ad00811a0->x1\n", + "\n", + "\n", + "1.16\n", + "Y\n", + "\n", + "\n", + "\n", + "x0\n", + "\n", + "x0\n", + "\n", + "\n", + "\n", + "7f6ad00811a0->x0\n", + "\n", + "\n", + "191.72\n", + "N\n", + "\n", + "\n", + "\n", + "x2\n", + "\n", + "x2\n", + "\n", + "\n", + "\n", + "7f6ad00810e0->x2\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "x6\n", + "\n", + "x6\n", + "\n", + "\n", + "\n", + "7f6ad00810e0->x6\n", + "\n", + "\n", + "512.72\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import graphviz\n", "\n", @@ -177,10 +419,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "1f7e725e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "digraph G {\n", + "y [shape=box];\n", + "y -> \"7f6ad0015040\" [label=\"0.03\"];\n", + "\"7f6ad0015040\" [label=\"Add\"];\n", + "\"7f6ad0015040\" -> \"7f6ad00811a0\" [label=\"\"];\n", + "\"7f6ad0015040\" -> \"7f6ad00810e0\" [label=\"\"];\n", + "\"7f6ad00811a0\" [label=\"x0>0.75?\"];\n", + "\"7f6ad00811a0\" -> \"x1\" [headlabel=\"1.16\",taillabel=\"Y\"];\n", + "\"7f6ad00811a0\" -> \"x0\" [headlabel=\"191.72\",taillabel=\"N\"];\n", + "\"x1\" [label=\"x1\"];\n", + "\"x0\" [label=\"x0\"];\n", + "\"7f6ad00810e0\" [label=\"Add\"];\n", + "\"7f6ad00810e0\" -> \"x2\" [label=\"\"];\n", + "\"7f6ad00810e0\" -> \"x6\" [label=\"512.72\"];\n", + "\"x2\" [label=\"x2\"];\n", + "\"x6\" [label=\"x6\"];\n", + "}\n", + "\n" + ] + } + ], "source": [ "print(model)" ] @@ -200,10 +467,132 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "f35b1e05", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "\n", + "\n", + "\n", + "7f6ad0015040\n", + "\n", + "Add\n", + "\n", + "\n", + "\n", + "y->7f6ad0015040\n", + "\n", + "\n", + "0.03\n", + "\n", + "\n", + "\n", + "7f6ad00811a0\n", + "\n", + "x0>0.75?\n", + "\n", + "\n", + "\n", + "7f6ad0015040->7f6ad00811a0\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "7f6ad00810e0\n", + "\n", + "Add\n", + "\n", + "\n", + "\n", + "7f6ad0015040->7f6ad00810e0\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "x1\n", + "\n", + "x1\n", + "\n", + "\n", + "\n", + "7f6ad00811a0->x1\n", + "\n", + "\n", + "1.16\n", + "Y\n", + "\n", + "\n", + "\n", + "x0\n", + "\n", + "x0\n", + "\n", + "\n", + "\n", + "7f6ad00811a0->x0\n", + "\n", + "\n", + "191.72\n", + "N\n", + "\n", + "\n", + "\n", + "x2\n", + "\n", + "x2\n", + "\n", + "\n", + "\n", + "7f6ad00810e0->x2\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "x6\n", + "\n", + "x6\n", + "\n", + "\n", + "\n", + "7f6ad00810e0->x6\n", + "\n", + "\n", + "512.72\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model = est.best_estimator_.get_dot_model(\"rankdir=LR;\")\n", "graphviz.Source(model)" @@ -226,7 +615,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.12.2" } }, "nbformat": 4, From f036a2dc33bbc438fcb744cc30cc27e6b6432dcd Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 5 Jun 2024 15:09:27 -0300 Subject: [PATCH 182/199] Fixed bug when passing a list of functions --- pybrush/BrushEstimator.py | 10 +++------- pybrush/DeapEstimator.py | 8 +------- pybrush/EstimatorInterface.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 53f02af4..790aed7e 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -64,11 +64,6 @@ def fit(self, X, y): feature_names=self.feature_names_, validation_size=self.validation_size) - if isinstance(self.functions, list): - self.functions_ = {k:1.0 for k in self.functions} - else: - self.functions_ = self.functions - # set n classes if relevant self.n_classes_ = 0 if self.mode=="classification": @@ -80,9 +75,10 @@ def fit(self, X, y): self.train_.set_batch_size(self.batch_size) # TODO: update batch indexes at the beggining of every generation self.validation_ = self.data_.get_validation_data() - self.search_space_ = SearchSpace(self.data_, self.functions_, self.weights_init) - self.parameters_ = self._wrap_parameters() + + self.search_space_ = SearchSpace(self.data_, self.parameters_.functions, self.weights_init) + self.engine_ = None if self.mode == 'classification': self.engine_ = ( ClassifierEngine diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index bb162f8b..6ef02d5e 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -140,11 +140,6 @@ def fit(self, X, y): feature_names=self.feature_names_, validation_size=self.validation_size) - if isinstance(self.functions, list): - self.functions_ = {k:1.0 for k in self.functions} - else: - self.functions_ = self.functions - # set n classes if relevant self.n_classes_ = 0 if self.mode=="classification": @@ -157,9 +152,8 @@ def fit(self, X, y): self.validation_ = self.data_.get_validation_data() - self.search_space_ = SearchSpace(self.data_, self.functions_, self.weights_init) - self.parameters_ = self._wrap_parameters() + self.search_space_ = SearchSpace(self.data_, self.parameters_.functions, self.weights_init) if self.mode == "classification": self.variator_ = (ClassifierVariator diff --git a/pybrush/EstimatorInterface.py b/pybrush/EstimatorInterface.py index c4277e53..d160eb1a 100644 --- a/pybrush/EstimatorInterface.py +++ b/pybrush/EstimatorInterface.py @@ -5,6 +5,7 @@ provides documentation for the hyperparameters. """ +import numpy as np from pybrush import Parameters class EstimatorInterface(): @@ -155,6 +156,11 @@ def _wrap_parameters(self): the algorithm to use. """ + if isinstance(self.functions, list): + self.functions_ = {k:1.0 for k in self.functions} + else: + self.functions_ = self.functions + params = Parameters() params.classification = self.mode == "classification" @@ -174,7 +180,7 @@ def _wrap_parameters(self): params.use_arch = self.use_arch params.val_from_arch = self.val_from_arch params.mig_prob = self.mig_prob - params.functions = self.functions + params.functions = self.functions_ params.mutation_probs = self.mutation_probs params.validation_size = self.validation_size params.batch_size = self.batch_size @@ -187,7 +193,7 @@ def _wrap_parameters(self): if self.random_state is not None: seed = 0 if isinstance(self.random_state, np.random.Generator): - seed = self.random_state.integers(10000) + seed = self.random_state.integers(1_000_000) elif isinstance(self.random_state, int): seed = self.random_state else: From 9375c8844ade7066437fc16620b4d3cbeefa0002 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 5 Jun 2024 15:09:59 -0300 Subject: [PATCH 183/199] more cleaning. Implemented string representation for fitness --- src/bindings/bind_fitness.cpp | 32 +++++++------- src/bindings/bind_individuals.h | 5 +++ src/bindings/bind_selection.cpp | 6 --- src/bindings/bind_selection.h | 16 ++++--- src/bindings/bind_variation.cpp | 7 +--- src/bindings/bind_variation.h | 31 ++++++++------ src/data/data.h | 2 + src/eval/metrics.h | 2 - src/ind/fitness.cpp | 74 ++++++++++++++++++++++++++++++++- src/ind/fitness.h | 33 +++++++-------- src/ind/individual.cpp | 69 ------------------------------ src/ind/individual.h | 6 ++- src/pop/population.h | 35 +--------------- src/program/node.h | 1 - src/util/utils.h | 48 +++++++++++++++++++++ src/vary/search_space.cpp | 36 ---------------- 16 files changed, 193 insertions(+), 210 deletions(-) diff --git a/src/bindings/bind_fitness.cpp b/src/bindings/bind_fitness.cpp index 8b031b10..c483acfc 100644 --- a/src/bindings/bind_fitness.cpp +++ b/src/bindings/bind_fitness.cpp @@ -30,21 +30,21 @@ void bind_fitness(py::module& m) .def("__gt__", &br::Fitness::operator>, py::is_operator()) .def("__le__", &br::Fitness::operator<=, py::is_operator()) .def("__ge__", &br::Fitness::operator>=, py::is_operator()) - // .def("__str__", &br::Fitness::toString, "String representation of the Fitness object") - // .def("__repr__", &br::Fitness::repr, "Representation for debugging the Fitness object") - .def(py::pickle( - [](const br::Fitness &f) { // __getstate__ - /* Return a tuple that fully encodes the state of the object */ - // return py::make_tuple(p.value(), p.extra()); - nl::json j = f; - return j; - }, - [](nl::json j) { // __setstate__ - br::Fitness f = j; - return f; - } - ) - ) - ; + .def("__str__", &br::Fitness::toString, "String representation of the Fitness object") + .def("__repr__", &br::Fitness::repr, "Representation for debugging the Fitness object") + .def(py::pickle( + [](const br::Fitness &f) { // __getstate__ + /* Return a tuple that fully encodes the state of the object */ + // return py::make_tuple(p.value(), p.extra()); + nl::json j = f; + return j; + }, + [](nl::json j) { // __setstate__ + br::Fitness f = j; + return f; + } + ) + ) + ; } \ No newline at end of file diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index 5c5b62a8..d8808bea 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -36,6 +36,11 @@ void bind_individual(py::module& m, string name) .def_property("objectives", &Class::get_objectives, &Class::set_objectives) .def_property_readonly("program", &Class::get_program) .def_property_readonly("fitness", &Class::get_fitness) + .def("get_model", &Class::get_model, + py::arg("fmt") = "compact", + py::arg("pretty") = false) + .def("get_dot_model", &Class::get_dot_model, + py::arg("extras") = "") .def("fit", static_cast(&Class::fit), "fit from Dataset object") diff --git a/src/bindings/bind_selection.cpp b/src/bindings/bind_selection.cpp index f8a8641c..427ead9e 100644 --- a/src/bindings/bind_selection.cpp +++ b/src/bindings/bind_selection.cpp @@ -5,14 +5,8 @@ namespace py = pybind11; namespace br = Brush; namespace nl = nlohmann; -// using Reg = br::Program; -// using Cls = br::Program; -// using Rep = br::Program; -// using MCls = br::Program; - void bind_selections(py::module& m) { - // TODO: make them a single class bind_selection(m, "RegressorSelector"); bind_selection(m, "ClassifierSelector"); diff --git a/src/bindings/bind_selection.h b/src/bindings/bind_selection.h index 2d1ed49f..b8c9d45b 100644 --- a/src/bindings/bind_selection.h +++ b/src/bindings/bind_selection.h @@ -1,4 +1,5 @@ #include "module.h" + // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) #include "../selection/selection.h" #include "../selection/selection.cpp" @@ -12,9 +13,6 @@ #include "../pop/population.cpp" #include "../pop/population.h" -// #include "../individual.h" -//#include "../selection/selection.cpp" - namespace py = pybind11; namespace nl = nlohmann; namespace br = Brush; @@ -31,7 +29,8 @@ void bind_selection(py::module& m, string name) .def(py::init( [](string type, bool survival){ Class s(type, survival); return s; }) ) - .def("select", [](Class &self, std::vector>& individuals, + .def("select", [](Class &self, + std::vector>& individuals, const Parameters& params) { // auto sel = Class("nsga2", false); @@ -53,10 +52,10 @@ void bind_selection(py::module& m, string name) } } - // returns references return pool; }) - .def("survive", [](Class &self, std::vector>& individuals, + .def("survive", [](Class &self, + std::vector>& individuals, const Parameters& params) { // auto sel = Class("nsga2", false); @@ -76,10 +75,10 @@ void bind_selection(py::module& m, string name) } } - // returns references return pool; }) - .def("migrate", [](Class &self, std::vector>& individuals, + .def("migrate", [](Class &self, + std::vector>& individuals, const Parameters& params) { auto pop = br::Pop::Population(); @@ -98,7 +97,6 @@ void bind_selection(py::module& m, string name) pool.push_back(pop[idx]); } } - // returns references return pool; }) ; diff --git a/src/bindings/bind_variation.cpp b/src/bindings/bind_variation.cpp index 0a772c7c..739d115e 100644 --- a/src/bindings/bind_variation.cpp +++ b/src/bindings/bind_variation.cpp @@ -5,14 +5,9 @@ namespace py = pybind11; namespace br = Brush; namespace nl = nlohmann; -// using Reg = br::Program; -// using Cls = br::Program; -// using Rep = br::Program; -// using MCls = br::Program; - void bind_variations(py::module& m) { - bind_variation(m, "RegressorVariator"); + bind_variation(m,"RegressorVariator"); bind_variation(m, "ClassifierVariator"); bind_variation(m, "MultiClassifierVariator"); bind_variation(m, "RepresenterVariator"); diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index fe697c95..88cacc3a 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -1,9 +1,9 @@ #include "module.h" -#include "../vary/variation.h" -#include "../vary/variation.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) -#include "../pop/population.cpp" +#include "../vary/variation.h" +#include "../vary/variation.cpp" #include "../pop/population.h" +#include "../pop/population.cpp" namespace py = pybind11; namespace nl = nlohmann; @@ -22,10 +22,17 @@ void bind_variation(py::module& m, string name) return variation; })) .def("mutate", &Class::mutate, py::return_value_policy::automatic) .def("cross", &Class::cross, py::return_value_policy::automatic) - .def("vary_pop", [](Class &self, std::vector>& individuals, const Parameters& params) { - + .def("vary_pop", [](Class &self, + std::vector>& individuals, + const Parameters& params) { if (individuals.size() != params.pop_size) { - throw std::runtime_error("Individual vector has different number of individuals than pop_size. When calling variation, they should be the same. popsize is "+to_string(params.pop_size)+", number of individuals is " + to_string(individuals.size())); + string msg = "Individual vector has different number of " + "individuals than pop_size. When calling " + "variation, they should be the same. popsize is "+ + to_string(params.pop_size)+", number of " + "individuals is "+to_string(individuals.size()); + + throw std::runtime_error(msg); } auto pop = br::Pop::Population(); @@ -37,10 +44,12 @@ void bind_variation(py::module& m, string name) for (int island = 0; island < params.num_islands; ++island) { - // I am assuming the individual vector passed as argument will contain the selected parents already + // I am assuming the individual vector passed as argument + // will contain the selected parents already vector parents = pop.get_island_indexes(island); - // including offspring indexes (the vary method will store the offspring in the second half of the index vector) + // including offspring indexes (the vary method will store the + // offspring in the second half of the index vector) pop.add_offspring_indexes(island); self.vary(pop, island, parents, params); @@ -53,10 +62,8 @@ void bind_variation(py::module& m, string name) // this is where the offspring is saved pool.push_back(pop[indices.at(i)]); } - } - - // returns references + } return pool; }) ; -} \ No newline at end of file +} diff --git a/src/data/data.h b/src/data/data.h index 7dde291b..e145957d 100644 --- a/src/data/data.h +++ b/src/data/data.h @@ -98,6 +98,8 @@ class Dataset const vector& vn = {} ); + // TODO: let the user specify the datatypes + /// turns input into a feature map, with feature types copied from a reference map copy_and_make_features(const ArrayXXf& X, const Dataset& ref_dataset, diff --git a/src/eval/metrics.h b/src/eval/metrics.h index fab075d1..5f4439f9 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -12,8 +12,6 @@ namespace Eval { float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, const vector& class_weights=vector() ); -// TODO: test cases for the metrics - /// log loss (2 methods below) VectorXf log_loss(const VectorXf& y, const VectorXf& predict_proba, const vector& class_weights=vector()); diff --git a/src/ind/fitness.cpp b/src/ind/fitness.cpp index 9e2de1ca..e3bd2d59 100644 --- a/src/ind/fitness.cpp +++ b/src/ind/fitness.cpp @@ -1 +1,73 @@ -#include "fitness.h" \ No newline at end of file +#include "fitness.h" + +namespace Brush +{ + +void to_json(json &j, const Fitness &f) +{ + j = json{ + {"values", f.values}, + {"weights", f.weights}, + {"wvalues", f.wvalues}, + {"loss", f.loss}, + {"loss_v", f.loss_v}, + {"complexity", f.complexity}, + {"size", f.size}, + {"depth", f.depth}, + {"dcounter", f.dcounter}, + {"dominated", f.dominated}, + {"rank", f.rank}, + {"crowding_dist", f.crowding_dist} + }; +} + +void from_json(const json &j, Fitness& f) +{ + j.at("values").get_to( f.values ); + j.at("weights").get_to( f.weights ); + j.at("wvalues").get_to( f.wvalues ); + j.at("loss").get_to( f.loss ); + j.at("loss_v").get_to( f.loss_v ); + j.at("complexity").get_to( f.complexity ); + j.at("size").get_to( f.size ); + j.at("depth").get_to( f.depth ); + j.at("dcounter").get_to( f.dcounter ); + j.at("dominated").get_to( f.dominated ); + j.at("rank").get_to( f.rank ); + j.at("crowding_dist").get_to( f.crowding_dist ); +} + + +int Fitness::dominates(const Fitness& b) const +{ + int flag1 = 0, // to check if this has a better objective + flag2 = 0; // to check if b has a better objective + + // TODO: replace comparison of individual values by using the overloaded operators (here and in nsga2) + for (int i=0; i b.get_wvalues().at(i) + || std::isnan(b.get_wvalues().at(i)) + ) + flag1 = 1; + if (get_wvalues().at(i) < b.get_wvalues().at(i) + || std::isnan(get_wvalues().at(i)) + ) + flag2 = 1; + } + + // the proper way of comparing weighted values is considering everything as a maximization problem + // (this is like deap does, and our fitness is inspired by them) + if (flag1==1 && flag2==0) + // there is at least one smaller objective for this and none + // for b + return 1; + else if (flag1==0 && flag2==1) + // there is at least one smaller objective for b and none + // for this + return -1; + else + // no smaller objective or both have one smaller + return 0; +} + +} // Brush \ No newline at end of file diff --git a/src/ind/fitness.h b/src/ind/fitness.h index abbcccf4..db0885e8 100644 --- a/src/ind/fitness.h +++ b/src/ind/fitness.h @@ -3,25 +3,15 @@ #include #include "../init.h" - +#include "../util/utils.h" using namespace nlohmann; - -template <> // this is intended to be used with DEAP (so our brush individuals can be hashed and compared to each other in python side) -struct std::hash> { - std::size_t operator()(const std::vector& v) const { - std::size_t seed = v.size(); - for (const auto& elem : v) { - seed ^= std::hash{}(elem) + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - return seed; - } -}; - namespace Brush{ + struct Fitness { // the loss is used in evolutionary functions + float loss; ///< aggregate loss score float loss_v; ///< aggregate validation loss score @@ -163,18 +153,27 @@ struct Fitness { // String representation std::string toString() const { if (valid()) { - return "TODO: implement string representation"; //std::to_string(wvalues); + string s = "Fitness("; + for (auto& v : values) + s += to_string(v) + " "; + return s+")"; } else { - return "Tuple()"; + return "Fitness()"; } } // Representation for debugging std::string repr() const { - return "TODO: implement string representation"; + if (valid()) { + string s = "Fitness("; + for (auto& v : values) + s += to_string(v) + " "; + return s+")"; + } else { + return "Fitness()"; + } } - /// set obj vector given a string of objective names int dominates(const Fitness& b) const; }; diff --git a/src/ind/individual.cpp b/src/ind/individual.cpp index 6d48b2a7..a08668c0 100644 --- a/src/ind/individual.cpp +++ b/src/ind/individual.cpp @@ -1,75 +1,6 @@ #include "individual.h" namespace Brush{ - -void to_json(json &j, const Fitness &f) -{ - j = json{ - {"values", f.values}, - {"weights", f.weights}, - {"wvalues", f.wvalues}, - {"loss", f.loss}, - {"loss_v", f.loss_v}, - {"complexity", f.complexity}, - {"size", f.size}, - {"depth", f.depth}, - {"dcounter", f.dcounter}, - {"dominated", f.dominated}, - {"rank", f.rank}, - {"crowding_dist", f.crowding_dist} - }; -} - -void from_json(const json &j, Fitness& f) -{ - j.at("values").get_to( f.values ); - j.at("weights").get_to( f.weights ); - j.at("wvalues").get_to( f.wvalues ); - j.at("loss").get_to( f.loss ); - j.at("loss_v").get_to( f.loss_v ); - j.at("complexity").get_to( f.complexity ); - j.at("size").get_to( f.size ); - j.at("depth").get_to( f.depth ); - j.at("dcounter").get_to( f.dcounter ); - j.at("dominated").get_to( f.dominated ); - j.at("rank").get_to( f.rank ); - j.at("crowding_dist").get_to( f.crowding_dist ); -} - - -int Fitness::dominates(const Fitness& b) const -{ - int flag1 = 0, // to check if this has a better objective - flag2 = 0; // to check if b has a better objective - - // TODO: replace comparison of individual values by using the overloaded operators (here and in nsga2) - for (int i=0; i b.get_wvalues().at(i) - || std::isnan(b.get_wvalues().at(i)) - ) - flag1 = 1; - if (get_wvalues().at(i) < b.get_wvalues().at(i) - || std::isnan(get_wvalues().at(i)) - ) - flag2 = 1; - } - - // the proper way of comparing weighted values is considering everything as a maximization problem - // (this is like deap does, and our fitness is inspired by them) - if (flag1==1 && flag2==0) - // there is at least one smaller objective for this and none - // for b - return 1; - else if (flag1==0 && flag2==1) - // there is at least one smaller objective for b and none - // for this - return -1; - else - // no smaller objective or both have one smaller - return 0; -} - - namespace Pop{ diff --git a/src/ind/individual.h b/src/ind/individual.h index f411bd75..472b9f05 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -83,11 +83,15 @@ class Individual{ // just getters bool get_is_fitted() const { return this->is_fitted_; }; - string get_model() const { return program.get_model(); }; unsigned int get_size() const { return program.size(); }; unsigned int get_depth() const { return program.depth(); }; unsigned int get_complexity() const { return program.complexity(); }; Program& get_program() { return program; }; + + string get_model(string fmt="compact", bool pretty=false) { + return program.get_model(fmt, pretty); }; + string get_dot_model(string extras="") { + return program.get_dot_model(extras); }; void set_fitness(Fitness &f) { fitness=f; }; Fitness& get_fitness() { return fitness; }; diff --git a/src/pop/population.h b/src/pop/population.h index 98cf5d7f..6871c6e8 100644 --- a/src/pop/population.h +++ b/src/pop/population.h @@ -1,43 +1,10 @@ #ifndef POPULATION_H #define POPULATION_H +#include "../util/utils.h" #include "../util/error.h" #include "../ind/individual.h" -// TODO: move this serialization elsewhere -// serializing vector of shared ptr: https://github.com/nlohmann/json/discussions/2377 -// (this is used by population, which has a shared_ptr vector) -namespace nlohmann -{ -template -struct adl_serializer> -{ - static void to_json(json& j, const std::shared_ptr& opt) - { - if (opt) - { - j = *opt; - } - else - { - j = nullptr; - } - } - - static void from_json(const json& j, std::shared_ptr& opt) - { - if (j.is_null()) - { - opt = nullptr; - } - else - { - opt.reset(new T(j.get())); - } - } -}; -} - namespace Brush { namespace Pop { diff --git a/src/program/node.h b/src/program/node.h index 8092664b..a6265f31 100644 --- a/src/program/node.h +++ b/src/program/node.h @@ -39,7 +39,6 @@ using Brush::Data::Dataset; namespace Brush{ -// TODO: should I move this declaration to another place? template inline auto Isnt(DataType dt) -> bool { return !((dt == T) || ...); } diff --git a/src/util/utils.h b/src/util/utils.h index 932a0cca..f767e653 100644 --- a/src/util/utils.h +++ b/src/util/utils.h @@ -27,6 +27,54 @@ using namespace std; * @brief namespace containing various utility functions */ +// serializing vector of shared ptr: https://github.com/nlohmann/json/discussions/2377 +// (used in population.h, which has a shared_ptr vector) +namespace nlohmann +{ +template +struct adl_serializer> +{ + static void to_json(json& j, const std::shared_ptr& opt) + { + if (opt) + { + j = *opt; + } + else + { + j = nullptr; + } + } + + static void from_json(const json& j, std::shared_ptr& opt) + { + if (j.is_null()) + { + opt = nullptr; + } + else + { + opt.reset(new T(j.get())); + } + } +}; +} + +// to overload operators and compare our individuals, we need to be able to +// serialize vectors. +// this is intended to be used with DEAP (so our brush individuals +// can be hashed and compared to each other in python side) +template <> +struct std::hash> { + std::size_t operator()(const std::vector& v) const { + std::size_t seed = v.size(); + for (const auto& elem : v) { + seed ^= std::hash{}(elem) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; + // namespace std // { diff --git a/src/vary/search_space.cpp b/src/vary/search_space.cpp index 943f3821..95a9cf0b 100644 --- a/src/vary/search_space.cpp +++ b/src/vary/search_space.cpp @@ -276,10 +276,6 @@ tree& SearchSpace::PTC2(tree& Tree, // parameters, the real maximum size that can occur is `max_size` plus the // highest operator arity, and the real maximum depth is `max_depth` plus one. - // auto Tree = tree(); - - // fmt::print("building program with max size {}, max depth {}",max_size,max_d); - // Queue of nodes that need children vector> queue; @@ -290,9 +286,6 @@ tree& SearchSpace::PTC2(tree& Tree, Node root = spot.node->data; - // cout << "root " << root.name << endl; - // auto spot = Tree.set_head(n); - // updating size accordingly to root node if (Is(root.node_type)) s += 3; @@ -315,8 +308,6 @@ tree& SearchSpace::PTC2(tree& Tree, Node n; // Now we actually start the PTC2 procedure to create the program tree - // cout << "queue size: " << queue.size() << endl; - // cout << "entering first while loop...\n"; while ( queue.size() + s < max_size && queue.size() > 0) { // including the queue size in the max_size, since each element in queue @@ -334,12 +325,6 @@ tree& SearchSpace::PTC2(tree& Tree, // cout << "current depth: " << d << endl; if (d >= max_d || s >= max_size) { - // choose terminal of matching type - // cout << "getting " << DataTypeName[t] << " terminal\n"; - // qspot = sample_terminal(t); - // Tree.replace(qspot, sample_terminal(t)); - // Tree.append_child(qspot, sample_terminal(t)); - auto opt = sample_terminal(t); // if it returned optional, then there's nothing to sample based on weights. @@ -355,11 +340,7 @@ tree& SearchSpace::PTC2(tree& Tree, else { //choose a nonterminal of matching type - // cout << "getting op of type " << DataTypeName[t] << endl; auto opt = sample_op(t); - // cout << "chose " << n.name << endl; - // TreeIter new_spot = Tree.append_child(qspot, n); - // qspot = n; if (!opt) { // there is no operator for this node. sample a terminal instead opt = sample_terminal(t); @@ -380,8 +361,6 @@ tree& SearchSpace::PTC2(tree& Tree, // For each arg of n, add to queue for (auto a : n.arg_types) { - // cout << "queing a node of type " << DataTypeName[a] << endl; - // queue.push_back(make_tuple(new_spot, a, d+1)); auto child_spot = Tree.append_child(newspot); queue.push_back(make_tuple(child_spot, a, d+1)); @@ -399,25 +378,15 @@ tree& SearchSpace::PTC2(tree& Tree, if ( n.get_is_weighted()==true && Isnt(n.node_type) ) s += 2; - - // cout << "current tree size: " << s << endl; } - // cout << "entering second while loop...\n"; while (queue.size() > 0) { if (queue.size() == 0) break; - // cout << "queue size: " << queue.size() << endl; - auto [qspot, t, d] = RandomDequeue(queue); - // cout << "getting " << DataTypeName[t] << " terminal\n"; - // Tree.append_child(qspot, sample_terminal(t)); - // qspot = sample_terminal(t); - // auto newspot = Tree.replace(qspot, sample_terminal(t)); - auto opt = sample_terminal(t); if (!opt) opt = sample_terminal(t, true); @@ -426,11 +395,6 @@ tree& SearchSpace::PTC2(tree& Tree, auto newspot = Tree.replace(qspot, n); } - - // cout << "final tree:\n" - // << Tree.begin().node->get_model() << "\n" - // << Tree.begin().node->get_tree_model(true) << endl; - return Tree; }; From cbcfcbee9ed2d1626b14b30b365106dae37b3ffc Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 6 Jun 2024 08:44:13 -0300 Subject: [PATCH 184/199] Final adjustments to save/load pop and use archive --- pybrush/BrushEstimator.py | 4 ++- pybrush/EstimatorInterface.py | 19 +++++++++++ src/bindings/bind_individuals.h | 9 +++-- src/bindings/bind_params.cpp | 1 + src/data/data.h | 2 +- src/engine.cpp | 42 +++++++----------------- src/engine.h | 22 ++----------- src/params.h | 8 ++++- src/pop/archive.cpp | 2 -- src/pop/archive.h | 3 +- src/pop/population.cpp | 2 +- src/program/optimizer/weight_optimizer.h | 1 - 12 files changed, 52 insertions(+), 63 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 790aed7e..6f149f94 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -88,6 +88,8 @@ def fit(self, X, y): self.engine_ = RegressorEngine(self.parameters_) self.engine_.fit(self.data_) + + self.archive_ = self.engine_.get_archive() self.best_estimator_ = self.engine_.best_ind return self @@ -224,7 +226,7 @@ def predict_proba(self, X): return prob - def predict_archive(self, X): + def predict_proba_archive(self, X): """Returns a list of dictionary predictions for all models.""" check_is_fitted(self) diff --git a/pybrush/EstimatorInterface.py b/pybrush/EstimatorInterface.py index d160eb1a..e29d1817 100644 --- a/pybrush/EstimatorInterface.py +++ b/pybrush/EstimatorInterface.py @@ -86,6 +86,16 @@ class EstimatorInterface(): Percentage of training data to sample every generation. If `1.0`, then all data is used. Very small values can improve execution time, but also lead to underfit. + save_population: str, optional (default "") + string containing the path to save the final population. Ignored if + not provided. + load_population: str, optional (default "") + string containing the path to load the initial population. Ignored + if not provided. + shuffle_split: boolean, optional (default False) + whether if the engine should shuffle the data before splitting it + into train and validation partitions. Ignored if `validation_size` + is set to zero. logfile: str, optional (default: "") If specified, spits statistics into a logfile. "" means don't log. random_state: int or None, default None @@ -119,6 +129,9 @@ def __init__(self, objectives=["error", "size"], random_state=None, logfile="", + save_population="", + load_population="", + shuffle_split=False, weights_init=True, val_from_arch=True, use_arch=False, @@ -139,11 +152,14 @@ def __init__(self, self.n_jobs=n_jobs self.cx_prob=cx_prob self.logfile=logfile + self.save_population=save_population + self.load_population=load_population self.mutation_probs=mutation_probs self.val_from_arch=val_from_arch # TODO: val from arch implementation (in cpp side) self.use_arch=use_arch self.functions=functions self.objectives=objectives + self.shuffle_split=shuffle_split self.initialization=initialization self.random_state=random_state self.batch_size=batch_size @@ -170,12 +186,15 @@ def _wrap_parameters(self): params.pop_size = self.pop_size params.max_gens = self.max_gens params.logfile = self.logfile + params.save_population = self.save_population + params.load_population = self.load_population params.max_stall = self.max_stall params.max_time = self.max_time params.num_islands = self.num_islands params.max_depth = self.max_depth params.max_size = self.max_size params.objectives = self.objectives + params.shuffle_split = self.shuffle_split params.cx_prob = self.cx_prob params.use_arch = self.use_arch params.val_from_arch = self.val_from_arch diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index d8808bea..5777c5e2 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -12,11 +12,10 @@ using Rep = Brush::RepresenterIndividual; using stream_redirect = py::call_guard; -// TODO: unify PT or T -template +template void bind_individual(py::module& m, string name) { - using Class = br::Pop::Individual; + using Class = br::Pop::Individual; using RetType = std::conditional_t< std::is_same_v, ArrayXf, @@ -25,10 +24,10 @@ void bind_individual(py::module& m, string name) py::class_ ind(m, name.data() ); ind.def(py::init<>()) - .def(py::init([](br::Program& prg){ Class i(prg); + .def(py::init([](br::Program& prg){ Class i(prg); return i; }) ) - .def(py::init([](const json& j){ br::Program prg = j; + .def(py::init([](const json& j){ br::Program prg = j; Class i(prg); return i; }) ) diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 1519bfe4..a4db4ae6 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -30,6 +30,7 @@ void bind_params(py::module& m) .def_property("n_classes", &Brush::Parameters::get_n_classes, &Brush::Parameters::set_n_classes) .def_property("n_jobs", &Brush::Parameters::get_n_jobs, &Brush::Parameters::set_n_classes) .def_property("classification", &Brush::Parameters::get_classification, &Brush::Parameters::set_classification) + .def_property("shuffle_split", &Brush::Parameters::get_shuffle_split, &Brush::Parameters::set_shuffle_split) .def_property("validation_size", &Brush::Parameters::get_validation_size, &Brush::Parameters::set_validation_size) .def_property("feature_names", &Brush::Parameters::get_feature_names, &Brush::Parameters::set_feature_names) .def_property("batch_size", &Brush::Parameters::get_batch_size, &Brush::Parameters::set_batch_size) diff --git a/src/data/data.h b/src/data/data.h index e145957d..a5d8ee26 100644 --- a/src/data/data.h +++ b/src/data/data.h @@ -205,7 +205,7 @@ class Dataset // if split is not set, then training = validation. Dataset get_training_data() const; Dataset get_validation_data() const; - + // TODO: shuffle split inline int get_n_samples() const { return std::visit( [&](auto&& arg) -> int { return int(arg.size());}, diff --git a/src/engine.cpp b/src/engine.cpp index b6623dc5..fb3218cc 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -23,13 +23,6 @@ void Engine::init() this->pop = Population(); - // TODO: load population into file - // TODO: if initializing from a population file, then this is where we should load previous models. - // three behaviors: if we have only 1 ind, then replicate it trought the entire pop - // if n_ind is the same as pop_size, load all models. if n_ind != pop_size, throw error - if (params.load_population != "") - this->pop.load(params.load_population); - this->evaluator = Evaluation(); // TODO: make these classes have a default constructor, and stop recreating instances @@ -197,11 +190,16 @@ void Engine::print_stats(std::ofstream& log, float fraction) template vector Engine::get_archive(bool front) { - json j; // TODO: use this front argument (or remove it). I think I can remove + vector archive_vector; // Use a vector to store serialized individuals + + // TODO: use this front argument (or remove it). I think I can remove for (const auto& ind : archive.individuals) { - to_json(j, ind); // Serialize each individual + json j; // Serialize each individual + to_json(j, ind); + archive_vector.push_back(j); } - return j; + + return archive_vector; } // TODO: private function called find_individual that searches for it based on id. Then, @@ -332,7 +330,10 @@ void Engine::run(Dataset &data) this->init(); - pop.init(this->ss, this->params); + if (params.load_population != "") + this->pop.load(params.load_population); + else + this->pop.init(this->ss, this->params); // log file stream std::ofstream log; @@ -490,14 +491,6 @@ void Engine::run(Dataset &data) this->set_is_fitted(true); - // TODO: make this work - // if (save_pop > 0) - // { - // pop.save(this->logfile+".pop.gen" + to_string(params.current_gen) - // + ".json"); - // this->best_ind.save(this->logfile+".best.json"); - // } - // TODO: open, write, close? (to avoid breaking the file and allow some debugging if things dont work well) if (log.is_open()) log.close(); @@ -508,13 +501,11 @@ void Engine::run(Dataset &data) { archive.individuals.resize(0); for (int island =0; island< pop.num_islands; ++island) { - // cout << "island" << island << endl; vector indices = pop.get_island_indexes(island); for (unsigned i = 0; i::run(Dataset &data) body.precede(back); back.precede(cond); - //std::cout << "taskflow configured " << std::endl; executor.run(taskflow); - - //std::cout << "submitted jobs " << std::endl; - executor.wait_for_all(); - //std::cout << "finished " << std::endl; //When you have tasks that are created at runtime (e.g., subflow, // cudaFlow), you need to execute the graph first to spawn these tasks and dump the entire graph. - - // printing the graph - //std::cout << "dumping taskflow in json " << std::endl; - // taskflow.dump(std::cout); } } \ No newline at end of file diff --git a/src/engine.h b/src/engine.h index c87d3b4f..7b02b411 100644 --- a/src/engine.h +++ b/src/engine.h @@ -51,7 +51,7 @@ class Engine{ inline bool get_is_fitted(){return is_fitted;} /// updates best score by searching in the population for the individual that best fits the given data - bool update_best(const Dataset& data, bool val=false); + bool update_best(const Dataset& data, bool val=false); // TODO: hyperparameter to set how the best is picked (MCDM, best on val, pareto front, etc). one of the options should be getting the pareto front // TODO: best fitness (the class) instead of these. use fitness comparison @@ -89,22 +89,6 @@ class Engine{ return predict_proba(d); }; - // TODO: starting pop (just like feat) - - // TODO: make these work - // /// predict on unseen data. - // VectorXf predict(MatrixXf& X, LongData& Z); - // VectorXf predict(MatrixXf& X); - - // /// predict on unseen data. return CLabels. - // shared_ptr predict_labels(MatrixXf& X, LongData Z = LongData()); - - // /// predict probabilities of each class. - // ArrayXXf predict_proba(MatrixXf& X, LongData& Z); - // ArrayXXf predict_proba(MatrixXf& X); - - // archive stuff --- - ///return archive size int get_archive_size(){ return this->archive.individuals.size(); }; @@ -122,9 +106,7 @@ class Engine{ requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier)) auto predict_proba_archive(int id, const Ref& X); - // TODO: make these work - // VectorXf predict_archive(int id, const Ref& X, LongData& Z); - // ArrayXXf predict_proba_archive(int id, const Ref& X, LongData& Z); + // TODO: predict/predict_proba/archive with longitudinal data /// train the model void run(Dataset &d); diff --git a/src/params.h b/src/params.h index a7302461..b2ae656d 100644 --- a/src/params.h +++ b/src/params.h @@ -33,7 +33,7 @@ struct Parameters int max_stall = 0; int max_time = -1; - unsigned int max_depth = 6; // TODO: make all tests be based on these values for max depth and size + unsigned int max_depth = 6; unsigned int max_size = 50; vector objectives{"error","complexity"}; // error should be generic and deducted based on mode @@ -72,6 +72,9 @@ struct Parameters // the uses uses an dataset bool classification; unsigned int n_classes; + + // validation partition + bool shuffle_split = false; float validation_size = 0.75; vector feature_names = {}; float batch_size = 0.0; @@ -157,6 +160,9 @@ struct Parameters void set_classification(bool c){ classification = c; }; bool get_classification(){ return classification; }; + void set_shuffle_split(bool shuff){ shuffle_split = shuff; }; + bool get_shuffle_split(){ return shuffle_split; }; + void set_n_classes(unsigned int new_n_classes){ n_classes = new_n_classes; }; unsigned int get_n_classes(){ return n_classes; }; diff --git a/src/pop/archive.cpp b/src/pop/archive.cpp index 80fdfe79..4dda26c0 100644 --- a/src/pop/archive.cpp +++ b/src/pop/archive.cpp @@ -111,7 +111,6 @@ void Archive::update(Population& pop, const Parameters& params) // refill archive with new pareto fronts (one pareto front for each island!) for (int island =0; island< pop.num_islands; ++island) { - cout << "island" << island << endl; vector indices = pop.get_island_indexes(island); // TODO: can i just call fast nds with all indexes in indices? @@ -119,7 +118,6 @@ void Archive::update(Population& pop, const Parameters& params) for (const auto& i : front[0]) { individuals.push_back( *pop.individuals.at(i) ); - cout << "index" << i << endl; } } diff --git a/src/pop/archive.h b/src/pop/archive.h index 87cdebdc..49404e31 100644 --- a/src/pop/archive.h +++ b/src/pop/archive.h @@ -15,7 +15,8 @@ namespace Pop{ template struct Archive { - // I dont need shared pointers here + // I dont need shared pointers here (this is not suposed to be operated + // by several threads) vector> individuals; ///< individual programs in the archive bool sort_complexity; ///< whether to sort archive by complexity diff --git a/src/pop/population.cpp b/src/pop/population.cpp index 45338f04..300e5e78 100644 --- a/src/pop/population.cpp +++ b/src/pop/population.cpp @@ -102,7 +102,7 @@ void Population::save(string filename) if (!filename.empty()) out.open(filename); else - out.open("pop.json"); + out.open("population.json"); json j; to_json(j, *this); diff --git a/src/program/optimizer/weight_optimizer.h b/src/program/optimizer/weight_optimizer.h index 702f27bd..e7afbd35 100644 --- a/src/program/optimizer/weight_optimizer.h +++ b/src/program/optimizer/weight_optimizer.h @@ -77,7 +77,6 @@ struct ResidualEvaluator { // TODO: see this struct and try to understand how to make non-templated classes struct WeightOptimizer { - /// @brief Update program weights using non-linear least squares. /// @tparam PT the program type /// @param program the program From e021f13769d62fa18b2eec63076660f96f260bcc Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 6 Jun 2024 09:14:22 -0300 Subject: [PATCH 185/199] Fix print statement when using archive --- src/engine.cpp | 6 ------ src/pop/archive.cpp | 8 ++++---- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/engine.cpp b/src/engine.cpp index fb3218cc..e6909406 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -326,7 +326,6 @@ void Engine::run(Dataset &data) //TODO: i need to make sure i initialize everything (pybind needs to have constructors // without arguments to work, and i need to handle correcting these values before running) this->ss = SearchSpace(data, params.functions); - //std::cout << "search space was set" << std::endl; this->init(); @@ -402,16 +401,12 @@ void Engine::run(Dataset &data) stop, // loop condition [&](tf::Subflow& subflow) { // loop body (evolutionary main loop) - //std::cout << "inside body" << std::endl; auto prepare_gen = subflow.emplace([&]() { - //std::cout << "inside prepare gen" << std::endl; - //std::cout << " -------------------- generation " << generation << " -------------------- " << std::endl; params.set_current_gen(generation); batch = data.get_batch(); // will return the original dataset if it is set to dont use batch }).name("prepare generation");// set generation in params, get batch auto run_generation = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) { - //std::cout << "inside select parents" << std::endl; evaluator.update_fitness(this->pop, island, data, params, true); // fit the weights with all training data // TODO: have some way to set which fitness to use (for example in params, or it can infer based on split size idk) @@ -422,7 +417,6 @@ void Engine::run(Dataset &data) vector parents = selector.select(this->pop, island, params); for (int i=0; i< parents.size(); i++){ - //std::cout << i << std::endl; island_parents.at(island).at(i) = parents.at(i); } diff --git a/src/pop/archive.cpp b/src/pop/archive.cpp index 4dda26c0..4eb5ebf9 100644 --- a/src/pop/archive.cpp +++ b/src/pop/archive.cpp @@ -89,14 +89,14 @@ void Archive::init(Population& pop) const auto& t = *pop.individuals.at(indices.at(i)); if (t.fitness.rank ==1){ - // TODO: check if this is creating a copy + // we can store a reference for the original ind, since + // variation operators does not change inplace. Ideally, the + // original individual is modified inplace just by fit(), which + // is a side effect that is OK to have here individuals.push_back(t); } } } - - // TODO: use this cout in the logger - cout << "intializing archive with " << individuals.size() << " inds\n"; if (this->sort_complexity) std::sort(individuals.begin(),individuals.end(), &sortComplexity); else From 17de2b0c54c84df4724887377f5b44e20062f354 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 6 Jun 2024 11:42:17 -0300 Subject: [PATCH 186/199] Removed useless lambda function --- src/ind/individual.h | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/src/ind/individual.h b/src/ind/individual.h index 472b9f05..01731503 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -108,29 +108,20 @@ class Individual{ // template // void Individual::set_objectives(const vector& objectives) - // Static map for weights associated with strings - inline static std::map weightsMap = []() { - std::map map = { - // this will determine each fitness metric to be a min/max problem - {"complexity", -1.0}, - {"size", -1.0}, - {"mse", -1.0}, - {"log", -1.0}, - {"multi_log", -1.0}, - - {"accuracy", +1.0}, - - // generic error metrics (will use default metrics for clf or reg) - // by default we use log and multi_log if the user specifies error - // for a classification problem. However, other metrics (such as - // accuracy or precision or AUC) can be a maximization problem, - // so this map allow us to have flexibility when setting the - // objectives - {"error", (T == Brush::ProgramType::Regressor) ? -1.0 : -1.0} - }; - - return map; - }(); + // Static map for weights associated with strings. + // this will determine each fitness metric to be a min/max problem. + // generic error metric: by default log and multi_log if it is a + // classification problem, and MSE if it is a regression (so its always + // a minimization by default, thus "error" has weight -1.0) + inline static std::map weightsMap = { + {"complexity", -1.0}, + {"size", -1.0}, + {"mse", -1.0}, + {"log", -1.0}, + {"multi_log", -1.0}, + {"accuracy", +1.0}, + {"error", -1.0} + }; vector get_objectives() const { return objectives; }; void set_objectives(vector objs){ From 6fd411d055bef4340da0a05549c37eb0ee6296f2 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 6 Jun 2024 11:42:52 -0300 Subject: [PATCH 187/199] Spacing --- tests/cpp/test_brush.cpp | 2 ++ tests/cpp/test_params.cpp | 13 +------------ 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 776f4af2..da489231 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -50,6 +50,8 @@ TEST(Engine, EngineWorks) params.set_mig_prob(0.0); // TODO: archive tests + + // TODO: test termination criterion --- max stall, generations, time params.set_verbosity(2); // TODO: verbosity tests diff --git a/tests/cpp/test_params.cpp b/tests/cpp/test_params.cpp index dc88d633..f1a07f32 100644 --- a/tests/cpp/test_params.cpp +++ b/tests/cpp/test_params.cpp @@ -1,14 +1,5 @@ #include "testsHeader.h" -// -// #include "../../src/individual.cpp" -// #include "../../src/population.cpp" // TODO: figure out if thats ok to include cpps instead of headers -// #include "../../src/eval/evaluation.cpp" -// #include "../../src/selection/nsga2.cpp" -// #include "../../src/selection/lexicase.cpp" -// #include "../../src/selection/selection_operator.cpp" -// #include "../../src/selection/selection.cpp" - using namespace Brush::Pop; using namespace Brush::Sel; using namespace Brush::Eval; @@ -49,7 +40,5 @@ TEST(Params, ParamsTests) // ft.params.set_verbosity(2); // ASSERT_EQ(ft.params.verbosity, 2); - // ASSERT_STREQ("", logger.log("Hello", 3).c_str()); - - // TODO: test termination criterion --- max stall, generations, time + // ASSERT_STREQ("", logger.log("Hello", 3).c_str()); } From 3112a181f3148b289e90fc7d832d193d44416285 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 6 Jun 2024 11:43:46 -0300 Subject: [PATCH 188/199] new example notebooks!! --- docs/guide/archive.ipynb | 365 ++++++++++++++++++ docs/guide/index.md | 2 + docs/guide/saving_loading_populations.ipynb | 281 ++++++++++++++ docs/guide/search_space.ipynb | 4 +- docs/guide/working_with_programs.ipynb | 401 +++++++++++--------- 5 files changed, 869 insertions(+), 184 deletions(-) diff --git a/docs/guide/archive.ipynb b/docs/guide/archive.ipynb index e69de29b..81fc4fb9 100644 --- a/docs/guide/archive.ipynb +++ b/docs/guide/archive.ipynb @@ -0,0 +1,365 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The archive\n", + "\n", + "When you fit a brush estimator, two new attributes are created: `best_estimator_` and `archive_`.\n", + "\n", + "If you set `use_arch` to `True` when instantiating the estimator, then it will store the pareto front as a list in `archive_`. This pareto front is always created with individuals from the final population that are not dominated in objectives **error** and **complexity**.\n", + "\n", + "In case you need more flexibility, the archive will contain the entire final population if `use_arch` is `False`, and you can iterate through this list to select individuals with different criteria. It is also good to remind that Brush supports different optimization objectives using the argument `objectives`.\n", + "\n", + "Each element from the archive is a serialized individual (JSON object)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pybrush import BrushClassifier\n", + "\n", + "# load data\n", + "df = pd.read_csv('../examples/datasets/d_analcatdata_aids.csv')\n", + "X = df.drop(columns='target')\n", + "y = df['target']" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Completed 100% [====================]\n", + "score: 0.7\n" + ] + } + ], + "source": [ + "est = BrushClassifier(\n", + " functions=['SplitBest','Add','Mul','Sin','Cos','Exp','Logabs'],\n", + " use_arch=True,\n", + " max_gens=100,\n", + " verbosity=1\n", + ")\n", + "\n", + "est.fit(X,y)\n", + "y_pred = est.predict(X)\n", + "print('score:', est.score(X,y))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see individuals from archive using the index:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5\n" + ] + }, + { + "data": { + "text/plain": [ + "{'fitness': {'complexity': 80,\n", + " 'crowding_dist': 0.0,\n", + " 'dcounter': 0,\n", + " 'depth': 3,\n", + " 'dominated': [],\n", + " 'loss': 0.5091069936752319,\n", + " 'loss_v': 0.5091069936752319,\n", + " 'rank': 1,\n", + " 'size': 12,\n", + " 'values': [0.5091069936752319, 12.0],\n", + " 'weights': [-1.0, -1.0],\n", + " 'wvalues': [-0.5091069936752319, -12.0]},\n", + " 'id': 10060,\n", + " 'objectives': ['error', 'size'],\n", + " 'parent_id': [9628],\n", + " 'program': {'Tree': [{'W': 15890.5,\n", + " 'arg_types': ['ArrayF', 'ArrayF'],\n", + " 'center_op': True,\n", + " 'feature': 'AIDS',\n", + " 'fixed': False,\n", + " 'is_weighted': False,\n", + " 'name': 'SplitBest',\n", + " 'node_type': 'SplitBest',\n", + " 'prob_change': 1.0,\n", + " 'ret_type': 'ArrayF',\n", + " 'sig_dual_hash': 9996486434638833164,\n", + " 'sig_hash': 10001460114883919497},\n", + " {'W': 1.0,\n", + " 'arg_types': ['ArrayF'],\n", + " 'center_op': True,\n", + " 'feature': '',\n", + " 'fixed': False,\n", + " 'is_weighted': False,\n", + " 'name': 'Logabs',\n", + " 'node_type': 'Logabs',\n", + " 'prob_change': 1.0,\n", + " 'ret_type': 'ArrayF',\n", + " 'sig_dual_hash': 10617925524997611780,\n", + " 'sig_hash': 13326223354425868050},\n", + " {'W': 2.7182815074920654,\n", + " 'arg_types': [],\n", + " 'center_op': True,\n", + " 'feature': 'Cf',\n", + " 'fixed': False,\n", + " 'is_weighted': False,\n", + " 'name': 'Constant',\n", + " 'node_type': 'Constant',\n", + " 'prob_change': 1.0,\n", + " 'ret_type': 'ArrayF',\n", + " 'sig_dual_hash': 509529941281334733,\n", + " 'sig_hash': 17717457037689164349},\n", + " {'W': 1572255.5,\n", + " 'arg_types': ['ArrayF', 'ArrayF'],\n", + " 'center_op': True,\n", + " 'feature': 'Total',\n", + " 'fixed': False,\n", + " 'is_weighted': False,\n", + " 'name': 'SplitBest',\n", + " 'node_type': 'SplitBest',\n", + " 'prob_change': 1.0,\n", + " 'ret_type': 'ArrayF',\n", + " 'sig_dual_hash': 9996486434638833164,\n", + " 'sig_hash': 10001460114883919497},\n", + " {'W': 0.2222222238779068,\n", + " 'arg_types': [],\n", + " 'center_op': True,\n", + " 'feature': 'MeanLabel',\n", + " 'fixed': False,\n", + " 'is_weighted': True,\n", + " 'name': 'MeanLabel',\n", + " 'node_type': 'MeanLabel',\n", + " 'prob_change': 1.0,\n", + " 'ret_type': 'ArrayF',\n", + " 'sig_dual_hash': 509529941281334733,\n", + " 'sig_hash': 17717457037689164349},\n", + " {'W': 0.5217871069908142,\n", + " 'arg_types': [],\n", + " 'center_op': True,\n", + " 'feature': 'Cf',\n", + " 'fixed': False,\n", + " 'is_weighted': False,\n", + " 'name': 'Constant',\n", + " 'node_type': 'Constant',\n", + " 'prob_change': 1.0,\n", + " 'ret_type': 'ArrayF',\n", + " 'sig_dual_hash': 509529941281334733,\n", + " 'sig_hash': 17717457037689164349}],\n", + " 'is_fitted_': True}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(len(est.archive_[0]))\n", + "\n", + "est.archive_[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And you can call `predict` (or `predict_proba`, if your `est` is an instance of `BrushClassifier`) with the entire archive:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 10060,\n", + " 'y_pred': array([False, True, True, True, True, False, True, True, True,\n", + " False, True, True, True, True, False, True, True, True,\n", + " True, True, True, True, True, True, True, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, True, False, True, True, True, True, True,\n", + " True, True, True, True, True])},\n", + " {'id': 9789,\n", + " 'y_pred': array([False, True, True, True, True, False, True, True, True,\n", + " False, True, True, True, True, False, True, True, True,\n", + " True, True, True, True, True, True, True, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, True, False, True, True, True, True, True,\n", + " True, True, True, True, True])},\n", + " {'id': 10049,\n", + " 'y_pred': array([False, True, True, True, True, False, True, True, True,\n", + " False, False, True, True, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False])},\n", + " {'id': 4384,\n", + " 'y_pred': array([False, True, True, True, True, False, True, True, True,\n", + " False, False, True, True, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False])},\n", + " {'id': 9692,\n", + " 'y_pred': array([ True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True])},\n", + " {'id': 9552,\n", + " 'y_pred': array([False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False])}]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "est.predict_archive(X)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 10060,\n", + " 'y_pred': array([0.22222222, 0.9999999 , 0.9999999 , 0.9999999 , 0.9999999 ,\n", + " 0.22222222, 0.9999999 , 0.9999999 , 0.9999999 , 0.22222222,\n", + " 0.5217871 , 0.9999999 , 0.9999999 , 0.5217871 , 0.22222222,\n", + " 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 ,\n", + " 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 ,\n", + " 0.22222222, 0.22222222, 0.22222222, 0.22222222, 0.22222222,\n", + " 0.22222222, 0.22222222, 0.22222222, 0.22222222, 0.22222222,\n", + " 0.22222222, 0.22222222, 0.22222222, 0.5217871 , 0.22222222,\n", + " 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 ,\n", + " 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 ],\n", + " dtype=float32)},\n", + " {'id': 9789,\n", + " 'y_pred': array([0.22222222, 0.99994993, 0.99994993, 0.99994993, 0.99994993,\n", + " 0.22222222, 0.99994993, 0.99994993, 0.99994993, 0.22222222,\n", + " 0.5217871 , 0.99994993, 0.99994993, 0.5217871 , 0.22222222,\n", + " 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 ,\n", + " 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 ,\n", + " 0.22222222, 0.22222222, 0.22222222, 0.22222222, 0.22222222,\n", + " 0.22222222, 0.22222222, 0.22222222, 0.22222222, 0.22222222,\n", + " 0.22222222, 0.22222222, 0.22222222, 0.5217871 , 0.22222222,\n", + " 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 ,\n", + " 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 , 0.5217871 ],\n", + " dtype=float32)},\n", + " {'id': 10049,\n", + " 'y_pred': array([0.39024392, 0.9999999 , 0.9999999 , 0.9999999 , 0.9999999 ,\n", + " 0.39024392, 0.9999999 , 0.9999999 , 0.9999999 , 0.39024392,\n", + " 0.39024392, 0.9999999 , 0.9999999 , 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392],\n", + " dtype=float32)},\n", + " {'id': 4384,\n", + " 'y_pred': array([0.39024392, 0.9999522 , 0.9999522 , 0.9999522 , 0.9999522 ,\n", + " 0.39024392, 0.9999522 , 0.9999522 , 0.9999522 , 0.39024392,\n", + " 0.39024392, 0.9999522 , 0.9999522 , 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392,\n", + " 0.39024392, 0.39024392, 0.39024392, 0.39024392, 0.39024392],\n", + " dtype=float32)},\n", + " {'id': 9692,\n", + " 'y_pred': array([0.5317098 , 0.93985564, 0.9835824 , 0.8686745 , 0.68970597,\n", + " 0.53089285, 0.8455727 , 0.9291562 , 0.7663612 , 0.6237519 ,\n", + " 0.5169323 , 0.7368382 , 0.794476 , 0.63628834, 0.5578266 ,\n", + " 0.50047225, 0.50908357, 0.51443684, 0.506959 , 0.50320625,\n", + " 0.5003231 , 0.50484663, 0.5051821 , 0.50173986, 0.5005965 ,\n", + " 0.5060892 , 0.5592239 , 0.56642807, 0.5267187 , 0.5222307 ,\n", + " 0.5185086 , 0.64804167, 0.68591666, 0.5714386 , 0.5314499 ,\n", + " 0.50612646, 0.5576549 , 0.5636914 , 0.5241404 , 0.5113072 ,\n", + " 0.50007457, 0.5010315 , 0.5013173 , 0.50085753, 0.50068355,\n", + " 0.5000373 , 0.50096935, 0.50095695, 0.5003852 , 0.500174 ],\n", + " dtype=float32)},\n", + " {'id': 9552,\n", + " 'y_pred': array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,\n", + " 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,\n", + " 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,\n", + " 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],\n", + " dtype=float32)}]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "est.predict_proba_archive(X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "brush", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/guide/index.md b/docs/guide/index.md index ccd1c6a8..eb71a290 100644 --- a/docs/guide/index.md +++ b/docs/guide/index.md @@ -13,5 +13,7 @@ data search_space working_with_programs json +saving_loading_populations +archive deap ``` \ No newline at end of file diff --git a/docs/guide/saving_loading_populations.ipynb b/docs/guide/saving_loading_populations.ipynb index e69de29b..cd65da2f 100644 --- a/docs/guide/saving_loading_populations.ipynb +++ b/docs/guide/saving_loading_populations.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Saving and loading populations\n", + "\n", + "Another feature Brush implements is the ability to save and load entire populations.\n", + "We use JSON notation to store the population into a file that is human readable. The same way, we can feed an estimator a previous population file to serve as starting point for the evolution.\n", + "\n", + "In this notebook, we will walk through how to use the `save_population` and `load_population` parameters. \n", + "\n", + "We start by getting a sample dataset and splitting it into `X` and `y`:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pybrush import BrushRegressor\n", + "\n", + "# load data\n", + "df = pd.read_csv('../examples/datasets/d_enc.csv')\n", + "X = df.drop(columns='label')\n", + "y = df['label']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To save the population after finishing the evolution, you nee to set `save_population` parameter to a value different than an empty string. Then, the final population is going to be stored in that specific file.\n", + "\n", + "In this example, we create a temporary file." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generation 1/10 [////// ]\n", + "Train Loss (Med): 10.64540 (90.38514)\n", + "Val Loss (Med): 10.64540 (90.38514)\n", + "Median Size (Max): 3 (19)\n", + "Median complexity (Max): 9 (648)\n", + "Time (s): 0.08182\n", + "\n", + "Generation 2/10 [/////////// ]\n", + "Train Loss (Med): 10.64540 (60.79966)\n", + "Val Loss (Med): 10.64540 (60.79966)\n", + "Median Size (Max): 3 (19)\n", + "Median complexity (Max): 9 (588)\n", + "Time (s): 0.15738\n", + "\n", + "Generation 3/10 [//////////////// ]\n", + "Train Loss (Med): 10.64540 (41.44810)\n", + "Val Loss (Med): 10.64540 (41.44810)\n", + "Median Size (Max): 3 (19)\n", + "Median complexity (Max): 9 (747)\n", + "Time (s): 0.25071\n", + "\n", + "Generation 4/10 [///////////////////// ]\n", + "Train Loss (Med): 10.64540 (17.94969)\n", + "Val Loss (Med): 10.64540 (17.94969)\n", + "Median Size (Max): 4 (19)\n", + "Median complexity (Max): 20 (1425)\n", + "Time (s): 0.36906\n", + "\n", + "Generation 5/10 [////////////////////////// ]\n", + "Train Loss (Med): 10.64540 (17.94969)\n", + "Val Loss (Med): 10.64540 (17.94969)\n", + "Median Size (Max): 3 (19)\n", + "Median complexity (Max): 9 (692)\n", + "Time (s): 0.45359\n", + "\n", + "Generation 6/10 [/////////////////////////////// ]\n", + "Train Loss (Med): 10.43983 (17.94969)\n", + "Val Loss (Med): 10.43983 (17.94969)\n", + "Median Size (Max): 3 (17)\n", + "Median complexity (Max): 9 (324)\n", + "Time (s): 0.55083\n", + "\n", + "Generation 7/10 [//////////////////////////////////// ]\n", + "Train Loss (Med): 10.43983 (17.90349)\n", + "Val Loss (Med): 10.43983 (17.90349)\n", + "Median Size (Max): 4 (19)\n", + "Median complexity (Max): 14 (975)\n", + "Time (s): 0.64643\n", + "\n", + "Generation 8/10 [///////////////////////////////////////// ]\n", + "Train Loss (Med): 10.26326 (17.67499)\n", + "Val Loss (Med): 10.26326 (17.67499)\n", + "Median Size (Max): 5 (17)\n", + "Median complexity (Max): 21 (324)\n", + "Time (s): 0.74136\n", + "\n", + "Generation 9/10 [////////////////////////////////////////////// ]\n", + "Train Loss (Med): 10.26326 (17.94969)\n", + "Val Loss (Med): 10.26326 (17.94969)\n", + "Median Size (Max): 3 (17)\n", + "Median complexity (Max): 9 (324)\n", + "Time (s): 0.83278\n", + "\n", + "Generation 10/10 [//////////////////////////////////////////////////]\n", + "Train Loss (Med): 10.26326 (17.94969)\n", + "Val Loss (Med): 10.26326 (17.94969)\n", + "Median Size (Max): 3 (17)\n", + "Median complexity (Max): 9 (324)\n", + "Time (s): 0.93411\n", + "\n", + "Saved population to file /tmp/tmpl0n47qch/population.json\n", + "score: 0.8864496494920485\n" + ] + } + ], + "source": [ + "import pickle\n", + "import os, tempfile\n", + "\n", + "pop_file = os.path.join(tempfile.mkdtemp(), 'population.json')\n", + "\n", + "# set verbosity==2 to see the full report\n", + "est = BrushRegressor(\n", + " functions=['SplitBest','Add','Mul','Sin','Cos','Exp','Logabs'],\n", + " max_gens=10,\n", + " save_population=pop_file,\n", + " verbosity=2\n", + ")\n", + "\n", + "est.fit(X,y)\n", + "y_pred = est.predict(X)\n", + "print('score:', est.score(X,y))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading a previous population is done providing `load_population` a string value corresponding to a JSON file generated by Brush. In our case, we will use the same file from the previous code block.\n", + "\n", + "After loading the population, we run the evolution for 10 more generations, and we can see that the first generation started from the previous population. This means that the population was successfully saved and loaded." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded population from /tmp/tmpl0n47qch/population.json of size = 200\n", + "Generation 1/10 [////// ]\n", + "Train Loss (Med): 10.26326 (17.94969)\n", + "Val Loss (Med): 10.26326 (17.94969)\n", + "Median Size (Max): 3 (17)\n", + "Median complexity (Max): 9 (324)\n", + "Time (s): 0.08078\n", + "\n", + "Generation 2/10 [/////////// ]\n", + "Train Loss (Med): 10.26326 (17.94969)\n", + "Val Loss (Med): 10.26326 (17.94969)\n", + "Median Size (Max): 3 (17)\n", + "Median complexity (Max): 9 (324)\n", + "Time (s): 0.15177\n", + "\n", + "Generation 3/10 [//////////////// ]\n", + "Train Loss (Med): 10.26326 (17.94969)\n", + "Val Loss (Med): 10.26326 (17.94969)\n", + "Median Size (Max): 3 (15)\n", + "Median complexity (Max): 9 (196)\n", + "Time (s): 0.23618\n", + "\n", + "Generation 4/10 [///////////////////// ]\n", + "Train Loss (Med): 10.26326 (17.94969)\n", + "Val Loss (Med): 10.26326 (17.94969)\n", + "Median Size (Max): 3 (15)\n", + "Median complexity (Max): 9 (196)\n", + "Time (s): 0.29749\n", + "\n", + "Generation 5/10 [////////////////////////// ]\n", + "Train Loss (Med): 10.26326 (17.94969)\n", + "Val Loss (Med): 10.26326 (17.94969)\n", + "Median Size (Max): 3 (15)\n", + "Median complexity (Max): 9 (196)\n", + "Time (s): 0.38480\n", + "\n", + "Generation 6/10 [/////////////////////////////// ]\n", + "Train Loss (Med): 10.26326 (17.94969)\n", + "Val Loss (Med): 10.26326 (17.94969)\n", + "Median Size (Max): 3 (15)\n", + "Median complexity (Max): 9 (196)\n", + "Time (s): 0.47692\n", + "\n", + "Generation 7/10 [//////////////////////////////////// ]\n", + "Train Loss (Med): 10.26326 (17.94969)\n", + "Val Loss (Med): 10.26326 (17.94969)\n", + "Median Size (Max): 3 (15)\n", + "Median complexity (Max): 9 (196)\n", + "Time (s): 0.54522\n", + "\n", + "Generation 8/10 [///////////////////////////////////////// ]\n", + "Train Loss (Med): 10.26326 (17.49268)\n", + "Val Loss (Med): 10.26326 (17.49268)\n", + "Median Size (Max): 5 (15)\n", + "Median complexity (Max): 22 (196)\n", + "Time (s): 0.61691\n", + "\n", + "Generation 9/10 [////////////////////////////////////////////// ]\n", + "Train Loss (Med): 10.26326 (17.49268)\n", + "Val Loss (Med): 10.26326 (17.49268)\n", + "Median Size (Max): 5 (15)\n", + "Median complexity (Max): 22 (196)\n", + "Time (s): 0.69888\n", + "\n", + "Generation 10/10 [//////////////////////////////////////////////////]\n", + "Train Loss (Med): 10.26326 (15.05905)\n", + "Val Loss (Med): 10.26326 (15.05905)\n", + "Median Size (Max): 7 (15)\n", + "Median complexity (Max): 36 (196)\n", + "Time (s): 0.80282\n", + "\n", + "score: 0.8864496494920485\n" + ] + } + ], + "source": [ + "est = BrushRegressor(\n", + " functions=['SplitBest','Add','Mul','Sin','Cos','Exp','Logabs'],\n", + " load_population=pop_file,\n", + " max_gens=10,\n", + " verbosity=2\n", + ")\n", + "\n", + "est.fit(X,y)\n", + "y_pred = est.predict(X)\n", + "print('score:', est.score(X,y))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can open the serialized file and change individuals' programs manually.\n", + "\n", + "This also allow us to have checkpoints in the execution." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "brush", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/guide/search_space.ipynb b/docs/guide/search_space.ipynb index fbe0e02c..69faab2c 100644 --- a/docs/guide/search_space.ipynb +++ b/docs/guide/search_space.ipynb @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "23d6f552", "metadata": {}, "outputs": [], @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a2953719", "metadata": {}, "outputs": [ diff --git a/docs/guide/working_with_programs.ipynb b/docs/guide/working_with_programs.ipynb index 28257cdb..0769c286 100644 --- a/docs/guide/working_with_programs.ipynb +++ b/docs/guide/working_with_programs.ipynb @@ -93,7 +93,7 @@ "output_type": "stream", "text": [ "Completed 100% [====================]\n", - "score: 0.890070958724087\n" + "score: 0.8972961690538603\n" ] } ], @@ -128,7 +128,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Fitness(9.935950 18.000000 )\n", + "Fitness(9.282899 19.000000 )\n", "['error', 'size']\n" ] } @@ -176,14 +176,47 @@ "id": "fe594691", "metadata": {}, "source": [ - "Brush let's you serialize the entire individual, or just the program or fitness it wraps. You can use pickle to save and load programs.\n", + "## Serialization \n", "\n", - "This is all you need to save and load entire populations, and this feature allow us to create store the `Archive` after running the algorithm." + "Brush let's you serialize the entire individual, or just the program or fitness it wraps. It uses JSON to serialize the objects, and this is implemented with the get and set states of an object:" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 14, + "id": "b01ab1fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "fitness {'complexity': 304, 'crowding_dist': 3.4028234663852886e+38, 'dcounter': 0, 'depth': 3, 'dominated': [0, 2, 29, 62, 80, 127, 146], 'loss': 9.282898902893066, 'loss_v': 9.282898902893066, 'rank': 1, 'size': 19, 'values': [9.282898902893066, 19.0], 'weights': [-1.0, -1.0], 'wvalues': [-9.282898902893066, -19.0]}\n", + "id 1910\n", + "objectives ['error', 'size']\n", + "parent_id [1858]\n", + "program {'Tree': [{'W': 0.75, 'arg_types': ['ArrayF', 'ArrayF'], 'center_op': True, 'feature': 'x0', 'fixed': False, 'is_weighted': False, 'name': 'SplitBest', 'node_type': 'SplitBest', 'prob_change': 1.0, 'ret_type': 'ArrayF', 'sig_dual_hash': 9996486434638833164, 'sig_hash': 10001460114883919497}, {'W': 0.8050000071525574, 'arg_types': ['ArrayF', 'ArrayF'], 'center_op': True, 'feature': 'x0', 'fixed': False, 'is_weighted': False, 'name': 'SplitBest', 'node_type': 'SplitBest', 'prob_change': 1.0, 'ret_type': 'ArrayF', 'sig_dual_hash': 9996486434638833164, 'sig_hash': 10001460114883919497}, {'W': 30.494491577148438, 'arg_types': [], 'center_op': True, 'feature': 'MeanLabel', 'fixed': False, 'is_weighted': True, 'name': 'MeanLabel', 'node_type': 'MeanLabel', 'prob_change': 1.0, 'ret_type': 'ArrayF', 'sig_dual_hash': 509529941281334733, 'sig_hash': 17717457037689164349}, {'W': 49.47871017456055, 'arg_types': [], 'center_op': True, 'feature': 'x0', 'fixed': False, 'is_weighted': True, 'name': 'Terminal', 'node_type': 'Terminal', 'prob_change': 1.0, 'ret_type': 'ArrayF', 'sig_dual_hash': 509529941281334733, 'sig_hash': 17717457037689164349}, {'W': 1.0, 'arg_types': ['ArrayF', 'ArrayF'], 'center_op': True, 'feature': '', 'fixed': False, 'is_weighted': False, 'name': 'Add', 'node_type': 'Add', 'prob_change': 1.0, 'ret_type': 'ArrayF', 'sig_dual_hash': 9996486434638833164, 'sig_hash': 10001460114883919497}, {'W': 0.018234524875879288, 'arg_types': [], 'center_op': True, 'feature': 'x1', 'fixed': False, 'is_weighted': True, 'name': 'Terminal', 'node_type': 'Terminal', 'prob_change': 1.0, 'ret_type': 'ArrayF', 'sig_dual_hash': 509529941281334733, 'sig_hash': 17717457037689164349}, {'W': 10.46687126159668, 'arg_types': [], 'center_op': True, 'feature': 'x6', 'fixed': False, 'is_weighted': True, 'name': 'Terminal', 'node_type': 'Terminal', 'prob_change': 1.0, 'ret_type': 'ArrayF', 'sig_dual_hash': 509529941281334733, 'sig_hash': 17717457037689164349}], 'is_fitted_': True}\n" + ] + } + ], + "source": [ + "estimator_dict = est.best_estimator_.__getstate__()\n", + "\n", + "for k, v in estimator_dict.items():\n", + " print(k, v)" + ] + }, + { + "cell_type": "markdown", + "id": "6bcb071b", + "metadata": {}, + "source": [ + "With serialization, you can use pickle to save and load just programs or even the entire individual." + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "b4537631", "metadata": {}, "outputs": [], @@ -191,15 +224,43 @@ "import pickle\n", "import os, tempfile\n", "\n", - "individual_file = os.path.join(tempfile.mkdtemp(), 'individual')\n", + "individual_file = os.path.join(tempfile.mkdtemp(), 'individual.json')\n", "with open(individual_file, \"wb\") as f:\n", " pickle.dump(est.best_estimator_, f)\n", "\n", - "program_file = os.path.join(tempfile.mkdtemp(), 'program')\n", + "program_file = os.path.join(tempfile.mkdtemp(), 'program.json')\n", "with open(program_file, \"wb\") as f:\n", " pickle.dump(est.best_estimator_.program, f)" ] }, + { + "cell_type": "markdown", + "id": "fff5693d", + "metadata": {}, + "source": [ + "Then we can load it later with:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ee7a20c6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "If(x0>0.75,If(x0>0.81,30.49*MeanLabel,49.48*x0),Add(0.02*x1,10.47*x6))\n" + ] + } + ], + "source": [ + "with open(individual_file, \"rb\") as f:\n", + " loaded_estimator = pickle.load(f)\n", + " print(loaded_estimator.get_model())" + ] + }, { "cell_type": "markdown", "id": "a355d8f3", @@ -213,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "316964d5", "metadata": {}, "outputs": [ @@ -221,7 +282,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "0.03*Add(If(x0>0.75,1.16*x1,191.72*x0),Add(x2,512.72*x6))\n" + "If(x0>0.75,If(x0>0.81,30.49*MeanLabel,49.48*x0),Add(0.02*x1,10.47*x6))\n" ] } ], @@ -241,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "dad68d01", "metadata": {}, "outputs": [ @@ -249,13 +310,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "0.03*Add\n", + "SplitBest\n", "|-SplitBest\n", - " |-1.16*x1\n", - " |-191.72*x0\n", + " |-30.49*MeanLabel\n", + " |-49.48*x0\n", "|-Add\n", - "| |-x2\n", - "| |-512.72*x6\n" + "| |-0.02*x1\n", + "| |-10.47*x6\n" ] } ], @@ -276,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "3ef1a735", "metadata": {}, "outputs": [ @@ -289,115 +350,104 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "G\n", - "\n", - "\n", + "\n", + "\n", "\n", - "y\n", - "\n", - "y\n", + "7f370003ebc0\n", + "\n", + "x0>0.75?\n", "\n", - "\n", + "\n", "\n", - "7f6ad0015040\n", - "\n", - "Add\n", + "7f37000b5410\n", + "\n", + "x0>0.81?\n", "\n", - "\n", + "\n", "\n", - "y->7f6ad0015040\n", - "\n", - "\n", - "0.03\n", + "7f370003ebc0->7f37000b5410\n", + "\n", + "\n", + "Y\n", "\n", - "\n", + "\n", "\n", - "7f6ad00811a0\n", - "\n", - "x0>0.75?\n", + "7f370003f120\n", + "\n", + "Add\n", "\n", - "\n", + "\n", "\n", - "7f6ad0015040->7f6ad00811a0\n", - "\n", - "\n", + "7f370003ebc0->7f370003f120\n", + "\n", + "\n", + "N\n", "\n", - "\n", + "\n", "\n", - "7f6ad00810e0\n", - "\n", - "Add\n", + "7f370003ef80\n", + "\n", + "30.49*MeanLabel\n", "\n", - "\n", + "\n", "\n", - "7f6ad0015040->7f6ad00810e0\n", - "\n", - "\n", + "7f37000b5410->7f370003ef80\n", + "\n", + "\n", + "Y\n", "\n", - "\n", + "\n", "\n", - "x1\n", - "\n", - "x1\n", + "x0\n", + "\n", + "x0\n", "\n", - "\n", + "\n", "\n", - "7f6ad00811a0->x1\n", - "\n", - "\n", - "1.16\n", - "Y\n", + "7f37000b5410->x0\n", + "\n", + "\n", + "49.48\n", + "N\n", "\n", - "\n", + "\n", "\n", - "x0\n", - "\n", - "x0\n", + "x1\n", + "\n", + "x1\n", "\n", - "\n", + "\n", "\n", - "7f6ad00811a0->x0\n", - "\n", - "\n", - "191.72\n", - "N\n", - "\n", - "\n", - "\n", - "x2\n", - "\n", - "x2\n", - "\n", - "\n", - "\n", - "7f6ad00810e0->x2\n", - "\n", - "\n", + "7f370003f120->x1\n", + "\n", + "\n", + "0.02\n", "\n", "\n", - "\n", + "\n", "x6\n", - "\n", - "x6\n", + "\n", + "x6\n", "\n", - "\n", - "\n", - "7f6ad00810e0->x6\n", - "\n", - "\n", - "512.72\n", + "\n", + "\n", + "7f370003f120->x6\n", + "\n", + "\n", + "10.47\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -419,7 +469,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "1f7e725e", "metadata": {}, "outputs": [ @@ -428,20 +478,18 @@ "output_type": "stream", "text": [ "digraph G {\n", - "y [shape=box];\n", - "y -> \"7f6ad0015040\" [label=\"0.03\"];\n", - "\"7f6ad0015040\" [label=\"Add\"];\n", - "\"7f6ad0015040\" -> \"7f6ad00811a0\" [label=\"\"];\n", - "\"7f6ad0015040\" -> \"7f6ad00810e0\" [label=\"\"];\n", - "\"7f6ad00811a0\" [label=\"x0>0.75?\"];\n", - "\"7f6ad00811a0\" -> \"x1\" [headlabel=\"1.16\",taillabel=\"Y\"];\n", - "\"7f6ad00811a0\" -> \"x0\" [headlabel=\"191.72\",taillabel=\"N\"];\n", - "\"x1\" [label=\"x1\"];\n", + "\"7f370003ebc0\" [label=\"x0>0.75?\"];\n", + "\"7f370003ebc0\" -> \"7f37000b5410\" [headlabel=\"\",taillabel=\"Y\"];\n", + "\"7f370003ebc0\" -> \"7f370003f120\" [headlabel=\"\",taillabel=\"N\"];\n", + "\"7f37000b5410\" [label=\"x0>0.81?\"];\n", + "\"7f37000b5410\" -> \"7f370003ef80\" [headlabel=\"\",taillabel=\"Y\"];\n", + "\"7f37000b5410\" -> \"x0\" [headlabel=\"49.48\",taillabel=\"N\"];\n", + "\"7f370003ef80\" [label=\"30.49*MeanLabel\"];\n", "\"x0\" [label=\"x0\"];\n", - "\"7f6ad00810e0\" [label=\"Add\"];\n", - "\"7f6ad00810e0\" -> \"x2\" [label=\"\"];\n", - "\"7f6ad00810e0\" -> \"x6\" [label=\"512.72\"];\n", - "\"x2\" [label=\"x2\"];\n", + "\"7f370003f120\" [label=\"Add\"];\n", + "\"7f370003f120\" -> \"x1\" [label=\"0.02\"];\n", + "\"7f370003f120\" -> \"x6\" [label=\"10.47\"];\n", + "\"x1\" [label=\"x1\"];\n", "\"x6\" [label=\"x6\"];\n", "}\n", "\n" @@ -467,7 +515,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "f35b1e05", "metadata": {}, "outputs": [ @@ -480,115 +528,104 @@ "\n", "\n", - "\n", + "\n", "\n", "G\n", - "\n", - "\n", + "\n", + "\n", "\n", - "y\n", - "\n", - "y\n", + "7f370003ebc0\n", + "\n", + "x0>0.75?\n", "\n", - "\n", + "\n", "\n", - "7f6ad0015040\n", - "\n", - "Add\n", + "7f37000b5410\n", + "\n", + "x0>0.81?\n", "\n", - "\n", + "\n", "\n", - "y->7f6ad0015040\n", - "\n", - "\n", - "0.03\n", + "7f370003ebc0->7f37000b5410\n", + "\n", + "\n", + "Y\n", "\n", - "\n", + "\n", "\n", - "7f6ad00811a0\n", - "\n", - "x0>0.75?\n", + "7f370003f120\n", + "\n", + "Add\n", "\n", - "\n", + "\n", "\n", - "7f6ad0015040->7f6ad00811a0\n", - "\n", - "\n", + "7f370003ebc0->7f370003f120\n", + "\n", + "\n", + "N\n", "\n", - "\n", + "\n", "\n", - "7f6ad00810e0\n", - "\n", - "Add\n", + "7f370003ef80\n", + "\n", + "30.49*MeanLabel\n", "\n", - "\n", + "\n", "\n", - "7f6ad0015040->7f6ad00810e0\n", - "\n", - "\n", + "7f37000b5410->7f370003ef80\n", + "\n", + "\n", + "Y\n", "\n", - "\n", + "\n", "\n", - "x1\n", - "\n", - "x1\n", + "x0\n", + "\n", + "x0\n", "\n", - "\n", + "\n", "\n", - "7f6ad00811a0->x1\n", - "\n", - "\n", - "1.16\n", - "Y\n", + "7f37000b5410->x0\n", + "\n", + "\n", + "49.48\n", + "N\n", "\n", - "\n", + "\n", "\n", - "x0\n", - "\n", - "x0\n", + "x1\n", + "\n", + "x1\n", "\n", - "\n", + "\n", "\n", - "7f6ad00811a0->x0\n", - "\n", - "\n", - "191.72\n", - "N\n", - "\n", - "\n", - "\n", - "x2\n", - "\n", - "x2\n", - "\n", - "\n", - "\n", - "7f6ad00810e0->x2\n", - "\n", - "\n", + "7f370003f120->x1\n", + "\n", + "\n", + "0.02\n", "\n", "\n", - "\n", + "\n", "x6\n", - "\n", - "x6\n", + "\n", + "x6\n", "\n", - "\n", - "\n", - "7f6ad00810e0->x6\n", - "\n", - "\n", - "512.72\n", + "\n", + "\n", + "7f370003f120->x6\n", + "\n", + "\n", + "10.47\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } From 6be28decf1bf1a5641e0f7dc51bbfec18e752533 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 6 Jun 2024 11:44:13 -0300 Subject: [PATCH 189/199] Added new files. Now it is time to write documentation --- docs/cpp_api/archive.rst | 5 +++++ docs/cpp_api/engine.rst | 9 +++++++++ docs/cpp_api/evaluation.rst | 8 ++++++++ docs/cpp_api/index.md | 6 ++++++ docs/cpp_api/individual.rst | 8 ++++++++ docs/cpp_api/population.rst | 5 +++++ docs/cpp_api/selection.rst | 14 ++++++++++++++ docs/cpp_api/variation.rst | 6 +++++- docs/python_api/classifier.rst | 3 +-- docs/python_api/estimator.rst | 2 +- docs/python_api/index.md | 1 + docs/python_api/python_api.rst | 2 +- docs/python_api/regressor.rst | 2 +- 13 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 docs/cpp_api/archive.rst create mode 100644 docs/cpp_api/engine.rst create mode 100644 docs/cpp_api/evaluation.rst create mode 100644 docs/cpp_api/individual.rst create mode 100644 docs/cpp_api/population.rst create mode 100644 docs/cpp_api/selection.rst diff --git a/docs/cpp_api/archive.rst b/docs/cpp_api/archive.rst new file mode 100644 index 00000000..aa80c161 --- /dev/null +++ b/docs/cpp_api/archive.rst @@ -0,0 +1,5 @@ +Archive +======= + +.. doxygenclass:: Brush::Pop::Archive + :members: diff --git a/docs/cpp_api/engine.rst b/docs/cpp_api/engine.rst new file mode 100644 index 00000000..60274e88 --- /dev/null +++ b/docs/cpp_api/engine.rst @@ -0,0 +1,9 @@ +Engine (and parameters) +======================= + +.. doxygenclass:: Brush::Parameters + :members: + +.. doxygenclass:: Brush::Engine + :members: + diff --git a/docs/cpp_api/evaluation.rst b/docs/cpp_api/evaluation.rst new file mode 100644 index 00000000..e3aaf620 --- /dev/null +++ b/docs/cpp_api/evaluation.rst @@ -0,0 +1,8 @@ +Evaluation +========== + +.. doxygenclass:: Brush::Eval::Evaluation + :members: + +.. doxygenclass:: Brush::Scorer + :members: diff --git a/docs/cpp_api/index.md b/docs/cpp_api/index.md index 226702d5..5d6ba358 100644 --- a/docs/cpp_api/index.md +++ b/docs/cpp_api/index.md @@ -13,5 +13,11 @@ search_space program node nodetypes +individual +evaluation +population variation +selection +archive +engine ``` \ No newline at end of file diff --git a/docs/cpp_api/individual.rst b/docs/cpp_api/individual.rst new file mode 100644 index 00000000..e781b865 --- /dev/null +++ b/docs/cpp_api/individual.rst @@ -0,0 +1,8 @@ +Individual and Fitness +====================== + +.. doxygenclass:: Brush::Pop::Individual + :members: + +.. doxygenclass:: Brush::Fitness + :members: \ No newline at end of file diff --git a/docs/cpp_api/population.rst b/docs/cpp_api/population.rst new file mode 100644 index 00000000..d8616e56 --- /dev/null +++ b/docs/cpp_api/population.rst @@ -0,0 +1,5 @@ +Population +========== + +.. doxygenclass:: Brush::Pop::Population + :members: \ No newline at end of file diff --git a/docs/cpp_api/selection.rst b/docs/cpp_api/selection.rst new file mode 100644 index 00000000..b9fa1429 --- /dev/null +++ b/docs/cpp_api/selection.rst @@ -0,0 +1,14 @@ +Selection +========= + +.. doxygenclass:: Brush::Sel::Selection + :members: + +.. doxygenclass:: Brush::Sel::SelectionOperator + :members: + +.. doxygenclass:: Brush::Sel::NSGA2 + :members: + +.. doxygenclass:: Brush::Sel::Lexicase + :members: diff --git a/docs/cpp_api/variation.rst b/docs/cpp_api/variation.rst index f92847f5..295f2972 100644 --- a/docs/cpp_api/variation.rst +++ b/docs/cpp_api/variation.rst @@ -1,4 +1,8 @@ Variation (Crossover/Mutation) ============================== -.. doxygenfile:: variation.h \ No newline at end of file +.. doxygenclass:: Brush::Vary::MutationBase + :members: + +.. doxygenclass:: Brush::Vary::Variation + :members: diff --git a/docs/python_api/classifier.rst b/docs/python_api/classifier.rst index c0317657..789af014 100644 --- a/docs/python_api/classifier.rst +++ b/docs/python_api/classifier.rst @@ -1,7 +1,6 @@ BrushClassifier =============== - -.. autoclass:: brush.estimator.BrushClassifier +.. autoclass:: pybrush.BrushClassifier :members: :undoc-members: \ No newline at end of file diff --git a/docs/python_api/estimator.rst b/docs/python_api/estimator.rst index 7ed540ed..48a2e1de 100644 --- a/docs/python_api/estimator.rst +++ b/docs/python_api/estimator.rst @@ -1,6 +1,6 @@ BrushEstimator ============== -.. autoclass:: brush.estimator.BrushEstimator +.. autoclass:: pybrush.BrushEstimator :members: :undoc-members: \ No newline at end of file diff --git a/docs/python_api/index.md b/docs/python_api/index.md index 74c1d0f2..cff7926b 100644 --- a/docs/python_api/index.md +++ b/docs/python_api/index.md @@ -4,4 +4,5 @@ estimator regressor classifier +python_api ``` \ No newline at end of file diff --git a/docs/python_api/python_api.rst b/docs/python_api/python_api.rst index 49c1e879..701cd786 100644 --- a/docs/python_api/python_api.rst +++ b/docs/python_api/python_api.rst @@ -3,7 +3,7 @@ Python API .. With doxygennamespace: -.. .. doxygennamespace:: brush +.. .. doxygennamespace:: pybrush .. :members: diff --git a/docs/python_api/regressor.rst b/docs/python_api/regressor.rst index 9289f85d..7fd3b9a8 100644 --- a/docs/python_api/regressor.rst +++ b/docs/python_api/regressor.rst @@ -1,6 +1,6 @@ BrushRegressor ============== -.. autoclass:: brush.estimator.BrushRegressor +.. autoclass:: pybrush.BrushRegressor :members: :undoc-members: \ No newline at end of file From 452ad67c4b6e3453a908ef62126f38b84a0c45f0 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 6 Jun 2024 11:45:20 -0300 Subject: [PATCH 190/199] Todo: version for docs should be set automatically --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index e2124945..9ba7aeba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -58,7 +58,7 @@ def configureDoxyfile(input_dir, output_dir): author = 'William La Cava and Joseph D. Romano' # The full version, including alpha/beta/rc tags -release = '0.1a' +release = '0.1a' # TODO: use versionstr here # -- General configuration --------------------------------------------------- From 3506898625bb67dbb2bca33551771d5cbeab1273 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Thu, 6 Jun 2024 21:34:02 -0300 Subject: [PATCH 191/199] Implemented average_precision_score metric in cpp --- src/eval/metrics.cpp | 60 +++++++++++++++++++++++++++++++++++++++++--- src/eval/metrics.h | 9 ++++++- src/eval/scorer.h | 4 +-- src/ind/individual.h | 15 +++++------ 4 files changed, 75 insertions(+), 13 deletions(-) diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp index ce7ea8c6..60de8c8a 100644 --- a/src/eval/metrics.cpp +++ b/src/eval/metrics.cpp @@ -52,12 +52,67 @@ float mean_log_loss(const VectorXf& y, const VectorXf& predict_proba, VectorXf& loss, const vector& class_weights) { - - /* std::cout << "loss: " << loss.transpose() << "\n"; */ loss = log_loss(y,predict_proba,class_weights); return loss.mean(); } +float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, + VectorXf& loss, + const vector& class_weights) { + + // get argsort of predict proba + vector argsort(predict_proba.size()); + iota(argsort.begin(), argsort.end(), 0); + sort(argsort.begin(), argsort.end(), [&](int i, int j) { + return predict_proba[i] > predict_proba[j]; + }); + + float ysum = 0; + if (!class_weights.empty()) + for (int i = 0; i < class_weights.size(); i++) { + ysum += y(i) * class_weights.at(y(i)); + } + else + ysum = y.sum(); + + // Calculate the precision and recall values + VectorXf precision(predict_proba.size()); + VectorXf recall(predict_proba.size()); + + float true_positives = 0; + float false_positives = 0; + float positives = 0; + + for (int i = 0; i < predict_proba.size(); i++) { + if (predict_proba[argsort[i]] >= 0.5 && y[argsort[i]] == 1) { + true_positives += 1; + } + else { + if (!class_weights.empty()) + false_positives = class_weights[y(argsort[i])]; + else + false_positives += 1; + } + positives = true_positives + false_positives; + + precision[i] = true_positives / (positives + 1); + recall[i] = ysum==0.0 ? 1.0 : true_positives/ysum; + } + + // Calculate the average precision score + float average_precision = 0; + float last_recall = 0; + + for (int i = 0; i < predict_proba.size(); i++) { + if (recall[i] != last_recall) { + loss[i] = precision[i] * (recall[i] - last_recall); + average_precision += loss[i]; + last_recall = recall[i]; + } + } + + return average_precision; +} // multinomial log loss VectorXf multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, @@ -111,7 +166,6 @@ VectorXf multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, return loss; } - float mean_multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, VectorXf& loss, const vector& class_weights) diff --git a/src/eval/metrics.h b/src/eval/metrics.h index 5f4439f9..00ebce37 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -8,10 +8,12 @@ namespace Eval { /* Scoring functions */ +// regression ------------------------------------------------------------------ /// mean squared error float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, const vector& class_weights=vector() ); +// binary classification ------------------------------------------------------- /// log loss (2 methods below) VectorXf log_loss(const VectorXf& y, const VectorXf& predict_proba, const vector& class_weights=vector()); @@ -19,6 +21,11 @@ VectorXf log_loss(const VectorXf& y, const VectorXf& predict_proba, float mean_log_loss(const VectorXf& y, const VectorXf& predict_proba, VectorXf& loss, const vector& class_weights = vector()); +float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, + VectorXf& loss, + const vector& class_weights=vector()); + +// multiclass classification --------------------------------------------------- /// multinomial log loss (2 methods below) VectorXf multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, const vector& class_weights=vector()); @@ -27,7 +34,7 @@ float mean_multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, VectorXf& loss, const vector& class_weights=vector()); -// TODO: average_precision_score for classification + } // metrics } // Brush diff --git a/src/eval/scorer.h b/src/eval/scorer.h index 7f62638c..a10c5df4 100644 --- a/src/eval/scorer.h +++ b/src/eval/scorer.h @@ -13,7 +13,7 @@ using namespace Pop; namespace Eval{ -template // requires(P == PT::Regressor || P == PT::BinaryClassifier) +template class Scorer { @@ -35,7 +35,6 @@ typedef float (*funcPointer)(const VectorXf&, Scorer(string scorer="mse") { // TODO: use this idea of map functpointer to do the mutations score_hash["mse"] = &mse; - // score_hash["multi_log"] = &mean_multi_log_loss; this->set_scorer(scorer); }; @@ -90,6 +89,7 @@ typedef float (*funcPointer)(const VectorXf&, Scorer(string scorer="log") { score_hash["log"] = &mean_log_loss; + score_hash["average_precision_score"] = &average_precision_score; this->set_scorer(scorer); }; diff --git a/src/ind/individual.h b/src/ind/individual.h index 01731503..aa030c58 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -114,13 +114,14 @@ class Individual{ // classification problem, and MSE if it is a regression (so its always // a minimization by default, thus "error" has weight -1.0) inline static std::map weightsMap = { - {"complexity", -1.0}, - {"size", -1.0}, - {"mse", -1.0}, - {"log", -1.0}, - {"multi_log", -1.0}, - {"accuracy", +1.0}, - {"error", -1.0} + {"complexity", -1.0}, + {"size", -1.0}, + {"mse", -1.0}, + {"log", -1.0}, + {"multi_log", -1.0}, + {"average_precision_score", +1.0}, + {"accuracy", +1.0}, + {"error", -1.0} }; vector get_objectives() const { return objectives; }; From 6af5ce0b1847c2b1481f97fe7b580388715368d7 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Fri, 7 Jun 2024 07:24:58 -0300 Subject: [PATCH 192/199] Fixed python docs not being found --- docs/conf.py | 4 ++-- docs/python_api/estimator.rst | 2 +- docs/python_api/index.md | 1 + docs/python_api/interface.rst | 6 ++++++ docs/python_api/python_api.rst | 12 +++++------ docs/python_api/regressor.rst | 2 +- pybrush/BrushEstimator.py | 37 +++++++++++++++++++++++++++------- pybrush/DeapEstimator.py | 2 +- 8 files changed, 48 insertions(+), 18 deletions(-) create mode 100644 docs/python_api/interface.rst diff --git a/docs/conf.py b/docs/conf.py index 9ba7aeba..0163b624 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -37,7 +37,6 @@ def configureDoxyfile(input_dir, output_dir): with open('Doxyfile', 'w') as fp2: fp2.write(filedata) - ## Only trigger readthedocs build if running on readthedocs servers: # read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' @@ -112,7 +111,8 @@ def configureDoxyfile(input_dir, output_dir): breathe_default_project = "brush" breathe_default_members = ('members', 'undoc-members') breathe_projects_source = { - "brush": ("../src/", list(glob('../src/', recursive=True))) + "brush" : ("../src/", list(glob('../src/', recursive=True)) ), + "pybrush": ("../pybrush/", list(glob('../pybrush/', recursive=True)) ), } html_theme_options = { diff --git a/docs/python_api/estimator.rst b/docs/python_api/estimator.rst index 48a2e1de..73b4d865 100644 --- a/docs/python_api/estimator.rst +++ b/docs/python_api/estimator.rst @@ -1,6 +1,6 @@ BrushEstimator ============== -.. autoclass:: pybrush.BrushEstimator +.. autoclass:: pybrush.BrushEstimator.BrushEstimator :members: :undoc-members: \ No newline at end of file diff --git a/docs/python_api/index.md b/docs/python_api/index.md index cff7926b..7463ff63 100644 --- a/docs/python_api/index.md +++ b/docs/python_api/index.md @@ -2,6 +2,7 @@ ```{toctree} estimator +interface regressor classifier python_api diff --git a/docs/python_api/interface.rst b/docs/python_api/interface.rst new file mode 100644 index 00000000..e35c8cb6 --- /dev/null +++ b/docs/python_api/interface.rst @@ -0,0 +1,6 @@ +EstimatorInterface +================== + +.. autoclass:: pybrush.EstimatorInterface.EstimatorInterface + :members: + :undoc-members: \ No newline at end of file diff --git a/docs/python_api/python_api.rst b/docs/python_api/python_api.rst index 701cd786..91f6317d 100644 --- a/docs/python_api/python_api.rst +++ b/docs/python_api/python_api.rst @@ -1,9 +1,9 @@ Python API ========== -.. With doxygennamespace: - -.. .. doxygennamespace:: pybrush -.. :members: - - +.. automodule:: pybrush + :members: + :undoc-members: + :show-inheritance: + :noindex: + :exclude-members: __init__, __module__, __doc__, __weakref__, __dict__ \ No newline at end of file diff --git a/docs/python_api/regressor.rst b/docs/python_api/regressor.rst index 7fd3b9a8..6191bcef 100644 --- a/docs/python_api/regressor.rst +++ b/docs/python_api/regressor.rst @@ -1,6 +1,6 @@ BrushRegressor ============== -.. autoclass:: pybrush.BrushRegressor +.. autoclass:: pybrush.BrushEstimator.BrushRegressor :members: :undoc-members: \ No newline at end of file diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 6f149f94..1519e85b 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -21,7 +21,8 @@ class BrushEstimator(EstimatorInterface, BaseEstimator): """ This is the base class for Brush estimators using the c++ engine. - Parameters are defined and documented in pybrush.EstimatorInterface. + Parameters are defined and documented in + :py:class:`EstimatorInterface ` Attributes ---------- @@ -37,8 +38,6 @@ class BrushEstimator(EstimatorInterface, BaseEstimator): Partition of `data_` containing `(validation_size)`% of the data, in Brush format. search_space_ : a Brush `SearchSpace` object. Holds the operators and terminals and sampling utilities to update programs. - toolbox_ : deap.Toolbox - The toolbox used by DEAP for EA algorithm. """ def __init__(self, **kwargs): @@ -170,9 +169,13 @@ def predict_archive(self, X): class BrushClassifier(BrushEstimator, ClassifierMixin): - """Deap-based Brush for classification. + """Brush with c++ engine for classification. - For options, see :py:class:`DeapEstimator `. + Parameters are defined and documented in + :py:class:`EstimatorInterface ` + + This class inherits from :py:class:`BrushEstimator `. + A full documentation of the methods and attributes can be found there. Examples -------- @@ -180,8 +183,8 @@ class BrushClassifier(BrushEstimator, ClassifierMixin): >>> df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv') >>> X = df.drop(columns='target') >>> y = df['target'] - >>> from pybrush import DeapClassifier - >>> est = DeapClassifier() + >>> from pybrush import BrushClassifier + >>> est = BrushClassifier() >>> est.fit(X,y) >>> # print('score:', est.score(X,y)) """ @@ -252,5 +255,25 @@ def predict_proba_archive(self, X): class BrushRegressor(BrushEstimator, RegressorMixin): + """Brush with c++ engine for regression. + + Parameters are defined and documented in + :py:class:`EstimatorInterface ` + + This class inherits from :py:class:`BrushEstimator `. + A full documentation of the methods and attributes can be found there. + + Examples + -------- + >>> import pandas as pd + >>> df = pd.read_csv('docs/examples/datasets/d_enc.csv') + >>> X = df.drop(columns='label') + >>> y = df['label'] + >>> from pybrush import BrushRegressor + >>> est = BrushRegressor() + >>> est.fit(X,y) + >>> # print('score:', est.score(X,y)) + """ + def __init__(self, **kwargs): super().__init__(mode='regressor',**kwargs) \ No newline at end of file diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index 6ef02d5e..8e4385a5 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -31,7 +31,7 @@ class DeapEstimator(EstimatorInterface, BaseEstimator): """ This is the base class for Brush estimators in python. - Parameters are defined and documented in pybrush.EstimatorInterface. + Parameters are defined and documented in pybrush.EstimatorInterface.EstimatorInterface Attributes ---------- From d67570723f754e6b2180a24fc8a930a072497a60 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Mon, 10 Jun 2024 21:45:05 -0300 Subject: [PATCH 193/199] Documentation. setting n_classes in cpp side --- pybrush/BrushEstimator.py | 31 +++++++++++++++++++++++++------ pybrush/DeapEstimator.py | 2 +- pybrush/EstimatorInterface.py | 5 ++++- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 1519e85b..8daf0b38 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -74,7 +74,7 @@ def fit(self, X, y): self.train_.set_batch_size(self.batch_size) # TODO: update batch indexes at the beggining of every generation self.validation_ = self.data_.get_validation_data() - self.parameters_ = self._wrap_parameters() + self.parameters_ = self._wrap_parameters(n_classes=self.n_classes_) self.search_space_ = SearchSpace(self.data_, self.parameters_.functions, self.weights_init) @@ -94,6 +94,25 @@ def fit(self, X, y): return self def _make_data(self, X, y=None, feature_names=[], validation_size=0.0): + """ + Prepare the data for training or prediction. + + Parameters: + - X: array-like or pandas DataFrame, shape (n_samples, n_features) + The input features. + - y: array-like or pandas Series, shape (n_samples,), optional (default=None) + The target variable. + - feature_names: list, optional (default=[]) + The names of the features. + - validation_size: float, optional (default=0.0) + The proportion of the data to be used for validation. + + Returns: + - dataset: Dataset + The prepared dataset object containing the input features, target variable, + feature names, and validation size. + """ + # This function should not partition data (since it may be used in `predict`). # partitioning is done by `fit`. Feature names should be inferred # before calling _make_data (so predict can be made with np arrays or @@ -145,6 +164,7 @@ def get_params(self, deep=True): def predict_archive(self, X): """Returns a list of dictionary predictions for all models.""" + check_is_fitted(self) if isinstance(X, pd.DataFrame): @@ -196,15 +216,13 @@ def predict_proba(self, X): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples. Internally, it will be converted to - ``dtype=np.float32``. + X : {array-like} of shape (n_samples, n_features) + The input samples. Returns ------- p : ndarray of shape (n_samples, n_classes) - The class probabilities of the input samples. The order of the - classes corresponds to that in the attribute :term:`classes_`. + The class probabilities of the input samples. """ @@ -231,6 +249,7 @@ def predict_proba(self, X): def predict_proba_archive(self, X): """Returns a list of dictionary predictions for all models.""" + check_is_fitted(self) if isinstance(X, pd.DataFrame): diff --git a/pybrush/DeapEstimator.py b/pybrush/DeapEstimator.py index 8e4385a5..eca83ccc 100644 --- a/pybrush/DeapEstimator.py +++ b/pybrush/DeapEstimator.py @@ -152,7 +152,7 @@ def fit(self, X, y): self.validation_ = self.data_.get_validation_data() - self.parameters_ = self._wrap_parameters() + self.parameters_ = self._wrap_parameters(n_classes=self.n_classes_) self.search_space_ = SearchSpace(self.data_, self.parameters_.functions, self.weights_init) if self.mode == "classification": diff --git a/pybrush/EstimatorInterface.py b/pybrush/EstimatorInterface.py index e29d1817..0b94a439 100644 --- a/pybrush/EstimatorInterface.py +++ b/pybrush/EstimatorInterface.py @@ -166,7 +166,7 @@ def __init__(self, self.weights_init=weights_init self.validation_size=validation_size - def _wrap_parameters(self): + def _wrap_parameters(self, **extra_kwargs): """ Creates a `Parameters` class to send to c++ backend the settings for the algorithm to use. @@ -220,4 +220,7 @@ def _wrap_parameters(self): params.random_state = seed + for k, v in extra_kwargs.items(): + setattr(params, k, v) + return params \ No newline at end of file From ae0a88635779e2bab31656dc8495789255d0dde5 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Mon, 10 Jun 2024 21:45:56 -0300 Subject: [PATCH 194/199] Documentation --- src/util/logger.h | 77 ++++++++++++++++++++++++++++------------------- src/util/rnd.h | 2 -- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/src/util/logger.h b/src/util/logger.h index 4351d36d..ae04c794 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -10,40 +10,55 @@ license: GNU/GPL v3 using namespace std; namespace Brush { +namespace Util{ + +/*! + * @class Logger + * @brief Defines a multi level static logger. + */ +class Logger +{ +public: + + /*! + * @brief Initializes the logger instance. + * @return A pointer to the logger instance. + */ + static Logger* initLogger(); + + /*! + * @brief Destroys the logger instance. + */ + static void destroy(); - namespace Util{ + /*! + * @brief Sets the log level. + * @param verbosity The log level to be set. + */ + void set_log_level(int& verbosity); - ////////////////////////////////////////////////////////////////////////////////// Declarations - - /*! - * @class Logger - * @brief Defines a multi level static logger. - */ + /*! + * @brief Gets the current log level. + * @return The current log level. + */ + int get_log_level(); + + /*! + * @brief Prints a log message with verbosity control. + * @param m The log message to be printed. + * @param v The verbosity level of the log message. + * @param sep The separator to be used between log messages. + * @return The formatted log message. + */ + string log(string m, int v, string sep="\n") const; + +private: + int verbosity; //!< The current log level. + static Logger* instance; //!< The singleton instance of the logger. +}; - class Logger - { - public: - - static Logger* initLogger(); - - static void destroy(); +static Logger &logger = *Logger::initLogger(); - void set_log_level(int& verbosity); - - int get_log_level(); - - /// print message with verbosity control. - string log(string m, int v, string sep="\n") const; - - private: - - int verbosity; - - static Logger* instance; - - }; - - static Logger &logger = *Logger::initLogger(); - } +} } #endif diff --git a/src/util/rnd.h b/src/util/rnd.h index 21e90ed1..99c96afe 100644 --- a/src/util/rnd.h +++ b/src/util/rnd.h @@ -25,7 +25,6 @@ namespace Brush { namespace Util{ * @class Rnd * @brief Defines a multi-core random number generator and its operators. */ - class Rnd { public: @@ -66,7 +65,6 @@ namespace Brush { namespace Util{ return start; } - // TODO: write doxygen documentation for this source code. /// select randomly with weighted distribution. // The probability of picking the i-th element is w_i/S, with S // being the sum of all weights. select_randomly works even if the From e1b2f193bbc7a22a9e124bd6d7c6fdebca4276cf Mon Sep 17 00:00:00 2001 From: gAldeia Date: Mon, 10 Jun 2024 21:46:18 -0300 Subject: [PATCH 195/199] Improved variation. Fixed lots of TODOs. implemented get_size in trees, so we dont have several reimplementations of same function --- src/bindings/bind_variation.h | 2 +- src/engine.cpp | 2 +- src/engine.h | 5 +- src/eval/scorer.h | 1 - src/params.h | 20 ++-- src/program/program.h | 59 +----------- src/program/tree_node.cpp | 36 ++++++- src/program/tree_node.h | 1 + src/vary/variation.cpp | 172 +++++++++++++++------------------- src/vary/variation.h | 138 ++++++++++++--------------- 10 files changed, 187 insertions(+), 249 deletions(-) diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index 88cacc3a..c08032f2 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -52,7 +52,7 @@ void bind_variation(py::module& m, string name) // offspring in the second half of the index vector) pop.add_offspring_indexes(island); - self.vary(pop, island, parents, params); + self.vary(pop, island, parents); // making copies of the second half of the island individuals vector indices = pop.get_island_indexes(island); diff --git a/src/engine.cpp b/src/engine.cpp index e6909406..3550b2f9 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -421,7 +421,7 @@ void Engine::run(Dataset &data) } this->pop.add_offspring_indexes(island); - variator.vary(this->pop, island, island_parents.at(island), params); + variator.vary(this->pop, island, island_parents.at(island)); evaluator.update_fitness(this->pop, island, data, params, true); if (data.use_batch) // assign the batch error as fitness (but fit was done with training data) diff --git a/src/engine.h b/src/engine.h index 7b02b411..f637aef0 100644 --- a/src/engine.h +++ b/src/engine.h @@ -27,8 +27,8 @@ using namespace Eval; using namespace Var; using namespace nlohmann; -template -class Engine{ + +template class Engine{ public: Engine(const Parameters& p=Parameters()) : params(p) @@ -52,6 +52,7 @@ class Engine{ /// updates best score by searching in the population for the individual that best fits the given data bool update_best(const Dataset& data, bool val=false); + // TODO: hyperparameter to set how the best is picked (MCDM, best on val, pareto front, etc). one of the options should be getting the pareto front // TODO: best fitness (the class) instead of these. use fitness comparison diff --git a/src/eval/scorer.h b/src/eval/scorer.h index a10c5df4..a47e4c9f 100644 --- a/src/eval/scorer.h +++ b/src/eval/scorer.h @@ -33,7 +33,6 @@ typedef float (*funcPointer)(const VectorXf&, // TODO: add more scores, include them here, add to score_hash Scorer(string scorer="mse") { - // TODO: use this idea of map functpointer to do the mutations score_hash["mse"] = &mse; this->set_scorer(scorer); diff --git a/src/params.h b/src/params.h index b2ae656d..2ac70594 100644 --- a/src/params.h +++ b/src/params.h @@ -17,19 +17,18 @@ namespace Brush struct Parameters { public: - // TODO: make parameters private, and use the getters and setters in the code - - int random_state = 0; // by default, the rng generator will use any random seed if random_state is zero + // by default, the rng generator will use any random seed if random_state is zero + int random_state = 0; int verbosity = 0; - // Evolutionary stuff + // Evolutionary algorithm settings string mode="regression"; unsigned int current_gen = 1; // termination criteria int pop_size = 100; - int max_gens = 100; + int max_gens = 100; int max_stall = 0; int max_time = -1; @@ -58,12 +57,11 @@ struct Parameters {"toggle_weight_off", 0.167} }; - float cx_prob=0.2; ///< cross rate for variation + float cx_prob=0.2; ///< cross rate for variation float mig_prob = 0.05; string scorer_="mse"; ///< actual loss function used, determined by error - // TODO: set these values when creating the parameters in python side vector classes; ///< class labels vector class_weights; ///< weights for each class vector sample_weights; ///< weights for each sample @@ -84,12 +82,12 @@ struct Parameters string logfile = ""; - int n_jobs = 1; // -1; ///< number of parallel jobs -1 use all threads; 0 use same as number of islands; positive number specify the amouut of threads + int n_jobs = 1; ///< number of parallel jobs -1 use all threads; 0 use same as number of islands; positive number specify the amouut of threads Parameters(){}; ~Parameters(){}; - // TODO: use logger to log information + // TODO: use logger to log information. Make getters const void set_verbosity(int new_verbosity){ Brush::Util::logger.set_log_level(new_verbosity); verbosity = new_verbosity; }; int get_verbosity(){ return verbosity; }; @@ -128,13 +126,13 @@ struct Parameters int get_num_islands(){ return num_islands; }; void set_max_depth(unsigned new_max_depth){ max_depth = new_max_depth; }; - unsigned get_max_depth(){ return max_depth; }; + unsigned get_max_depth() const { return max_depth; }; void set_n_jobs(int new_n_jobs){ n_jobs = new_n_jobs; }; int get_n_jobs(){ return n_jobs; }; void set_max_size(unsigned new_max_size){ max_size = new_max_size; }; - unsigned get_max_size(){ return max_size; }; + unsigned get_max_size() const { return max_size; }; void set_objectives(vector new_objectives){ objectives = new_objectives; }; vector get_objectives(){ return objectives; }; diff --git a/src/program/program.h b/src/program/program.h index 238caa8f..a311603d 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -100,30 +100,9 @@ template struct Program /// @param include_weight whether to include the node's weight in the count. /// @return int number of nodes. int size(bool include_weight=true) const{ - int acc = 0; - - std::for_each(Tree.begin(), Tree.end(), - [include_weight, &acc](auto& node){ - ++acc; // the node operator or terminal - - // SplitBest has an optimizable decision tree consisting of 3 nodes - // (terminal, arithmetic comparison, value) that needs to be taken - // into account. Split on will have an random decision tree that can - // have different sizes, but will also have the arithmetic comparison - // and a value. - if (Is(node.node_type)) - acc += 3; - else if (Is(node.node_type)) - acc += 2; - - if ( (include_weight && node.get_is_weighted()==true) - && Isnt(node.node_type) ) - // Taking into account the weight and multiplication, if enabled. - // weighted constants still count as 1 (simpler than constant terminals) - acc += 2; - }); - - return acc; + auto head = Tree.begin(); + + return head.node->get_size(include_weight); } /// @brief count the size of a given subtree, optionally including the @@ -133,37 +112,7 @@ template struct Program /// @return int number of nodes. int size_at(Iter& top, bool include_weight=true) const{ - int acc = 0; - - // inspired in tree.hh size. First create two identical iterators - Iter it=top, eit=top; - - // Then make the second one point to the next sibling - eit.skip_children(); - ++eit; - - // calculate tree size for each node until reach next sibling - while(it!=eit) { - ++acc; // counting the node operator/terminal - - // SplitBest has an optimizable decision tree consisting of 3 nodes - // (terminal, arithmetic comparison, value) that needs to be taken - // into account - if (Is(it.node->data.node_type)) - acc += 3; - else if (Is(it.node->data.node_type)) - acc += 2; - - if ( (include_weight && it.node->data.get_is_weighted()==true) - && Isnt(it.node->data.node_type) ) - // Taking into account the weight and multiplication, if enabled. - // weighted constants still count as 1 (simpler than constant terminals) - acc += 2; - - ++it; - } - - return acc; + return top.node->get_size(include_weight); } /// @brief count the tree depth of the program. The depth is not influenced by weighted nodes. diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index d67f3793..2a186418 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -162,8 +162,8 @@ int TreeNode::get_complexity() const // include the `w` and `*` if the node is weighted (and it is not a constant or mean label) if (data.get_is_weighted() && !(Is(data.node_type) - || (Is(data.node_type) - || Is(data.node_type)) ) + || (Is(data.node_type) + || Is(data.node_type)) ) ) return operator_complexities.at(NodeType::Mul)*( operator_complexities.at(NodeType::Constant) + @@ -171,4 +171,34 @@ int TreeNode::get_complexity() const ); return node_complexity*(children_complexity_sum); -} \ No newline at end of file +}; + +int TreeNode::get_size(bool include_weight) const +{ + int acc = 1; // the node operator or terminal + + // SplitBest has an optimizable decision tree consisting of 3 nodes + // (terminal, arithmetic comparison, value) that needs to be taken + // into account. Split on will have an random decision tree that can + // have different sizes, but will also have the arithmetic comparison + // and a value. + if (Is(data.node_type)) + acc += 3; + else if (Is(data.node_type)) + acc += 2; + + if ( (include_weight && data.get_is_weighted()==true) + && Isnt(data.node_type) ) + // Taking into account the weight and multiplication, if enabled. + // weighted constants still count as 1 (simpler than constant terminals) + acc += 2; + + auto child = first_child; + for(int i = 0; i < data.get_arg_count(); ++i) + { + acc += child->get_size(include_weight); + child = child->next_sibling; + } + + return acc; +}; diff --git a/src/program/tree_node.h b/src/program/tree_node.h index 64b54149..dc50f00a 100644 --- a/src/program/tree_node.h +++ b/src/program/tree_node.h @@ -51,6 +51,7 @@ class tree_node_ { // size: 5*4=20 bytes (on 32 bit arch), can be reduced string get_tree_model(bool pretty=false, string offset="") const; int get_complexity() const; + int get_size(bool include_weight=true) const; }; using TreeNode = class tree_node_; diff --git a/src/vary/variation.cpp b/src/vary/variation.cpp index cdf26811..3e75182b 100644 --- a/src/vary/variation.cpp +++ b/src/vary/variation.cpp @@ -12,18 +12,12 @@ namespace Var { class PointMutation : public MutationBase { public: - explicit PointMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) + static auto mutate(tree& Tree, Iter spot, const SearchSpace& SS, + const Parameters& params) { - } - - auto operator()(tree& Tree, Iter spot) const -> bool override - { - // cout << "point mutation\n"; - // get_node_like will sample a similar node based on node_map_weights or // terminal_weights, and maybe will return a Node. - optional newNode = SS().get_node_like(spot.node->data); + optional newNode = SS.get_node_like(spot.node->data); if (!newNode) // overload to check if newNode == nullopt return false; @@ -44,16 +38,12 @@ class PointMutation : public MutationBase class InsertMutation : public MutationBase { public: - explicit InsertMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) - { - } - - auto find_spots(tree& Tree) const -> vector override + static auto find_spots(tree& Tree, const SearchSpace& SS, + const Parameters& params) { vector weights; - if (size_with_weights(Tree) < max_size()) { + if (Tree.size() < params.get_max_size()) { Iter iter = Tree.begin(); std::transform(Tree.begin(), Tree.end(), std::back_inserter(weights), [&](const auto& n){ @@ -61,8 +51,8 @@ class InsertMutation : public MutationBase std::advance(iter, 1); // check if SS holds an operator to avoid failing `check` in sample_op_with_arg - if ((d >= max_depth()) - || (SS().node_map.find(n.ret_type) == SS().node_map.end())) { + if ((d >= params.get_max_depth()) + || (SS.node_map.find(n.ret_type) == SS.node_map.end())) { return 0.0f; } else { @@ -79,9 +69,9 @@ class InsertMutation : public MutationBase return weights; } - auto operator()(tree& Tree, Iter spot) const -> bool override + static auto mutate(tree& Tree, Iter spot, const SearchSpace& SS, + const Parameters& params) { - // cout << "insert mutation\n"; auto spot_type = spot.node->data.ret_type; // pick a random compatible node to insert (with probabilities given by @@ -91,8 +81,8 @@ class InsertMutation : public MutationBase // size restriction, which will be relaxed here (just as it is in the PTC2 // algorithm). This mutation can create a new expression that exceeds the // maximum size by the highest arity among the operators. - std::optional n = SS().sample_op_with_arg(spot_type, spot_type, true, - max_size()-Tree.size()-1); + std::optional n = SS.sample_op_with_arg( + spot_type, spot_type, true, params.max_size-Tree.size()-1); if (!n) // there is no operator with compatible arguments return false; @@ -107,7 +97,7 @@ class InsertMutation : public MutationBase if (spot_filled) { // if spot is in its child position, append children. - auto opt = SS().sample_terminal(a); + auto opt = SS.sample_terminal(a); if (!opt) return false; @@ -119,7 +109,7 @@ class InsertMutation : public MutationBase spot_filled = true; // otherwise, add siblings before spot node else { - auto opt = SS().sample_terminal(a); + auto opt = SS.sample_terminal(a); if (!opt) return false; @@ -141,18 +131,12 @@ class InsertMutation : public MutationBase class DeleteMutation : public MutationBase { public: - explicit DeleteMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) + static auto mutate(tree& Tree, Iter spot, const SearchSpace& SS, + const Parameters& params) { - } - - auto operator()(tree& Tree, Iter spot) const -> bool override - { - // cout << "delete mutation\n"; - // sample_terminal will sample based on terminal_weights. If it succeeds, // then the new terminal will be in `opt.value()` - auto opt = SS().sample_terminal(spot.node->data.ret_type); + auto opt = SS.sample_terminal(spot.node->data.ret_type); if (!opt) // there is no terminal with compatible arguments return false; @@ -174,16 +158,12 @@ class DeleteMutation : public MutationBase class ToggleWeightOnMutation : public MutationBase { public: - explicit ToggleWeightOnMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) - { - } - - auto find_spots(tree& Tree) const -> vector override + static auto find_spots(tree& Tree, const SearchSpace& SS, + const Parameters& params) { vector weights(Tree.size()); - if (size_with_weights(Tree) < max_size()) { + if (Tree.size() < params.max_size) { std::transform(Tree.begin(), Tree.end(), weights.begin(), [&](const auto& n){ // some nodetypes must always have a weight @@ -208,10 +188,9 @@ class ToggleWeightOnMutation : public MutationBase return weights; } - auto operator()(tree& Tree, Iter spot) const -> bool override + static auto mutate(tree& Tree, Iter spot, const SearchSpace& SS, + const Parameters& params) { - // cout << "toggle_weight_on mutation\n"; - if (spot.node->data.get_is_weighted()==true // cant turn on whats already on || !IsWeighable(spot.node->data.ret_type)) // does not accept weights (e.g. boolean) return false; // false indicates that mutation failed and should return std::nullopt @@ -230,12 +209,8 @@ class ToggleWeightOnMutation : public MutationBase class ToggleWeightOffMutation : public MutationBase { public: - explicit ToggleWeightOffMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) - { - } - - auto find_spots(tree& Tree) const -> vector override + static auto find_spots(tree& Tree, const SearchSpace& SS, + const Parameters& params) { vector weights(Tree.size()); @@ -255,7 +230,8 @@ class ToggleWeightOffMutation : public MutationBase return weights; } - auto operator()(tree& Tree, Iter spot) const -> bool override + static auto mutate(tree& Tree, Iter spot, const SearchSpace& SS, + const Parameters& params) { // cout << "toggle_weight_off mutation\n"; @@ -276,19 +252,14 @@ class ToggleWeightOffMutation : public MutationBase class SubtreeMutation : public MutationBase { public: - explicit SubtreeMutation(const SearchSpace& SS, size_t max_size=0, size_t max_depth=0) - : MutationBase(SS, max_size, max_depth) // TODO: change order size and depth - { - } - - // TODO: make different private functions to find spots and use them. theres too much copy and paste here - auto find_spots(tree& Tree) const -> vector override + static auto find_spots(tree& Tree, const SearchSpace& SS, + const Parameters& params) { vector weights; - auto node_map = SS().node_map; + auto node_map = SS.node_map; - if (size_with_weights(Tree) < max_size()) { + if (Tree.size() < params.max_size) { Iter iter = Tree.begin(); std::transform(Tree.begin(), Tree.end(), std::back_inserter(weights), [&](const auto& n){ @@ -296,9 +267,9 @@ class SubtreeMutation : public MutationBase std::advance(iter, 1); // we need to make sure there's some node to start the subtree - if ((d >= max_depth()) - || (SS().node_map.find(n.ret_type) == SS().node_map.end()) - || (SS().node_map.find(n.ret_type) == SS().node_map.end()) ) + if ((d >= params.max_depth) + || (SS.node_map.find(n.ret_type) == SS.node_map.end()) + || (SS.node_map.find(n.ret_type) == SS.node_map.end()) ) return 0.0f; else return n.get_prob_change(); @@ -312,30 +283,29 @@ class SubtreeMutation : public MutationBase return weights; } - auto operator()(tree& Tree, Iter spot) const -> bool override + static auto mutate(tree& Tree, Iter spot, const SearchSpace& SS, + const Parameters& params) { - // cout << "subtree mutation\n"; - // check if we exceeded the size/depth constrains (without subtracting, // to avoid overflow cases if the user sets max_size smaller than arity // of smallest operator. The overflow would happen when calculating d and // s in the following lines, to choose the PTC2 limits) - if ( max_size() <= (Tree.size() - Tree.size(spot)) - || max_depth() <= Tree.depth(spot) ) + if ( params.max_size <= (Tree.size() - Tree.size(spot)) + || params.max_depth <= Tree.depth(spot) ) return false; auto spot_type = spot.node->data.ret_type; // d and s must be compatible with PTC2 --- they should be based on // tree structure, not program structure - size_t d = max_depth() - Tree.depth(spot); - size_t s = max_size() - (Tree.size() - Tree.size(spot)); + size_t d = params.max_depth - Tree.depth(spot); + size_t s = params.max_size - (Tree.size() - Tree.size(spot)); s = r.rnd_int(1, s); // sample subtree uses PTC2, which operates on depth and size of the tree // (and not on the program!). we shoudn't care for weights here - auto subtree = SS().sample_subtree(spot.node->data, d, s); + auto subtree = SS.sample_subtree(spot.node->data, d, s); if (!subtree) // there is no terminal with compatible arguments return false; @@ -437,7 +407,6 @@ std::optional> Variation::cross( return (s <= allowed_size) && (d <= allowed_depth); }; - // TODO: something like `is_valid_program` in FEAT std::transform(other.Tree.begin(), other.Tree.end(), other_weights.begin(), [child_ret_type, check_and_incrm](const auto& n){ @@ -531,42 +500,34 @@ std::optional> Variation::mutate(const Individual& parent) return std::nullopt; } + Program child(parent.program); + int attempts = 0; while(++attempts <= 3) { // choose a valid mutation option string choice = r.random_choice(parameters.mutation_probs); + + vector weights; - // TODO: this could be improved (specially with the Variation class) - std::unique_ptr mutation; - if (choice == "point") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "insert") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "delete") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "toggle_weight_on") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "toggle_weight_off") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); - else if (choice == "subtree") - mutation = std::make_unique( - search_space,parameters.max_size, parameters.max_depth); + // choose location by weighted sampling of program + if (choice == "point") + weights = PointMutation::find_spots(child.Tree, search_space, parameters); + else if (choice == "insert") + weights = InsertMutation::find_spots(child.Tree, search_space, parameters); + else if (choice == "delete") + weights = DeleteMutation::find_spots(child.Tree, search_space, parameters); + else if (choice == "subtree") + weights = SubtreeMutation::find_spots(child.Tree, search_space, parameters); + else if (choice == "toggle_weight_on") + weights = ToggleWeightOnMutation::find_spots(child.Tree, search_space, parameters); + else if (choice == "toggle_weight_off") + weights = ToggleWeightOffMutation::find_spots(child.Tree, search_space, parameters); else { std::string msg = fmt::format("{} not a valid mutation choice", choice); HANDLE_ERROR_THROW(msg); } - Program child(parent.program); - - // choose location by weighted sampling of program - auto weights = mutation->find_spots(child.Tree); - if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { return w<=0.0; })) @@ -581,7 +542,20 @@ std::optional> Variation::mutate(const Individual& parent) // Every mutation here works inplace, so they return bool instead of // std::optional to indicare the result of their manipulation over the // program tree. Here we call the mutation function and return the result - bool success = (*mutation)(child.Tree, spot); + + bool success; + if (choice == "point") + success = PointMutation::mutate(child.Tree, spot, search_space, parameters); + else if (choice == "insert") + success = InsertMutation::mutate(child.Tree, spot, search_space, parameters); + else if (choice == "delete") + success = DeleteMutation::mutate(child.Tree, spot, search_space, parameters); + else if (choice == "subtree") + success = SubtreeMutation::mutate(child.Tree, spot, search_space, parameters); + else if (choice == "toggle_weight_on") + success = ToggleWeightOnMutation::mutate(child.Tree, spot, search_space, parameters); + else // it must be"toggle_weight_off" + success = ToggleWeightOffMutation::mutate(child.Tree, spot, search_space, parameters); // std::cout << "returning" << std::endl; if (success @@ -602,7 +576,7 @@ std::optional> Variation::mutate(const Individual& parent) template void Variation::vary(Population& pop, int island, - const vector& parents, const Parameters& p) + const vector& parents) { auto indices = pop.get_island_indexes(island); @@ -635,7 +609,7 @@ void Variation::vary(Population& pop, int island, } // this assumes that islands do not share indexes before doing variation - unsigned id = p.current_gen*p.pop_size+indices.at(i); + unsigned id = parameters.current_gen*parameters.pop_size+indices.at(i); // mutation and crossover already perform 3 attempts. If it fails, we just fill with a random individual if (opt) // variation worked, lets keep this diff --git a/src/vary/variation.h b/src/vary/variation.h index 8868a53b..2f3bced4 100644 --- a/src/vary/variation.h +++ b/src/vary/variation.h @@ -6,23 +6,11 @@ license: GNU/GPL v3 #ifndef VARIATION_H #define VARIATION_H - -// #include "util/error.h" -// #include "util/utils.h" - -//#include "search_space.h" #include "../pop/population.h" #include #include -// namespace Brush{ - -// typedef tree::pre_order_iterator Iter; - -//////////////////////////////////////////////////////////////////////////// -// Mutation & Crossover - using namespace Brush::Pop; /** @@ -32,19 +20,12 @@ using namespace Brush::Pop; namespace Brush { namespace Var { -// base for MUTATION variators class MutationBase { public: using Iter = tree::pre_order_iterator; - // TODO: static methods, without storing information, and using just SS and params as arguments - MutationBase(const SearchSpace& SS, size_t max_size, size_t max_depth) - : SS_(SS) - , max_size_(max_size) - , max_depth_(max_depth) - {} - - virtual auto find_spots(tree& Tree) const -> vector + static auto find_spots(tree& Tree, const SearchSpace& SS, + const Parameters& params) { vector weights(Tree.size()); @@ -56,81 +37,86 @@ class MutationBase { return weights; } - virtual auto operator()(tree& Tree, Iter spot) const -> bool = 0; - - auto SS() const -> SearchSpace { return SS_; } - auto max_size() const -> size_t { return max_size_; } - auto max_depth() const -> size_t{ return max_depth_; } -protected: - static size_t size_with_weights(tree& Tree, bool include_weight=true) - { - // re-implementation of int Node::size(bool include_weight=true) meant - // to work with the tree instead of brush's programs. - // TODO: find a better way to have this function available to mutations - // and avoid repeated functions - size_t acc = 0; - - std::for_each(Tree.begin(), Tree.end(), - [include_weight, &acc](auto& node){ - ++acc; // the node operator or terminal - - // TODO: the same size check occurs in search_space.cpp and program.h. Make a function (stop doing hardcoded) - // SplitBest has an optimizable decision tree consisting of 3 nodes - // (terminal, arithmetic comparison, value) that needs to be taken - // into account - if (Is(node.node_type)) - acc += 3; - else if (Is(node.node_type)) - acc += 2; - - if ( (include_weight && node.get_is_weighted()==true) - && Isnt(node.node_type) ) - // Taking into account the weight and multiplication, if enabled. - // weighted constants still count as 1 (simpler than constant terminals) - acc += 2; - }); - - return acc; - } - -private: - SearchSpace SS_; // where to sample nodes to change the program - - // constrains TODO: use params to get this values, stop storing it - size_t max_size_; - size_t max_depth_; + static auto mutate(tree& Tree, Iter spot, const SearchSpace& SS, + const Parameters& params); }; -// TODO: make sure every method doesnt store information, instead they retrieve it from parameters (so there's no side effect) - +/*! + * @class Variation + * @brief Class representing the variation operators in Brush. + * + * The Variation class is responsible for performing individual-level variations + * and handling the variation of a population in Brush. It contains methods for + * crossing individuals, mutating individuals, and varying a population. + */ template -class Variation -{ -private: - SearchSpace search_space; - Parameters parameters; // stop using this thing here and get parameter as argument +class Variation { public: + /** + * @brief Default constructor. + */ Variation() = default; + /** + * @brief Constructor that initializes the Variation object with parameters and search space. + * + * @param params The parameters for the variation operator. + * @param ss The search space for the variation operator. + */ Variation(Parameters& params, SearchSpace& ss) : parameters(params) , search_space(ss) {}; + /** + * @brief Destructor. + */ ~Variation() {}; + /** + * @brief Initializes the Variation object with parameters and search space. + * + * @param params The parameters for the variation operator. + * @param ss The search space for the variation operator. + */ void init(Parameters& params, SearchSpace& ss){ this->parameters = params; this->search_space = ss; }; - // individual-level variations - std::optional> cross(const Individual& mom, const Individual& dad); + /** + * @brief Performs crossover operation on two individuals. + * + * @param mom The first parent individual. + * @param dad The second parent individual. + * @return An optional containing the offspring individual if the crossover + * is successful, or an empty optional otherwise. + */ + std::optional> cross(const Individual& mom, + const Individual& dad); + + /** + * @brief Performs mutation operation on an individual. + * + * @param parent The parent individual. + * @return An optional containing the mutated individual if the mutation is + * successful, or an empty optional otherwise. + */ std::optional> mutate(const Individual& parent); - /// method to handle variation of population - void vary(Population& pop, int island, const vector& parents, - const Parameters& p); + /** + * @brief Handles variation of a population. + * + * @param pop The population to be varied. + * @param island The island index. + * @param parents The indices of the parent individuals. + * @param p The parameters for the variation operator. + */ + void vary(Population& pop, int island, const vector& parents); + +private: + SearchSpace search_space; // The search space for the variation operator. + Parameters parameters; // The parameters for the variation operator }; } //namespace Var From dd1eb30289fe8aa7f63e918f3c064a6518e121a0 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Mon, 10 Jun 2024 21:46:59 -0300 Subject: [PATCH 196/199] Updated tedts to work with new variation --- tests/cpp/test_evaluation.cpp | 1 + tests/cpp/test_population.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/cpp/test_evaluation.cpp b/tests/cpp/test_evaluation.cpp index e69de29b..db71c641 100644 --- a/tests/cpp/test_evaluation.cpp +++ b/tests/cpp/test_evaluation.cpp @@ -0,0 +1 @@ +// write a test for different metrics \ No newline at end of file diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index b4a2dc43..7a78d3f1 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -107,7 +107,7 @@ TEST(Population, PopulationTests) // variation applied to population fmt::print("Variations for island {}\n", j); - variator.vary(pop, j, parents, params); + variator.vary(pop, j, parents); fmt::print("fitting {}\n", j); // at this step, we know that theres only one pointer to each individual being fitted, so we can perform it in parallel evaluator.update_fitness(pop, j, data, params, true, true); From f5d26eb1dcbd1d2363c231db4a9baeb5b1ff190b Mon Sep 17 00:00:00 2001 From: gAldeia Date: Mon, 10 Jun 2024 21:47:47 -0300 Subject: [PATCH 197/199] Fixed bad doxygen instructions --- docs/cpp_api/archive.rst | 2 +- docs/cpp_api/engine.rst | 2 +- docs/cpp_api/evaluation.rst | 2 +- docs/cpp_api/individual.rst | 2 +- docs/cpp_api/variation.rst | 4 +- docs/guide/saving_loading_populations.ipynb | 196 ++++++++++---------- docs/python_api/python_api.rst | 12 +- 7 files changed, 110 insertions(+), 110 deletions(-) diff --git a/docs/cpp_api/archive.rst b/docs/cpp_api/archive.rst index aa80c161..02810868 100644 --- a/docs/cpp_api/archive.rst +++ b/docs/cpp_api/archive.rst @@ -1,5 +1,5 @@ Archive ======= -.. doxygenclass:: Brush::Pop::Archive +.. doxygenstruct:: Brush::Pop::Archive :members: diff --git a/docs/cpp_api/engine.rst b/docs/cpp_api/engine.rst index 60274e88..9129dfa3 100644 --- a/docs/cpp_api/engine.rst +++ b/docs/cpp_api/engine.rst @@ -1,7 +1,7 @@ Engine (and parameters) ======================= -.. doxygenclass:: Brush::Parameters +.. doxygenstruct:: Brush::Parameters :members: .. doxygenclass:: Brush::Engine diff --git a/docs/cpp_api/evaluation.rst b/docs/cpp_api/evaluation.rst index e3aaf620..8803dc2d 100644 --- a/docs/cpp_api/evaluation.rst +++ b/docs/cpp_api/evaluation.rst @@ -4,5 +4,5 @@ Evaluation .. doxygenclass:: Brush::Eval::Evaluation :members: -.. doxygenclass:: Brush::Scorer +.. doxygenclass:: Brush::Eval::Scorer :members: diff --git a/docs/cpp_api/individual.rst b/docs/cpp_api/individual.rst index e781b865..155097ec 100644 --- a/docs/cpp_api/individual.rst +++ b/docs/cpp_api/individual.rst @@ -4,5 +4,5 @@ Individual and Fitness .. doxygenclass:: Brush::Pop::Individual :members: -.. doxygenclass:: Brush::Fitness +.. doxygenstruct:: Brush::Fitness :members: \ No newline at end of file diff --git a/docs/cpp_api/variation.rst b/docs/cpp_api/variation.rst index 295f2972..55959d79 100644 --- a/docs/cpp_api/variation.rst +++ b/docs/cpp_api/variation.rst @@ -1,8 +1,8 @@ Variation (Crossover/Mutation) ============================== -.. doxygenclass:: Brush::Vary::MutationBase +.. doxygenclass:: Brush::Var::MutationBase :members: -.. doxygenclass:: Brush::Vary::Variation +.. doxygenclass:: Brush::Var::Variation :members: diff --git a/docs/guide/saving_loading_populations.ipynb b/docs/guide/saving_loading_populations.ipynb index cd65da2f..af6fd4bd 100644 --- a/docs/guide/saving_loading_populations.ipynb +++ b/docs/guide/saving_loading_populations.ipynb @@ -48,77 +48,77 @@ "output_type": "stream", "text": [ "Generation 1/10 [////// ]\n", - "Train Loss (Med): 10.64540 (90.38514)\n", - "Val Loss (Med): 10.64540 (90.38514)\n", + "Train Loss (Med): 11.75939 (74.37032)\n", + "Val Loss (Med): 11.75939 (74.37032)\n", "Median Size (Max): 3 (19)\n", - "Median complexity (Max): 9 (648)\n", - "Time (s): 0.08182\n", + "Median complexity (Max): 9 (432)\n", + "Time (s): 0.12205\n", "\n", "Generation 2/10 [/////////// ]\n", - "Train Loss (Med): 10.64540 (60.79966)\n", - "Val Loss (Med): 10.64540 (60.79966)\n", + "Train Loss (Med): 11.58283 (17.94969)\n", + "Val Loss (Med): 11.58283 (17.94969)\n", "Median Size (Max): 3 (19)\n", - "Median complexity (Max): 9 (588)\n", - "Time (s): 0.15738\n", + "Median complexity (Max): 9 (368)\n", + "Time (s): 0.27800\n", "\n", "Generation 3/10 [//////////////// ]\n", - "Train Loss (Med): 10.64540 (41.44810)\n", - "Val Loss (Med): 10.64540 (41.44810)\n", - "Median Size (Max): 3 (19)\n", - "Median complexity (Max): 9 (747)\n", - "Time (s): 0.25071\n", + "Train Loss (Med): 11.15674 (17.94969)\n", + "Val Loss (Med): 11.15674 (17.94969)\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 10 (915)\n", + "Time (s): 0.41845\n", "\n", "Generation 4/10 [///////////////////// ]\n", - "Train Loss (Med): 10.64540 (17.94969)\n", - "Val Loss (Med): 10.64540 (17.94969)\n", - "Median Size (Max): 4 (19)\n", - "Median complexity (Max): 20 (1425)\n", - "Time (s): 0.36906\n", + "Train Loss (Med): 10.62121 (17.94969)\n", + "Val Loss (Med): 10.62121 (17.94969)\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 9 (381)\n", + "Time (s): 0.56585\n", "\n", "Generation 5/10 [////////////////////////// ]\n", - "Train Loss (Med): 10.64540 (17.94969)\n", - "Val Loss (Med): 10.64540 (17.94969)\n", - "Median Size (Max): 3 (19)\n", - "Median complexity (Max): 9 (692)\n", - "Time (s): 0.45359\n", + "Train Loss (Med): 10.51181 (17.94969)\n", + "Val Loss (Med): 10.51181 (17.94969)\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 9 (412)\n", + "Time (s): 0.73561\n", "\n", "Generation 6/10 [/////////////////////////////// ]\n", - "Train Loss (Med): 10.43983 (17.94969)\n", - "Val Loss (Med): 10.43983 (17.94969)\n", - "Median Size (Max): 3 (17)\n", - "Median complexity (Max): 9 (324)\n", - "Time (s): 0.55083\n", + "Train Loss (Med): 10.51181 (17.94969)\n", + "Val Loss (Med): 10.51181 (17.94969)\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 9 (412)\n", + "Time (s): 0.89526\n", "\n", "Generation 7/10 [//////////////////////////////////// ]\n", - "Train Loss (Med): 10.43983 (17.90349)\n", - "Val Loss (Med): 10.43983 (17.90349)\n", - "Median Size (Max): 4 (19)\n", - "Median complexity (Max): 14 (975)\n", - "Time (s): 0.64643\n", + "Train Loss (Med): 10.51181 (17.94969)\n", + "Val Loss (Med): 10.51181 (17.94969)\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 9 (412)\n", + "Time (s): 1.03213\n", "\n", "Generation 8/10 [///////////////////////////////////////// ]\n", - "Train Loss (Med): 10.26326 (17.67499)\n", - "Val Loss (Med): 10.26326 (17.67499)\n", - "Median Size (Max): 5 (17)\n", - "Median complexity (Max): 21 (324)\n", - "Time (s): 0.74136\n", + "Train Loss (Med): 10.43982 (17.94969)\n", + "Val Loss (Med): 10.43982 (17.94969)\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 9 (412)\n", + "Time (s): 1.19282\n", "\n", "Generation 9/10 [////////////////////////////////////////////// ]\n", - "Train Loss (Med): 10.26326 (17.94969)\n", - "Val Loss (Med): 10.26326 (17.94969)\n", - "Median Size (Max): 3 (17)\n", - "Median complexity (Max): 9 (324)\n", - "Time (s): 0.83278\n", + "Train Loss (Med): 10.33524 (17.94969)\n", + "Val Loss (Med): 10.33524 (17.94969)\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 9 (368)\n", + "Time (s): 1.33781\n", "\n", "Generation 10/10 [//////////////////////////////////////////////////]\n", - "Train Loss (Med): 10.26326 (17.94969)\n", - "Val Loss (Med): 10.26326 (17.94969)\n", - "Median Size (Max): 3 (17)\n", - "Median complexity (Max): 9 (324)\n", - "Time (s): 0.93411\n", + "Train Loss (Med): 10.33524 (17.94969)\n", + "Val Loss (Med): 10.33524 (17.94969)\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 9 (368)\n", + "Time (s): 1.50192\n", "\n", - "Saved population to file /tmp/tmpl0n47qch/population.json\n", - "score: 0.8864496494920485\n" + "Saved population to file /tmp/tmpw7jkwa5m/population.json\n", + "score: 0.8856532915521027\n" ] } ], @@ -159,78 +159,78 @@ "name": "stdout", "output_type": "stream", "text": [ - "Loaded population from /tmp/tmpl0n47qch/population.json of size = 200\n", + "Loaded population from /tmp/tmpw7jkwa5m/population.json of size = 200\n", "Generation 1/10 [////// ]\n", - "Train Loss (Med): 10.26326 (17.94969)\n", - "Val Loss (Med): 10.26326 (17.94969)\n", - "Median Size (Max): 3 (17)\n", - "Median complexity (Max): 9 (324)\n", - "Time (s): 0.08078\n", + "Train Loss (Med): 10.33524 (17.94969)\n", + "Val Loss (Med): 10.33524 (17.94969)\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 9 (368)\n", + "Time (s): 0.16596\n", "\n", "Generation 2/10 [/////////// ]\n", - "Train Loss (Med): 10.26326 (17.94969)\n", - "Val Loss (Med): 10.26326 (17.94969)\n", - "Median Size (Max): 3 (17)\n", - "Median complexity (Max): 9 (324)\n", - "Time (s): 0.15177\n", + "Train Loss (Med): 10.33524 (17.94969)\n", + "Val Loss (Med): 10.33524 (17.94969)\n", + "Median Size (Max): 3 (18)\n", + "Median complexity (Max): 9 (240)\n", + "Time (s): 0.31669\n", "\n", "Generation 3/10 [//////////////// ]\n", "Train Loss (Med): 10.26326 (17.94969)\n", "Val Loss (Med): 10.26326 (17.94969)\n", - "Median Size (Max): 3 (15)\n", - "Median complexity (Max): 9 (196)\n", - "Time (s): 0.23618\n", + "Median Size (Max): 3 (20)\n", + "Median complexity (Max): 9 (368)\n", + "Time (s): 0.45045\n", "\n", "Generation 4/10 [///////////////////// ]\n", "Train Loss (Med): 10.26326 (17.94969)\n", "Val Loss (Med): 10.26326 (17.94969)\n", - "Median Size (Max): 3 (15)\n", - "Median complexity (Max): 9 (196)\n", - "Time (s): 0.29749\n", + "Median Size (Max): 3 (19)\n", + "Median complexity (Max): 9 (368)\n", + "Time (s): 0.63331\n", "\n", "Generation 5/10 [////////////////////////// ]\n", - "Train Loss (Med): 10.26326 (17.94969)\n", - "Val Loss (Med): 10.26326 (17.94969)\n", - "Median Size (Max): 3 (15)\n", - "Median complexity (Max): 9 (196)\n", - "Time (s): 0.38480\n", + "Train Loss (Med): 10.26326 (16.41696)\n", + "Val Loss (Med): 10.26326 (16.41696)\n", + "Median Size (Max): 5 (17)\n", + "Median complexity (Max): 33 (330)\n", + "Time (s): 0.78002\n", "\n", "Generation 6/10 [/////////////////////////////// ]\n", - "Train Loss (Med): 10.26326 (17.94969)\n", - "Val Loss (Med): 10.26326 (17.94969)\n", - "Median Size (Max): 3 (15)\n", - "Median complexity (Max): 9 (196)\n", - "Time (s): 0.47692\n", + "Train Loss (Med): 9.70269 (17.94969)\n", + "Val Loss (Med): 9.70269 (17.94969)\n", + "Median Size (Max): 3 (19)\n", + "Median complexity (Max): 9 (330)\n", + "Time (s): 0.91656\n", "\n", "Generation 7/10 [//////////////////////////////////// ]\n", - "Train Loss (Med): 10.26326 (17.94969)\n", - "Val Loss (Med): 10.26326 (17.94969)\n", - "Median Size (Max): 3 (15)\n", - "Median complexity (Max): 9 (196)\n", - "Time (s): 0.54522\n", + "Train Loss (Med): 9.67577 (17.94969)\n", + "Val Loss (Med): 9.67577 (17.94969)\n", + "Median Size (Max): 3 (19)\n", + "Median complexity (Max): 9 (330)\n", + "Time (s): 1.10225\n", "\n", "Generation 8/10 [///////////////////////////////////////// ]\n", - "Train Loss (Med): 10.26326 (17.49268)\n", - "Val Loss (Med): 10.26326 (17.49268)\n", - "Median Size (Max): 5 (15)\n", - "Median complexity (Max): 22 (196)\n", - "Time (s): 0.61691\n", + "Train Loss (Med): 9.67577 (16.41696)\n", + "Val Loss (Med): 9.67577 (16.41696)\n", + "Median Size (Max): 5 (19)\n", + "Median complexity (Max): 33 (330)\n", + "Time (s): 1.30773\n", "\n", "Generation 9/10 [////////////////////////////////////////////// ]\n", - "Train Loss (Med): 10.26326 (17.49268)\n", - "Val Loss (Med): 10.26326 (17.49268)\n", - "Median Size (Max): 5 (15)\n", - "Median complexity (Max): 22 (196)\n", - "Time (s): 0.69888\n", + "Train Loss (Med): 9.67577 (16.41696)\n", + "Val Loss (Med): 9.67577 (16.41696)\n", + "Median Size (Max): 5 (19)\n", + "Median complexity (Max): 33 (330)\n", + "Time (s): 1.44840\n", "\n", "Generation 10/10 [//////////////////////////////////////////////////]\n", - "Train Loss (Med): 10.26326 (15.05905)\n", - "Val Loss (Med): 10.26326 (15.05905)\n", - "Median Size (Max): 7 (15)\n", - "Median complexity (Max): 36 (196)\n", - "Time (s): 0.80282\n", + "Train Loss (Med): 9.67577 (15.67545)\n", + "Val Loss (Med): 9.67577 (15.67545)\n", + "Median Size (Max): 6 (19)\n", + "Median complexity (Max): 36 (723)\n", + "Time (s): 1.65144\n", "\n", - "score: 0.8864496494920485\n" + "score: 0.892949582824199\n" ] } ], diff --git a/docs/python_api/python_api.rst b/docs/python_api/python_api.rst index 91f6317d..701cd786 100644 --- a/docs/python_api/python_api.rst +++ b/docs/python_api/python_api.rst @@ -1,9 +1,9 @@ Python API ========== -.. automodule:: pybrush - :members: - :undoc-members: - :show-inheritance: - :noindex: - :exclude-members: __init__, __module__, __doc__, __weakref__, __dict__ \ No newline at end of file +.. With doxygennamespace: + +.. .. doxygennamespace:: pybrush +.. :members: + + From 5210dd7e7d218642485e624534faec1cb5b91eaa Mon Sep 17 00:00:00 2001 From: gAldeia Date: Mon, 10 Jun 2024 22:19:44 -0300 Subject: [PATCH 198/199] lots of docs --- src/engine.h | 4 +- src/eval/evaluation.h | 39 ++++++++++++++++-- src/eval/metrics.h | 57 +++++++++++++++++++++++-- src/ind/fitness.h | 28 +++++++++---- src/pop/archive.h | 89 ++++++++++++++++++++++++++++++++-------- src/selection/lexicase.h | 3 -- 6 files changed, 182 insertions(+), 38 deletions(-) diff --git a/src/engine.h b/src/engine.h index f637aef0..437a71e0 100644 --- a/src/engine.h +++ b/src/engine.h @@ -27,8 +27,8 @@ using namespace Eval; using namespace Var; using namespace nlohmann; - -template class Engine{ +template +class Engine{ public: Engine(const Parameters& p=Parameters()) : params(p) diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index 91fd2c47..e03dc9f5 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -19,12 +19,20 @@ using namespace Pop; namespace Eval { template +/** + * @class Evaluation + * @brief Class for evaluating the fitness of individuals in a population. + */ class Evaluation { public: Scorer S; - - // TODO: make eval update loss_v accordingly, and set to th same as train loss if there is no batch or no validation + /** + * @brief Constructor for Evaluation class. + * @details Initializes the scorer based on the program type. + */ Evaluation(){ + // TODO: make eval update loss_v accordingly, and set to th same as train loss if there is no batch or no validation + string scorer; if ( (T == Brush::ProgramType::MulticlassClassifier) || (T == Brush::ProgramType::Representer) ) @@ -38,10 +46,27 @@ class Evaluation { }; ~Evaluation(){}; + /** + * @brief Set the scorer for evaluation. + * @param scorer The scorer to be set. + */ void set_scorer(string scorer){this->S.set_scorer(scorer);}; + + /** + * @brief Get the current scorer. + * @return The current scorer. + */ string get_scorer(){return this->S.get_scorer();}; - /// fitness of population. + /** + * @brief Update the fitness of individuals in a population. + * @param pop The population to update. + * @param island The island index. + * @param data The dataset for evaluation. + * @param params The parameters for evaluation. + * @param fit Flag indicating whether to update fitness. + * @param validation Flag indicating whether to perform validation. + */ void update_fitness(Population& pop, int island, const Dataset& data, @@ -50,7 +75,13 @@ class Evaluation { bool validation=false ); - /// assign fitness to an individual. + /** + * @brief Assign fitness to an individual. + * @param ind The individual to assign fitness to. + * @param data The dataset for evaluation. + * @param params The parameters for evaluation. + * @param val Flag indicating whether it is validation fitness. + */ void assign_fit(Individual& ind, const Dataset& data, const Parameters& params, bool val=false); diff --git a/src/eval/metrics.h b/src/eval/metrics.h index 00ebce37..7a66f8e5 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -4,37 +4,86 @@ #include "../data/data.h" namespace Brush { +/** + * @namespace Eval + * @brief Namespace containing scoring functions for evaluation metrics. + */ namespace Eval { /* Scoring functions */ // regression ------------------------------------------------------------------ -/// mean squared error + +/** + * @brief Calculates the mean squared error between the predicted values and the true values. + * @param y The true values. + * @param yhat The predicted values. + * @param loss Reference to store the calculated losses for each sample. + * @param class_weights The optional class weights (not used for MSE). + * @return The mean squared error. + */ float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, const vector& class_weights=vector() ); // binary classification ------------------------------------------------------- -/// log loss (2 methods below) + +/** + * @brief Calculates the log loss between the predicted probabilities and the true labels. + * @param y The true labels. + * @param predict_proba The predicted probabilities. + * @param class_weights The optional class weights. + * @return The log loss. + */ VectorXf log_loss(const VectorXf& y, const VectorXf& predict_proba, const vector& class_weights=vector()); +/** + * @brief Calculates the mean log loss between the predicted probabilities and the true labels. + * @param y The true labels. + * @param predict_proba The predicted probabilities. + * @param loss Reference to store the calculated losses for each sample. + * @param class_weights The optional class weights. + * @return The mean log loss. + */ float mean_log_loss(const VectorXf& y, const VectorXf& predict_proba, VectorXf& loss, const vector& class_weights = vector()); +/** + * @brief Calculates the average precision score between the predicted probabilities and the true labels. + * @param y The true labels. + * @param predict_proba The predicted probabilities. + * @param loss Reference to store the calculated losses for each sample. + * @param class_weights The optional class weights. + * @return The average precision score. + */ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, VectorXf& loss, const vector& class_weights=vector()); // multiclass classification --------------------------------------------------- -/// multinomial log loss (2 methods below) + +/** + * @brief Calculates the multinomial log loss between the predicted probabilities and the true labels. + * @param y The true labels. + * @param predict_proba The predicted probabilities. + * @param class_weights The optional class weights. + * @return The multinomial log loss. + */ VectorXf multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, const vector& class_weights=vector()); +/** + * @brief Calculates the mean multinomial log loss between the predicted probabilities and the true labels. + * @param y The true labels. + * @param predict_proba The predicted probabilities. + * @param loss Reference to store the calculated losses for each sample. + * @param class_weights The optional class weights. + * @return The mean multinomial log loss. + */ float mean_multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba, VectorXf& loss, const vector& class_weights=vector()); - } // metrics } // Brush diff --git a/src/ind/fitness.h b/src/ind/fitness.h index db0885e8..6cabcf97 100644 --- a/src/ind/fitness.h +++ b/src/ind/fitness.h @@ -8,7 +8,20 @@ using namespace nlohmann; namespace Brush{ - + +/** + * @brief Represents the fitness of an individual in the Brush namespace. + * + * The `Fitness` struct stores various attributes related to the fitness of an individual in the Brush namespace. + * It includes the aggregate loss score, aggregate validation loss score, complexity, size, depth, dominance counter, + * dominated individuals, Pareto front rank, crowding distance on the Pareto front, weighted values, and weights. + * + * The struct provides getter and setter methods for accessing and modifying these attributes. + * It also includes methods for calculating the hash value, setting values, clearing values, checking validity, + * and performing comparison operations. + * + * Additionally, there are methods for converting the `Fitness` object to JSON format and vice versa. + */ struct Fitness { // the loss is used in evolutionary functions @@ -25,6 +38,12 @@ struct Fitness { unsigned int rank; ///< pareto front rank float crowding_dist; ///< crowding distance on the Pareto front + vector values; + vector weights; + + // weighted values + vector wvalues; + void set_dominated(vector& dom){ dominated=dom; }; vector get_dominated() const { return dominated; }; @@ -52,12 +71,6 @@ struct Fitness { void set_crowding_dist(float cd){ crowding_dist=cd; }; float get_crowding_dist() const { return crowding_dist; }; - vector values; - vector weights; - - // weighted values - vector wvalues; - // Constructor with initializer list for weights Fitness(const vector& w={}) : values(), wvalues(), weights(w) { dcounter = 0; @@ -91,7 +104,6 @@ struct Fitness { if (v.size() != weights.size()) { throw std::length_error("Assigned values have not the same length than current values"); } - // fmt::print("updated values\n"); values.resize(0); for (const auto& element : v) { diff --git a/src/pop/archive.h b/src/pop/archive.h index 49404e31..a4105ede 100644 --- a/src/pop/archive.h +++ b/src/pop/archive.h @@ -12,38 +12,93 @@ using namespace Sel; namespace Pop{ +/** + * @brief The Archive struct represents a collection of individual programs. + * + * The Archive struct is used to store individual programs in a collection. It provides + * functionality for initializing, updating, and sorting the archive based on complexity + * or objectives. The archive can be operated on by a single thread. + * + * @tparam T The program type. + */ template struct Archive { - // I dont need shared pointers here (this is not suposed to be operated - // by several threads) vector> individuals; ///< individual programs in the archive bool sort_complexity; ///< whether to sort archive by complexity + NSGA2 selector; ///< using NSGA2 in survival mode (nsga2 does not implement selection) - // using NSGA2 in survival mode (nsga2 does not implement selection) - NSGA2 selector; - + /** + * @brief Default constructor for the Archive struct. + */ Archive(); + /** + * @brief Initializes the archive with individuals from a population. + * @param pop The population from which to initialize the archive. + */ void init(Population& pop); + /** + * @brief Updates the archive with individuals from a population. + * @param pop The population from which to update the archive. + * @param params The parameters for the update. + */ void update(Population& pop, const Parameters& params); - + + /** + * @brief Sets the objectives for the archive. + * + * This function sets the objectives for the archive. The objectives are used for + * sorting the archive. + * + * @param objectives The objectives to set for the archive. + */ void set_objectives(vector objectives); - /// Sort population in increasing complexity. - static bool sortComplexity(const Individual& lhs, - const Individual& rhs); + /** + * @brief Sorts the population in increasing complexity. + * + * This static function is used to sort the population in increasing complexity. + * It is used as a comparison function for sorting algorithms. + * + * @param lhs The left-hand side individual to compare. + * @param rhs The right-hand side individual to compare. + */ + static bool sortComplexity(const Individual& lhs, const Individual& rhs); + + /** + * @brief Sorts the population by the first objective. + * + * This static function is used to sort the population by the first objective. + * It is used as a comparison function for sorting algorithms. + * + * @param lhs The left-hand side individual to compare. + * @param rhs The right-hand side individual to compare. + */ + static bool sortObj1(const Individual& lhs, const Individual& rhs); - /// Sort population by first objective. - static bool sortObj1(const Individual& lhs, - const Individual& rhs); + /** + * @brief Checks if two individuals have the same fitness complexity. + * + * This static function is used to check if two individuals have the same fitness complexity. + * It is used as a comparison function for finding duplicates in the population. + * + * @param lhs The left-hand side individual to compare. + * @param rhs The right-hand side individual to compare. + */ + static bool sameFitComplexity(const Individual& lhs, const Individual& rhs); - /// check for repeats - static bool sameFitComplexity(const Individual& lhs, - const Individual& rhs); - static bool sameObjectives(const Individual& lhs, - const Individual& rhs); + /** + * @brief Checks if two individuals have the same objectives. + * + * This static function is used to check if two individuals have the same objectives. + * It is used as a comparison function for finding duplicates in the population. + * + * @param lhs The left-hand side individual to compare. + * @param rhs The right-hand side individual to compare. + */ + static bool sameObjectives(const Individual& lhs, const Individual& rhs); }; //serialization diff --git a/src/selection/lexicase.h b/src/selection/lexicase.h index a57f5286..9613bfcb 100644 --- a/src/selection/lexicase.h +++ b/src/selection/lexicase.h @@ -4,17 +4,14 @@ #include "selection_operator.h" #include "../util/utils.h" - namespace Brush { namespace Sel { - using namespace Brush; using namespace Pop; using namespace Sel; -////////////////////////////////////////////////////////////// Declarations /*! * @class Lexicase * @brief Lexicase selection operator. From f70d32e1c9e0b64f1867a2adfe825a466c9f0e2c Mon Sep 17 00:00:00 2001 From: gAldeia Date: Mon, 10 Jun 2024 22:21:57 -0300 Subject: [PATCH 199/199] Documentation for Engine class (just the class definition, not its methods) --- src/engine.h | 12 +++++++++ src/selection/selection_operator.h | 41 ++++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/src/engine.h b/src/engine.h index 437a71e0..abf38570 100644 --- a/src/engine.h +++ b/src/engine.h @@ -28,6 +28,18 @@ using namespace Var; using namespace nlohmann; template +/** + * @brief The `Engine` class represents the core engine of the brush library. + * + * It encapsulates the functionality for training and predicting with programs + * in a genetic programming framework. The `Engine` class manages the population + * of programs, selection algorithms, evaluation code, variation operators, and + * survival algorithms. It also provides methods for training the model, making + * predictions, and accessing runtime statistics. + * + * The `Engine` class is parameterized by the program type `T`, which determines + * the type of programs that can be evolved and evaluated by the engine. + */ class Engine{ public: Engine(const Parameters& p=Parameters()) diff --git a/src/selection/selection_operator.h b/src/selection/selection_operator.h index bcfab9f6..6bf824b0 100644 --- a/src/selection/selection_operator.h +++ b/src/selection/selection_operator.h @@ -20,22 +20,41 @@ using namespace Pop; * @brief base class for selection operators. */ template -class SelectionOperator +/** + * @brief The SelectionOperator class represents a base class for selection operators in a genetic algorithm. + * + * This class provides common functionality and interface for selection operators. + */ +class SelectionOperator { public: - bool survival; - string name; - - // shoudn't have a constructor - // SelectionOperator(){}; + bool survival; /**< Flag indicating whether the selection operator is used for survival selection. */ + string name; /**< The name of the selection operator. */ + /** + * @brief Destructor for the SelectionOperator class. + */ virtual ~SelectionOperator(); - - virtual vector select(Population& pop, int island, - const Parameters& p); + + /** + * @brief Selects individuals from the population based on the selection operator's strategy. + * + * @param pop The population from which to select individuals. + * @param island The index of the island in a parallel genetic algorithm. + * @param p The parameters for the selection operator. + * @return A vector of indices representing the selected individuals. + */ + virtual vector select(Population& pop, int island, const Parameters& p); - virtual vector survive(Population& pop, int island, - const Parameters& p); + /** + * @brief Applies the selection operator to determine which individuals survive in the population. + * + * @param pop The population in which to apply the survival selection. + * @param island The index of the island in a parallel genetic algorithm. + * @param p The parameters for the selection operator. + * @return A vector of indices representing the surviving individuals. + */ + virtual vector survive(Population& pop, int island, const Parameters& p); }; } // selection