diff --git a/src/bindings/bind_dataset.cpp b/src/bindings/bind_dataset.cpp index 3c405f8f..872750d5 100644 --- a/src/bindings/bind_dataset.cpp +++ b/src/bindings/bind_dataset.cpp @@ -9,29 +9,90 @@ namespace nl = nlohmann; void bind_dataset(py::module & m) { py::class_(m, "Dataset") + // construct from X - .def(py::init &>()) + // .def(py::init &>()) + // construct from X (and optional validation and batch sizes) with constructor 3. + .def(py::init([](const Ref& X, + const float validation_size=0.0, + const float batch_size=1.0){ + return br::Data::Dataset( + X, {}, validation_size, batch_size); + }), + py::arg("X"), + py::arg("validation_size") = 0.0, + py::arg("batch_size") = 1.0 + ) // construct from X, feature names - .def(py::init< - const Ref&, - const vector& - >() + // .def(py::init< + // const Ref&, + // const vector& + // >() + // ) + // construct from X, feature names (and optional validation and batch sizes) with constructor 3. + .def(py::init([](const Ref& X, + const vector& feature_names, + const float validation_size=0.0, + const float batch_size=1.0){ + return br::Data::Dataset( + X, feature_names, validation_size, batch_size); + }), + py::arg("X"), + py::arg("feature_names"), + py::arg("validation_size") = 0.0, + py::arg("batch_size") = 1.0 ) - // construct from X,y arrays - .def(py::init &, Ref &>()) + + // construct from X, y arrays + // .def(py::init &, Ref &>()) + // construct from X, y arrays (and optional validation and batch sizes) with constructor 2. + .def(py::init([](const Ref& X, + const Ref& y, + const float validation_size=0.0, + const float batch_size=1.0){ + return br::Data::Dataset( + X, y, {}, {}, false, validation_size, batch_size); + }), + py::arg("X"), + py::arg("y"), + py::arg("validation_size") = 0.0, + py::arg("batch_size") = 1.0 + ) + // construct from X, y, feature names - .def(py::init< - const Ref&, - const Ref&, - const vector& - >() + // .def(py::init< + // const Ref&, + // const Ref&, + // const vector& + // >() + // ) + // construct from X, y, feature names (and optional validation and batch sizes) with constructor 2. + .def(py::init([](const Ref& X, + const Ref& y, + const vector& feature_names, + const float validation_size=0.0, + const float batch_size=1.0){ + return br::Data::Dataset( + X, y, feature_names, {}, false, validation_size, batch_size); + }), + py::arg("X"), + py::arg("y"), + py::arg("feature_names"), + py::arg("validation_size") = 0.0, + py::arg("batch_size") = 1.0 ) + .def_readwrite("y", &br::Data::Dataset::y) - // .def_readwrite("features", &br::Data::Dataset::features) + // .def_readwrite("features", &br::Data::Dataset::features) .def("get_n_samples", &br::Data::Dataset::get_n_samples) .def("get_n_features", &br::Data::Dataset::get_n_features) .def("print", &br::Data::Dataset::print) .def("get_batch", &br::Data::Dataset::get_batch) + .def("get_training_data", &br::Data::Dataset::get_training_data) + .def("get_validation_data", &br::Data::Dataset::get_validation_data) + .def("get_batch_size", &br::Data::Dataset::get_batch_size) + .def("set_batch_size", &br::Data::Dataset::set_batch_size) + .def("split", &br::Data::Dataset::split) .def("get_X", &br::Data::Dataset::get_X) ; diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp index 8c054f0f..75521ab3 100644 --- a/src/bindings/bind_params.cpp +++ b/src/bindings/bind_params.cpp @@ -1,5 +1,8 @@ #include "module.h" #include "../params.h" +#include "../util/rnd.h" + +namespace br = Brush; void bind_params(py::module& m) { @@ -9,5 +12,10 @@ void bind_params(py::module& m) // py::class_(m, "Params", py::dynamic_attr()) // .def(py::init<>()) - m.def("set_params", &Brush::set_params); + m.def("set_params", &br::set_params); + m.def("get_params", &br::get_params); + m.def("set_random_state", [](unsigned int seed) + { br::Util::r = *br::Util::Rnd::initRand(); + br::Util::r.set_seed(seed); }); + m.def("rnd_flt", [](){ return br::Util::r.rnd_flt(); }); } \ No newline at end of file diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index 35fd1c0c..96a36b71 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -45,9 +45,12 @@ void bind_program(py::module& m, string name) ) .def("get_dot_model", &T::get_dot_model, py::arg("extras")="") .def("get_weights", &T::get_weights) - .def("size", &T::size) - .def("cross", &T::cross) - .def("mutate", &T::mutate) // static_cast(&T::mutate)) + .def("size", &T::size, py::arg("include_weight")=true) + .def("depth", &T::depth) + .def("cross", &T::cross, py::return_value_policy::automatic, + "Performs one attempt to stochastically swap subtrees between two programs and generate a child") + .def("mutate", &T::mutate, py::return_value_policy::automatic, + "Performs one attempt to stochastically mutate the program and generate a child") .def("set_search_space", &T::set_search_space) .def(py::pickle( [](const T &p) { // __getstate__ diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py index 29a275d3..e45d011b 100644 --- a/src/brush/deap_api/nsga2.py +++ b/src/brush/deap_api/nsga2.py @@ -1,29 +1,45 @@ from deap import tools from deap.benchmarks.tools import diversity, convergence, hypervolume import numpy as np -import random +import functools -def nsga2(toolbox, NGEN, MU, CXPB, verbosity): + +def nsga2(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rnd_flt): # NGEN = 250 - # MU = 100 + # MU = 100 # CXPB = 0.9 + # rnd_flt: random number generator to sample crossover prob + + def calculate_statistics(ind): + on_train = ind.fitness.values + on_val = toolbox.evaluateValidation(ind) + + return (*on_train, *on_val) - stats = tools.Statistics(lambda ind: ind.fitness.values) - stats.register("ave", np.mean, axis=0) + stats = tools.Statistics(calculate_statistics) + + stats.register("avg", np.mean, axis=0) + stats.register("med", np.median, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) - # stats.register("max", np.max, axis=0) + stats.register("max", np.max, axis=0) logbook = tools.Logbook() - logbook.header = "gen", "evals", "ave", "std", "min" + logbook.header = "gen", "evals", "avg (O1 train, O2 train, O1 val, O2 val)", \ + "med (O1 train, O2 train, O1 val, O2 val)", \ + "std (O1 train, O2 train, O1 val, O2 val)", \ + "min (O1 train, O2 train, O1 val, O2 val)", \ + "max (O1 train, O2 train, O1 val, O2 val)" pop = toolbox.population(n=MU) - - # Evaluate the individuals with an invalid fitness - invalid_ind = [ind for ind in pop if not ind.fitness.valid] - fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) - for ind, fit in zip(invalid_ind, fitnesses): + batch = toolbox.getBatch() # everytime this function is called, a new random batch is generated + + # OBS: evaluate calls fit in the individual. It is different from using it to predict. The + # function evaluateValidation don't call the fit + fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), pop) + + for ind, fit in zip(pop, fitnesses): ind.fitness.values = fit # This is just to assign the crowding distance to the individuals @@ -31,12 +47,22 @@ def nsga2(toolbox, NGEN, MU, CXPB, verbosity): pop = toolbox.survive(pop, len(pop)) record = stats.compile(pop) - logbook.record(gen=0, evals=len(invalid_ind), **record) + logbook.record(gen=0, evals=len(pop), **record) + if verbosity > 0: print(logbook.stream) # Begin the generational process for gen in range(1, NGEN): + # The batch will be random only if it is not the size of the entire train set. + # In this case, we dont need to reevaluate the whole pop + if (use_batch): + batch = toolbox.getBatch() + fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), pop) + + for ind, fit in zip(pop, fitnesses): + ind.fitness.values = fit + # Vary the population # offspring = tools.selTournamentDCD(pop, len(pop)) parents = toolbox.select(pop, len(pop)) @@ -44,25 +70,29 @@ def nsga2(toolbox, NGEN, MU, CXPB, verbosity): offspring = [] for ind1, ind2 in zip(parents[::2], parents[1::2]): - if random.random() <= CXPB: - ind1, ind2 = toolbox.mate(ind1, ind2) - - off1 = toolbox.mutate(ind1) - off2 = toolbox.mutate(ind2) - # del ind1.fitness.values, ind2.fitness.values - offspring.extend([off2, off2]) + off1, off2 = None, None + if rnd_flt() < CXPB: + off1, off2 = toolbox.mate(ind1, ind2) + else: + off1 = toolbox.mutate(ind1) + off2 = toolbox.mutate(ind2) + + # avoid inserting empty solutions + if off1 is not None: offspring.extend([off1]) + if off2 is not None: offspring.extend([off2]) # archive.update(offspring) # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] - fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) + fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Select the next generation population pop = toolbox.survive(pop + offspring, MU) record = stats.compile(pop) - logbook.record(gen=gen, evals=len(invalid_ind), **record) + logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record) + if verbosity > 0: print(logbook.stream) diff --git a/src/brush/estimator.py b/src/brush/estimator.py index f1ff642b..fd4913af 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -23,7 +23,6 @@ class BrushEstimator(BaseEstimator): This class shouldn't be called directly; instead, call a child class like :py:class:`BrushRegressor ` or :py:class:`BrushClassifier `. All of the shared parameters are documented here. - Parameters ---------- @@ -39,12 +38,44 @@ class BrushEstimator(BaseEstimator): Maximum depth of GP trees in the GP program. Use 0 for no limit. max_size : int, default 0 Maximum number of nodes in a tree. Use 0 for no limit. - mutation_options : dict, default {"point":0.5, "insert": 0.25, "delete": 0.25} + cx_prob : float, default 1/7 + Probability of applying the crossover variation when generating the offspring, + must be between 0 and 1. + Given that there are `n` mutations, and either crossover or mutation is + used to generate each individual in the offspring (but not both at the + same time), we want to have by default an uniform probability between + crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and + `1/n` for each mutation, we can achieve an uniform distribution. + mutation_options : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6} A dictionary with keys naming the types of mutation and floating point - values specifying the fraction of total mutations to do with that method. + values specifying the fraction of total mutations to do with that method. + The probability of having a mutation is `(1-cx_prob)` and, in case the mutation + is applied, then each mutation option is sampled based on the probabilities + defined in `mutation_options`. The set of probabilities should add up to 1.0. functions: dict[str,float] or list[str], default {} - A dictionary with keys naming the function set and values giving the probability of sampling them, or a list of functions which will be weighted uniformly. + A dictionary with keys naming the function set and values giving the probability + of sampling them, or a list of functions which will be weighted uniformly. If empty, all available functions are included in the search space. + initialization : {"grow", "full"}, default "grow" + Strategy to create the initial population. If `full`, then every expression is created + with `max_size` nodes. If `grow`, size will be uniformly distributed. + validation_size : float, default 0.0 + Percentage of samples to use as a hold-out partition. These samples are used + to calculate statistics during evolution, but not used to train the models. + The `best_estimator_` will be selected using this partition. If zero, then + the same data used for training is used for validation. + batch_size : float, default 1.0 + Percentage of training data to sample every generation. If `1.0`, then + all data is used. Very small values can improve execution time, but + also lead to underfit. + random_state: int or None, default None + If int, then the value is used to seed the c++ random generator; if None, + then a seed will be generated using a non-deterministic generator. It is + important to notice that, even if the random state is fixed, it is + unlikely that running brush using multiple threads will have the same + results. This happens because the Operating System's scheduler is + responsible to choose which thread will run at any given time, thus + reproductibility is not guaranteed. Attributes ---------- @@ -53,7 +84,11 @@ class BrushEstimator(BaseEstimator): archive_ : list[deap_api.DeapIndividual] The final population from training. data_ : _brush.Dataset - The training data in Brush format. + The complete data in Brush format. + train_ : _brush.Dataset + Partition of `data_` containing `(1-validation_size)`% of the data, in Brush format. + validation_ : _brush.Dataset + Partition of `data_` containing `(validation_size)`% of the data, in Brush format. search_space_ : a Brush `SearchSpace` object. Holds the operators and terminals and sampling utilities to update programs. toolbox_ : deap.Toolbox @@ -69,9 +104,14 @@ def __init__( verbosity=0, max_depth=3, max_size=20, - mutation_options = {"point":0.4, "insert": 0.25, "delete": 0.25, "toggle_weight": 0.1}, + cx_prob= 1/7, + mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, + "toggle_weight_on":1/6, "toggle_weight_off":1/6}, functions: list[str]|dict[str,float] = {}, - batch_size: int = 0 + initialization="grow", + random_state=None, + validation_size: float = 0.0, + batch_size: float = 1.0 ): self.pop_size=pop_size self.max_gen=max_gen @@ -79,17 +119,26 @@ def __init__( self.mode=mode self.max_depth=max_depth self.max_size=max_size + self.cx_prob=cx_prob self.mutation_options=mutation_options self.functions=functions + self.initialization=initialization + self.random_state=random_state self.batch_size=batch_size + self.validation_size=validation_size - def _setup_toolbox(self, data): + def _setup_toolbox(self, data_train, data_validation): """Setup the deap toolbox""" toolbox: base.Toolbox = base.Toolbox() - # minimize MAE, minimize size - creator.create("FitnessMulti", base.Fitness, weights=(-1.0,-1.0)) + # creator.create is used to "create new functions", and takes at least + # 2 arguments: the name of the newly created class and a base class + + # Minimizing/maximizing problem: negative/positive weight, respectively. + # Our classification is using the error as a metric + # Comparing fitnesses: https://deap.readthedocs.io/en/master/api/base.html#deap.base.Fitness + creator.create("FitnessMulti", base.Fitness, weights=self.weights) # create Individual class, inheriting from self.Individual with a fitness attribute creator.create("Individual", DeapIndividual, fitness=creator.FitnessMulti) @@ -97,29 +146,45 @@ def _setup_toolbox(self, data): toolbox.register("mate", self._crossover) toolbox.register("mutate", self._mutate) + # When solving multi-objective problems, selection and survival must + # support this feature. This means that these selection operators must + # accept a tuple of fitnesses as argument) toolbox.register("select", tools.selTournamentDCD) toolbox.register("survive", tools.selNSGA2) # toolbox.population will return a list of elements by calling toolbox.individual - toolbox.register("population", tools.initRepeat, list, self._make_individual) - toolbox.register( "evaluate", self._fitness_function, data=data) + toolbox.register("createRandom", self._make_individual) + toolbox.register("population", tools.initRepeat, list, toolbox.createRandom) + + toolbox.register("getBatch", data_train.get_batch) + toolbox.register("evaluate", self._fitness_function, data=data_train) + toolbox.register("evaluateValidation", self._fitness_validation, data=data_validation) return toolbox + def _crossover(self, ind1, ind2): offspring = [] for i,j in [(ind1,ind2),(ind2,ind1)]: - off = creator.Individual(i.prg.cross(j.prg)) - # off.fitness.valid = False - offspring.append(off) + child = i.prg.cross(j.prg) + if child: + offspring.append(creator.Individual(child)) + else: # so we'll always have two elements to unpack in `offspring` + offspring.append(None) return offspring[0], offspring[1] + def _mutate(self, ind1): # offspring = (creator.Individual(ind1.prg.mutate(self.search_space_)),) - offspring = creator.Individual(ind1.prg.mutate()) - return offspring + offspring = ind1.prg.mutate() + + if offspring: + return creator.Individual(offspring) + + return None + def fit(self, X, y): """ @@ -133,28 +198,76 @@ def fit(self, X, y): 1-d array of (boolean) target values. """ _brush.set_params(self.get_params()) - self.data_ = self._make_data(X,y) + + if self.random_state is not None: + _brush.set_random_state(self.random_state) + + self.data_ = self._make_data(X,y, validation_size=self.validation_size) # set n classes if relevant if self.mode=="classification": self.n_classes_ = len(np.unique(y)) + # These have a default behavior to return something meaningfull if + # no values are set + self.train_ = self.data_.get_training_data() + self.train_.set_batch_size(self.batch_size) + self.validation_ = self.data_.get_validation_data() + if isinstance(self.functions, list): self.functions_ = {k:1.0 for k in self.functions} else: self.functions_ = self.functions - self.search_space_ = _brush.SearchSpace(self.data_, self.functions_) - self.toolbox_ = self._setup_toolbox(data=self.data_) + self.search_space_ = _brush.SearchSpace(self.train_, self.functions_) + self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_) - archive, logbook = nsga2(self.toolbox_, self.max_gen, self.pop_size, 0.9, self.verbosity) + archive, logbook = nsga2( + self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, + (0.0 0: + print(f'best model {self.best_estimator_.get_model()}'+ + f' with size {self.best_estimator_.size()}, ' + + f' depth {self.best_estimator_.depth()}, ' + + f' and fitness {self.archive_[0].fitness}' ) - print('best model:',self.best_estimator_.get_model()) return self - def _make_data(self, X, y=None): + def _make_data(self, X, y=None, validation_size=0.0): + # This function should not partition data (as it is used in predict). + # partitioning is done in fit(). + if isinstance(y, pd.Series): y = y.values if isinstance(X, pd.DataFrame): @@ -162,16 +275,20 @@ def _make_data(self, X, y=None): feature_names = X.columns.to_list() X = X.values if isinstance(y, NoneType): - return _brush.Dataset(X, feature_names) + return _brush.Dataset(X, + feature_names=feature_names, validation_size=validation_size) else: - return _brush.Dataset(X, y, feature_names) + return _brush.Dataset(X, y, + feature_names=feature_names, validation_size=validation_size) assert isinstance(X, np.ndarray) + # if there is no label, don't include it in library call to Dataset if isinstance(y, NoneType): - return _brush.Dataset(X) + return _brush.Dataset(X, validation_size=validation_size) + + return _brush.Dataset(X, y, validation_size=validation_size) - return _brush.Dataset(X,y) def predict(self, X): """Predict using the best estimator in the archive. """ @@ -194,6 +311,7 @@ def predict(self, X): def get_params(self): return {k:v for k,v in self.__dict__.items() if not k.endswith('_')} + class BrushClassifier(BrushEstimator,ClassifierMixin): """Brush for classification. @@ -214,19 +332,39 @@ class BrushClassifier(BrushEstimator,ClassifierMixin): def __init__( self, **kwargs): super().__init__(mode='classification',**kwargs) + # Weight of each objective (+ for maximization, - for minimization) + self.weights = (+1.0,-1.0) + + def _fitness_validation(self, ind, data: _brush.Dataset): + # Fitness without fitting the expression, used with validation data + return ( # (accuracy, size) + (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], + ind.prg.size() + ) + def _fitness_function(self, ind, data: _brush.Dataset): ind.prg.fit(data) - return ( - np.abs(data.y-ind.prg.predict(data)).sum(), + return ( # (accuracy, size) + (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], ind.prg.size() ) def _make_individual(self): + # C++'s PTC2-based `make_individual` will create a tree of at least + # the given size. By uniformly sampling the size, we can instantiate a + # population with more diversity + + if self.initialization not in ["grow", "full"]: + raise ValueError(f"Invalid argument value for `initialization`. " + f"expected 'full' or 'grow'. got {self.initialization}") + return creator.Individual( - self.search_space_.make_classifier(self.max_depth, self.max_size) - if self.n_classes_ == 2 else - self.search_space_.make_multiclass_classifier(self.max_depth, self.max_size) - ) + self.search_space_.make_classifier( + self.max_depth,(0 if self.initialization=='grow' else self.max_size)) + if self.n_classes_ == 2 else + self.search_space_.make_multiclass_classifier( + self.max_depth, (0 if self.initialization=='grow' else self.max_size)) + ) def predict_proba(self, X): """Predict class probabilities for X. @@ -266,16 +404,35 @@ class BrushRegressor(BrushEstimator, RegressorMixin): def __init__(self, **kwargs): super().__init__(mode='regressor',**kwargs) + # Weight of each objective (+ for maximization, - for minimization) + self.weights = (-1.0,-1.0) + + def _fitness_validation(self, ind, data: _brush.Dataset): + # Fitness without fitting the expression, used with validation data + + MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) + if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf + MSE = np.inf + + return ( MSE, ind.prg.size() ) + def _fitness_function(self, ind, data: _brush.Dataset): ind.prg.fit(data) - return ( - np.sum((data.y- ind.prg.predict(data))**2), - ind.prg.size() - ) + + MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) + if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf + MSE = np.inf + + return ( MSE, ind.prg.size() ) def _make_individual(self): - return creator.Individual( - self.search_space_.make_regressor(self.max_depth, self.max_size) + if self.initialization not in ["grow", "full"]: + raise ValueError(f"Invalid argument value for `initialization`. " + f"expected 'full' or 'grow'. got {self.initialization}") + + return creator.Individual( # No arguments (or zero): brush will use PARAMS passed in set_params. max_size is sampled between 1 and params['max_size'] if zero is provided + self.search_space_.make_regressor( + self.max_depth, (0 if self.initialization=='grow' else self.max_size)) ) # Under development diff --git a/src/data/data.cpp b/src/data/data.cpp index 610293e9..b80668df 100644 --- a/src/data/data.cpp +++ b/src/data/data.cpp @@ -90,7 +90,7 @@ State check_type(const ArrayXf& x) } else { - if(isCategorical && uniqueMap.size() < 10) + if(isCategorical && uniqueMap.size() <= 10) { tmp = ArrayXi(x.cast()); } @@ -130,20 +130,36 @@ Dataset Dataset::operator()(const vector& idx) const return Dataset(new_features, new_y, this->classification); } -Dataset Dataset::get_batch(int batch_size) const +Dataset Dataset::get_batch() const { + // will always return a new dataset, even when use_batch is false (this case, returns itself) - batch_size = std::min(batch_size,int(this->get_n_samples())); - return (*this)(r.shuffled_index(get_n_samples())); + if (!use_batch) + return (*this); + + auto n_samples = int(this->get_n_samples()); + // garantee that at least one sample is going to be returned, since + // use_batch is true only if batch_size is (0, 1), and ceil will round + // up + n_samples = int(ceil(n_samples*batch_size)); + + return (*this)(r.shuffled_index(n_samples)); } array Dataset::split(const ArrayXb& mask) const { + // TODO: assert that mask is not filled with zeros or ones (would create + // one empty partition) + // split data into two based on mask. auto idx1 = Util::mask_to_index(mask); auto idx2 = Util::mask_to_index((!mask)); return std::array{ (*this)(idx1), (*this)(idx2) }; } + +Dataset Dataset::get_training_data() const { return (*this)(training_data_idx); } +Dataset Dataset::get_validation_data() const { return (*this)(validation_data_idx); } + /// call init at the end of constructors /// to define metafeatures of the data. void Dataset::init() @@ -153,6 +169,13 @@ void Dataset::init() // note this will have to change in unsupervised settings // n_samples = this->y.size(); + if (this->features.size() == 0){ + HANDLE_ERROR_THROW( + fmt::format("Error during the initialization of the dataset. It " + "does not contain any data\n") + ); + } + // fmt::print("Dataset::init()\n"); for (const auto& [name, value]: this->features) { @@ -165,6 +188,36 @@ void Dataset::init() // add feature to appropriate map list this->features_of_type[feature_type].push_back(name); } + + // setting the training and validation data indexes + auto n_samples = int(this->get_n_samples()); + auto idx = r.shuffled_index(n_samples); + + // garantee that at least one sample is going to be returned, since + // use_batch is true only if batch_size is (0, 1), and ceil will round + // up + auto n_train_samples = int(ceil(n_samples*(1-validation_size))); + + training_data_idx.resize(0); + std::transform(idx.begin(), idx.begin() + n_train_samples, + back_inserter(training_data_idx), + [&](int element) { return element; }); + + if ( use_validation && (n_samples - n_train_samples != 0) ) { + validation_data_idx.resize(0); + std::transform(idx.begin() + n_train_samples, idx.end(), + back_inserter(validation_data_idx), + [&](int element) { return element; }); + } + else { + validation_data_idx = training_data_idx; + } +} + +float Dataset::get_batch_size() { return batch_size; } +void Dataset::set_batch_size(float new_size) { + batch_size = new_size; + use_batch = batch_size > 0.0 && batch_size < 1.0; } /// turns input data into a feature map diff --git a/src/data/data.h b/src/data/data.h index 52a37f20..629c02b5 100644 --- a/src/data/data.h +++ b/src/data/data.h @@ -50,27 +50,39 @@ class Dataset //Dataset(ArrayXXf& X, ArrayXf& y, std::map, vector>>& Z): X(X), y(y), Z(Z){} private: + vector training_data_idx; + vector validation_data_idx; + public: /// @brief keeps track of the unique data types in the dataset. std::vector unique_data_types; + /// @brief types of data in the features. std::vector feature_types; + /// @brief map from data types to features having that type. std::unordered_map> features_of_type; - /// @brief dataset features, as key value pairs std::map features; // TODO: this should probably be a more complex type to include feature type // and potentially other info, like arbitrary relations between features - /// @brief length N array, the target label ArrayXf y; + /// @brief whether this is a classification problem bool classification; std::optional> Xref; + /// @brief percentage of original data used for train. if 0.0, then all data is used for train and validation + float validation_size; + bool use_validation; + + /// @brief percentage of training data size to use in each batch. if 1.0, then all data is used + float batch_size; + bool use_batch; + Dataset operator()(const vector& idx) const; /// call init at the end of constructors /// to define metafeatures of the data. @@ -82,34 +94,53 @@ class Dataset const vector& vn = {} ); - /// initialize data from a map. + /// 1. initialize data from a map. Dataset(std::map& d, const Ref& y_ = ArrayXf(), - bool c = false + bool c = false, + float validation_size = 0.0, + float batch_size = 1.0 ) : features(d) , y(y_) , classification(c) + , validation_size(validation_size) + , use_validation(validation_size > 0.0 && validation_size < 1.0) + , batch_size(batch_size) + , use_batch(batch_size > 0.0 && batch_size < 1.0) {init();}; - /// initialize data from a matrix with feature columns. + /// 2. initialize data from a matrix with feature columns. Dataset(const ArrayXXf& X, const Ref& y_ = ArrayXf(), const vector& vn = {}, const map& Z = {}, - bool c = false + bool c = false, + float validation_size = 0.0, + float batch_size = 1.0 ) : features(make_features(X,Z,vn)) , y(y_) , classification(c) + , validation_size(validation_size) + , use_validation(validation_size > 0.0 && validation_size < 1.0) + , batch_size(batch_size) + , use_batch(batch_size > 0.0 && batch_size < 1.0) { init(); Xref = optional>{X}; } - Dataset(const ArrayXXf& X, const vector& vn) + /// 3. initialize data from X and feature names + Dataset(const ArrayXXf& X, const vector& vn, + float validation_size = 0.0, + float batch_size = 1.0) : classification(false) , features(make_features(X,map{},vn)) + , validation_size(validation_size) + , use_validation(validation_size > 0.0 && validation_size < 1.0) + , batch_size(batch_size) + , use_batch(batch_size > 0.0 && batch_size < 1.0) { init(); Xref = optional>{X}; @@ -123,22 +154,26 @@ class Dataset for (auto& [key, value] : this->features) { if (std::holds_alternative(value)) - fmt::print("{}: {}\n", key, std::get(value)); + fmt::print("{} : {}\n", key, std::get(value)); else if (std::holds_alternative(value)) - fmt::print("{}: {}\n", key, std::get(value)); + fmt::print("{} : {}\n", key, std::get(value)); else if (std::holds_alternative(value)) - fmt::print("{}: {}\n", key, std::get(value)); + fmt::print("{} : {}\n", key, std::get(value)); } }; auto get_X() const { - if (Xref.has_value()) - return this->Xref.value().get(); - else + if (!Xref.has_value()) HANDLE_ERROR_THROW("Dataset does not hold a reference to X."); + return this->Xref.value().get(); } - void set_validation(bool v=true); + + // inner partition of original dataset for train and validation. + // if split is not set, then training = validation. + Dataset get_training_data() const; + Dataset get_validation_data() const; + inline int get_n_samples() const { return std::visit( [&](auto&& arg) -> int { return int(arg.size());}, @@ -147,7 +182,10 @@ class Dataset }; inline int get_n_features() const { return this->features.size(); }; /// select random subset of data for training weights. - Dataset get_batch(int batch_size) const; + Dataset get_batch() const; + + float get_batch_size(); + void set_batch_size(float new_size); std::array split(const ArrayXb& mask) const; diff --git a/src/program/dispatch_table.h b/src/program/dispatch_table.h index cd7e9cd0..f4217057 100644 --- a/src/program/dispatch_table.h +++ b/src/program/dispatch_table.h @@ -216,7 +216,7 @@ extern DispatchTable dtable_predict; // ArgsName[args_type], // node_type, // node, -// SS.weight_map.at(ret_type).at(args_type).at(node_type) +// SS.node_map_weights.at(ret_type).at(args_type).at(node_type) // ); // } // } diff --git a/src/program/node.cpp b/src/program/node.cpp index 68becad2..2c632249 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -183,8 +183,10 @@ void from_json(const json &j, Node& p) if (j.contains("center_op")) j.at("center_op").get_to(p.center_op); + if (j.contains("fixed")) j.at("fixed").get_to(p.fixed); + if (j.contains("feature")) { // j.at("feature").get_to(p.feature); @@ -195,6 +197,12 @@ void from_json(const json &j, Node& p) else p.is_weighted=false; + if (j.contains("prob_change")) + j.at("prob_change").get_to(p.prob_change); + else + p.prob_change=1.0; + + // if node has a ret_type and arg_types, get them. if not we need to make // a signature bool make_signature=false; diff --git a/src/program/node.h b/src/program/node.h index 00e5a8fc..cf4541ef 100644 --- a/src/program/node.h +++ b/src/program/node.h @@ -39,6 +39,28 @@ using Brush::Data::Dataset; namespace Brush{ +// TODO: should I move this declaration to another place? +template +inline auto Isnt(DataType dt) -> bool { return !((dt == T) || ...); } + +template +inline auto IsWeighable() noexcept -> bool { + return Isnt(DT); +} +inline auto IsWeighable(DataType dt) noexcept -> bool { + return Isnt(dt); +} + struct uint32_vector_hasher { std::size_t operator()(std::vector const& vec) const { std::size_t seed = vec.size(); @@ -134,9 +156,12 @@ struct Node { W = 1.0; // set_node_hash(); - set_prob_change(1.0); fixed=false; + set_prob_change(1.0); + // cant weight an boolean terminal + if (!IsWeighable(this->ret_type)) + this->is_weighted = false; } /// @brief gets a string version of the node for printing. @@ -212,20 +237,26 @@ struct Node { //////////////////////////////////////////////////////////////////////////////// // getters and setters //TODO revisit - float get_prob_change() const { return this->prob_change;}; + float get_prob_change() const { return fixed ? 0.0 : this->prob_change;}; void set_prob_change(float w){ if (!fixed) this->prob_change = w;}; float get_prob_keep() const { return 1-this->prob_change;}; inline void set_feature(string f){ feature = f; }; inline string get_feature() const { return feature; }; + inline bool get_is_weighted() const {return this->is_weighted;}; + inline void set_is_weighted(bool is_weighted){ + // cant change the weight of a boolean terminal + if (IsWeighable(this->ret_type)) + this->is_weighted = is_weighted; + }; + private: /// @brief feature name for terminals or splitting nodes string feature; }; -//TODO: add nt to template as first argument, make these constexpr template inline auto Is(NodeType nt) -> bool { return ((nt == T) || ...); } @@ -278,6 +309,7 @@ inline auto IsWeighable(NodeType nt) noexcept -> bool { NodeType::SplitBest >(nt); } + ostream& operator<<(ostream& os, const Node& n); ostream& operator<<(ostream& os, const NodeType& nt); diff --git a/src/program/operator.h b/src/program/operator.h index 28061605..195afc6a 100644 --- a/src/program/operator.h +++ b/src/program/operator.h @@ -16,6 +16,7 @@ namespace util{ /// @param weights option pointer to a weight array, used in place of node weight /// @return template + requires (!is_one_of_v) Scalar get_weight(const TreeNode& tn, const W** weights=nullptr) { Scalar w; @@ -31,7 +32,7 @@ namespace util{ w = **weights; // NLS case 2: a Jet/Dual weight is stored in weights, but this constant is a // integer type. We need to do some casting - else if constexpr (is_same_v && is_same_v) { + else if constexpr (is_same_v && is_same_v) { using WScalar = typename Scalar::Scalar; WScalar tmp = WScalar((**weights).a); w = Scalar(tmp); @@ -44,6 +45,23 @@ namespace util{ } return w; }; + template + requires (is_one_of_v) + Scalar get_weight(const TreeNode& tn, const W** weights=nullptr) + { + // we cannot weight a boolean feature. Nevertheless, we need to provide + // an implementation for get_weight behavior, so the metaprogramming + // doesn't fail to get a matching signature. + + if (tn.data.get_is_weighted()) + // Node's init() function avoids the creation of weighted nodes, + // and the setter for `is_weighted` prevent enabling weight on + // boolean values. + HANDLE_ERROR_THROW(fmt::format("boolean terminal is weighted, but " + "it should not\n")); + + return Scalar(true); + }; } //////////////////////////////////////////////////////////////////////////////// // Operator class @@ -199,7 +217,7 @@ struct Operator auto inputs = get_kids(d, tn, weights); if constexpr (is_one_of_v) { - if (tn.data.is_weighted) + if (tn.data.get_is_weighted()) { auto w = util::get_weight(tn, weights); return this->apply(inputs)*w; @@ -218,13 +236,14 @@ struct Operator using RetType = typename S::RetType; using W = typename S::WeightType; + // Standard C++ types template requires (is_one_of_v) RetType eval(const Dataset& d, const TreeNode& tn, const W** weights=nullptr) const { if constexpr (is_one_of_v) { - if (tn.data.is_weighted) + if (tn.data.get_is_weighted()) { auto w = util::get_weight(tn, weights); return this->get(d, tn.data.get_feature())*w; @@ -233,6 +252,7 @@ struct Operator return this->get(d,tn.data.get_feature()); }; + // Jet types template requires( is_one_of_v) RetType eval(const Dataset &d, const TreeNode &tn, const W **weights = nullptr) const @@ -240,7 +260,7 @@ struct Operator using nonJetType = UnJetify_t; if constexpr (is_one_of_v) { - if (tn.data.is_weighted) + if (tn.data.get_is_weighted()) { auto w = util::get_weight(tn, weights); return this->get(d, tn.data.get_feature()).template cast()*w; @@ -249,6 +269,7 @@ struct Operator return this->get(d, tn.data.get_feature()).template cast(); }; + // Accessing dataset directly template auto get(const Dataset& d, const string& feature) const { @@ -261,7 +282,6 @@ struct Operator )); return T(); - } }; diff --git a/src/program/program.h b/src/program/program.h index 296b7aad..d1330cc2 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -87,8 +87,74 @@ template struct Program SSref = std::optional>{s}; } - int size(){ - return Tree.size(); + /// @brief count the tree size of the program, including the weights in weighted nodes. + /// @param include_weight whether to include the node's weight in the count. + /// @return int number of nodes. + int size(bool include_weight=true) const{ + int acc = 0; + + std::for_each(Tree.begin(), Tree.end(), + [include_weight, &acc](auto& node){ + ++acc; // the node operator or terminal + + if (include_weight && node.get_is_weighted()==true) + acc += 2; // weight and multiplication, if enabled + }); + + return acc; + } + + /// @brief count the size of a given subtree, optionally including the + /// weights in weighted nodes. This function is not exposed to the python wrapper. + /// @param top root node of the subtree. + /// @param include_weight whether to include the node's weight in the count. + /// @return int number of nodes. + int size_at(Iter& top, bool include_weight=true) const{ + + int acc = 0; + + // inspired in tree.hh size. First create two identical iterators + Iter it=top, eit=top; + + // Then make the second one point to the next sibling + eit.skip_children(); + ++eit; + + // calculate tree size for each node until reach next sibling + while(it!=eit) { + ++acc; // counting the node operator/terminal + + if (include_weight && it.node->data.get_is_weighted()==true) + acc += 2; // weight and multiplication, if enabled + + ++it; + } + + return acc; + } + + /// @brief count the tree depth of the program. The depth is not influenced by weighted nodes. + /// @return int tree depth. + int depth() const{ + //tree.hh count the number of edges. We need to ensure that a single-node + //tree has depth>0 + return 1+Tree.max_depth(); + } + + /// @brief count the depth of a given subtree. The depth is not influenced by + /// weighted nodes. This function is not exposed to the python wrapper. + /// @param top root node of the subtree. + /// @return int tree depth. + int depth_at(Iter& top) const{ + return 1+Tree.max_depth(top); + } + + /// @brief count the depth until reaching the given subtree. The depth is + /// not influenced by weighted nodes. This function is not exposed to the python wrapper. + /// @param top root node of the subtree. + /// @return int tree depth. + int depth_to_reach(Iter& top) const{ + return 1+Tree.depth(top); } Program& fit(const Dataset& d) @@ -215,7 +281,7 @@ template struct Program for (PostIter i = Tree.begin_post(); i != Tree.end_post(); ++i) { const auto& node = i.node->data; - if (node.is_weighted) + if (node.get_is_weighted()) ++count; } return count; @@ -233,7 +299,7 @@ template struct Program for (PostIter t = Tree.begin_post(); t != Tree.end_post(); ++t) { const auto& node = t.node->data; - if (node.is_weighted) + if (node.get_is_weighted()) { weights(i) = node.W; ++i; @@ -258,7 +324,7 @@ template struct Program for (PostIter i = Tree.begin_post(); i != Tree.end_post(); ++i) { auto& node = i.node->data; - if (node.is_weighted) + if (node.get_is_weighted()) { node.W = weights(j); ++j; @@ -327,7 +393,7 @@ template struct Program // if the first node is weighted, make a dummy output node so that the // first node's weight can be shown - if (i==0 && parent->data.is_weighted) + if (i==0 && parent->data.get_is_weighted()) { out += "y [shape=box];\n"; out += fmt::format("y -> \"{}\" [label=\"{:.2f}\"];\n", @@ -360,7 +426,7 @@ template struct Program // string kid_id = fmt::format("{}",fmt::ptr(kid)); // kid_id = kid_id.substr(2); - if (kid->data.is_weighted && Isnt(kid->data.node_type)){ + if (kid->data.get_is_weighted() && Isnt(kid->data.node_type)){ edge_label = fmt::format("{:.2f}",kid->data.W); } @@ -415,15 +481,15 @@ template struct Program /// @brief convenience wrapper for :cpp:func:`variation:mutate()` in variation.h /// @return a mutated version of this program - Program mutate() const; + std::optional> mutate() const; /** * @brief convenience wrapper for :cpp:func:`variation:cross` in variation.h * * @param other another program to cross with this one. - * @return a mutated version of this and the other program + * @return a new version of this and the other program */ - Program cross(Program other) const; + std::optional> cross(Program other) const; /// @brief turns program tree into a linear program. /// @return a vector of nodes encoding the program in reverse polish notation @@ -455,14 +521,14 @@ void Program::update_weights(const Dataset& d) // mutation and crossover #include "../variation.h" template -Program Program::mutate() const +std::optional> Program::mutate() const { return variation::mutate(*this, this->SSref.value().get()); }; /// swaps subtrees between this and other (note the pass by copy) template -Program Program::cross(Program other) const +std::optional> Program::cross(Program other) const { return variation::cross(*this, other); }; diff --git a/src/program/signatures.h b/src/program/signatures.h index 1d9e13cc..a46e58a9 100644 --- a/src/program/signatures.h +++ b/src/program/signatures.h @@ -195,8 +195,9 @@ template struct Signatures; template struct Signatures>>{ using type = std::tuple< - Signature, - Signature + Signature, + Signature, + Signature >; }; diff --git a/src/search_space.cpp b/src/search_space.cpp index cb4a448a..4ea0b518 100644 --- a/src/search_space.cpp +++ b/src/search_space.cpp @@ -4,6 +4,35 @@ namespace Brush{ + +float calc_initial_weight(const ArrayXf& value, const ArrayXf& y) +{ + // weights are initialized as the slope of the z-score of x and y. + + // If y has different length from X, we get a core dump here. + // TODO: need to make SS (or Datasaet) check for this when loading the data + + vector dtypes = {'f', 'f'}; + + MatrixXf data(value.size(), 2); + + data.col(0) << value; + data.col(1) << y; + + Normalizer n(true); + n.fit_normalize(data, dtypes); // normalize works row-wise + + // In slope function, argument order matters (if not normalized with z-score) + // The feature should be the first value, and the true value the second + // (it will divide covar(arg1, arg2) by var(arg2)). + // Since z-score normalizes so mean=0 and std=1, then order doesnt matter + float prob_change = std::abs(slope(data.col(0).array() , // x=variable + data.col(1).array() )); // y=target + + return prob_change; +} + + /// @brief generate terminals from the dataset features and random constants. /// @param d a dataset /// @return a vector of nodes @@ -19,20 +48,90 @@ vector generate_terminals(const Dataset& d) using Scalar = typename T::Scalar; constexpr bool weighted = std::is_same_v; // if constexpr (T::Scalar == float) - terminals.push_back(Node( + auto n = Node( NodeType::Terminal, Signature{}, weighted, feature_name - )); + ); + + float prob_change = 1.0; // default value + + // if the value can be casted to float array, we can calculate slope + if (std::holds_alternative(value)) + { + prob_change = calc_initial_weight(std::get(value), d.y); + } + else if (std::holds_alternative(value)) + { + // for each variable we create a one-vs-all binary variable, then + // calculate slope. Final value will be the average of slopes + + auto tmp = std::get(value); + + //get number of unique values + std::map uniqueMap; + for(int i = 0; i < tmp.size(); i++) + uniqueMap[(float)tmp(i)] = true; + + ArrayXf slopes = ArrayXf::Ones(uniqueMap.size()); + int slopesIterator = 0; + for (const auto& pair : uniqueMap) + { + auto one_vs_all = ArrayXf::Ones(tmp.size()).array() * (tmp.array()==pair.first).cast(); + + slopes[slopesIterator++] = calc_initial_weight(one_vs_all, d.y); + } + + prob_change = slopes.mean(); + } + else if (std::holds_alternative(value)) + { + auto tmp = std::get(value).template cast(); + prob_change = calc_initial_weight(tmp, d.y); + } + else + { + auto msg = fmt::format("Brush coudn't calculate the initial weight of variable {}\n",feature_name); + HANDLE_ERROR_THROW(msg); + } + + n.set_prob_change( prob_change ); + + terminals.push_back(n); }, value ); ++i; }; - // add a constant - terminals.push_back( Node(NodeType::Constant, Signature{}, true, "C")); + // iterate through terminals and take the average of values of same signature + auto signature_avg = [terminals](DataType ret_type){ + float sum = 0.0; + int count = 0; + + for (const auto& n : terminals) { + if (n.ret_type == ret_type) { + sum += n.get_prob_change(); + count++; + } + } + + return sum / count; + }; + + auto cXf = Node(NodeType::Constant, Signature{}, true, "C"); + cXf.set_prob_change(signature_avg(cXf.ret_type)); + terminals.push_back(cXf); + + auto cXi = Node(NodeType::Constant, Signature{}, true, "C"); + cXi.set_prob_change(signature_avg(cXi.ret_type)); + terminals.push_back(cXi); + + auto cXb = Node(NodeType::Constant, Signature{}, false, "C"); + cXb.set_prob_change(signature_avg(cXb.ret_type)); + terminals.push_back(cXb); + return terminals; }; @@ -46,7 +145,7 @@ void SearchSpace::init(const Dataset& d, const unordered_map& user { // fmt::print("constructing search space...\n"); this->node_map.clear(); - this->weight_map.clear(); + this->node_map_weights.clear(); this->terminal_map.clear(); this->terminal_types.clear(); this->terminal_weights.clear(); @@ -56,7 +155,6 @@ void SearchSpace::init(const Dataset& d, const unordered_map& user for (const auto& [op, weight] : user_ops) op_names.push_back(op); - // create nodes based on data types terminal_types = d.unique_data_types; @@ -72,12 +170,173 @@ void SearchSpace::init(const Dataset& d, const unordered_map& user /* fmt::print("adding {} to search space...\n", term.get_name()); */ if (terminal_map.find(term.ret_type) == terminal_map.end()) terminal_map[term.ret_type] = vector(); + /* fmt::print("terminal ret_type: {}\n", DataTypeName[term.ret_type]); */ terminal_map[term.ret_type].push_back(term); - terminal_weights[term.ret_type].push_back(1.0); + terminal_weights[term.ret_type].push_back(term.get_prob_change()); + } +}; + +std::optional> SearchSpace::sample_subtree(Node root, int max_d, int max_size) const +{ + // public interface to use PTC2 algorithm + + // PTC is designed to not fail (it will persistently try to find nodes with + // sampling functions). In pop initialization, this shoudnt be a problem, but + // during evolution, due to dynamic changes in node weights by the learners, + // it now may fail. We need to check, before calling it, that it has elements + // in search space to sample + auto ret_match = node_map.at(root.ret_type); + + vector args_w = get_weights(root.ret_type); + + // at least one operator that matches the weight must have positive probability + if (!has_solution_space(args_w.begin(), args_w.end())) + return std::nullopt; + + if ( (terminal_map.find(root.ret_type) == terminal_map.end()) + || (!has_solution_space(terminal_weights.at(root.ret_type).begin(), + terminal_weights.at(root.ret_type).end())) ) + return std::nullopt; + + // we should notice the difference between size of a PROGRAM and a TREE. + // program count weights in its size, while the TREE structure dont. Wenever + // using size of a program/tree, make sure you use the function from the correct class + return PTC2(root, max_d, max_size); +}; + +tree SearchSpace::PTC2(Node root, int max_d, int max_size) const +{ + // PTC2 is agnostic of program type + + // A comment about PTC2 method: + // PTC2 can work with depth or size restriction, but it does not strictly + // satisfies these conditions all time. Given a `max_size` and `max_depth` + // parameters, the real maximum size that can occur is `max_size` plus the + // highest operator arity, and the real maximum depth is `max_depth` plus one. + + auto Tree = tree(); + + /* fmt::print("building program with max size {}, max depth {}",max_size,max_d); */ + + // Queue of nodes that need children + vector> queue; + + /* cout << "chose " << n.name << endl; */ + // auto spot = Tree.set_head(n); + /* cout << "inserting...\n"; */ + auto spot = Tree.insert(Tree.begin(), root); + // node depth + int d = 1; + // current tree size + int s = 1; + //For each argument position a of n, Enqueue(a; g) + for (auto a : root.arg_types) + { + /* cout << "queing a node of type " << DataTypeName[a] << endl; */ + auto child_spot = Tree.append_child(spot); + queue.push_back(make_tuple(child_spot, a, d)); + } + + Node n; + // Now we actually start the PTC2 procedure to create the program tree + /* cout << "queue size: " << queue.size() << endl; */ + /* cout << "entering first while loop...\n"; */ + while ( 3*(queue.size()-1) + s < max_size && queue.size() > 0) + { + // by default, terminals are weighted (counts as 3 nodes in program size). + // since every spot in queue has potential to be a terminal, we multiply + // its size by 3. Subtracting one due to the fact that this loop will + // always insert a non terminal (which by default has weights off). + // this way, we can have PTC2 working properly. + + /* cout << "queue size: " << queue.size() << endl; */ + auto [qspot, t, d] = RandomDequeue(queue); + + /* cout << "current depth: " << d << endl; */ + if (d == max_d) + { + // choose terminal of matching type + /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ + // qspot = sample_terminal(t); + // Tree.replace(qspot, sample_terminal(t)); + // Tree.append_child(qspot, sample_terminal(t)); + + auto opt = sample_terminal(t); + while (!opt) + opt = sample_terminal(t); + + // If we successfully get a terminal, use it + n = opt.value(); + + Tree.replace(qspot, n); + } + else + { + //choose a nonterminal of matching type + /* cout << "getting op of type " << DataTypeName[t] << endl; */ + auto opt = sample_op(t); + /* cout << "chose " << n.name << endl; */ + // TreeIter new_spot = Tree.append_child(qspot, n); + // qspot = n; + + while (!opt) + opt = sample_op(t); + + n = opt.value(); + + auto newspot = Tree.replace(qspot, n); + + // For each arg of n, add to queue + for (auto a : n.arg_types) + { + /* cout << "queing a node of type " << DataTypeName[a] << endl; */ + // queue.push_back(make_tuple(new_spot, a, d+1)); + auto child_spot = Tree.append_child(newspot); + + queue.push_back(make_tuple(child_spot, a, d+1)); + } + } + + // increment is different based on node weights + ++s; + if (n.get_is_weighted()) + s += 2; + + /* cout << "current tree size: " << s << endl; */ + } + /* cout << "entering second while loop...\n"; */ + while (queue.size() > 0) + { + if (queue.size() == 0) + break; + + /* cout << "queue size: " << queue.size() << endl; */ + + auto [qspot, t, d] = RandomDequeue(queue); + + /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ + // Tree.append_child(qspot, sample_terminal(t)); + // qspot = sample_terminal(t); + // auto newspot = Tree.replace(qspot, sample_terminal(t)); + + auto opt = sample_terminal(t); + while (!opt) { + opt = sample_terminal(t); + } + + n = opt.value(); + + auto newspot = Tree.replace(qspot, n); } + /* cout << "final tree:\n" */ + /* << Tree.begin().node->get_model() << "\n" */ + /* << Tree.begin().node->get_tree_model(true) << endl; */ + /* << Tree.get_model() << "\n" */ + /* << Tree.get_model(true) << endl; // pretty */ + return Tree; }; RegressorProgram SearchSpace::make_regressor(int max_d, int max_size) diff --git a/src/search_space.h b/src/search_space.h index d99b03de..ac751a65 100644 --- a/src/search_space.h +++ b/src/search_space.h @@ -16,7 +16,6 @@ license: GNU/GPL v3 #include #include - /* Defines the search space of Brush. * The search spaces consists of nodes and their accompanying probability * distribution. @@ -53,26 +52,32 @@ vector generate_terminals(const Dataset& d); extern std::unordered_map ArgsName; /*! @brief Holds a search space, consisting of operations and terminals - * and functions, and methods to sample that space to create programs. - * - * The set of operators is a user controlled parameter; however, we can - * automate, to some extent, the set of possible operators based on the - * data types in the problem. - * Constraints on operators based on data types: - * - only user specified operators are included. - * - operators whose arguments are covered by terminal types are included - * first. Then, a second pass includes any operators whose arguments - * are covered by terminal_types + return types of the current set of - * operators. One could imagine this continuing ad infinitum, but we - * just do two passes for simplicity. - * - assertion check to make sure there is at least one operator that - * returns the output type of the model. - * - * - * Parameters - * ---------- - * -*/ + * and functions, and methods to sample that space to create programs. + * + * The set of operators is a user controlled parameter; however, we can + * automate, to some extent, the set of possible operators based on the + * data types in the problem. + * Constraints on operators based on data types: + * - only user specified operators are included. + * - operators whose arguments are covered by terminal types are included + * first. Then, a second pass includes any operators whose arguments + * are covered by terminal_types + return types of the current set of + * operators. One could imagine this continuing ad infinitum, but we + * just do two passes for simplicity. + * - assertion check to make sure there is at least one operator that + * returns the output type of the model. + * + * When sampling in the search space (using any of the sampling functions + * `sample_op` or `sample_terminal`), some methods can fail to return a + * value --- given a specific set of parameters to a function, the candidate + * solutions set may be empty --- and, for these methods, the return type is + * either a valid value, or a `std::nullopt`. This is controlled wrapping + * the return type with `std::optional`. + * + * Parameters + * ---------- + * + */ struct SearchSpace { using ArgsHash = std::size_t; @@ -93,7 +98,7 @@ struct SearchSpace Map node_map; /// @brief A map of weights corresponding to elements in @ref node_map, used to weight probabilities of each node being sampled from the map. - Map weight_map; + Map node_map_weights; /** * @brief Maps return types to terminals. @@ -102,11 +107,10 @@ struct SearchSpace * * { return_type : vector of Nodes } * - * */ unordered_map> terminal_map; - /// @brief A map of weights corresponding to elements in @ref terminal_map, used to weight probabilities of each node being sampled from the map. + /// @brief A map of weights corresponding to elements in @ref terminal_map, used to weight probabilities of each terminal being sampled from the map. unordered_map> terminal_weights; /// @brief A vector storing the available return types of terminals. @@ -117,7 +121,7 @@ struct SearchSpace NLOHMANN_DEFINE_TYPE_INTRUSIVE(SearchSpace, node_map, - weight_map, + node_map_weights, terminal_map, terminal_weights, terminal_types @@ -222,6 +226,18 @@ struct SearchSpace return true; } + /// @brief Takes iterators to weight vectors and checks if they have a + /// non-empty solution space. An empty solution space is defined as + /// having no non-zero, positive values + /// @tparam T type of iterator. + /// @param start Start iterator + /// @param end End iterator + /// @return true if at least one weight is positive + template + bool has_solution_space(Iter start, Iter end) const { + return !std::all_of(start, end, [](const auto& w) { return w<=0.0; }); + } + template Node get(const string& name); /// @brief get a typed node @@ -244,66 +260,12 @@ struct SearchSpace template Node get(NodeType type, DataType R, S sig){ return get(type, R, sig.hash()); }; - /// @brief Get a specific node type that matches a return value. - /// @param type the node type - /// @param R the return type - /// @return A Node of type `type` with return type `R`. - Node get(NodeType type, DataType R) - { - check(R); - auto ret_match = node_map.at(R); - vector matches; - vector weights; - for (const auto& kv: ret_match) - { - auto arg_hash = kv.first; - auto node_type_map = kv.second; - if (node_type_map.find(type) != node_type_map.end()) - { - matches.push_back(node_type_map.at(type)); - weights.push_back(weight_map.at(R).at(arg_hash).at(type)); - } - } - - return (*r.select_randomly(matches.begin(), - matches.end(), - weights.begin(), - weights.end())); - }; - - /// get a random terminal - Node get_terminal() const - { - //TODO: match terminal args_type (probably '{}' or something?) - // make a separate terminal_map - auto match = *r.select_randomly(terminal_map.begin(), terminal_map.end()); - return *r.select_randomly( - match.second.begin(), match.second.end(), - terminal_weights.at(match.first).begin(), - terminal_weights.at(match.first).end() - ); - }; - - /// get a terminal with return type `R` - Node get_terminal(DataType R) const - { - if (terminal_map.find(R) == terminal_map.end()){ - auto msg = fmt::format("{} not in terminal_map\n",R); - HANDLE_ERROR_THROW(msg); - } - auto rval = *r.select_randomly(terminal_map.at(R).begin(), - terminal_map.at(R).end(), - terminal_weights.at(R).begin(), - terminal_weights.at(R).end()); - return rval; - }; - /// @brief get weights of the return types /// @return a weight vector, each element corresponding to a return type. vector get_weights() const { vector v; - for (auto& [ret, arg_w_map]: weight_map) + for (auto& [ret, arg_w_map]: node_map_weights) { v.push_back(0); for (const auto& [arg, name_map] : arg_w_map) @@ -312,7 +274,6 @@ struct SearchSpace { v.back() += w; } - } } return v; @@ -324,7 +285,7 @@ struct SearchSpace vector get_weights(DataType ret) const { vector v; - for (const auto& [arg, name_map] : weight_map.at(ret)) + for (const auto& [arg, name_map] : node_map_weights.at(ret)) { v.push_back(0); for (const auto& [name, w]: name_map) @@ -343,66 +304,167 @@ struct SearchSpace vector get_weights(DataType ret, ArgsHash sig_hash) const { vector v; - for (const auto& [name, w]: weight_map.at(ret).at(sig_hash)) + for (const auto& [name, w]: node_map_weights.at(ret).at(sig_hash)) v.push_back(w); return v; }; + /// @brief Get a random terminal + /// @return `std::optional` that may contain a terminal Node. + std::optional sample_terminal() const + { + //TODO: match terminal args_type (probably '{}' or something?) + // make a separate terminal_map + + // We'll make terminal types to have its weights proportional to the + // DataTypes Weights they hold + vector data_type_weights(terminal_weights.size()); + std::transform( + terminal_weights.begin(), + terminal_weights.end(), + data_type_weights.begin(), + [](const auto& tw){ + return std::reduce(tw.second.begin(), tw.second.end()); } + ); + + if (!has_solution_space(data_type_weights.begin(), + data_type_weights.end())) + return std::nullopt; + + // If we got this far, then it is garanteed that we'll return something + // The match take into account datatypes with non-zero weights + auto match = *r.select_randomly( + terminal_map.begin(), + terminal_map.end(), + data_type_weights.begin(), + data_type_weights.end() + ); + + return *r.select_randomly( + match.second.begin(), match.second.end(), + terminal_weights.at(match.first).begin(), + terminal_weights.at(match.first).end() + ); + }; + + /// @brief Get a random terminal with return type `R` + /// @return `std::optional` that may contain a terminal Node of type `R`. + std::optional sample_terminal(DataType R) const + { + // should I keep doing this check? + // if (terminal_map.find(R) == terminal_map.end()){ + // auto msg = fmt::format("{} not in terminal_map\n",R); + // HANDLE_ERROR_THROW(msg); + // } + + // TODO: try to combine with above function + if ( (terminal_map.find(R) == terminal_map.end()) + || (!has_solution_space(terminal_weights.at(R).begin(), + terminal_weights.at(R).end())) ) + return std::nullopt; + + return *r.select_randomly(terminal_map.at(R).begin(), + terminal_map.at(R).end(), + terminal_weights.at(R).begin(), + terminal_weights.at(R).end()); + }; + /// @brief get an operator matching return type `ret`. /// @param ret return type - /// @return a randomly chosen operator - Node get_op(DataType ret) const + /// @return `std::optional` that may contain a randomly chosen operator matching return type `ret` + std::optional sample_op(DataType ret) const { - check(ret); + // check(ret); + //TODO: match terminal args_type (probably '{}' or something?) auto ret_match = node_map.at(ret); vector args_w = get_weights(ret); + if (!has_solution_space(args_w.begin(), args_w.end())) + return std::nullopt; + auto arg_match = *r.select_randomly(ret_match.begin(), ret_match.end(), args_w.begin(), args_w.end()); vector name_w = get_weights(ret, arg_match.first); + + if (!has_solution_space(name_w.begin(), name_w.end())) + return std::nullopt; + return (*r.select_randomly(arg_match.second.begin(), arg_match.second.end(), name_w.begin(), name_w.end())).second; }; - + /// @brief Get a specific node type that matches a return value. + /// @param type the node type + /// @param R the return type + /// @return `std::optional` that may contain a Node of type `type` with return type `R`. + std::optional sample_op(NodeType type, DataType R) + { + // check(R); + + auto ret_match = node_map.at(R); + + vector matches; + vector weights; + for (const auto& kv: ret_match) + { + auto arg_hash = kv.first; + auto node_type_map = kv.second; + if (node_type_map.find(type) != node_type_map.end()) + { + matches.push_back(node_type_map.at(type)); + weights.push_back(node_map_weights.at(R).at(arg_hash).at(type)); + } + } + + if ( (weights.size()==0) + || (!has_solution_space(weights.begin(), + weights.end())) ) + return std::nullopt; + + return (*r.select_randomly(matches.begin(), + matches.end(), + weights.begin(), + weights.end())); + }; + /// @brief get operator with at least one argument matching arg /// @param ret return type /// @param arg argument type to match /// @param terminal_compatible if true, the other args the returned operator takes must exist in the terminal types. - /// @param max_arg_count if zero, there is no limit on number of arguments of the operator. If not, the operator can have at most `max_arg_count` arguments. - /// @return a matching operator. - Node get_op_with_arg(DataType ret, DataType arg, + /// @param max_args if zero, there is no limit on number of arguments of the operator. If not, the operator can have at most `max_args` arguments. + /// @return `std::optional` that may contain a matching operator respecting all restrictions. + std::optional sample_op_with_arg(DataType ret, DataType arg, bool terminal_compatible=true, - int max_arg_count=0) const + int max_args=0) const { // thoughts (TODO): // this could be templated by return type and arg. although the lookup in the map should be // fairly fast. //TODO: these needs to be overhauled - // fmt::print("get_op_with_arg"); + // fmt::print("sample_op_with_arg"); check(ret); auto args_map = node_map.at(ret); - vector matches; - vector weights; + vector matches; + vector weights; for (const auto& [args_type, name_map]: args_map) { for (const auto& [name, node]: name_map) { auto node_arg_types = node.get_arg_types(); - + // has no size limit (max_arg_count==0) or the number of // arguments woudn't exceed the maximum number of arguments - auto within_size_limit = !(max_arg_count) || (node.get_arg_count() <= max_arg_count); - - if ( in(node_arg_types, arg) && within_size_limit ) { + auto within_size_limit = !(max_args) || (node.get_arg_count() <= max_args); + + if ( in(node_arg_types, arg) && within_size_limit) { // if checking terminal compatibility, make sure there's // a compatible terminal for the node's other arguments if (terminal_compatible) { @@ -420,26 +482,37 @@ struct SearchSpace } // if we made it this far, include the node as a match! matches.push_back(node); - weights.push_back(weight_map.at(ret).at(args_type).at(name)); + weights.push_back(node_map_weights.at(ret).at(args_type).at(name)); } } } + if ( (weights.size()==0) + || (!has_solution_space(weights.begin(), + weights.end())) ) + return std::nullopt; + return (*r.select_randomly(matches.begin(), matches.end(), weights.begin(), weights.end())); }; /// @brief get a node with a signature matching `node` /// @param node the node to match - /// @return a Node - Node get_node_like(Node node) const + /// @return `std::optional` that may contain a Node + std::optional get_node_like(Node node) const { if (Is(node.node_type)){ - return get_terminal(node.ret_type); + return sample_terminal(node.ret_type); } auto matches = node_map.at(node.ret_type).at(node.args_type()); auto match_weights = get_weights(node.ret_type, node.args_type()); + + if ( (match_weights.size()==0) + || (!has_solution_space(match_weights.begin(), + match_weights.end())) ) + return std::nullopt; + return (*r.select_randomly(matches.begin(), matches.end(), match_weights.begin(), @@ -447,10 +520,18 @@ struct SearchSpace ).second; }; + /// @brief create a subtree with maximum size and depth restrictions and root of type `root_type` + /// @param root_type return type + /// @param max_d the maximum depth + /// @param max_size the maximum size of the tree (will be sampled between [1, max_size]) + /// @return `std::optional` that may contain a tree + std::optional> sample_subtree(Node root, int max_d, int max_size) const; + /// @brief prints the search space map. void print() const; private: + tree PTC2(Node root, int max_d, int max_size) const; template requires (!is_in_v) @@ -490,7 +571,7 @@ struct SearchSpace node_map[n.ret_type][n.args_type()][n.node_type] = n; // sampling probability map float w = use_all? 1.0 : user_ops.at(name); - weight_map[n.ret_type][n.args_type()][n.node_type] = w; + node_map_weights[n.ret_type][n.args_type()][n.node_type] = w; } } @@ -550,125 +631,63 @@ T RandomDequeue(std::vector& Q) template P SearchSpace::make_program(int max_d, int max_size) { - // A comment about PTC2 method: - // PTC2 can work with depth or size restriction, but it does not strictly - // satisfies these conditions all time. Given a `max_size` and `max_depth` - // parameters, the real maximum size that can occur is `max_size` plus the - // highest operator arity, and the real maximum depth is `max_depth` plus one. - if (max_d == 0) - max_d = r.rnd_int(1, PARAMS["max_depth"].get()); + max_d = PARAMS["max_depth"].get(); if (max_size == 0) max_size = r.rnd_int(1, PARAMS["max_size"].get()); DataType root_type = DataTypeEnum::value; ProgramType program_type = P::program_type; // ProgramType program_type = ProgramTypeEnum::value; - + auto Tree = tree(); + if (max_size == 1) + { + // auto root = Tree.insert(Tree.begin(), sample_terminal(root_type)); - /* fmt::print("building program with max size {}, max depth {}",max_size,max_d); */ + // We can only have a terminal here, but the terminal must be compatible + auto opt = sample_terminal(root_type); - // Queue of nodes that need children - vector> queue; + if (!opt){ + auto msg = fmt::format("Program with size=1 could not be created. " + "The search space does not contain any terminal with data type {}./n", + root_type); + HANDLE_ERROR_THROW(msg); + } - if (max_size == 1) - { - auto root = Tree.insert(Tree.begin(), get_terminal(root_type)); + Tree.insert(Tree.begin(), opt.value()); } - else - { - Node n; + else {// Our program can (and will) be grater than 1 node + + // building the root node for each program case. We give the root, and it + // fills the rest of the tree + Node root; + + // building the root node for each program case if (P::program_type == ProgramType::BinaryClassifier) { - n = get(NodeType::Logistic, DataType::ArrayF, Signature()); - n.set_prob_change(0.0); - n.fixed=true; + root = get(NodeType::Logistic, DataType::ArrayF, Signature()); + root.set_prob_change(0.0); + root.fixed=true; + } else if (P::program_type == ProgramType::MulticlassClassifier) { - n = get(NodeType::Softmax, DataType::MatrixF); - n.set_prob_change(0.0); - n.fixed=true; - } - else - n = get_op(root_type); - - /* cout << "chose " << n.name << endl; */ - // auto spot = Tree.set_head(n); - /* cout << "inserting...\n"; */ - auto spot = Tree.insert(Tree.begin(), n); - // node depth - int d = 1; - // current tree size - int s = 1; - //For each argument position a of n, Enqueue(a; g) - for (auto a : n.arg_types) - { - /* cout << "queing a node of type " << DataTypeName[a] << endl; */ - auto child_spot = Tree.append_child(spot); - queue.push_back(make_tuple(child_spot, a, d)); + root = get(NodeType::Softmax, DataType::MatrixF, Signature()); + root.set_prob_change(0.0); + root.fixed=true; } - - /* cout << "queue size: " << queue.size() << endl; */ - /* cout << "entering first while loop...\n"; */ - while (queue.size() + s < max_size && queue.size() > 0) - { - /* cout << "queue size: " << queue.size() << endl; */ - auto [qspot, t, d] = RandomDequeue(queue); - - /* cout << "current depth: " << d << endl; */ - if (d == max_d) - { - // choose terminal of matching type - /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ - // qspot = get_terminal(t); - Tree.replace(qspot, get_terminal(t)); - // Tree.append_child(qspot, get_terminal(t)); + else { + // we start with a non-terminal (can be replaced inside PTC2 though, if max_size==1) + auto opt = sample_op(root_type); + while (!opt) { + opt = sample_op(root_type); } - else - { - //choose a nonterminal of matching type - /* cout << "getting op of type " << DataTypeName[t] << endl; */ - auto n = get_op(t); - /* cout << "chose " << n.name << endl; */ - // TreeIter new_spot = Tree.append_child(qspot, n); - // qspot = n; - auto newspot = Tree.replace(qspot, n); - // For each arg of n, add to queue - for (auto a : n.arg_types) - { - /* cout << "queing a node of type " << DataTypeName[a] << endl; */ - // queue.push_back(make_tuple(new_spot, a, d+1)); - auto child_spot = Tree.append_child(newspot); - queue.push_back(make_tuple(child_spot, a, d+1)); - } - } - ++s; - /* cout << "current tree size: " << s << endl; */ - } - /* cout << "entering second while loop...\n"; */ - while (queue.size() > 0) - { - if (queue.size() == 0) - break; - - /* cout << "queue size: " << queue.size() << endl; */ - - auto [qspot, t, d] = RandomDequeue(queue); - - /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ - // Tree.append_child(qspot, get_terminal(t)); - // qspot = get_terminal(t); - auto newspot = Tree.replace(qspot, get_terminal(t)); - + root = opt.value(); } + + Tree = PTC2(root, max_d, max_size); } - /* cout << "final tree:\n" */ - /* << Tree.begin().node->get_model() << "\n" */ - /* << Tree.begin().node->get_tree_model(true) << endl; */ - /* << Tree.get_model() << "\n" */ - /* << Tree.get_model(true) << endl; // pretty */ return P(*this,Tree); }; @@ -692,7 +711,7 @@ template <> struct fmt::formatter: formatter { ArgsName[args_type], node_type, node, - SS.weight_map.at(ret_type).at(args_type).at(node_type) + SS.node_map_weights.at(ret_type).at(args_type).at(node_type) ); } } diff --git a/src/types.h b/src/types.h index 31664384..5badc481 100644 --- a/src/types.h +++ b/src/types.h @@ -179,6 +179,12 @@ template<> struct DataEnumType{ using type = ArrayXXf; }; template<> struct DataEnumType{ using type = Data::TimeSeriesb; }; template<> struct DataEnumType{ using type = Data::TimeSeriesi; }; template<> struct DataEnumType{ using type = Data::TimeSeriesf; }; +template<> struct DataEnumType{ using type = ArrayXbJet; }; +template<> struct DataEnumType{ using type = ArrayXiJet; }; +template<> struct DataEnumType{ using type = ArrayXfJet; }; +template<> struct DataEnumType{ using type = ArrayXXbJet; }; +template<> struct DataEnumType{ using type = ArrayXXiJet; }; +template<> struct DataEnumType{ using type = ArrayXXfJet; }; template struct DataTypeEnum; template <> struct DataTypeEnum { static constexpr DT value = DT::ArrayB; }; diff --git a/src/util/error.cpp b/src/util/error.cpp index a36cc220..e1fb8f55 100644 --- a/src/util/error.cpp +++ b/src/util/error.cpp @@ -13,7 +13,12 @@ namespace Brush{ namespace Util{ void HandleErrorThrow(string err, const char *file, int line ) { fmt::print(stderr, "FATAL ERROR {}:{}: {}\n", file, line, err); - throw; + + // when called with no arguments, will call terminate(), which + // throws a std::terminate_handler (and can't be handled in GTEST). + // Here we throw a runtime_error with same information printed on + // screen. + throw std::runtime_error(fmt::format("FATAL ERROR {}:{}: {}\n", file, line, err)); } ///prints error to stderr and returns diff --git a/src/util/rnd.cpp b/src/util/rnd.cpp index aedb2c32..ac95b699 100644 --- a/src/util/rnd.cpp +++ b/src/util/rnd.cpp @@ -12,17 +12,24 @@ namespace Brush { namespace Util{ Rnd::Rnd() { /*! - * need a random generator for each core to do multiprocessing + * need a random generator for each core to do multiprocessing. + * The constructor will resize the random generators based on + * the number of available cores. */ + //cout << "Max threads are " <set_seed(0); // setting a random initial state } return instance; @@ -36,28 +43,25 @@ namespace Brush { namespace Util{ instance = NULL; } - void Rnd::set_seed(int seed) + void Rnd::set_seed(unsigned int seed) { /*! - * set seeds for each core's random number generator + * set seeds for each core's random number generator. */ - if (seed == 0) - { + if (seed == 0) { + // use a non-deterministic random generator to seed the deterministics std::random_device rd; - - for (auto& r : rg) - r.seed(rd()); + seed = rd(); } - else // seed first rg with seed, then seed rest with random ints from rg[0]. - { - rg[0].seed(seed); - - int imax = std::numeric_limits::max(); - - std::uniform_int_distribution<> dist(0, imax); - for (size_t i = 1; i < rg.size(); ++i) - rg[i].seed(dist(rg[0])); + // generating a seed sequence + std::seed_seq seq{seed}; + + std::vector seeds(rg.size()); + seq.generate(seeds.begin(), seeds.end()); + + for (size_t i = 0; i < rg.size(); ++i) { + rg[i].seed(seeds[i]); } } diff --git a/src/util/rnd.h b/src/util/rnd.h index d4b61f84..0a682e54 100644 --- a/src/util/rnd.h +++ b/src/util/rnd.h @@ -12,6 +12,8 @@ license: GNU/GPL v3 #include "../init.h" +// Defines a multi-core random number generator and its operators. + using namespace std; using std::swap; @@ -32,7 +34,7 @@ namespace Brush { namespace Util{ static void destroy(); - void set_seed(int seed); + void set_seed(unsigned int seed); int rnd_int( int lowerLimit, int upperLimit ); @@ -123,7 +125,6 @@ namespace Brush { namespace Util{ && " attemping to return random choice from empty vector"); return *select_randomly(v.begin(),v.end()); } - template class C, class T> T random_choice(const C>& v, const vector& w ) @@ -163,10 +164,15 @@ namespace Brush { namespace Util{ // Vector of pseudo-random number generators, one for each thread vector rg; + // private static attribute used by every instance of the class. + // All threads share common static members of the class static Rnd* instance; - }; + // `Brush.Util` static attribute holding an singleton instance of Rnd. + // the instance is created by calling `initRand`, which creates + // an instance of the private static attribute `instance`. `r` will contain + // one generator for each thread (since it called the constructor) static Rnd &r = *Rnd::initRand(); } // Util } // Brush diff --git a/src/util/utils.cpp b/src/util/utils.cpp index 05243649..9f469618 100644 --- a/src/util/utils.cpp +++ b/src/util/utils.cpp @@ -105,10 +105,10 @@ void Normalizer::fit(MatrixXf& X, const vector& dt) for (unsigned int i=0; iscale_all || dtypes.at(i)=='f') { - X.row(i) = X.row(i).array() - offset.at(i); + X.col(i) = X.col(i).array() - offset.at(i); if (scale.at(i) > NEAR_ZERO) - X.row(i) = X.row(i).array()/scale.at(i); + X.col(i) = X.col(i).array()/scale.at(i); } } } diff --git a/src/variation.h b/src/variation.h index 8048f21d..dcfd8288 100644 --- a/src/variation.h +++ b/src/variation.h @@ -11,6 +11,8 @@ license: GNU/GPL v3 // #include "program/tree_node.h" // #include "node.h" +#include + // namespace Brush{ // typedef tree::pre_order_iterator Iter; @@ -27,83 +29,202 @@ namespace variation { typedef tree::pre_order_iterator Iter; -/// point mutation: replace node with same typed node -inline void point_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +/// @brief replace node with same typed node +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space to sample a node like `spot` +/// @return boolean indicating the success (true) or fail (false) of the operation +inline bool point_mutation(tree& Tree, Iter spot, const SearchSpace& SS) { // cout << "point mutation\n"; - auto newNode = SS.get_node_like(spot.node->data); - Tree.replace(spot, newNode); + + // get_node_like will sample a similar node based on node_map_weights or + // terminal_weights, and maybe will return a Node. + std::optional newNode = SS.get_node_like(spot.node->data); + + if (!newNode) // newNode == std::nullopt + return false; + + // if optional contains a Node, we access its contained value + Tree.replace(spot, *newNode); + + return true; } -/// insert a node with spot as a child -inline void insert_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +/// @brief insert a node with spot as a child +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space to sample a node like `spot` +/// @return boolean indicating the success (true) or fail (false) of the operation +inline bool insert_mutation(tree& Tree, Iter spot, const SearchSpace& SS) { // cout << "insert mutation\n"; auto spot_type = spot.node->data.ret_type; - // pick a compatible random node to insert. We subtract one to count - // the op node that is already being inserted. - auto n = SS.get_op_with_arg(spot_type, spot_type, true, + // pick a random compatible node to insert (with probabilities given by + // node_map_weights). The `-1` represents the node being inserted. + // Ideally, it should always find at least one match (the same node + // used as a reference when calling the function). However, we have a + // size restriction, which will be relaxed here (just as it is in the PTC2 + // algorithm). This mutation can create a new expression that exceeds the + // maximum size by the highest arity among the operators. + std::optional n = SS.sample_op_with_arg(spot_type, spot_type, true, PARAMS["max_size"].get()-Tree.size()-1); - // make node n wrap the subtree at the chosen spot - auto parent_node = Tree.wrap(spot, n); + if (!n) // there is no operator with compatible arguments + return false; - // GUI TODO: spot_filled should be any random spot with same type + // make node n wrap the subtree at the chosen spot + auto parent_node = Tree.wrap(spot, *n); // now fill the arguments of n appropriately bool spot_filled = false; - for (auto a: n.arg_types) + for (auto a: (*n).arg_types) { if (spot_filled) { - // if spot is in its child position, append children - Tree.append_child(parent_node, SS.get_terminal(a)); + // if spot is in its child position, append children. + // TODO: reminding that sample_terminal may fail as well + auto opt = SS.sample_terminal(a); + + if (!opt) + return false; + + Tree.append_child(parent_node, opt.value()); } // if types match, treat this spot as filled by the spot node else if (a == spot_type) spot_filled = true; // otherwise, add siblings before spot node - else - Tree.insert(spot, SS.get_terminal(a)); + else { + auto opt = SS.sample_terminal(a); + + if (!opt) + return false; + + Tree.insert(spot, opt.value()); + } } + + return true; } -/// delete subtree and replace it with a terminal of the same return type -inline void delete_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +/// @brief delete subtree and replace it with a terminal of the same return type +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space to sample a node like `spot` +/// @return boolean indicating the success (true) or fail (false) of the operation +inline bool delete_mutation(tree& Tree, Iter spot, const SearchSpace& SS) { // cout << "delete mutation\n"; - auto terminal = SS.get_terminal(spot.node->data.ret_type); + + // sample_terminal will sample based on terminal_weights. If it succeeds, + // then the new terminal will be in `opt.value()` + auto opt = SS.sample_terminal(spot.node->data.ret_type); + + if (!opt) // there is no terminal with compatible arguments + return false; + Tree.erase_children(spot); - Tree.replace(spot, terminal); + + Tree.replace(spot, opt.value()); + + return true; }; -/// @brief toggle the node's weight on or off. +/// @brief toggle the node's weight ON. /// @param Tree the program tree /// @param spot an iterator to the node that is being mutated /// @param SS the search space (unused) -inline void toggle_weight_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +/// @return boolean indicating the success (true) or fail (false) of the operation +inline bool toggle_weight_on_mutation(tree& Tree, Iter spot, const SearchSpace& SS) { - spot.node->data.is_weighted = !spot.node->data.is_weighted; + if (spot.node->data.get_is_weighted()==true // cant turn on whats already on + || !IsWeighable(spot.node->data.ret_type)) // does not accept weights (e.g. boolean) + return false; // false indicates that mutation failed and should return std::nullopt + + spot.node->data.set_is_weighted(true); + return true; +} + +/// @brief toggle the node's weight OFF. +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space (unused) +/// @return boolean indicating the success (true) or fail (false) of the operation +inline bool toggle_weight_off_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +{ + if (spot.node->data.get_is_weighted()==false) + return false; + + spot.node->data.set_is_weighted(false); + return true; +} + +/// @brief replaces the subtree rooted in `spot` +/// @param Tree the program tree +/// @param spot an iterator to the node that is being mutated +/// @param SS the search space to generate a compatible subtree +/// @return boolean indicating the success (true) or fail (false) of the operation +inline bool subtree_mutation(tree& Tree, Iter spot, const SearchSpace& SS) +{ + auto spot_type = spot.node->data.ret_type; + auto max_size = PARAMS["max_size"].get() - (Tree.size() - Tree.size(spot)); + auto max_depth = PARAMS["max_depth"].get() - (Tree.depth(spot)); + + // sample subtree uses PTC2, which operates on depth and size of the tree + // (and not on the program!). we shoudn't care for weights here + auto subtree = SS.sample_subtree(spot.node->data, max_depth, max_size); + + if (!subtree) // there is no terminal with compatible arguments + return false; + + // if optional contains a Node, we access its contained value + Tree.erase_children(spot); + Tree.replace(spot, subtree.value().begin()); + + return true; } /** - * @brief Mutate a program. + * @brief Stochastically mutate a program. * * Types of mutation: * * - point mutation changes a single node. * - insertion mutation inserts a node as the parent of an existing node, and fills in the other arguments. - * - deletion mutation deletes a node - * - toggle_weight mutation turns a node's weight on or off. + * - deletion mutation deletes a node. + * - subtree mutation inserts a new subtree into the program. + * - toggle_weight_on mutation turns a node's weight ON. + * - toggle_weight_off mutation turns a node's weight OFF. + * + * Every mutation has a probability (weight) based on global parameters. The + * spot where the mutation will take place is sampled based on attribute + * `get_prob_change` of each node in the tree. Inside each type of mutation, + * when a new node is inserted, it is sampled based on `terminal_weights`. + * + * Due to the stochastic behavior, and the several sampling steps, it may come to + * a case where the search space does not hold any possible modification to do in + * the program. In this case, the method returns `std::nullopt` (and has overloads + * so it can be used in a boolean context). + * + * If the mutation succeeds, the mutated program can be accessed through the + * `.value()` attribute of the `std::optional`. + * + * This means that, if you use the mutation as `auto opt = mutate(parent, SS)`, + * either `opt==false` or `opt.value()` contains the child program. + * * @tparam T program type * @param parent the program to be mutated * @param SS a search space - * @return `child`, the mutated program + * @return `std::optional` that may contain the child program of type `T` */ template -Program mutate(const Program& parent, const SearchSpace& SS) +std::optional> mutate(const Program& parent, const SearchSpace& SS) { + // all mutation validation and setup should be done here. Specific mutaiton + // functions are intended to work on the program tree thus cannot access + // program functions and attributes. Program child(parent); // choose location by weighted sampling of program @@ -113,46 +234,91 @@ Program mutate(const Program& parent, const SearchSpace& SS) [](const auto& n){ return n.get_prob_change(); } ); - auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), - weights.begin(), weights.end()); - auto options = PARAMS["mutation_options"].get>(); - // Setting to zero the weight of variations that increase the expression - // if the expression is already at the maximum size or depth - if (child.Tree.size()+1 >= PARAMS["max_size"].get() - || child.Tree.max_depth()+1 >= PARAMS["max_depth"].get()) - { - // avoid using mutations that increase size/depth - options["insert"] = 0.0; + if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { + return w<=0.0; + })) + { // There is no spot that has a probability to be selected + return std::nullopt; } + auto spot = r.select_randomly(child.Tree.begin(), child.Tree.end(), + weights.begin(), weights.end()); + + if (std::all_of(options.begin(), options.end(), [](const auto& kv) { + return kv.second<=0.0; + })) + { // No mutation can be successfully applied to this solution + return std::nullopt; + } + // choose a valid mutation option string choice = r.random_choice(options); - if (choice == "insert") - insert_mutation(child.Tree, spot, SS); - else if (choice == "delete") - delete_mutation(child.Tree, spot, SS); - else if (choice == "point") - point_mutation(child.Tree, spot, SS); - else if (choice == "toggle_weight") - toggle_weight_mutation(child.Tree, spot, SS); - else{ - string msg = fmt::format("{} not a valid mutation choice", choice); + // std::cout << "mutation configuration (choice was " << choice << "):" << std::endl; + // for (const auto& [k, v] : options) + // std::cout << " - " << k << " : " << v << std::endl; + + // Every mutation here works inplace, so they return bool instead of + // std::optional to indicare the result of their manipulation over the + // program tree. Here we call the mutation function and return the result + using MutationFunc = std::function&, Iter, const SearchSpace&)>; + + std::map mutations{ + {"insert", insert_mutation}, + {"delete", delete_mutation}, + {"point", point_mutation}, + {"subtree", subtree_mutation}, + {"toggle_weight_on", toggle_weight_on_mutation}, + {"toggle_weight_off", toggle_weight_off_mutation} + }; + + // Try to find the mutation function based on the choice + auto it = mutations.find(choice); + if (it == mutations.end()) { + std::string msg = fmt::format("{} not a valid mutation choice", choice); HANDLE_ERROR_THROW(msg); } - return child; + // apply the mutation and check if it succeeded + bool success = it->second(child.Tree, spot, SS); + + if (success + && ( (child.size() <= PARAMS["max_size"].get() ) + && (child.depth() <= PARAMS["max_depth"].get()) )){ + + return child; + } else { + return std::nullopt; + } }; -/// @brief swaps subtrees between root and other, returning new program -/// @tparam T the program type -/// @param root the root parent -/// @param other the donating parent -/// @return new program of type `T` +/** + * @brief Stochastically swaps subtrees between root and other, returning a new program. + * + * The spot where the cross will take place in the `root` parent is sampled + * based on attribute `get_prob_change` of each node in the tree. After selecting + * the cross spot, the program will iterate through the `other` parent searching + * for all compatible sub-trees to replace. + * + * Due to the stochastic behavior, it may come to a case where there is no + * candidate to replace the spot node. In this case, the method returns + * `std::nullopt` (and has overloads so it can be used in a boolean context). + * + * If the cross succeeds, the child program can be accessed through the + * `.value()` attribute of the `std::optional`. + * + * This means that, if you use the cross as `auto opt = mutate(parent, SS)`, + * either `opt==false` or `opt.value()` contains the child. + * + * @tparam T the program type + * @param root the root parent + * @param other the donating parent + * @return `std::optional` that may contain the child program of type `T` + */ template -Program cross(const Program& root, const Program& other) +std::optional> cross(const Program& root, const Program& other) { /* subtree crossover between this and other, producing new Program */ // choose location by weighted sampling of program @@ -162,65 +328,64 @@ Program cross(const Program& root, const Program& other) // pick a subtree to replace vector child_weights(child.Tree.size()); std::transform(child.Tree.begin(), child.Tree.end(), - child_weights.begin(), - [](const auto& n){ return n.get_prob_change(); } - ); + child_weights.begin(), + [](const auto& n){ return n.get_prob_change(); } + ); - // GUI TODO: Keep doing random attempts, or test all possilities? - bool matching_spots_found = false; - for (int tries = 0; tries < 3; ++tries) - { - auto child_spot = r.select_randomly(child.Tree.begin(), - child.Tree.end(), - child_weights.begin(), - child_weights.end() - ); - - auto child_ret_type = child_spot.node->data.ret_type; - - auto allowed_size = PARAMS["max_size"].get() - - ( child.Tree.size() - child.Tree.size(child_spot) ); - auto allowed_depth = PARAMS["max_depth"].get() - - ( child.Tree.max_depth() - child.Tree.depth(child_spot) ); - - // pick a subtree to insert. Selection is based on other_weights - vector other_weights(other.Tree.size()); - - // iterator to get the size of subtrees inside transform - auto other_iter = other.Tree.begin(); - - // lambda function to check feasibility of solution and increment the iterator - const auto check_and_incrm = [other, &other_iter, allowed_size, allowed_depth]() -> bool { - int s = other.Tree.size(other_iter); - int d = other.Tree.depth(other_iter); - - std::advance(other_iter, 1); - return (s <= allowed_size) && (d <= allowed_depth); - }; - - std::transform(other.Tree.begin(), other.Tree.end(), - other_weights.begin(), - [child_ret_type, check_and_incrm](const auto& n){ - // need to pick a node that has a matching output type to the child_spot. - // also need to check if swaping this node wouldn't exceed max_size - if (check_and_incrm() && (n.ret_type == child_ret_type)) - return n.get_prob_change(); - else - // setting the weight to zero to indicate a non-feasible crossover point - return float(0.0); - } - ); - - for (const auto& w: other_weights) - { - matching_spots_found = w > 0.0; + if (std::all_of(child_weights.begin(), child_weights.end(), [](const auto& w) { + return w<=0.0; + })) + { // There is no spot that has a probability to be selected + return std::nullopt; + } - if (matching_spots_found) - break; + auto child_spot = r.select_randomly(child.Tree.begin(), + child.Tree.end(), + child_weights.begin(), + child_weights.end() + ); + + auto child_ret_type = child_spot.node->data.ret_type; + + auto allowed_size = PARAMS["max_size"].get() - + ( child.size() - child.size_at(child_spot) ); + auto allowed_depth = PARAMS["max_depth"].get() - + ( child.depth_to_reach(child_spot) ); + + // pick a subtree to insert. Selection is based on other_weights + vector other_weights(other.Tree.size()); + + // iterator to get the size of subtrees inside transform + auto other_iter = other.Tree.begin(); + + // lambda function to check feasibility of solution and increment the iterator + const auto check_and_incrm = [other, &other_iter, allowed_size, allowed_depth]() -> bool { + int s = other.size_at( other_iter ); + int d = other.depth_at( other_iter ); + + std::advance(other_iter, 1); + return (s <= allowed_size) && (d <= allowed_depth); + }; + + std::transform(other.Tree.begin(), other.Tree.end(), + other_weights.begin(), + [child_ret_type, check_and_incrm](const auto& n){ + // need to pick a node that has a matching output type to the child_spot. + // also need to check if swaping this node wouldn't exceed max_size + if (check_and_incrm() && (n.ret_type == child_ret_type)) + return n.get_prob_change(); + else + // setting the weight to zero to indicate a non-feasible crossover point + return float(0.0); } + ); - if (matching_spots_found) - { + bool matching_spots_found = false; + for (const auto& w: other_weights) + { + matching_spots_found = w > 0.0; + + if (matching_spots_found) { auto other_spot = r.select_randomly( other.Tree.begin(), other.Tree.end(), @@ -233,10 +398,9 @@ Program cross(const Program& root, const Program& other) child.Tree.move_ontop(child_spot, other_spot); return child; } - // fmt::print("try {} failed\n",tries); } - return child; + return std::nullopt; }; -} //namespace vary +} //namespace variation #endif \ No newline at end of file diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index e69de29b..09893c2c 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -0,0 +1,117 @@ +#include "testsHeader.h" +#include "../../src/search_space.h" +#include "../../src/program/program.h" +#include "../../src/program/dispatch_table.h" + +TEST(Data, ErrorHandling) +{ + // Creating an empty dataset throws error + EXPECT_THROW({ + MatrixXf X(0,0); + ArrayXf y(0); + + try + { + Dataset dt(X, y); + } + catch( const std::runtime_error& err ) + { + const string msg = err.what(); + ASSERT_NE( + msg.find("Error during the initialization of the dataset"), + std::string::npos); + throw; + } + }, std::runtime_error); +} + +TEST(Data, MixedVariableTypes) +{ + // We need to set at least the mutation options (and respective + // probabilities) in order to call PRG.predict() + PARAMS["mutation_options"] = { + {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"toggle_weight_on", 0.125}, {"toggle_weight_off", 0.125} + }; + + MatrixXf X(5,3); + X << 0 , 1, 0 , // binary with integer values + 0.0, 1.0, 1.0, // binary with float values + 2 , 1.0, -3.0, // integer with float and negative values + 2 , 1 , 3 , // integer with integer values + 2.1, 3.7, -5.2; // float values + + X.transposeInPlace(); + + ArrayXf y(3); + + y << 6.1, 7.7, -4.2; // y = x_0 + x_1 + x_2 + + unordered_map user_ops = { + {"Add", 1}, + {"Sub", 1}, + {"SplitOn", 1} + }; + + Dataset dt(X, y); + SearchSpace SS; + SS.init(dt, user_ops); + + dt.print(); + SS.print(); + + for (int d = 1; d < 5; ++d) + for (int s = 1; s < 5; ++s) + { + + PARAMS["max_size"] = s; + PARAMS["max_depth"] = d; + + RegressorProgram PRG = SS.make_regressor(d, s); + fmt::print( + "=================================================\n" + "Tree model for depth = {}, size= {}: {}\n", + d, s, PRG.get_model("compact", true) + ); + + // visualizing detailed information for the model + std::for_each(PRG.Tree.begin(), PRG.Tree.end(), + [](const auto& n) { + fmt::print("Name {}, node {}, feature {}\n" + " sig_hash {}\n ret_type {}\n ret_type type {}\n", + n.name, n.node_type, n.get_feature(), + n.sig_hash, n.ret_type, typeid(n.ret_type).name()); + }); + + std::cout << std::endl; + + fmt::print( "PRG fit\n"); + PRG.fit(dt); + fmt::print( "PRG predict\n"); + ArrayXf y_pred = PRG.predict(dt); + fmt::print( "y_pred: {}\n", y_pred); + + // creating and fitting a child + auto opt = PRG.mutate(); + + if (!opt){ + fmt::print("Mutation failed to create a child\n"); + } + else { + auto Child = opt.value(); + + fmt::print("Child model: {}\n", Child.get_model("compact", true)); + + fmt::print( "Child fit\n"); + Child.fit(dt); + fmt::print( "Child predict\n"); + ArrayXf y_pred_child = Child.predict(dt); + fmt::print( "y_pred: {}\n", y_pred); + } + } + + // Brush exports two DispatchTable structs named dtable_fit and dtable_predict. + // These structures holds the mapping between nodes and its corresponding + // operations, and are used to resolve the evaluation of an expression. + // dtable_fit.print(); + // dtable_predict.print(); +} \ No newline at end of file diff --git a/tests/cpp/test_optimization.cpp b/tests/cpp/test_optimization.cpp index e087cec0..857ea47d 100644 --- a/tests/cpp/test_optimization.cpp +++ b/tests/cpp/test_optimization.cpp @@ -7,9 +7,9 @@ using testing::TestWithParam; // Hashes corresponding to a 3-ary Prod operator -const std::size_t sig_hash = 5617655905677279916; -const std::size_t sig_dual_hash = 10188582206427064428; -const std::size_t complete_hash = 1786662244046809282; +const std::size_t sig_hash = 5617655905677279916u; +const std::size_t sig_dual_hash = 10188582206427064428u; +const std::size_t complete_hash = 1786662244046809282u; class OptimizerTest : public TestWithParam< std::tuple> > { diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp index acd0f67b..3fb2de8b 100644 --- a/tests/cpp/test_program.cpp +++ b/tests/cpp/test_program.cpp @@ -9,8 +9,6 @@ TEST(Program, MakeRegressor) Dataset data = Data::read_csv("docs/examples/datasets/d_enc.csv","label"); - - SearchSpace SS; SS.init(data); @@ -179,10 +177,6 @@ TEST(Operators, ProgramSizeAndDepthPARAMS) SearchSpace SS; SS.init(data); - // split operator --> arity 3 - // prod operator --> arity 4 - int max_arity = 4; - for (int d = 1; d < 10; ++d) { for (int s = 1; s < 10; ++s) @@ -201,23 +195,21 @@ TEST(Operators, ProgramSizeAndDepthPARAMS) "Model size : {}\n" "=================================================\n", d, s, - PRG.get_model("compact", true), PRG.Tree.max_depth(), PRG.Tree.size() + PRG.get_model("compact", true), PRG.depth(), PRG.size() ); - // There are two ways of assessing the size of a program - // GUI TODO: which style are we going to use? i) implement - // PRG.max_depth() (or PRG.depth()); or ii) get rid of PRG.size() - // and use always PRG.Tree.<>) - ASSERT_TRUE(PRG.Tree.size() > 0); - ASSERT_TRUE(PRG.Tree.size() <= s+max_arity); - - ASSERT_TRUE(PRG.size() > 0); - ASSERT_TRUE(PRG.size() <= s+max_arity); + // Terminals are weighted by default, while operators not. Since we + // include the weights in the calculation of the size of the program, + // and PTC2 uses the tree size (not the program size), it is not + // expected that initial trees will strictly respect `max_size`. + ASSERT_TRUE(PRG.size() > 0); // size is always positive + + // PTC2: maximum size is s+max(arity). Since in Brush terminals are + // weighted by default, we set it to 3*max(arity) + ASSERT_TRUE(PRG.size() <= s+3*4); - // GUI TODO: max_depth() counts the number of edges (so a program with - // one node has max_depth()==0). Is this how it should behave? - ASSERT_TRUE(PRG.Tree.max_depth() >= 0); - ASSERT_TRUE(PRG.Tree.max_depth() <= d+1); + ASSERT_TRUE(PRG.depth() <= d+1); + ASSERT_TRUE(PRG.depth() > 0); // depth is always positive } } } \ No newline at end of file diff --git a/tests/cpp/test_search_space.cpp b/tests/cpp/test_search_space.cpp index 9904684b..02eaaf19 100644 --- a/tests/cpp/test_search_space.cpp +++ b/tests/cpp/test_search_space.cpp @@ -5,40 +5,64 @@ TEST(SearchSpace, Initialization) { - - MatrixXf X(4,2); - MatrixXf X_v(3,2); - X << 0,1, - 0.47942554,0.87758256, - 0.84147098, 0.54030231, - 0.99749499, 0.0707372; - X_v << 0.90929743, -0.41614684, - 0.59847214, -0.80114362, - 0.14112001,-0.9899925; - - X.transposeInPlace(); - X_v.transposeInPlace(); - ArrayXf y(4); - ArrayXf y_v(3); - // y = 2*x1 + 3.x2 - y << 3.0, 3.59159876, 3.30384889, 2.20720158; - y_v << 0.57015434, -1.20648656, -2.68773747; - + y << 3.00000, 3.59876, 7.18622, 15.19294; + + // variables have different pairwise correlations with y. The idea is to + // see the mutation weights for each floating variable. The slope were + // calculated using python np.cov(xprime, yprime)[0][1]/np.var(xprime), + // where xprime and yprime are the z-score normalized arrays obtained + // from x and y. + MatrixXf X(5,4); + X << 0, 0, 1, 1, // x0, binary, expected weight=1 + 2, 0, 1, 2, // x1, categorical, expected weight=1 + 0.05699, 0.62737, 0.72406, 0.99294, // x2, slope ~= 1.069 + 0.03993, 0.36558, 0.01393, 0.25878, // x3, slope ~= 0.25 + 5.17539, 7.63579,-2.82560, 0.24645; // x4, slope ~= -0.799 + X.transposeInPlace(); // 4 rows x 5 variables + Dataset dt(X, y); - Dataset dv(X_v, y_v); - + + // different weights to check if searchspace is initialized correctnly unordered_map user_ops = { - {"Add", 1}, - {"Sub", 1}, - {"Div", .5}, - {"Times", 0.5} + {"Add", 1}, + {"Sub", 1}, + {"Div", .5}, + {"Mul", 0.5} }; - // SearchSpace SS; SearchSpace SS; SS.init(dt, user_ops); -/* dtable_fit.print(); */ -/* dtable_predict.print(); */ -} + dt.print(); + SS.print(); + // dtable_fit.print(); + // dtable_predict.print(); + + // manually calculated. last value is the avg of prev values + ArrayXf expected_weights_Xf(4); // 4 elements (x3, x4, x5 and c) + expected_weights_Xf << 0.80240685, 0.19270448, 0.5994426, 0.531518; + + auto actual_weights_f = SS.terminal_weights.at(DataType::ArrayF); + Eigen::Map actual_weights_Xf(actual_weights_f.data(), actual_weights_f.size()); + + ASSERT_TRUE(expected_weights_Xf.isApprox(actual_weights_Xf)); + + + ArrayXf expected_weights_Xi(2); // 2 elements (x2 and c) + expected_weights_Xi << 0.2736814, 0.2736814; + + auto actual_weights_i = SS.terminal_weights.at(DataType::ArrayI); + Eigen::Map actual_weights_Xi(actual_weights_i.data(), actual_weights_i.size()); + + ASSERT_TRUE(expected_weights_Xi.isApprox(actual_weights_Xi)); + + + ArrayXf expected_weights_Xb(2); // 2 elements (x0 and c) + expected_weights_Xb << 0.8117065, 0.8117065; + + auto actual_weights_b = SS.terminal_weights.at(DataType::ArrayB); + Eigen::Map actual_weights_Xb(actual_weights_b.data(), actual_weights_b.size()); + + ASSERT_TRUE(expected_weights_Xb.isApprox(actual_weights_Xb)); +} \ No newline at end of file diff --git a/tests/cpp/test_variation.cpp b/tests/cpp/test_variation.cpp index 1f505754..d0eb9bcf 100644 --- a/tests/cpp/test_variation.cpp +++ b/tests/cpp/test_variation.cpp @@ -4,46 +4,69 @@ #include "../../src/program/dispatch_table.h" #include "../../src/data/io.h" -TEST(Operators, Mutation) +TEST(Operators, InsertMutationWorks) { + // TODO: this tests could be parameterized. + // To understand design implementation of this test, check Mutation test + PARAMS["mutation_options"] = { - {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"toggle_weight", 0.25} + {"point", 0.0}, {"insert", 1.0}, {"delete", 0.0}, {"subtree", 0.0}, {"toggle_weight_on", 0.0}, {"toggle_weight_off", 0.0} }; - // test mutation - // TODO: set random seed + + // retrieving the options to check if everything was set right + std::cout << "Initial mutation configuration" << std::endl; + auto options = PARAMS["mutation_options"].get>(); + for (const auto& [k, v] : options) + std::cout << k << " : " << v << std::endl; + MatrixXf X(10,2); ArrayXf y(10); X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, + 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; + y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, - 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; + Dataset data(X,y); SearchSpace SS; SS.init(data); - for (int d = 1; d < 10; ++d) + int successes = 0; + for (int attempt = 0; attempt < 100; ++attempt) { - for (int s = 1; s < 10; ++s) - { - fmt::print("d={},s={}\n",d,s); - fmt::print("make_regressor\n"); - RegressorProgram PRG = SS.make_regressor(d, s); - fmt::print("PRG.fit(data);\n"); - PRG.fit(data); - ArrayXf y_pred = PRG.predict(data); - fmt::print("auto Child = PRG.mutate(SS);\n"); - auto Child = PRG.mutate(); + // we need to have big values here so the mutation will work + // (when the xmen child exceeds the maximum limits, mutation returns + // std::nullopt) + PARAMS["max_size"] = 20; + PARAMS["max_depth"] = 10; + + fmt::print("d={},s={}\n", PARAMS["max_depth"].get(), PARAMS["max_size"].get()); + fmt::print("make_regressor\n"); + + // creating a "small" program (with a plenty amount of space to insert stuff) + RegressorProgram PRG = SS.make_regressor(5, 5); + + fmt::print("PRG.fit(data);\n"); + PRG.fit(data); + ArrayXf y_pred = PRG.predict(data); + + // applying mutation and checking if the optional result is non-empty + fmt::print("auto Child = PRG.mutate();\n"); + auto opt = PRG.mutate(); // We should assume that it will be always the insert mutation - fmt::print("print\n"); + if (opt){ + successes += 1; + auto Child = opt.value(); fmt::print( "=================================================\n" "depth = {}, size= {}\n" "Initial Model: {}\n" "Mutated Model: {}\n", - d, s, + PARAMS["max_depth"].get(), PARAMS["max_size"].get(), PRG.get_model("compact", true), Child.get_model("compact", true) ); @@ -51,15 +74,52 @@ TEST(Operators, Mutation) fmt::print("child fit\n"); Child.fit(data); y_pred = Child.predict(data); + + // since we successfully inserted a node, this should be always true + ASSERT_TRUE(Child.size() > PRG.size()); + + // maybe the insertion spot was a shorter branch than the maximum + // depth. At least, xmen depth should be equal to its parent + ASSERT_TRUE(Child.depth() >= PRG.depth()); + } + + // lets also see if it always fails when the child exceeds the maximum limits + PARAMS["max_size"] = PRG.size(); + PARAMS["max_depth"] = PRG.depth(); + + auto opt2 = PRG.mutate(); + if (opt2){ // This shoudl't happen. We'll print then error + auto Child2 = opt2.value(); + + std::cout << "Fail failed. Mutation weights:" << std::endl; + auto options2 = PARAMS["mutation_options"].get>(); + for (const auto& [k, v] : options2) + std::cout << k << " : " << v << std::endl; + + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutated Model: {}\n", + PARAMS["max_depth"].get(), PARAMS["max_size"].get(), + PRG.get_model("compact", true), + Child2.get_model("compact", true) + ); + ASSERT_TRUE(opt2==std::nullopt); } } + ASSERT_TRUE(successes > 0); } -TEST(Operators, Crossover) +TEST(Operators, Mutation) { // test mutation // TODO: set random seed + PARAMS["mutation_options"] = { + {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"subtree", 0.0}, {"toggle_weight_on", 0.125}, {"toggle_weight_off", 0.125} + }; + MatrixXf X(10,2); ArrayXf y(10); X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, @@ -78,54 +138,62 @@ TEST(Operators, Crossover) for (int d = 1; d < 10; ++d) { + int successes = 0; for (int s = 1; s < 10; ++s) { - RegressorProgram PRG1 = SS.make_regressor(d, s); - RegressorProgram PRG2 = SS.make_regressor(d, s); - PRG1.fit(data); - PRG2.fit(data); - - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model 1: {}\n" - "Initial Model 2: {}\n", - d, s, - PRG1.get_model("compact", true), - PRG2.get_model("compact", true) - ); - ArrayXf y_pred = PRG1.predict(data); - fmt::print("cross one\n"); - auto Child1 = PRG1.cross(PRG2); - fmt::print( - "Model 1 after cross: {}\n" - "Model 2 after cross: {}\n", - PRG1.get_model("compact", true), - PRG2.get_model("compact", true) - ); - fmt::print("cross two\n"); - auto Child2 = PRG2.cross(PRG1); + fmt::print("d={},s={}\n",d,s); + fmt::print("make_regressor\n"); - fmt::print( - "Crossed Model 1: {}\n" - "Crossed Model 2: {}\n" - "=================================================\n", - Child1.get_model("compact", true), - Child2.get_model("compact", true) - ); + // if we set max_size and max_depth to zero, it will use the + // values in the global PARAMS. Otherwise, it will respect the + // values passed as argument. + RegressorProgram PRG = SS.make_regressor(d, s); - Child1.fit(data); - Child2.fit(data); - auto child_pred1 = Child1.predict(data); - auto child_pred2 = Child2.predict(data); + fmt::print("PRG.fit(data);\n"); + PRG.fit(data); + ArrayXf y_pred = PRG.predict(data); + + // applying mutation and checking if the optional result is non-empty + fmt::print("auto Child = PRG.mutate();\n"); + auto opt = PRG.mutate(); + + if (!opt){ + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutation failed to create a child", + d, s, + PRG.get_model("compact", true) + ); + } + else { + successes += 1; + auto Child = opt.value(); + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutated Model: {}\n", + d, s, + PRG.get_model("compact", true), + Child.get_model("compact", true) + ); + + fmt::print("child fit\n"); + Child.fit(data); + y_pred = Child.predict(data); + } } + // since x1 and x2 have same type, we shoudn't get fails + ASSERT_TRUE(successes > 0); } } TEST(Operators, MutationSizeAndDepthLimit) { PARAMS["mutation_options"] = { - {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"toggle_weight", 0.25} + {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"subtree", 0.0}, {"toggle_weight_on", 0.125}, {"toggle_weight_off", 0.125} }; MatrixXf X(10,2); @@ -150,6 +218,7 @@ TEST(Operators, MutationSizeAndDepthLimit) for (int d = 5; d < 15; ++d) { + int successes = 0; for (int s = 5; s < 15; ++s) { PARAMS["max_size"] = s; @@ -165,43 +234,59 @@ TEST(Operators, MutationSizeAndDepthLimit) auto PRG_model = PRG.get_model("compact", true); - auto Child = PRG.mutate(); - - fmt::print("print\n"); - fmt::print( - "=================================================\n" - "depth = {}, size= {}\n" - "Initial Model: {}\n" - "Mutated Model: {}\n" - "Mutated depth: {}\n" - "Mutated size : {}\n", - d, s, - PRG.get_model("compact", true), - Child.get_model("compact", true), - Child.Tree.max_depth(), - Child.Tree.size() - ); - - // Original didn't change - ASSERT_TRUE(PRG_model == PRG.get_model("compact", true)); - - // Child is within restrictions. Here we expect the generated - // expression to have at most max_size nodes (there is no tolerance - // gap as PTC2 has). Notice that this is only valid if the original - // parent is already respecting the max_size - ASSERT_TRUE(Child.size() > 0); - ASSERT_TRUE(Child.size() <= s); - - ASSERT_TRUE(Child.Tree.size() > 0); - ASSERT_TRUE(Child.Tree.size() <= s); - - ASSERT_TRUE(Child.Tree.max_depth() >= 0); - ASSERT_TRUE(Child.Tree.max_depth() <= d); + auto opt = PRG.mutate(); + + if (!opt){ + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutation failed to create a child", + d, s, + PRG.get_model("compact", true) + ); + } + else { + successes += 1; + + // Extracting the child from the std::optional and checking + // if it is within size and depth restrictions. There is no + // margin for having slightly bigger expressions. + auto Child = opt.value(); + + fmt::print("print\n"); + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Initial Model: {}\n" + "Mutated Model: {}\n" + "Mutated depth: {}\n" + "Mutated size : {}\n", + d, s, + PRG.get_model("compact", true), + Child.get_model("compact", true), + Child.depth(), + Child.size() + ); + + // Original didn't change + ASSERT_TRUE(PRG_model == PRG.get_model("compact", true)); + + ASSERT_TRUE(Child.size() > 0); + ASSERT_TRUE(Child.size() <= s); + + ASSERT_TRUE(Child.size() > 0); + ASSERT_TRUE(Child.size() <= s); + + ASSERT_TRUE(Child.depth() >= 0); + ASSERT_TRUE(Child.depth() <= d); + } } + ASSERT_TRUE(successes > 0); } } -TEST(Operators, CrossoverSizeAndDepthLimit) +TEST(Operators, Crossover) { MatrixXf X(10,2); ArrayXf y(10); @@ -219,24 +304,15 @@ TEST(Operators, CrossoverSizeAndDepthLimit) SearchSpace SS; SS.init(data); - // split operator --> arity 3 - // prod operator --> arity 4 - int max_arity = 4; - - for (int d = 5; d < 15; ++d) + for (int d = 1; d < 10; ++d) { - for (int s = 5; s < 15; ++s) + int successes = 0; + for (int s = 1; s < 10; ++s) { - PARAMS["max_size"] = s; - PARAMS["max_depth"] = d; - - // Enforcing that the parents does not exceed max_size by - // taking into account the highest arity of the function nodes - RegressorProgram PRG1 = SS.make_regressor(d-1, s-max_arity); - RegressorProgram PRG2 = SS.make_regressor(d-1, s-max_arity); - - auto PRG1_model = PRG1.get_model("compact", true); - auto PRG2_model = PRG2.get_model("compact", true); + RegressorProgram PRG1 = SS.make_regressor(d, s); + RegressorProgram PRG2 = SS.make_regressor(d, s); + PRG1.fit(data); + PRG2.fit(data); fmt::print( "=================================================\n" @@ -248,114 +324,45 @@ TEST(Operators, CrossoverSizeAndDepthLimit) PRG2.get_model("compact", true) ); + ArrayXf y_pred = PRG1.predict(data); fmt::print("cross one\n"); - auto Child1 = PRG1.cross(PRG2); - fmt::print( - "Model 1 after cross: {}\n" - "Model 2 after cross: {}\n", - PRG1.get_model("compact", true), - PRG2.get_model("compact", true) - ); - - fmt::print("cross two\n"); - auto Child2 = PRG2.cross(PRG1); - fmt::print( - "Crossed Model 1 : {}\n" - "Crossed Model 1 depth: {}\n" - "Crossed Model 1 size : {}\n" - "Crossed Model 2 : {}\n" - "Crossed Model 2 depth: {}\n" - "Crossed Model 2 size : {}\n" - "=================================================\n", - Child1.get_model("compact", true), - Child1.Tree.max_depth(), Child1.Tree.size(), - Child2.get_model("compact", true), - Child2.Tree.max_depth(), Child2.Tree.size() - ); - - // Original didn't change - ASSERT_TRUE(PRG1_model == PRG1.get_model("compact", true)); - ASSERT_TRUE(PRG2_model == PRG2.get_model("compact", true)); - - // Child1 is within restrictions - ASSERT_TRUE(Child1.size() > 0); - ASSERT_TRUE(Child1.size() <= s); - ASSERT_TRUE(Child1.Tree.size() > 0); - ASSERT_TRUE(Child1.Tree.size() <= s); - ASSERT_TRUE(Child1.Tree.max_depth() >= 0); - ASSERT_TRUE(Child1.Tree.max_depth() <= d); - - // Child2 is within restrictions - ASSERT_TRUE(Child2.size() > 0); - ASSERT_TRUE(Child2.size() <= s); - ASSERT_TRUE(Child2.Tree.size() > 0); - ASSERT_TRUE(Child2.Tree.size() <= s); - - ASSERT_TRUE(Child2.Tree.max_depth() >= 0); - ASSERT_TRUE(Child2.Tree.max_depth() <= d); + auto opt = PRG1.cross(PRG2); + if (!opt){ + fmt::print( + "=================================================\n" + "depth = {}, size= {}\n" + "Original model 1: {}\n" + "Original model 2: {}\n", + "Crossover failed to create a child", + d, s, + PRG1.get_model("compact", true), + PRG2.get_model("compact", true) + ); + } + else { + successes += 1; + auto Child = opt.value(); + fmt::print( + "Original model 1 after cross: {}\n" + "Original model 2 after cross: {}\n", + PRG1.get_model("compact", true), + PRG2.get_model("compact", true) + ); + fmt::print( + "Crossed Model: {}\n" + "=================================================\n", + Child.get_model("compact", true) + ); + Child.fit(data); + auto child_pred1 = Child.predict(data); + } } + ASSERT_TRUE(successes > 0); } } -TEST(Operators, MutationSizeAndDepthPARAMS) -{ - PARAMS["mutation_options"] = { - {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"toggle_weight", 0.25} - }; - - MatrixXf X(10,2); - ArrayXf y(10); - X << 0.85595296, 0.55417453, 0.8641915 , 0.99481109, 0.99123376, - 0.9742618 , 0.70894019, 0.94940306, 0.99748867, 0.54205151, - - 0.5170537 , 0.8324005 , 0.50316305, 0.10173936, 0.13211973, - 0.2254195 , 0.70526861, 0.31406024, 0.07082619, 0.84034526; - - y << 3.55634251, 3.13854087, 3.55887523, 3.29462895, 3.33443517, - 3.4378868 , 3.41092345, 3.5087468 , 3.25110243, 3.11382179; - - Dataset data(X,y); - - SearchSpace SS; - SS.init(data); - - // split operator --> arity 3 - // prod operator --> arity 4 - int max_arity = 4; - - for (int d = 1; d < 10; ++d) - { - for (int s = 1; s < 10; ++s) - { - PARAMS["max_size"] = s; - PARAMS["max_depth"] = d; - - fmt::print("d={},s={}\n",d,s); - fmt::print("make_regressor\n"); - - RegressorProgram PRG = SS.make_regressor(0, 0); - - auto PRG_model = PRG.get_model("compact", true); - - auto Child = PRG.mutate(); - - // Child is within restrictions. Here we allow the mutation - // to generate slightly bigger expressions (because the original - // parents can also have this offset due to PTC2 generation method) - ASSERT_TRUE(Child.size() > 0); - ASSERT_TRUE(Child.size() <= s+max_arity); - - ASSERT_TRUE(Child.Tree.size() > 0); - ASSERT_TRUE(Child.Tree.size() <= s+max_arity); - - ASSERT_TRUE(Child.Tree.max_depth() >= 0); - ASSERT_TRUE(Child.Tree.max_depth() <= d+1); - } - } -} - -TEST(Operators, CrossoverSizeAndDepthPARAMS) +TEST(Operators, CrossoverSizeAndDepthLimit) { MatrixXf X(10,2); ArrayXf y(10); @@ -377,39 +384,67 @@ TEST(Operators, CrossoverSizeAndDepthPARAMS) // prod operator --> arity 4 int max_arity = 4; - for (int d = 1; d < 10; ++d) + for (int d = 5; d < 15; ++d) { - for (int s = 1; s < 10; ++s) + int successes = 0; + for (int s = 5; s < 15; ++s) { PARAMS["max_size"] = s; PARAMS["max_depth"] = d; - RegressorProgram PRG1 = SS.make_regressor(0, 0); - RegressorProgram PRG2 = SS.make_regressor(0, 0); + // Enforcing that the parents does not exceed max_size by + // taking into account the highest arity of the function nodes + RegressorProgram PRG1 = SS.make_regressor(d-1, s-max_arity); + RegressorProgram PRG2 = SS.make_regressor(d-1, s-max_arity); auto PRG1_model = PRG1.get_model("compact", true); auto PRG2_model = PRG2.get_model("compact", true); - auto Child1 = PRG1.cross(PRG2); - auto Child2 = PRG2.cross(PRG1); - - // Child1 is within restrictions - ASSERT_TRUE(Child1.size() > 0); - ASSERT_TRUE(Child1.size() <= s+max_arity); - ASSERT_TRUE(Child1.Tree.size() > 0); - ASSERT_TRUE(Child1.Tree.size() <= s+max_arity); - - ASSERT_TRUE(Child1.Tree.max_depth() >= 0); - ASSERT_TRUE(Child1.Tree.max_depth() <= d+1); - - // Child2 is within restrictions - ASSERT_TRUE(Child2.size() > 0); - ASSERT_TRUE(Child2.size() <= s+max_arity); - ASSERT_TRUE(Child2.Tree.size() > 0); - ASSERT_TRUE(Child2.Tree.size() <= s+max_arity); + fmt::print( + "=================================================\n" + "settings: depth = {}, size= {}\n" + "Original model 1: {}\n" + "depth = {}, size= {}\n" + "Original model 2: {}\n" + "depth = {}, size= {}\n", + d, s, + PRG1.get_model("compact", true), + PRG1.depth(), PRG1.size(), + PRG2.get_model("compact", true), + PRG2.depth(), PRG2.size() + ); - ASSERT_TRUE(Child2.Tree.max_depth() >= 0); - ASSERT_TRUE(Child2.Tree.max_depth() <= d+1); + fmt::print("cross\n"); + auto opt = PRG1.cross(PRG2); + + if (!opt){ + fmt::print("Crossover failed to create a child" + "=================================================\n"); + } + else { + successes += 1; + auto Child = opt.value(); + fmt::print( + "Child Model : {}\n" + "Child Model depth: {}\n" + "Child Model size : {}\n" + "=================================================\n", + Child.get_model("compact", true), + Child.depth(), Child.size() + ); + + // Original didn't change + ASSERT_TRUE(PRG1_model == PRG1.get_model("compact", true)); + ASSERT_TRUE(PRG2_model == PRG2.get_model("compact", true)); + + // Child is within restrictions + ASSERT_TRUE(Child.size() > 0); + ASSERT_TRUE(Child.size() <= s); + + ASSERT_TRUE(Child.depth() >= 0); + ASSERT_TRUE(Child.depth() <= d); + } } + ASSERT_TRUE(successes > 0); } -} +} \ No newline at end of file diff --git a/tests/python/test_brush.py b/tests/python/test_brush.py index 2fadd1e4..5e38898f 100644 --- a/tests/python/test_brush.py +++ b/tests/python/test_brush.py @@ -60,3 +60,14 @@ def test_fit(setup, brush_args, request): pytest.fail(f"Unexpected Exception caught: {e}") logging.error(traceback.format_exc()) + +# def test_random_state(): # TODO: make it work +# test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) +# test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0], +# [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T + +# est1 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) +# est2 = brush.BrushRegressor(random_state=42).fit(test_X, test_y) + +# assert est1.best_estimator_.get_model() == est2.best_estimator_.get_model(), \ +# "random state failed to generate same results" \ No newline at end of file diff --git a/tests/python/test_params.py b/tests/python/test_params.py new file mode 100644 index 00000000..03d08bc4 --- /dev/null +++ b/tests/python/test_params.py @@ -0,0 +1,95 @@ +import pytest + +import _brush +import time +from multiprocessing import Pool +import numpy as np + + +def test_param_random_state(): + # Check if make_regressor, mutation and crossover will create the same expressions + test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0. ]) + test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0], + [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T + + data = _brush.Dataset(test_X, test_y) + SS = _brush.SearchSpace(data) + + _brush.set_random_state(123) + + first_run = [] + for d in range(1,4): + for s in range(1,20): + prg = SS.make_regressor(d, s) + prg = prg.mutate() + + if prg != None: prg = prg.cross(prg) + if prg != None: first_run.append(prg.get_model()) + + assert len(first_run) > 0, "either mutation or crossover is always failing" + + _brush.set_random_state(123) + + second_run = [] + for d in range(1,4): + for s in range(1,20): + prg = SS.make_regressor(d, s) + prg = prg.mutate() + + if prg != None: prg = prg.cross(prg) + if prg != None: second_run.append(prg.get_model()) + + assert len(second_run) > 0, "either mutation or crossover is always failing" + + for fr, sr in zip(first_run, second_run): + assert fr==sr, "random state failed to generate same expressions" + + +def _change_and_wait(config): + "Will change the mutation weights to set only the `index` to 1, then wait " + "`seconts` to retrieve the _brush PARAMS and print weight values" + index, seconds = config + + # Sample configuration + params = { + 'verbosity': False, + 'pop_size' : 100, + 'max_gen' : 100, + 'max_depth': 5, + 'max_size' : 50, + 'mutation_options': {'point' : 0.0, + 'insert' : 0.0, + 'delete' : 0.0, + 'subtree' : 0.0, + 'toggle_weight_on' : 0.0, + 'toggle_weight_off': 0.0} + } + + # We need to guarantee order to use the index correctly + mutations = ['point', 'insert', 'delete', 'subtree', 'toggle_weight_on', 'toggle_weight_off'] + + for i, m in enumerate(mutations): + params['mutation_options'][m] = 0 if i != index else 1.0 + + print(f"(Thread id {index}{seconds}) Setting mutation {mutations[index]} to 1 and wait {seconds} seconds") + + _brush.set_params(params) + time.sleep(seconds) + + print(f"(Thread id {index}{seconds}) Retrieving PARAMS: {_brush.get_params()['mutation_options']}") + + assert params['mutation_options']==_brush.get_params()['mutation_options'], \ + f"(Thread id {index}{seconds}) BRUSH FAILED TO KEEP SEPARATE INSTANCES OF `PARAMS` BETWEEN MULTIPLE THREADS" + +def test_global_PARAMS_sharing(): + print("By default, all threads starts with all mutations having weight zero.") + + scale = 0.25 # Scale the time of each thread (for human manual checking) + + # Checking if brush's PARAMS can be modified inside a pool without colateral effects. + # Each configuration will start in the same order as they are listed, but they + # will finish in different times. They are all modifying the brush's PARAMS. + Pool(processes=3).map(_change_and_wait, [(0, 3*scale), + (1, 1*scale), + (2, 2*scale)]) + \ No newline at end of file diff --git a/tests/python/test_program.py b/tests/python/test_program.py index 5c7e51b7..78356bee 100644 --- a/tests/python/test_program.py +++ b/tests/python/test_program.py @@ -13,7 +13,7 @@ def test_data(): test_y = np.array([1.,0.,1.4,1.,0.,1.,1.,0.,0.,0.]) test_X = np.array([[1.1,2.0,3.0,4.0,5.0,6.5,7.0,8.0,9.0,10.0], - [2.0,1.2,6.0,4.0,5.0,8.0,7.0,5.0,9.0,10.0]]) + [2.0,1.2,6.0,4.0,5.0,8.0,7.0,5.0,9.0,10.0]]).T return (test_X, test_y)