From d70339ae310e8a366573fc39fa88a8b6f12c264b Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Wed, 11 Oct 2023 16:25:52 -0400 Subject: [PATCH] Implemented complexity. User can specify objectives --- src/bindings/bind_programs.h | 1 + src/brush/estimator.py | 63 ++++++++++++++++----------- src/program/nodetype.h | 3 +- src/program/program.h | 8 ++++ src/program/tree_node.cpp | 83 ++++++++++++++++++++++++++++++++++++ src/program/tree_node.h | 2 + 6 files changed, 135 insertions(+), 25 deletions(-) diff --git a/src/bindings/bind_programs.h b/src/bindings/bind_programs.h index f18e188e..2211aa90 100644 --- a/src/bindings/bind_programs.h +++ b/src/bindings/bind_programs.h @@ -46,6 +46,7 @@ void bind_program(py::module& m, string name) .def("get_dot_model", &T::get_dot_model, py::arg("extras")="") .def("get_weights", &T::get_weights) .def("size", &T::size, py::arg("include_weight")=true) + .def("complexity", &T::complexity) .def("depth", &T::depth) .def("cross", &T::cross, py::return_value_policy::automatic, "Performs one attempt to stochastically swap subtrees between two programs and generate a child") diff --git a/src/brush/estimator.py b/src/brush/estimator.py index ce79efe4..cd99af71 100644 --- a/src/brush/estimator.py +++ b/src/brush/estimator.py @@ -57,6 +57,10 @@ class BrushEstimator(BaseEstimator): A dictionary with keys naming the function set and values giving the probability of sampling them, or a list of functions which will be weighted uniformly. If empty, all available functions are included in the search space. + objectives : list[str], default ["error", "size"] + list with one or more objectives to use. Options are `"error", "size", "complexity"`. + If `"error"` is used, then it will be the mean squared error for regression, + and accuracy for classification. initialization : {"grow", "full"}, default "grow" Strategy to create the initial population. If `full`, then every expression is created with `max_size` nodes. If `grow`, size will be uniformly distributed. @@ -111,6 +115,7 @@ def __init__( mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6}, functions: list[str]|dict[str,float] = {}, + objectives=["error", "size"], initialization="grow", algorithm="nsga2", random_state=None, @@ -127,6 +132,7 @@ def __init__( self.cx_prob=cx_prob self.mutation_options=mutation_options self.functions=functions + self.objectives=objectives self.initialization=initialization self.random_state=random_state self.batch_size=batch_size @@ -239,6 +245,13 @@ def fit(self, X, y): # elif "Softmax" not in self.functions_: # TODO: implement multiclassific. # self.functions_["Softmax"] = 1.0 + # Weight of each objective (+ for maximization, - for minimization) + obj_weight = { + "error" : +1.0 if self.mode=="classification" else -1.0, + "size" : -1.0, + "complexity" : -1.0 + } + self.weights = [obj_weight[w] for w in self.objectives] # These have a default behavior to return something meaningfull if # no values are set @@ -370,23 +383,24 @@ class BrushClassifier(BrushEstimator,ClassifierMixin): def __init__( self, **kwargs): super().__init__(mode='classification',**kwargs) - # Weight of each objective (+ for maximization, - for minimization) - self.weights = (+1.0,-1.0) - + def _error(self, ind, data: _brush.Dataset): + return (data.y==ind.prg.predict(data)).sum() / data.y.shape[0] + def _fitness_validation(self, ind, data: _brush.Dataset): # Fitness without fitting the expression, used with validation data - return ( # (accuracy, size) - (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], - ind.prg.size() - ) + + ind_objectives = { + "error" : self._error(ind, data), + "size" : ind.prg.size(), + "complexity": ind.prg.complexity() + } + return [ ind_objectives[obj] for obj in self.objectives ] def _fitness_function(self, ind, data: _brush.Dataset): ind.prg.fit(data) - return ( # (accuracy, size) - (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], - ind.prg.size() - ) - + + return self._fitness_validation(ind, data) + def _make_individual(self): # C++'s PTC2-based `make_individual` will create a tree of at least # the given size. By uniformly sampling the size, we can instantiate a @@ -461,26 +475,27 @@ class BrushRegressor(BrushEstimator, RegressorMixin): def __init__(self, **kwargs): super().__init__(mode='regressor',**kwargs) - # Weight of each objective (+ for maximization, - for minimization) - self.weights = (-1.0,-1.0) - - def _fitness_validation(self, ind, data: _brush.Dataset): - # Fitness without fitting the expression, used with validation data - + def _error(self, ind, data: _brush.Dataset): MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf MSE = np.inf - return ( MSE, ind.prg.size() ) + return MSE + + def _fitness_validation(self, ind, data: _brush.Dataset): + # Fitness without fitting the expression, used with validation data + + ind_objectives = { + "error" : self._error(ind, data), + "size" : ind.prg.size(), + "complexity": ind.prg.complexity() + } + return [ ind_objectives[obj] for obj in self.objectives ] def _fitness_function(self, ind, data: _brush.Dataset): ind.prg.fit(data) - MSE = np.mean( (data.y-ind.prg.predict(data))**2 ) - if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf - MSE = np.inf - - return ( MSE, ind.prg.size() ) + return self._fitness_validation(ind, data) def _make_individual(self): if self.initialization not in ["grow", "full"]: diff --git a/src/program/nodetype.h b/src/program/nodetype.h index ac00ccfd..7d153ab7 100644 --- a/src/program/nodetype.h +++ b/src/program/nodetype.h @@ -28,7 +28,8 @@ using Brush::Data::TimeSeriesf; namespace Brush { -enum class NodeType : uint64_t { +enum class NodeType : uint64_t { // Each node type must have a complexity + // in operator_complexities@tree_node.cpp // Unary Abs = 1UL << 0UL, Acos = 1UL << 1UL, diff --git a/src/program/program.h b/src/program/program.h index 523a0b90..ec0305b9 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -89,6 +89,14 @@ template struct Program SSref = std::optional>{s}; } + /// @brief count the complexity of the program. + /// @return int complexity. + int complexity() const{ + auto head = Tree.begin(); + + return head.node->get_complexity(); + } + /// @brief count the tree size of the program, including the weights in weighted nodes. /// @param include_weight whether to include the node's weight in the count. /// @return int number of nodes. diff --git a/src/program/tree_node.cpp b/src/program/tree_node.cpp index 0e4dfcd3..59b4088a 100644 --- a/src/program/tree_node.cpp +++ b/src/program/tree_node.cpp @@ -75,4 +75,87 @@ void from_json(const json &j, tree &t) stack.push_back(subtree); } t = stack.back(); +} + +unordered_map operator_complexities = { + // Unary + {NodeType::Abs , 3}, + {NodeType::Acos , 3}, + {NodeType::Asin , 3}, + {NodeType::Atan , 3}, + {NodeType::Cos , 3}, + {NodeType::Cosh , 3}, + {NodeType::Sin , 3}, + {NodeType::Sinh , 3}, + {NodeType::Tan , 3}, + {NodeType::Tanh , 3}, + {NodeType::Ceil , 3}, + {NodeType::Floor , 3}, + {NodeType::Exp , 3}, + {NodeType::Log , 3}, + {NodeType::Logabs , 3}, + {NodeType::Log1p , 3}, + {NodeType::Sqrt , 3}, + {NodeType::Sqrtabs , 3}, + {NodeType::Square , 3}, + {NodeType::Logistic, 3}, + + // timing masks + {NodeType::Before, 2}, + {NodeType::After , 2}, + {NodeType::During, 2}, + + // Reducers + {NodeType::Min , 4}, + {NodeType::Max , 4}, + {NodeType::Mean , 4}, + {NodeType::Median, 4}, + {NodeType::Sum , 4}, + {NodeType::Prod , 4}, + + // Transformers + {NodeType::Softmax, 4}, + + // Binary + {NodeType::Add, 1}, + {NodeType::Sub, 1}, + {NodeType::Mul, 1}, + {NodeType::Div, 1}, + {NodeType::Pow, 1}, + + //split + {NodeType::SplitBest, 2}, + {NodeType::SplitOn , 2}, + + // boolean + {NodeType::And, 1}, + {NodeType::Or , 1}, + {NodeType::Not, 1}, + + // leaves + {NodeType::MeanLabel, 1}, + {NodeType::Constant , 1}, + {NodeType::Terminal , 2}, + {NodeType::ArgMax , 2}, + {NodeType::Count , 2}, + + // custom + {NodeType::CustomUnaryOp , 5}, + {NodeType::CustomBinaryOp, 5}, + {NodeType::CustomSplit , 5} +}; + +int TreeNode::get_complexity() const +{ + int node_complexity = operator_complexities.at(data.node_type); + int children_complexity = 0; + + auto child = first_child; + for(int i = 0; i < data.get_arg_count(); ++i) + { + children_complexity += child->get_complexity(); + child = child->next_sibling; + } + + return node_complexity*children_complexity; } \ No newline at end of file diff --git a/src/program/tree_node.h b/src/program/tree_node.h index 81836137..64b54149 100644 --- a/src/program/tree_node.h +++ b/src/program/tree_node.h @@ -49,6 +49,8 @@ class tree_node_ { // size: 5*4=20 bytes (on 32 bit arch), can be reduced string get_model(bool pretty=false) const; string get_tree_model(bool pretty=false, string offset="") const; + + int get_complexity() const; }; using TreeNode = class tree_node_;