Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented complexity. User can specify objectives #51

Merged
merged 1 commit into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/bindings/bind_programs.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ void bind_program(py::module& m, string name)
.def("get_dot_model", &T::get_dot_model, py::arg("extras")="")
.def("get_weights", &T::get_weights)
.def("size", &T::size, py::arg("include_weight")=true)
.def("complexity", &T::complexity)
.def("depth", &T::depth)
.def("cross", &T::cross, py::return_value_policy::automatic,
"Performs one attempt to stochastically swap subtrees between two programs and generate a child")
Expand Down
63 changes: 39 additions & 24 deletions src/brush/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ class BrushEstimator(BaseEstimator):
A dictionary with keys naming the function set and values giving the probability
of sampling them, or a list of functions which will be weighted uniformly.
If empty, all available functions are included in the search space.
objectives : list[str], default ["error", "size"]
list with one or more objectives to use. Options are `"error", "size", "complexity"`.
If `"error"` is used, then it will be the mean squared error for regression,
and accuracy for classification.
initialization : {"grow", "full"}, default "grow"
Strategy to create the initial population. If `full`, then every expression is created
with `max_size` nodes. If `grow`, size will be uniformly distributed.
Expand Down Expand Up @@ -111,6 +115,7 @@ def __init__(
mutation_options = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6,
"toggle_weight_on":1/6, "toggle_weight_off":1/6},
functions: list[str]|dict[str,float] = {},
objectives=["error", "size"],
initialization="grow",
algorithm="nsga2",
random_state=None,
Expand All @@ -127,6 +132,7 @@ def __init__(
self.cx_prob=cx_prob
self.mutation_options=mutation_options
self.functions=functions
self.objectives=objectives
self.initialization=initialization
self.random_state=random_state
self.batch_size=batch_size
Expand Down Expand Up @@ -239,6 +245,13 @@ def fit(self, X, y):
# elif "Softmax" not in self.functions_: # TODO: implement multiclassific.
# self.functions_["Softmax"] = 1.0

# Weight of each objective (+ for maximization, - for minimization)
obj_weight = {
"error" : +1.0 if self.mode=="classification" else -1.0,
"size" : -1.0,
"complexity" : -1.0
}
self.weights = [obj_weight[w] for w in self.objectives]

# These have a default behavior to return something meaningfull if
# no values are set
Expand Down Expand Up @@ -370,23 +383,24 @@ class BrushClassifier(BrushEstimator,ClassifierMixin):
def __init__( self, **kwargs):
super().__init__(mode='classification',**kwargs)

# Weight of each objective (+ for maximization, - for minimization)
self.weights = (+1.0,-1.0)

def _error(self, ind, data: _brush.Dataset):
return (data.y==ind.prg.predict(data)).sum() / data.y.shape[0]
def _fitness_validation(self, ind, data: _brush.Dataset):
# Fitness without fitting the expression, used with validation data
return ( # (accuracy, size)
(data.y==ind.prg.predict(data)).sum() / data.y.shape[0],
ind.prg.size()
)

ind_objectives = {
"error" : self._error(ind, data),
"size" : ind.prg.size(),
"complexity": ind.prg.complexity()
}
return [ ind_objectives[obj] for obj in self.objectives ]

def _fitness_function(self, ind, data: _brush.Dataset):
ind.prg.fit(data)
return ( # (accuracy, size)
(data.y==ind.prg.predict(data)).sum() / data.y.shape[0],
ind.prg.size()
)


return self._fitness_validation(ind, data)

def _make_individual(self):
# C++'s PTC2-based `make_individual` will create a tree of at least
# the given size. By uniformly sampling the size, we can instantiate a
Expand Down Expand Up @@ -461,26 +475,27 @@ class BrushRegressor(BrushEstimator, RegressorMixin):
def __init__(self, **kwargs):
super().__init__(mode='regressor',**kwargs)

# Weight of each objective (+ for maximization, - for minimization)
self.weights = (-1.0,-1.0)

def _fitness_validation(self, ind, data: _brush.Dataset):
# Fitness without fitting the expression, used with validation data

def _error(self, ind, data: _brush.Dataset):
MSE = np.mean( (data.y-ind.prg.predict(data))**2 )
if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf
MSE = np.inf

return ( MSE, ind.prg.size() )
return MSE

def _fitness_validation(self, ind, data: _brush.Dataset):
# Fitness without fitting the expression, used with validation data

ind_objectives = {
"error" : self._error(ind, data),
"size" : ind.prg.size(),
"complexity": ind.prg.complexity()
}
return [ ind_objectives[obj] for obj in self.objectives ]

def _fitness_function(self, ind, data: _brush.Dataset):
ind.prg.fit(data)

MSE = np.mean( (data.y-ind.prg.predict(data))**2 )
if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf
MSE = np.inf

return ( MSE, ind.prg.size() )
return self._fitness_validation(ind, data)

def _make_individual(self):
if self.initialization not in ["grow", "full"]:
Expand Down
3 changes: 2 additions & 1 deletion src/program/nodetype.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ using Brush::Data::TimeSeriesf;

namespace Brush {

enum class NodeType : uint64_t {
enum class NodeType : uint64_t { // Each node type must have a complexity
// in operator_complexities@tree_node.cpp
// Unary
Abs = 1UL << 0UL,
Acos = 1UL << 1UL,
Expand Down
8 changes: 8 additions & 0 deletions src/program/program.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,14 @@ template<PT PType> struct Program
SSref = std::optional<std::reference_wrapper<SearchSpace>>{s};
}

/// @brief count the complexity of the program.
/// @return int complexity.
int complexity() const{
auto head = Tree.begin();

return head.node->get_complexity();
}

/// @brief count the tree size of the program, including the weights in weighted nodes.
/// @param include_weight whether to include the node's weight in the count.
/// @return int number of nodes.
Expand Down
83 changes: 83 additions & 0 deletions src/program/tree_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,87 @@ void from_json(const json &j, tree<Node> &t)
stack.push_back(subtree);
}
t = stack.back();
}

unordered_map<NodeType, int> operator_complexities = {
// Unary
{NodeType::Abs , 3},
{NodeType::Acos , 3},
{NodeType::Asin , 3},
{NodeType::Atan , 3},
{NodeType::Cos , 3},
{NodeType::Cosh , 3},
{NodeType::Sin , 3},
{NodeType::Sinh , 3},
{NodeType::Tan , 3},
{NodeType::Tanh , 3},
{NodeType::Ceil , 3},
{NodeType::Floor , 3},
{NodeType::Exp , 3},
{NodeType::Log , 3},
{NodeType::Logabs , 3},
{NodeType::Log1p , 3},
{NodeType::Sqrt , 3},
{NodeType::Sqrtabs , 3},
{NodeType::Square , 3},
{NodeType::Logistic, 3},

// timing masks
{NodeType::Before, 2},
{NodeType::After , 2},
{NodeType::During, 2},

// Reducers
{NodeType::Min , 4},
{NodeType::Max , 4},
{NodeType::Mean , 4},
{NodeType::Median, 4},
{NodeType::Sum , 4},
{NodeType::Prod , 4},

// Transformers
{NodeType::Softmax, 4},

// Binary
{NodeType::Add, 1},
{NodeType::Sub, 1},
{NodeType::Mul, 1},
{NodeType::Div, 1},
{NodeType::Pow, 1},

//split
{NodeType::SplitBest, 2},
{NodeType::SplitOn , 2},

// boolean
{NodeType::And, 1},
{NodeType::Or , 1},
{NodeType::Not, 1},

// leaves
{NodeType::MeanLabel, 1},
{NodeType::Constant , 1},
{NodeType::Terminal , 2},
{NodeType::ArgMax , 2},
{NodeType::Count , 2},

// custom
{NodeType::CustomUnaryOp , 5},
{NodeType::CustomBinaryOp, 5},
{NodeType::CustomSplit , 5}
};

int TreeNode::get_complexity() const
{
int node_complexity = operator_complexities.at(data.node_type);
int children_complexity = 0;

auto child = first_child;
for(int i = 0; i < data.get_arg_count(); ++i)
{
children_complexity += child->get_complexity();
child = child->next_sibling;
}

return node_complexity*children_complexity;
}
2 changes: 2 additions & 0 deletions src/program/tree_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class tree_node_<Node> { // size: 5*4=20 bytes (on 32 bit arch), can be reduced

string get_model(bool pretty=false) const;
string get_tree_model(bool pretty=false, string offset="") const;

int get_complexity() const;
};
using TreeNode = class tree_node_<Node>;

Expand Down
Loading