From 3328d77751d46d148ca594058ade1eb454be9306 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Tue, 25 Jul 2023 10:16:24 -0400
Subject: [PATCH 01/17] `get_prob_change` returns 0 if node is fixed

avoid choosing fixed nodes in variation operations
---
 src/program/node.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/program/node.h b/src/program/node.h
index 8742fdc7..42791173 100644
--- a/src/program/node.h
+++ b/src/program/node.h
@@ -237,7 +237,7 @@ struct Node {
     ////////////////////////////////////////////////////////////////////////////////
     // getters and setters
     //TODO revisit
-    float get_prob_change() const { return this->prob_change;};
+    float get_prob_change() const { return fixed ? 0.0 : this->prob_change;};
     void set_prob_change(float w){ if (!fixed) this->prob_change = w;};
     float get_prob_keep() const { return 1-this->prob_change;};
 

From e36b205cae342e0e189f5f450bd5b85727e9608e Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Wed, 26 Jul 2023 14:11:25 -0400
Subject: [PATCH 02/17] Added comment about `random_state` with multithreads

---
 src/brush/estimator.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/brush/estimator.py b/src/brush/estimator.py
index 3fe983bb..e62971de 100644
--- a/src/brush/estimator.py
+++ b/src/brush/estimator.py
@@ -48,7 +48,12 @@ class BrushEstimator(BaseEstimator):
         If empty, all available functions are included in the search space.
     random_state: int or None, default None
         If int, then the value is used to seed the c++ random generator; if None,
-        then a seed will be generated using a non-deterministic generator.
+        then a seed will be generated using a non-deterministic generator. It is
+        important to notice that, even if the random state is fixed, it is
+        unlikely that running brush using multiple threads will have the same
+        results. This happens because the Operating System's scheduler is
+        responsible to choose which thread will run at any given time, thus 
+        reproductibility is not guaranteed.
 
     Attributes
     ----------

From 39cf94f6ce64c7b300f5ee220b180b6868e49d25 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Wed, 26 Jul 2023 14:12:46 -0400
Subject: [PATCH 03/17] fixed test cases that causes core dump

Although there core dumps were not occuring, the different size
between X and y would crash the tests if we attempted to
do some operation that relies on these values.

I fixed this, as I'm starting to make search space initialize
weights (the tests were crashing for this exact reason)
---
 src/program/node.h           | 2 +-
 src/program/operator.h       | 4 ++--
 tests/python/test_params.py  | 6 +++---
 tests/python/test_program.py | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/program/node.h b/src/program/node.h
index 42791173..cf4541ef 100644
--- a/src/program/node.h
+++ b/src/program/node.h
@@ -39,7 +39,7 @@ using Brush::Data::Dataset;
 
 namespace Brush{
 
-// should I move this declaration to another place?
+// TODO: should I move this declaration to another place?
 template <DataType... T>
 inline auto Isnt(DataType dt) -> bool { return !((dt == T) || ...); }
 
diff --git a/src/program/operator.h b/src/program/operator.h
index 3f8b8927..195afc6a 100644
--- a/src/program/operator.h
+++ b/src/program/operator.h
@@ -16,7 +16,7 @@ namespace util{
     /// @param weights option pointer to a weight array, used in place of node weight
     /// @return 
     template<typename T, typename Scalar, typename W> 
-        requires (!is_one_of_v<Scalar,bool, bJet>)
+        requires (!is_one_of_v<Scalar,bool,bJet>)
     Scalar get_weight(const TreeNode& tn, const W** weights=nullptr)
     { 
         Scalar w;
@@ -46,7 +46,7 @@ namespace util{
         return w;
     };
     template<typename T, typename Scalar, typename W> 
-        requires (is_one_of_v<Scalar,bool, bJet>)
+        requires (is_one_of_v<Scalar,bool,bJet>)
     Scalar get_weight(const TreeNode& tn, const W** weights=nullptr)
     {
         // we cannot weight a boolean feature. Nevertheless, we need to provide
diff --git a/tests/python/test_params.py b/tests/python/test_params.py
index 0b91b7d5..a24a2a5c 100644
--- a/tests/python/test_params.py
+++ b/tests/python/test_params.py
@@ -7,9 +7,9 @@
 
 
 def test_random_state():
-    test_y = np.array([1.,0.,1.4,1.,0.,1.,1.,0.,0.,0.])
-    test_X = np.array([[1.1,2.0,3.0,4.0,5.0,6.5,7.0,8.0,9.0,10.0],
-                       [2.0,1.2,6.0,4.0,5.0,8.0,7.0,5.0,9.0,10.0]])
+    test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0.  ])
+    test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0],
+                       [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T
     
     data = _brush.Dataset(test_X, test_y)
     SS   = _brush.SearchSpace(data)
diff --git a/tests/python/test_program.py b/tests/python/test_program.py
index 5c7e51b7..78356bee 100644
--- a/tests/python/test_program.py
+++ b/tests/python/test_program.py
@@ -13,7 +13,7 @@
 def test_data():
     test_y = np.array([1.,0.,1.4,1.,0.,1.,1.,0.,0.,0.])
     test_X = np.array([[1.1,2.0,3.0,4.0,5.0,6.5,7.0,8.0,9.0,10.0],
-                       [2.0,1.2,6.0,4.0,5.0,8.0,7.0,5.0,9.0,10.0]])
+                       [2.0,1.2,6.0,4.0,5.0,8.0,7.0,5.0,9.0,10.0]]).T
     
     return (test_X, test_y)
 

From 3bd7063b525af3357721ff2dcef4bb6d9b530df3 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Wed, 26 Jul 2023 14:14:20 -0400
Subject: [PATCH 04/17] Fix wrong axis used in normalize

---
 src/util/utils.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/util/utils.cpp b/src/util/utils.cpp
index 05243649..9f469618 100644
--- a/src/util/utils.cpp
+++ b/src/util/utils.cpp
@@ -105,10 +105,10 @@ void Normalizer::fit(MatrixXf& X, const vector<char>& dt)
     for (unsigned int i=0; i<X.cols(); ++i)
     {
         // mean center
-        VectorXf tmp = X.row(i).array()-X.row(i).mean();
+        VectorXf tmp = X.col(i).array()-X.col(i).mean();
         // scale by the standard deviation
-        scale.push_back(std::sqrt((tmp.array()).square().sum()/(tmp.size()-1)));
-        offset.push_back(X.row(i).mean());
+        scale.push_back(std::sqrt((tmp.array()).square().sum()/(tmp.size())));
+        offset.push_back(X.col(i).mean());
     }
       
 }
@@ -121,15 +121,15 @@ void Normalizer::normalize(MatrixXf& X)
     {
         if (std::isinf(scale.at(i)))
         {
-            X.row(i) = VectorXf::Zero(X.row(i).size());
+            X.col(i) = VectorXf::Zero(X.col(i).size());
             continue;
         }
-        // scale, potentially skipping binary and categorical rows
+        // scale, potentially skipping binary and categorical cols
         if (this->scale_all || dtypes.at(i)=='f')                   
         {
-            X.row(i) = X.row(i).array() - offset.at(i);
+            X.col(i) = X.col(i).array() - offset.at(i);
             if (scale.at(i) > NEAR_ZERO)
-                X.row(i) = X.row(i).array()/scale.at(i);
+                X.col(i) = X.col(i).array()/scale.at(i);
         }
     }
 }

From fb925a168d6a62b02a3e8652228e775b585c8db6 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Wed, 26 Jul 2023 14:16:22 -0400
Subject: [PATCH 05/17] Initializing weights of floating number terminals

---
 src/search_space.cpp | 60 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/src/search_space.cpp b/src/search_space.cpp
index 5949b5a6..2b6d6d9f 100644
--- a/src/search_space.cpp
+++ b/src/search_space.cpp
@@ -4,6 +4,35 @@
 
 namespace Brush{
 
+
+float calc_initial_weight(const ArrayXf& value, const ArrayXf& y)
+{
+    // weights are initialized as the slope of the z-score of x and y.
+
+    // If y has different length from X, we get a core dump here. 
+    // TODO: need to make SS (or Datasaet) check for this when loading the data
+
+    vector<char> dtypes = {'f', 'f'};
+
+    MatrixXf data(value.size(), 2);
+
+    data.col(0) << value;
+    data.col(1) << y;
+
+    Normalizer n(true);
+    n.fit_normalize(data, dtypes); // normalize works row-wise
+
+    // In slope function, argument order matters (if not normalized with z-score)
+    // The feature should be the first value, and the true value the second
+    // (it will divide covar(arg1, arg2) by var(arg2)).
+    // Since z-score normalizes so mean=0 and std=1, then order doesnt matter
+    float prob_change = std::abs(slope(data.col(0).array(),   // x
+                                       data.col(1).array())); // y
+
+    return prob_change;
+}
+
+
 /// @brief generate terminals from the dataset features and random constants.
 /// @param d a dataset
 /// @return a vector of nodes 
@@ -19,21 +48,40 @@ vector<Node> generate_terminals(const Dataset& d)
                 using Scalar = typename T::Scalar;
                 constexpr bool weighted = std::is_same_v<Scalar, float>;
                 // if constexpr (T::Scalar == float)
-                terminals.push_back(Node(
+                auto n = Node(
                     NodeType::Terminal, 
                     Signature<T()>{}, 
                     weighted,
                     feature_name
-                ));
+                );
+
+                float prob_change = 1.0;
+                
+                // if the value can be casted to float array, we can calculate slope
+                if (std::holds_alternative<ArrayXf>(value)) {
+                    prob_change = calc_initial_weight(std::get<ArrayXf>(value), d.y);
+                }
+
+                n.set_prob_change( prob_change );
+
+                terminals.push_back(n);
             },
             value
         );
         ++i;
     };
 
-    // add a constant
-    terminals.push_back( Node(NodeType::Constant, Signature<ArrayXf()>{}, true, "C"));
-    terminals.push_back( Node(NodeType::Constant, Signature<ArrayXi()>{}, true, "C"));
+    // add constants
+    float num_const_prob_change = calc_initial_weight(VectorXf::Ones(d.y.size()), d.y);
+
+    auto cXf = Node(NodeType::Constant, Signature<ArrayXf()>{}, true, "C");
+    // cXf.set_prob_change(num_const_prob_change);
+    terminals.push_back(cXf);
+
+    auto cXi = Node(NodeType::Constant, Signature<ArrayXi()>{}, true, "C");
+    // cXi.set_prob_change(num_const_prob_change);
+    terminals.push_back(cXi);
+
     terminals.push_back( Node(NodeType::Constant, Signature<ArrayXb()>{}, false, "C"));
 
     return terminals;
@@ -77,7 +125,7 @@ void SearchSpace::init(const Dataset& d, const unordered_map<string,float>& user
 
         /* fmt::print("terminal ret_type: {}\n", DataTypeName[term.ret_type]); */
         terminal_map[term.ret_type].push_back(term);
-        terminal_weights[term.ret_type].push_back(1.0);
+        terminal_weights[term.ret_type].push_back(term.get_prob_change());
     }
 };
 

From 174d9a55e967d300b18c3f6500fcf6106cb88e96 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Wed, 26 Jul 2023 14:16:43 -0400
Subject: [PATCH 06/17] Improved test with comments explaining expected
 behavior

---
 tests/cpp/test_search_space.cpp | 57 +++++++++++++++------------------
 1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/tests/cpp/test_search_space.cpp b/tests/cpp/test_search_space.cpp
index 18e1fa8c..84dd9bf3 100644
--- a/tests/cpp/test_search_space.cpp
+++ b/tests/cpp/test_search_space.cpp
@@ -5,44 +5,37 @@
 
 TEST(SearchSpace, Initialization)
 {
-	
-    MatrixXf X(4,2); 
-    MatrixXf X_v(3,2); 
-    X << 0,1,  
-         0.47942554,0.87758256,  
-         0.84147098,  0.54030231,
-         0.99749499,  0.0707372;
-    X_v <<  0.90929743, -0.41614684,
-         0.59847214, -0.80114362,
-         0.14112001,-0.9899925;
+    ArrayXf y(4); 
+    y << 3.00000,  3.59876, 7.18622, 15.19294;
 
-    X.transposeInPlace();
-    X_v.transposeInPlace();
+    // variables have different pairwise correlations with y. The idea is to 
+    // see the mutation weights for each floating variable. The slope were
+    // calculated using python np.cov(xprime, yprime)[0][1]/np.var(xprime),
+    // where xprime and yprime are the z-score normalized arrays obtained
+    // from x and y.
+    MatrixXf X(5,4); 
+    X <<       0,       0,       1,       1, // x0, binary, expected weight=1
+               2,       0,       1,       2, // x1, categorical, expected weight=1
+         0.05699, 0.62737, 0.72406, 0.99294, // x2, slope ~= 1.069
+         0.03993, 0.36558, 0.01393, 0.25878, // x3, slope ~= 0.25
+         5.17539, 7.63579,-2.82560, 0.24645; // x4, slope ~= -0.799
+    X.transposeInPlace(); // 4 rows x 5 variables
 
-    ArrayXf y(4); 
-    ArrayXf y_v(3); 
-    // y = 2*x1 + 3.x2
-    y << 3.0,  3.59159876,  3.30384889,  2.20720158;
-    y_v << 0.57015434, -1.20648656, -2.68773747;
-    
 	Dataset dt(X, y);
-    Dataset dv(X_v, y_v);
-    
+
+    // different weights to check if searchspace is initialized correctnly
     unordered_map<string, float> user_ops = {
-        {"Add", 1},
-        {"Sub", 1},
-        {"Div", .5},
-        {"Times", 0.5}
+        {"Add",   1},
+        {"Sub",   1},
+        {"Div",  .5},
+        {"Mul", 0.5}
     };
 
-    // SearchSpace SS;
     SearchSpace SS;
     SS.init(dt, user_ops);
 
-/*     dtable_fit.print(); */
-/*     dtable_predict.print(); */
-}
-
-// TODO: check if searchspace recognizes when PTC2 is not going to work (avoid infinite loops)
-
-// TODO: test search space when I set only incompatible functions and terminals (should work)
\ No newline at end of file
+    dt.print();
+    SS.print();
+    // dtable_fit.print();
+    // dtable_predict.print();
+}
\ No newline at end of file

From 542727b40e43ec0deeaf5e858f3887bd364cb8c1 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Fri, 4 Aug 2023 15:23:11 -0400
Subject: [PATCH 07/17] Test to check if random_state works (currently not)

---
 tests/python/test_brush.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/python/test_brush.py b/tests/python/test_brush.py
index 2fadd1e4..38c381dc 100644
--- a/tests/python/test_brush.py
+++ b/tests/python/test_brush.py
@@ -60,3 +60,14 @@ def test_fit(setup, brush_args, request):
         pytest.fail(f"Unexpected Exception caught: {e}")
         logging.error(traceback.format_exc())
         
+
+# def test_random_state():
+#     test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0.  ])
+#     test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0],
+#                        [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T
+    
+#     est1 = brush.BrushRegressor(random_state=42).fit(test_X, test_y)
+#     est2 = brush.BrushRegressor(random_state=42).fit(test_X, test_y)
+
+#     assert est1.best_estimator_.get_model() == est2.best_estimator_.get_model(), \
+#            "random state failed to generate same results"
\ No newline at end of file

From 3a150f8f8ca8503ed4da8ec6b2f6e562d147ce56 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Fri, 4 Aug 2023 15:23:40 -0400
Subject: [PATCH 08/17] Test to check if brush's random_state works (yes!)

---
 tests/python/test_params.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/python/test_params.py b/tests/python/test_params.py
index a24a2a5c..e6166c4e 100644
--- a/tests/python/test_params.py
+++ b/tests/python/test_params.py
@@ -6,7 +6,8 @@
 import numpy as np
 
 
-def test_random_state():
+def test_param_random_state():
+    # Check if make_regressor, mutation and crossover will create the same expressions
     test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0.  ])
     test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0],
                        [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T
@@ -20,16 +21,26 @@ def test_random_state():
     for d in range(1,4):
         for s in range(1,20):
             prg = SS.make_regressor(d, s)
-            first_run.append(prg.get_model())
+            prg = prg.mutate()
+            
+            if prg != None: prg = prg.cross(prg)    
+            if prg != None: first_run.append(prg.get_model())
     
+    assert len(first_run) > 0, "either mutation or crossover is always failing"
+
     _brush.set_random_state(123)
 
     second_run = []
     for d in range(1,4):
         for s in range(1,20):
             prg = SS.make_regressor(d, s)
-            second_run.append(prg.get_model())
+            prg = prg.mutate()
+
+            if prg != None: prg = prg.cross(prg)
+            if prg != None: second_run.append(prg.get_model())
         
+    assert len(second_run) > 0, "either mutation or crossover is always failing"
+
     for fr, sr in zip(first_run, second_run):
         assert fr==sr,  "random state failed to generate same expressions"
 

From d450c219b0b8e63ec4f0ae6b6a5c15857f614c37 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Fri, 4 Aug 2023 15:24:16 -0400
Subject: [PATCH 09/17] Implemented logic to train and validation partitions,
 and batch learning

---
 src/bindings/bind_dataset.cpp | 87 +++++++++++++++++++++++++++++------
 src/data/data.cpp             | 52 +++++++++++++++++++--
 src/data/data.h               | 57 +++++++++++++++++++----
 3 files changed, 171 insertions(+), 25 deletions(-)

diff --git a/src/bindings/bind_dataset.cpp b/src/bindings/bind_dataset.cpp
index 3c405f8f..6ff01ffb 100644
--- a/src/bindings/bind_dataset.cpp
+++ b/src/bindings/bind_dataset.cpp
@@ -9,29 +9,90 @@ namespace nl = nlohmann;
 void bind_dataset(py::module & m)
 {
     py::class_<br::Data::Dataset>(m, "Dataset")
+
         // construct from X 
-        .def(py::init<Ref<const ArrayXXf> &>())
+        // .def(py::init<Ref<const ArrayXXf> &>())
+        // construct from X (and optional validation and batch sizes) with constructor 3.
+        .def(py::init([](const Ref<const ArrayXXf>& X,
+                         const float validation_size=0.0,
+                         const float batch_size=1.0){
+                return br::Data::Dataset(
+                    X, {}, validation_size, batch_size);
+            }), 
+            py::arg("X"),
+            py::arg("validation_size") = 0.0,
+            py::arg("batch_size") = 1.0
+        )
         // construct from X, feature names 
-        .def(py::init<
-                const Ref<const ArrayXXf>&, 
-                const vector<string>&
-            >()
+        // .def(py::init<
+        //         const Ref<const ArrayXXf>&, 
+        //         const vector<string>&
+        //     >()
+        // )
+        // construct from X, feature names (and optional validation and batch sizes) with constructor 3.
+        .def(py::init([](const Ref<const ArrayXXf>& X, 
+                         const vector<string>& vn,
+                         const float validation_size=0.0,
+                         const float batch_size=1.0){
+                return br::Data::Dataset(
+                    X, vn, validation_size, batch_size);
+            }), 
+            py::arg("X"),
+            py::arg("vn"),
+            py::arg("validation_size") = 0.0,
+            py::arg("batch_size") = 1.0
         )
-        // construct from X,y arrays
-        .def(py::init<Ref<const ArrayXXf> &, Ref<const ArrayXf> &>())
+
+        // construct from X, y arrays
+        // .def(py::init<Ref<const ArrayXXf> &, Ref<const ArrayXf> &>())
+        // construct from X, y arrays (and optional validation and batch sizes) with constructor 2.
+        .def(py::init([](const Ref<const ArrayXXf>& X, 
+                         const Ref<const ArrayXf>& y,
+                         const float validation_size=0.0,
+                         const float batch_size=1.0){
+                return br::Data::Dataset(
+                    X, y, {}, {}, false, validation_size, batch_size);
+            }), 
+            py::arg("X"),
+            py::arg("y"),
+            py::arg("validation_size") = 0.0,
+            py::arg("batch_size") = 1.0
+        )
+
         // construct from X, y, feature names 
-        .def(py::init<
-                const Ref<const ArrayXXf>&, 
-                const Ref<const ArrayXf>&,
-                const vector<string>&
-            >()
+        // .def(py::init<
+        //         const Ref<const ArrayXXf>&, 
+        //         const Ref<const ArrayXf>&,
+        //         const vector<string>&
+        //     >()
+        // )
+        // construct from X, y, feature names (and optional validation and batch sizes) with constructor 2.
+        .def(py::init([](const Ref<const ArrayXXf>& X, 
+                         const Ref<const ArrayXf>& y,
+                         const vector<string>& vn,
+                         const float validation_size=0.0,
+                         const float batch_size=1.0){
+                return br::Data::Dataset(
+                    X, y, vn, {}, false, validation_size, batch_size);
+            }), 
+            py::arg("X"),
+            py::arg("y"),
+            py::arg("vn"),
+            py::arg("validation_size") = 0.0,
+            py::arg("batch_size") = 1.0
         )
+        
         .def_readwrite("y", &br::Data::Dataset::y)
-    //     .def_readwrite("features", &br::Data::Dataset::features)
+        // .def_readwrite("features", &br::Data::Dataset::features)
         .def("get_n_samples", &br::Data::Dataset::get_n_samples)
         .def("get_n_features", &br::Data::Dataset::get_n_features)
         .def("print", &br::Data::Dataset::print)
         .def("get_batch", &br::Data::Dataset::get_batch)
+        .def("get_training_data", &br::Data::Dataset::get_training_data)
+        .def("get_validation_data", &br::Data::Dataset::get_validation_data)
+        .def("get_batch_size", &br::Data::Dataset::get_batch_size)
+        .def("set_batch_size", &br::Data::Dataset::set_batch_size)
+        .def("split", &br::Data::Dataset::split)
         .def("get_X", &br::Data::Dataset::get_X)
         ;
 
diff --git a/src/data/data.cpp b/src/data/data.cpp
index 449f2d00..c2fdaafc 100644
--- a/src/data/data.cpp
+++ b/src/data/data.cpp
@@ -130,20 +130,36 @@ Dataset Dataset::operator()(const vector<size_t>& idx) const
     return Dataset(new_features, new_y, this->classification);
 }
 
-Dataset Dataset::get_batch(int batch_size) const
+Dataset Dataset::get_batch() const
 {
+    // will always return a new dataset, even when use_batch is false (this case, returns itself)
 
-    batch_size =  std::min(batch_size,int(this->get_n_samples()));
-    return (*this)(r.shuffled_index(get_n_samples()));
+    if (!use_batch) 
+        return (*this);
+
+    auto n_samples = int(this->get_n_samples());
+    // garantee that at least one sample is going to be returned, since
+    // use_batch is true only if batch_size is (0, 1), and ceil will round
+    // up
+    n_samples = int(ceil(n_samples*batch_size));
+
+    return (*this)(r.shuffled_index(n_samples));
 }
 
 array<Dataset, 2> Dataset::split(const ArrayXb& mask) const
 {
+    // TODO: assert that mask is not filled with zeros or ones (would create
+    // one empty partition)
+
     // split data into two based on mask. 
     auto idx1 = Util::mask_to_index(mask);
     auto idx2 = Util::mask_to_index((!mask));
     return std::array<Dataset, 2>{ (*this)(idx1), (*this)(idx2) };
 }
+
+Dataset Dataset::get_training_data() const { return (*this)(training_data_idx); }
+Dataset Dataset::get_validation_data() const { return (*this)(validation_data_idx); }
+
 /// call init at the end of constructors
 /// to define metafeatures of the data.
 void Dataset::init()
@@ -172,6 +188,36 @@ void Dataset::init()
         // add feature to appropriate map list 
         this->features_of_type[feature_type].push_back(name);
     }
+
+    // setting the training and validation data indexes
+    auto n_samples = int(this->get_n_samples());
+    auto idx = r.shuffled_index(n_samples);
+
+    // garantee that at least one sample is going to be returned, since
+    // use_batch is true only if batch_size is (0, 1), and ceil will round
+    // up
+    auto n_train_samples = int(ceil(n_samples*(1-validation_size)));
+
+    training_data_idx.resize(0);
+    std::transform(idx.begin(), idx.begin() + n_train_samples,
+            back_inserter(training_data_idx),
+            [&](int element) { return element; });
+
+    if ( use_validation && (n_samples - n_train_samples != 0) ) {
+        validation_data_idx.resize(0);
+        std::transform(idx.begin() + n_train_samples, idx.end(),
+                back_inserter(validation_data_idx),
+                [&](int element) { return element; });
+    }
+    else {
+        validation_data_idx = training_data_idx;
+    } 
+}
+
+float Dataset::get_batch_size() { return batch_size; }
+void Dataset::set_batch_size(float new_size) {
+    batch_size = new_size;
+    use_batch = batch_size > 0.0 && batch_size < 1.0;
 }
 
 /// turns input data into a feature map
diff --git a/src/data/data.h b/src/data/data.h
index 79814fe5..629c02b5 100644
--- a/src/data/data.h
+++ b/src/data/data.h
@@ -50,27 +50,39 @@ class Dataset
     //Dataset(ArrayXXf& X, ArrayXf& y, std::map<string, 
     //std::pair<vector<ArrayXf>, vector<ArrayXf>>>& Z): X(X), y(y), Z(Z){}
     private:
+        vector<size_t> training_data_idx;
+        vector<size_t> validation_data_idx;
+
     public:
         /// @brief keeps track of the unique data types in the dataset. 
         std::vector<DataType> unique_data_types;
+
         /// @brief types of data in the features.  
         std::vector<DataType> feature_types;
+
         /// @brief map from data types to features having that type.
         std::unordered_map<DataType,vector<string>> features_of_type;
-
         
         /// @brief dataset features, as key value pairs
         std::map<string, State> features;
         // TODO: this should probably be a more complex type to include feature type 
         // and potentially other info, like arbitrary relations between features
 
-        
         /// @brief length N array, the target label
         ArrayXf y;
+
         /// @brief whether this is a classification problem
         bool classification;
         std::optional<std::reference_wrapper<const ArrayXXf>> Xref;
 
+        /// @brief percentage of original data used for train. if 0.0, then all data is used for train and validation
+        float validation_size; 
+        bool use_validation;
+
+        /// @brief percentage of training data size to use in each batch. if 1.0, then all data is used
+        float batch_size;
+        bool use_batch;
+
         Dataset operator()(const vector<size_t>& idx) const;
         /// call init at the end of constructors
         /// to define metafeatures of the data.
@@ -82,34 +94,53 @@ class Dataset
                                         const vector<string>& vn = {}
                                        );
 
-        /// initialize data from a map.
+        /// 1. initialize data from a map.
         Dataset(std::map<string, State>& d, 
              const Ref<const ArrayXf>& y_ = ArrayXf(), 
-             bool c = false
+             bool c = false,
+             float validation_size = 0.0,
+             float batch_size = 1.0
              ) 
              : features(d) 
              , y(y_)
              , classification(c) 
+             , validation_size(validation_size)
+             , use_validation(validation_size > 0.0 && validation_size < 1.0)
+             , batch_size(batch_size)
+             , use_batch(batch_size > 0.0 && batch_size < 1.0)
              {init();};
 
-        /// initialize data from a matrix with feature columns.
+        /// 2. initialize data from a matrix with feature columns.
         Dataset(const ArrayXXf& X, 
              const Ref<const ArrayXf>& y_ = ArrayXf(), 
              const vector<string>& vn = {}, 
              const map<string, State>& Z = {},
-             bool c = false
+             bool c = false,
+             float validation_size = 0.0,
+             float batch_size = 1.0
             ) 
             : features(make_features(X,Z,vn))
             , y(y_)
             , classification(c)
+            , validation_size(validation_size)
+            , use_validation(validation_size > 0.0 && validation_size < 1.0)
+            , batch_size(batch_size)
+            , use_batch(batch_size > 0.0 && batch_size < 1.0)
             {
                 init();
                 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
             } 
 
-        Dataset(const ArrayXXf& X, const vector<string>& vn) 
+        /// 3. initialize data from X and feature names
+        Dataset(const ArrayXXf& X, const vector<string>& vn,
+             float validation_size = 0.0,
+             float batch_size = 1.0) 
             : classification(false)
             , features(make_features(X,map<string, State>{},vn))
+            , validation_size(validation_size)
+            , use_validation(validation_size > 0.0 && validation_size < 1.0)
+            , batch_size(batch_size)
+            , use_batch(batch_size > 0.0 && batch_size < 1.0)
             {
                 init();
                 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
@@ -137,7 +168,12 @@ class Dataset
                 HANDLE_ERROR_THROW("Dataset does not hold a reference to X.");
             return this->Xref.value().get();
         }
-        void set_validation(bool v=true);
+        
+        // inner partition of original dataset for train and validation.
+        // if split is not set, then training = validation.
+        Dataset get_training_data() const;
+        Dataset get_validation_data() const;
+
         inline int get_n_samples() const { 
             return std::visit(
                 [&](auto&& arg) -> int { return int(arg.size());}, 
@@ -146,7 +182,10 @@ class Dataset
         };
         inline int get_n_features() const { return this->features.size(); };
         /// select random subset of data for training weights.
-        Dataset get_batch(int batch_size) const;
+        Dataset get_batch() const;
+
+        float get_batch_size();
+        void set_batch_size(float new_size);
 
         std::array<Dataset, 2> split(const ArrayXb& mask) const;
 

From 665057bfd0831fef881c7f7e1271205304365c99 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Fri, 4 Aug 2023 15:28:08 -0400
Subject: [PATCH 10/17] Using validation partition and batch learning in nsga2

---
 src/brush/deap_api/nsga2.py |  56 ++++++++++++-----
 src/brush/estimator.py      | 116 ++++++++++++++++++++++++++++++------
 2 files changed, 140 insertions(+), 32 deletions(-)

diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py
index b189fead..ccb0ab59 100644
--- a/src/brush/deap_api/nsga2.py
+++ b/src/brush/deap_api/nsga2.py
@@ -1,28 +1,45 @@
 from deap import tools 
 from deap.benchmarks.tools import diversity, convergence, hypervolume
 import numpy as np
-import random
+import functools
 
-def nsga2(toolbox, NGEN, MU, CXPB, verbosity):
+
+def nsga2(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rng):
     # NGEN = 250
     # MU = 100
     # CXPB = 0.9
 
-    stats = tools.Statistics(lambda ind: ind.fitness.values)
-    stats.register("ave", np.mean, axis=0)
-    stats.register("std", np.std, axis=0)
-    stats.register("min", np.min, axis=0)
+    def calculate_statistics(ind):
+        on_train = ind.fitness.values
+        on_val   = toolbox.evaluateValidation(ind)
+
+        return (*on_train, *on_val) 
+
+    stats = tools.Statistics(calculate_statistics)
+
+    stats.register("ave train", np.mean, axis=0)
+    stats.register("std train", np.std, axis=0)
+    stats.register("min train", np.min, axis=0)
+
+    stats.register("ave val", np.mean, axis=0)
+    stats.register("std val", np.std, axis=0)
+    stats.register("min val", np.min, axis=0)
+
     # stats.register("max", np.max, axis=0)
 
     logbook = tools.Logbook()
-    logbook.header = "gen", "evals", "ave", "std", "min" 
+    logbook.header = "gen", "evals", "ave train", "std train", "min train", \
+                     "ave val", "std val", "min val"
 
     pop = toolbox.population(n=MU)
 
-    # Evaluate the individuals with an invalid fitness
-    invalid_ind = [ind for ind in pop if not ind.fitness.valid]
-    fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
-    for ind, fit in zip(invalid_ind, fitnesses):
+    batch = toolbox.getBatch() # everytime this function is called, a new random batch is generated
+    
+    # OBS: evaluate calls fit in the individual. It is different from using it to predict. The
+    # function evaluateValidation don't call the fit
+    fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), pop)
+    
+    for ind, fit in zip(pop, fitnesses):
         ind.fitness.values = fit
 
     # This is just to assign the crowding distance to the individuals
@@ -30,12 +47,20 @@ def nsga2(toolbox, NGEN, MU, CXPB, verbosity):
     pop = toolbox.survive(pop, len(pop))
 
     record = stats.compile(pop)
-    logbook.record(gen=0, evals=len(invalid_ind), **record)
+    logbook.record(gen=0, evals=len(pop), **record)
+
     if verbosity > 0: 
         print(logbook.stream)
 
     # Begin the generational process
     for gen in range(1, NGEN):
+        if (use_batch): #batch will be random only if it is not the size of the entire train set. In this case, we dont need to reevaluate the whole pop
+            batch = toolbox.getBatch()
+            fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), pop)
+        
+            for ind, fit in zip(pop, fitnesses):
+                ind.fitness.values = fit
+
         # Vary the population
         # offspring = tools.selTournamentDCD(pop, len(pop))
         parents = toolbox.select(pop, len(pop))
@@ -43,7 +68,7 @@ def nsga2(toolbox, NGEN, MU, CXPB, verbosity):
         offspring = []
 
         for ind1, ind2 in zip(parents[::2], parents[1::2]):
-            if random.random() < CXPB:
+            if rng.random() < CXPB:
                 off1, off2 = toolbox.mate(ind1, ind2)
             else:
                 off1, off2 = ind1, ind2
@@ -58,14 +83,15 @@ def nsga2(toolbox, NGEN, MU, CXPB, verbosity):
         # archive.update(offspring)
         # Evaluate the individuals with an invalid fitness
         invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
-        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
+        fitnesses = toolbox.map(functools.partial(toolbox.evaluate, data=batch), invalid_ind)
         for ind, fit in zip(invalid_ind, fitnesses):
             ind.fitness.values = fit
 
         # Select the next generation population
         pop = toolbox.survive(pop + offspring, MU)
         record = stats.compile(pop)
-        logbook.record(gen=gen, evals=len(offspring), **record)
+        logbook.record(gen=gen, evals=len(offspring)+(len(pop) if use_batch else 0), **record)
+
         if verbosity > 0: 
             print(logbook.stream)
 
diff --git a/src/brush/estimator.py b/src/brush/estimator.py
index e62971de..04b5ee20 100644
--- a/src/brush/estimator.py
+++ b/src/brush/estimator.py
@@ -5,6 +5,7 @@
 control of the underlying GP objects.
 """
 from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
+from sklearn.utils import check_random_state
 # from sklearn.metrics import mean_squared_error
 import numpy as np
 import pandas as pd
@@ -44,8 +45,21 @@ class BrushEstimator(BaseEstimator):
         A dictionary with keys naming the types of mutation and floating point 
         values specifying the fraction of total mutations to do with that method. 
     functions: dict[str,float] or list[str], default {}
-        A dictionary with keys naming the function set and values giving the probability of sampling them, or a list of functions which will be weighted uniformly.
+        A dictionary with keys naming the function set and values giving the probability
+        of sampling them, or a list of functions which will be weighted uniformly.
         If empty, all available functions are included in the search space.
+    initialization : {"grow", "full"}, default "grow" 
+        Strategy to create the initial population. If `full`, then every expression is created
+        with `max_size` nodes. If `grow`, size will be uniformly distributed.
+    validation_size : float, default 0.0
+        Percentage of samples to use as a hold-out partition. These samples are used
+        to calculate statistics during evolution, but not used to train the models.
+        The `best_estimator_` will be selected using this partition. If zero, then
+        the same data used for training is used for validation.
+    batch_size : float, default 1.0
+        Percentage of training data to sample every generation. If `1.0`, then
+        all data is used. Very small values can improve execution time, but 
+        also lead to underfit.
     random_state: int or None, default None
         If int, then the value is used to seed the c++ random generator; if None,
         then a seed will be generated using a non-deterministic generator. It is
@@ -62,7 +76,11 @@ class BrushEstimator(BaseEstimator):
     archive_ : list[deap_api.DeapIndividual]
         The final population from training. 
     data_ : _brush.Dataset
-        The training data in Brush format. 
+        The complete data in Brush format. 
+    train_ : _brush.Dataset
+        Partition of `data_` containing `(1-validation_size)`% of the data, in Brush format.
+    validation_ : _brush.Dataset
+        Partition of `data_` containing `(validation_size)`% of the data, in Brush format.
     search_space_ : a Brush `SearchSpace` object. 
         Holds the operators and terminals and sampling utilities to update programs.
     toolbox_ : deap.Toolbox
@@ -81,8 +99,10 @@ def __init__(
         cx_prob=0.9,
         mutation_options = {"point":0.25, "insert":0.25, "delete":0.25, "toggle_weight":0.25},
         functions: list[str]|dict[str,float] = {},
+        initialization="grow",
         random_state=None,
-        batch_size: int = 0
+        validation_size: float = 0.0,
+        batch_size: float = 1.0
         ):
         self.pop_size=pop_size
         self.max_gen=max_gen
@@ -93,11 +113,13 @@ def __init__(
         self.cx_prob=cx_prob
         self.mutation_options=mutation_options
         self.functions=functions
+        self.initialization=initialization
         self.random_state=random_state
         self.batch_size=batch_size
+        self.validation_size=validation_size
 
 
-    def _setup_toolbox(self, data):
+    def _setup_toolbox(self, data_train, data_validation, rng):
         """Setup the deap toolbox"""
         toolbox: base.Toolbox = base.Toolbox()
 
@@ -122,11 +144,16 @@ def _setup_toolbox(self, data):
         toolbox.register("survive", tools.selNSGA2)
 
         # toolbox.population will return a list of elements by calling toolbox.individual
-        toolbox.register("population", tools.initRepeat, list, self._make_individual)
-        toolbox.register( "evaluate", self._fitness_function, data=data)
+        toolbox.register("createRandom", self._make_individual, rng=rng)
+        toolbox.register("population", tools.initRepeat, list, toolbox.createRandom)
+
+        toolbox.register("getBatch", data_train.get_batch)
+        toolbox.register("evaluate", self._fitness_function)
+        toolbox.register("evaluateValidation", self._fitness_validation, data=data_validation)
 
         return toolbox
 
+
     def _crossover(self, ind1, ind2):
         offspring = [] 
 
@@ -138,6 +165,7 @@ def _crossover(self, ind1, ind2):
                 offspring.append(None)
 
         return offspring[0], offspring[1]
+    
 
     def _mutate(self, ind1):
         # offspring = (creator.Individual(ind1.prg.mutate(self.search_space_)),)
@@ -148,6 +176,7 @@ def _mutate(self, ind1):
         
         return None
 
+
     def fit(self, X, y):
         """
         Fit an estimator to X,y.
@@ -160,26 +189,39 @@ def fit(self, X, y):
             1-d array of (boolean) target values.
         """
         _brush.set_params(self.get_params())
-        self.data_ = self._make_data(X,y)
-
+        
+        rng = check_random_state(self.random_state)
         if self.random_state != None:
             _brush.set_random_state(self.random_state)
 
+        self.data_ = self._make_data(X,y)
+
         # set n classes if relevant
         if self.mode=="classification":
             self.n_classes_ = len(np.unique(y))
 
+        # These have a default behavior to return something meaningfull if 
+        # no values are set
+        self.train_ = self.data_.get_training_data()
+        self.train_.set_batch_size(self.batch_size)
+        self.validation_ = self.data_.get_validation_data()
+
         if isinstance(self.functions, list):
             self.functions_ = {k:1.0 for k in self.functions}
         else:
             self.functions_ = self.functions
 
-        self.search_space_ = _brush.SearchSpace(self.data_, self.functions_)
-        self.toolbox_ = self._setup_toolbox(data=self.data_)
+        self.search_space_ = _brush.SearchSpace(self.train_, self.functions_)
+        self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_, rng=rng)
 
-        archive, logbook = nsga2(self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, self.verbosity)
+        archive, logbook = nsga2(
+            self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, 
+            (0.0<self.batch_size<1.0), self.verbosity, rng)
+        
         self.archive_ = archive
         self.logbook_ = logbook
+
+        # Selecting the best estimator using validation data and multi-criteria decision making
         self.best_estimator_ = self.archive_[0].prg
 
         if self.verbosity > 0:
@@ -191,6 +233,9 @@ def fit(self, X, y):
         return self
     
     def _make_data(self, X, y=None):
+        # This function should not partition data (as it is used in predict).
+        # partitioning is done in fit().
+
         if isinstance(y, pd.Series):
             y = y.values
         if isinstance(X, pd.DataFrame):
@@ -203,11 +248,13 @@ def _make_data(self, X, y=None):
                 return _brush.Dataset(X, y, feature_names)
 
         assert isinstance(X, np.ndarray)
+
         # if there is no label, don't include it in library call to Dataset
         if isinstance(y, NoneType):
             return _brush.Dataset(X)
 
-        return _brush.Dataset(X,y)
+        return _brush.Dataset(X, y)
+
 
     def predict(self, X):
         """Predict using the best estimator in the archive. """
@@ -250,6 +297,13 @@ class BrushClassifier(BrushEstimator,ClassifierMixin):
     def __init__( self, **kwargs):
         super().__init__(mode='classification',**kwargs)
 
+    def _fitness_validation(self, ind, data: _brush.Dataset):
+        return ( # (accuracy, size)
+            (data.y==ind.prg.predict(data)).sum() / data.y.shape[0], 
+            ind.prg.size()
+        )
+
+
     def _fitness_function(self, ind, data: _brush.Dataset):
         ind.prg.fit(data)
         return ( # (accuracy, size)
@@ -257,15 +311,23 @@ def _fitness_function(self, ind, data: _brush.Dataset):
             ind.prg.size()
         )
     
-    def _make_individual(self):
+    def _make_individual(self, rng):
         # C++'s PTC2-based `make_individual` will create a tree of at least
         # the given size. By uniformly sampling the size, we can instantiate a
         # population with more diversity
+        s = 0  
+        if self.initialization=="grow":
+            s = rng.randint(1, self.max_size)
+        elif self.initialization=="full":
+            s = self.max_size
+        else:
+            raise ValueError(f"Invalid argument value for `initialization`. "
+                             f"expected 'full' or 'grow'. got {self.initialization}")
 
         return creator.Individual(
-            self.search_space_.make_classifier(self.max_depth, self.max_size)
+                self.search_space_.make_classifier(self.max_depth, s)
             if self.n_classes_ == 2 else
-            self.search_space_.make_multiclass_classifier(self.max_depth, self.max_size)
+                self.search_space_.make_multiclass_classifier(self.max_depth, s)
             )
 
     def predict_proba(self, X):
@@ -306,6 +368,16 @@ class BrushRegressor(BrushEstimator, RegressorMixin):
     def __init__(self, **kwargs):
         super().__init__(mode='regressor',**kwargs)
 
+
+    def _fitness_validation(self, ind, data: _brush.Dataset):
+        MSE = np.mean( (data.y-ind.prg.predict(data))**2 )
+        if not np.isfinite(MSE): # numeric erros, np.nan, +-np.inf
+            MSE = np.inf
+
+        # We are squash the error and making it a maximization problem
+        return ( 1/(1+MSE), ind.prg.size() )
+
+
     def _fitness_function(self, ind, data: _brush.Dataset):
         ind.prg.fit(data)
 
@@ -316,9 +388,19 @@ def _fitness_function(self, ind, data: _brush.Dataset):
         # We are squash the error and making it a maximization problem
         return ( 1/(1+MSE), ind.prg.size() )
 
-    def _make_individual(self):        
+
+    def _make_individual(self, rng):
+        s = 0  
+        if self.initialization=="grow":
+            s = rng.randint(1, self.max_size)
+        elif self.initialization=="full":
+            s = self.max_size
+        else:
+            raise ValueError(f"Invalid argument value for `initialization`. "
+                             f"expected 'full' or 'grow'. got {self.initialization}")
+
         return creator.Individual(
-            self.search_space_.make_regressor(self.max_depth, self.max_size)
+            self.search_space_.make_regressor(self.max_depth, s)
         )
 
 # Under development

From dea6776ef5b4984bd53309738475f0f801f89163 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Fri, 4 Aug 2023 17:25:58 -0400
Subject: [PATCH 11/17] Implemented MCDM to select final solution

---
 src/bindings/bind_params.cpp |  1 +
 src/brush/deap_api/nsga2.py  | 23 ++++++-------
 src/brush/estimator.py       | 63 ++++++++++++++++++++----------------
 src/search_space.h           | 24 ++++++--------
 4 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/src/bindings/bind_params.cpp b/src/bindings/bind_params.cpp
index 7ad12b5d..75521ab3 100644
--- a/src/bindings/bind_params.cpp
+++ b/src/bindings/bind_params.cpp
@@ -17,4 +17,5 @@ void bind_params(py::module& m)
     m.def("set_random_state", [](unsigned int seed)
                                 { br::Util::r = *br::Util::Rnd::initRand(); 
                                   br::Util::r.set_seed(seed); });
+    m.def("rnd_flt", [](){ return br::Util::r.rnd_flt(); });
 }
\ No newline at end of file
diff --git a/src/brush/deap_api/nsga2.py b/src/brush/deap_api/nsga2.py
index ccb0ab59..a7d44f4d 100644
--- a/src/brush/deap_api/nsga2.py
+++ b/src/brush/deap_api/nsga2.py
@@ -4,10 +4,11 @@
 import functools
 
 
-def nsga2(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rng):
+def nsga2(toolbox, NGEN, MU, CXPB, use_batch, verbosity, rnd_flt):
     # NGEN = 250
     # MU = 100
     # CXPB = 0.9
+    # rnd_flt: random number generator to sample crossover prob
 
     def calculate_statistics(ind):
         on_train = ind.fitness.values
@@ -17,19 +18,15 @@ def calculate_statistics(ind):
 
     stats = tools.Statistics(calculate_statistics)
 
-    stats.register("ave train", np.mean, axis=0)
-    stats.register("std train", np.std, axis=0)
-    stats.register("min train", np.min, axis=0)
-
-    stats.register("ave val", np.mean, axis=0)
-    stats.register("std val", np.std, axis=0)
-    stats.register("min val", np.min, axis=0)
-
-    # stats.register("max", np.max, axis=0)
+    stats.register("ave", np.mean, axis=0)
+    stats.register("std", np.std, axis=0)
+    stats.register("min", np.min, axis=0)
+    stats.register("max", np.max, axis=0)
 
     logbook = tools.Logbook()
-    logbook.header = "gen", "evals", "ave train", "std train", "min train", \
-                     "ave val", "std val", "min val"
+    logbook.header = "gen", "evals", "ave (O1 train, O2 train, O1 val, O2 val)", \
+                                     "std (O1 train, O2 train, O1 val, O2 val)", \
+                                     "min (O1 train, O2 train, O1 val, O2 val)"
 
     pop = toolbox.population(n=MU)
 
@@ -68,7 +65,7 @@ def calculate_statistics(ind):
         offspring = []
 
         for ind1, ind2 in zip(parents[::2], parents[1::2]):
-            if rng.random() < CXPB:
+            if rnd_flt() < CXPB:
                 off1, off2 = toolbox.mate(ind1, ind2)
             else:
                 off1, off2 = ind1, ind2
diff --git a/src/brush/estimator.py b/src/brush/estimator.py
index 04b5ee20..e16689f8 100644
--- a/src/brush/estimator.py
+++ b/src/brush/estimator.py
@@ -5,7 +5,6 @@
 control of the underlying GP objects.
 """
 from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
-from sklearn.utils import check_random_state
 # from sklearn.metrics import mean_squared_error
 import numpy as np
 import pandas as pd
@@ -119,7 +118,7 @@ def __init__(
         self.validation_size=validation_size
 
 
-    def _setup_toolbox(self, data_train, data_validation, rng):
+    def _setup_toolbox(self, data_train, data_validation):
         """Setup the deap toolbox"""
         toolbox: base.Toolbox = base.Toolbox()
 
@@ -131,6 +130,8 @@ def _setup_toolbox(self, data_train, data_validation, rng):
         # Comparing fitnesses: https://deap.readthedocs.io/en/master/api/base.html#deap.base.Fitness
         creator.create("FitnessMulti", base.Fitness, weights=(+1.0,-1.0))
 
+        # TODO: make this weights attributes of each derivate class (creator is global)
+
         # create Individual class, inheriting from self.Individual with a fitness attribute
         creator.create("Individual", DeapIndividual, fitness=creator.FitnessMulti)  
         
@@ -144,7 +145,7 @@ def _setup_toolbox(self, data_train, data_validation, rng):
         toolbox.register("survive", tools.selNSGA2)
 
         # toolbox.population will return a list of elements by calling toolbox.individual
-        toolbox.register("createRandom", self._make_individual, rng=rng)
+        toolbox.register("createRandom", self._make_individual)
         toolbox.register("population", tools.initRepeat, list, toolbox.createRandom)
 
         toolbox.register("getBatch", data_train.get_batch)
@@ -190,7 +191,6 @@ def fit(self, X, y):
         """
         _brush.set_params(self.get_params())
         
-        rng = check_random_state(self.random_state)
         if self.random_state != None:
             _brush.set_random_state(self.random_state)
 
@@ -212,17 +212,28 @@ def fit(self, X, y):
             self.functions_ = self.functions
 
         self.search_space_ = _brush.SearchSpace(self.train_, self.functions_)
-        self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_, rng=rng)
+        self.toolbox_ = self._setup_toolbox(data_train=self.train_, data_validation=self.validation_)
 
         archive, logbook = nsga2(
             self.toolbox_, self.max_gen, self.pop_size, self.cx_prob, 
-            (0.0<self.batch_size<1.0), self.verbosity, rng)
+            (0.0<self.batch_size<1.0), self.verbosity, _brush.rnd_flt)
         
         self.archive_ = archive
         self.logbook_ = logbook
 
         # Selecting the best estimator using validation data and multi-criteria decision making
-        self.best_estimator_ = self.archive_[0].prg
+        points = np.array([self.toolbox_.evaluateValidation(ind) for ind in self.archive_])
+        points = points*np.array([+1.0,-1.0]) #Multiply by the weights TODO: use weights here instead of hardcoded
+
+        # Normalizing
+        min_vals = np.min(points, axis=0)
+        max_vals = np.max(points, axis=0)
+        points = (points - min_vals) / (max_vals - min_vals)
+        
+        reference = np.array([0, 0])
+        closest_idx = np.argmin( np.linalg.norm(points - reference, axis=1) )
+
+        self.best_estimator_ = self.archive_[closest_idx].prg
 
         if self.verbosity > 0:
             print(f'best model {self.best_estimator_.get_model()}'+
@@ -311,24 +322,22 @@ def _fitness_function(self, ind, data: _brush.Dataset):
             ind.prg.size()
         )
     
-    def _make_individual(self, rng):
+    def _make_individual(self):
         # C++'s PTC2-based `make_individual` will create a tree of at least
         # the given size. By uniformly sampling the size, we can instantiate a
         # population with more diversity
-        s = 0  
-        if self.initialization=="grow":
-            s = rng.randint(1, self.max_size)
-        elif self.initialization=="full":
-            s = self.max_size
-        else:
+        
+        if self.initialization not in ["grow", "full"]:
             raise ValueError(f"Invalid argument value for `initialization`. "
                              f"expected 'full' or 'grow'. got {self.initialization}")
 
         return creator.Individual(
-                self.search_space_.make_classifier(self.max_depth, s)
-            if self.n_classes_ == 2 else
-                self.search_space_.make_multiclass_classifier(self.max_depth, s)
-            )
+            self.search_space_.make_classifier(
+                self.max_depth,(0 if self.initialization=='grow' else self.max_size))
+        if self.n_classes_ == 2 else
+            self.search_space_.make_multiclass_classifier(
+                self.max_depth, (0 if self.initialization=='grow' else self.max_size))
+        )
 
     def predict_proba(self, X):
         """Predict class probabilities for X.
@@ -389,20 +398,18 @@ def _fitness_function(self, ind, data: _brush.Dataset):
         return ( 1/(1+MSE), ind.prg.size() )
 
 
-    def _make_individual(self, rng):
-        s = 0  
-        if self.initialization=="grow":
-            s = rng.randint(1, self.max_size)
-        elif self.initialization=="full":
-            s = self.max_size
-        else:
+    def _make_individual(self):
+        if self.initialization not in ["grow", "full"]:
             raise ValueError(f"Invalid argument value for `initialization`. "
                              f"expected 'full' or 'grow'. got {self.initialization}")
-
-        return creator.Individual(
-            self.search_space_.make_regressor(self.max_depth, s)
+        
+        return creator.Individual( # No arguments (or zero): brush will use PARAMS passed in set_params. max_size is sampled between 1 and params['max_size'] if zero is provided
+            self.search_space_.make_regressor(
+            self.max_depth, (0 if self.initialization=='grow' else self.max_size))
         )
 
+
+
 # Under development
 # class BrushRepresenter(BrushEstimator, TransformerMixin):
 #     """Brush for representation learning.
diff --git a/src/search_space.h b/src/search_space.h
index 396817a0..69d6eb50 100644
--- a/src/search_space.h
+++ b/src/search_space.h
@@ -630,7 +630,7 @@ P SearchSpace::make_program(int max_d, int max_size)
     // highest operator arity, and the real maximum depth is `max_depth` plus one.
 
     if (max_d == 0)
-        max_d = r.rnd_int(1, PARAMS["max_depth"].get<int>());
+        max_d = PARAMS["max_depth"].get<int>();
     if (max_size == 0)
         max_size = r.rnd_int(1, PARAMS["max_size"].get<int>());
 
@@ -678,7 +678,7 @@ P SearchSpace::make_program(int max_d, int max_size)
             n.fixed=true;
         }
         else
-        {
+        { // we start with a non-terminal
             auto opt = sample_op(root_type);
             while (!opt) {
                 opt = sample_op(root_type);
@@ -699,7 +699,7 @@ P SearchSpace::make_program(int max_d, int max_size)
         { 
             /* cout << "queing a node of type " << DataTypeName[a] << endl; */
             auto child_spot = Tree.append_child(spot);
-            queue.push_back(make_tuple(child_spot, a, d));
+            queue.push_back(make_tuple(child_spot, a, d+1));
         }
 
         // Now we actually start the PTC2 procedure to create the program tree
@@ -720,10 +720,8 @@ P SearchSpace::make_program(int max_d, int max_size)
                 // Tree.append_child(qspot, sample_terminal(t));
 
                 auto opt = sample_terminal(t);
-
-                if (!opt) { // lets push back and try again later
-                    queue.push_back(make_tuple(qspot, t, d));
-                    continue;
+                while (!opt) {
+                    opt = sample_terminal(t);
                 }
 
                 // If we successfully get a terminal, use it
@@ -740,9 +738,8 @@ P SearchSpace::make_program(int max_d, int max_size)
                 // TreeIter new_spot = Tree.append_child(qspot, n);
                 // qspot = n;
 
-                if (!opt) { // lets push back and try again later
-                    queue.push_back(make_tuple(qspot, t, d));
-                    continue;
+                while (!opt) {
+                    opt = sample_op(t);
                 }
 
                 n = opt.value();
@@ -779,11 +776,10 @@ P SearchSpace::make_program(int max_d, int max_size)
             // auto newspot = Tree.replace(qspot, sample_terminal(t));
 
             auto opt = sample_terminal(t);
-
-            if (!opt) { // set push back and try again later
-                queue.push_back(make_tuple(qspot, t, d));
-                continue;
+            while (!opt) {
+                opt = sample_terminal(t);
             }
+
             n = opt.value();
             
             auto newspot = Tree.replace(qspot, n);

From 55e630b9e636bea7449ac2e66dd890a3bf845b3d Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Sat, 5 Aug 2023 15:28:17 -0400
Subject: [PATCH 12/17] Python wrapper lets user specify batch size and
 validation partition

---
 src/bindings/bind_dataset.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/bindings/bind_dataset.cpp b/src/bindings/bind_dataset.cpp
index 6ff01ffb..872750d5 100644
--- a/src/bindings/bind_dataset.cpp
+++ b/src/bindings/bind_dataset.cpp
@@ -31,14 +31,14 @@ void bind_dataset(py::module & m)
         // )
         // construct from X, feature names (and optional validation and batch sizes) with constructor 3.
         .def(py::init([](const Ref<const ArrayXXf>& X, 
-                         const vector<string>& vn,
+                         const vector<string>& feature_names,
                          const float validation_size=0.0,
                          const float batch_size=1.0){
                 return br::Data::Dataset(
-                    X, vn, validation_size, batch_size);
+                    X, feature_names, validation_size, batch_size);
             }), 
             py::arg("X"),
-            py::arg("vn"),
+            py::arg("feature_names"),
             py::arg("validation_size") = 0.0,
             py::arg("batch_size") = 1.0
         )
@@ -69,15 +69,15 @@ void bind_dataset(py::module & m)
         // construct from X, y, feature names (and optional validation and batch sizes) with constructor 2.
         .def(py::init([](const Ref<const ArrayXXf>& X, 
                          const Ref<const ArrayXf>& y,
-                         const vector<string>& vn,
+                         const vector<string>& feature_names,
                          const float validation_size=0.0,
                          const float batch_size=1.0){
                 return br::Data::Dataset(
-                    X, y, vn, {}, false, validation_size, batch_size);
+                    X, y, feature_names, {}, false, validation_size, batch_size);
             }), 
             py::arg("X"),
             py::arg("y"),
-            py::arg("vn"),
+            py::arg("feature_names"),
             py::arg("validation_size") = 0.0,
             py::arg("batch_size") = 1.0
         )

From ac3b67a0c28b4d88c991a42e1fee86782d89651d Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Sat, 5 Aug 2023 15:29:48 -0400
Subject: [PATCH 13/17] New subtree mutation

Also fixed PTC2 generating programs twice as big as it should.
Now PTC2 takes into account that terminal nodes are weighted by
default, and discounts these nodes while expanding the tree.
---
 src/brush/estimator.py |  20 +++---
 src/search_space.cpp   | 158 +++++++++++++++++++++++++++++++++++++++++
 src/search_space.h     | 158 ++++++++---------------------------------
 src/variation.h        |  26 +++++++
 4 files changed, 223 insertions(+), 139 deletions(-)

diff --git a/src/brush/estimator.py b/src/brush/estimator.py
index e16689f8..78f94d27 100644
--- a/src/brush/estimator.py
+++ b/src/brush/estimator.py
@@ -40,7 +40,7 @@ class BrushEstimator(BaseEstimator):
         Maximum number of nodes in a tree. Use 0 for no limit.
     cx_prob : float, default 0.9
         Probability of applying the crossover variation when generating the offspring
-    mutation_options : dict, default {"point":0.25, "insert":0.25, "delete":0.25, "toggle_weight":0.25}
+    mutation_options : dict, default {"point":0.2, "insert":0.2, "delete":0.2, "subtree":0.2, "toggle_weight":0.2}
         A dictionary with keys naming the types of mutation and floating point 
         values specifying the fraction of total mutations to do with that method. 
     functions: dict[str,float] or list[str], default {}
@@ -96,7 +96,7 @@ def __init__(
         max_depth=3,
         max_size=20,
         cx_prob=0.9,
-        mutation_options = {"point":0.25, "insert":0.25, "delete":0.25, "toggle_weight":0.25},
+        mutation_options = {"point":0.2, "insert":0.2, "delete":0.2, "subtree":0.2, "toggle_weight":0.2},
         functions: list[str]|dict[str,float] = {},
         initialization="grow",
         random_state=None,
@@ -149,7 +149,7 @@ def _setup_toolbox(self, data_train, data_validation):
         toolbox.register("population", tools.initRepeat, list, toolbox.createRandom)
 
         toolbox.register("getBatch", data_train.get_batch)
-        toolbox.register("evaluate", self._fitness_function)
+        toolbox.register("evaluate", self._fitness_function, data=data_train)
         toolbox.register("evaluateValidation", self._fitness_validation, data=data_validation)
 
         return toolbox
@@ -194,7 +194,7 @@ def fit(self, X, y):
         if self.random_state != None:
             _brush.set_random_state(self.random_state)
 
-        self.data_ = self._make_data(X,y)
+        self.data_ = self._make_data(X,y, validation_size=self.validation_size)
 
         # set n classes if relevant
         if self.mode=="classification":
@@ -243,7 +243,7 @@ def fit(self, X, y):
 
         return self
     
-    def _make_data(self, X, y=None):
+    def _make_data(self, X, y=None, validation_size=0.0):
         # This function should not partition data (as it is used in predict).
         # partitioning is done in fit().
 
@@ -254,17 +254,19 @@ def _make_data(self, X, y=None):
             feature_names = X.columns.to_list()
             X = X.values
             if isinstance(y, NoneType):
-                return _brush.Dataset(X, feature_names)
+                return _brush.Dataset(X,
+                    feature_names=feature_names, validation_size=validation_size)
             else:
-                return _brush.Dataset(X, y, feature_names)
+                return _brush.Dataset(X, y,
+                    feature_names=feature_names, validation_size=validation_size)
 
         assert isinstance(X, np.ndarray)
 
         # if there is no label, don't include it in library call to Dataset
         if isinstance(y, NoneType):
-            return _brush.Dataset(X)
+            return _brush.Dataset(X, validation_size=validation_size)
 
-        return _brush.Dataset(X, y)
+        return _brush.Dataset(X, y, validation_size=validation_size)
 
 
     def predict(self, X):
diff --git a/src/search_space.cpp b/src/search_space.cpp
index 2b6d6d9f..436f074f 100644
--- a/src/search_space.cpp
+++ b/src/search_space.cpp
@@ -129,6 +129,164 @@ void SearchSpace::init(const Dataset& d, const unordered_map<string,float>& user
     }
 };
 
+std::optional<tree<Node>> SearchSpace::sample_subtree(Node root, int max_d, int max_size) const 
+{
+    // public interface to use PTC2 algorithm
+
+    // PTC is designed to not fail (it will persistently try to find nodes with
+    // sampling functions). In pop initialization, this shoudnt be a problem, but
+    // during evolution, due to dynamic changes in node weights by the learners, 
+    // it now may fail. We need to check, before calling it, that it has elements
+    // in search space to sample
+    auto ret_match = node_map.at(root.ret_type);
+
+    vector<float> args_w = get_weights(root.ret_type);
+
+    // at least one operator that matches the weight must have positive probability
+    if (!has_solution_space(args_w.begin(), args_w.end()))
+        return std::nullopt;
+
+    if ( (terminal_map.find(root.ret_type) == terminal_map.end())
+    ||   (!has_solution_space(terminal_weights.at(root.ret_type).begin(), 
+                              terminal_weights.at(root.ret_type).end())) )
+        return std::nullopt;
+
+    // we should notice the difference between size of a PROGRAM and a TREE.
+    // program count weights in its size, while the TREE structure dont. Wenever
+    // using size of a program/tree, make sure you use the function from the correct class
+    return PTC2(root, max_d, max_size);
+};
+
+tree<Node> SearchSpace::PTC2(Node root, int max_d, int max_size) const
+{
+    // PTC2 is agnostic of program type
+
+    // A comment about PTC2 method:            
+    // PTC2 can work with depth or size restriction, but it does not strictly
+    // satisfies these conditions all time. Given a `max_size` and `max_depth`
+    // parameters, the real maximum size that can occur is `max_size` plus the
+    // highest operator arity, and the real maximum depth is `max_depth` plus one.
+
+    auto Tree = tree<Node>();
+
+    /* fmt::print("building program with max size {}, max depth {}",max_size,max_d); */ 
+
+    // Queue of nodes that need children
+    vector<tuple<TreeIter, DataType, int>> queue; 
+
+    /* cout << "chose " << n.name << endl; */
+    // auto spot = Tree.set_head(n);
+    /* cout << "inserting...\n"; */
+    auto spot = Tree.insert(Tree.begin(), root);
+    // node depth
+    int d = 1;
+    // current tree size
+    int s = 1;
+    //For each argument position a of n, Enqueue(a; g) 
+    for (auto a : root.arg_types)
+    { 
+        /* cout << "queing a node of type " << DataTypeName[a] << endl; */
+        auto child_spot = Tree.append_child(spot);
+        queue.push_back(make_tuple(child_spot, a, d));
+    }
+
+    Node n;
+    // Now we actually start the PTC2 procedure to create the program tree
+    /* cout << "queue size: " << queue.size() << endl; */ 
+    /* cout << "entering first while loop...\n"; */
+    while ( 3*(queue.size()-1) + s < max_size && queue.size() > 0) 
+    {            
+        // by default, terminals are weighted (counts as 3 nodes in program size).
+        // since every spot in queue has potential to be a terminal, we multiply
+        // its size by 3. Subtracting one due to the fact that this loop will
+        // always insert a non terminal (which by default has weights off).
+        // this way, we can have PTC2 working properly.
+        
+        /* cout << "queue size: " << queue.size() << endl; */ 
+        auto [qspot, t, d] = RandomDequeue(queue);
+
+        /* cout << "current depth: " << d << endl; */
+        if (d == max_d)
+        {
+            // choose terminal of matching type
+            /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ 
+            // qspot = sample_terminal(t);
+            // Tree.replace(qspot, sample_terminal(t));
+            // Tree.append_child(qspot, sample_terminal(t));
+
+            auto opt = sample_terminal(t);
+            while (!opt)
+                opt = sample_terminal(t);
+
+            // If we successfully get a terminal, use it
+            n = opt.value();
+
+            Tree.replace(qspot, n);
+        }
+        else
+        {
+            //choose a nonterminal of matching type
+            /* cout << "getting op of type " << DataTypeName[t] << endl; */
+            auto opt = sample_op(t);
+            /* cout << "chose " << n.name << endl; */
+            // TreeIter new_spot = Tree.append_child(qspot, n);
+            // qspot = n;
+
+            while (!opt)
+                opt = sample_op(t);
+
+            n = opt.value();
+            
+            auto newspot = Tree.replace(qspot, n);
+
+            // For each arg of n, add to queue
+            for (auto a : n.arg_types)
+            {
+                /* cout << "queing a node of type " << DataTypeName[a] << endl; */
+                // queue.push_back(make_tuple(new_spot, a, d+1));
+                auto child_spot = Tree.append_child(newspot);
+
+                queue.push_back(make_tuple(child_spot, a, d+1));
+            }
+        }
+
+        ++s;
+        /* cout << "current tree size: " << s << endl; */
+    } 
+    /* cout << "entering second while loop...\n"; */
+    while (queue.size() > 0)
+    {
+        if (queue.size() == 0)
+            break;
+
+        /* cout << "queue size: " << queue.size() << endl; */ 
+
+        auto [qspot, t, d] = RandomDequeue(queue);
+
+        /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ 
+        // Tree.append_child(qspot, sample_terminal(t));
+        // qspot = sample_terminal(t);
+        // auto newspot = Tree.replace(qspot, sample_terminal(t));
+
+        auto opt = sample_terminal(t);
+        while (!opt) {
+            opt = sample_terminal(t);
+        }
+
+        n = opt.value();
+        
+        auto newspot = Tree.replace(qspot, n);
+    }
+
+    /* cout << "final tree:\n" */ 
+    /*     << Tree.begin().node->get_model() << "\n" */
+    /*     << Tree.begin().node->get_tree_model(true) << endl; */
+         /* << Tree.get_model() << "\n" */ 
+         /* << Tree.get_model(true) << endl; // pretty */
+
+    return Tree;
+};
+
 RegressorProgram SearchSpace::make_regressor(int max_d, int max_size)
 {
     return make_program<RegressorProgram>(max_d, max_size);
diff --git a/src/search_space.h b/src/search_space.h
index 69d6eb50..ac751a65 100644
--- a/src/search_space.h
+++ b/src/search_space.h
@@ -520,10 +520,18 @@ struct SearchSpace
                ).second;
     };
 
+    /// @brief create a subtree with maximum size and depth restrictions and root of type `root_type`
+    /// @param root_type return type
+    /// @param max_d the maximum depth
+    /// @param max_size the maximum size of the tree (will be sampled between [1, max_size]) 
+    /// @return `std::optional` that may contain a tree 
+    std::optional<tree<Node>> sample_subtree(Node root, int max_d, int max_size) const;
+
     /// @brief prints the search space map. 
     void print() const; 
 
     private:
+        tree<Node> PTC2(Node root, int max_d, int max_size) const;
 
         template<NodeType NT, typename S>
         requires (!is_in_v<NT, NodeType::Terminal, NodeType::Constant>)
@@ -623,12 +631,6 @@ T RandomDequeue(std::vector<T>& Q)
 template<typename P>
 P SearchSpace::make_program(int max_d, int max_size)
 {
-    // A comment about PTC2 method:            
-    // PTC2 can work with depth or size restriction, but it does not strictly
-    // satisfies these conditions all time. Given a `max_size` and `max_depth`
-    // parameters, the real maximum size that can occur is `max_size` plus the
-    // highest operator arity, and the real maximum depth is `max_depth` plus one.
-
     if (max_d == 0)
         max_d = PARAMS["max_depth"].get<int>();
     if (max_size == 0)
@@ -637,14 +639,8 @@ P SearchSpace::make_program(int max_d, int max_size)
     DataType root_type = DataTypeEnum<typename P::TreeType>::value;
     ProgramType program_type = P::program_type;
     // ProgramType program_type = ProgramTypeEnum<PT>::value;
-
+    
     auto Tree = tree<Node>();
-
-    /* fmt::print("building program with max size {}, max depth {}",max_size,max_d); */ 
-
-    // Queue of nodes that need children
-    vector<tuple<TreeIter, DataType, int>> queue; 
-
     if (max_size == 1)
     {
         // auto root = Tree.insert(Tree.begin(), sample_terminal(root_type));
@@ -659,137 +655,39 @@ P SearchSpace::make_program(int max_d, int max_size)
             HANDLE_ERROR_THROW(msg); 
         }
 
-        auto root = Tree.insert(Tree.begin(), opt.value());
+        Tree.insert(Tree.begin(), opt.value());
     }
-    else // Our program can (and will) be grater than 1 node
-    {
+    else {// Our program can (and will) be grater than 1 node
+
+        // building the root node for each program case. We give the root, and it 
+        // fills the rest of the tree
+        Node root;
+
         // building the root node for each program case
-        Node n;
         if (P::program_type == ProgramType::BinaryClassifier)
         {
-            n = get(NodeType::Logistic, DataType::ArrayF, Signature<ArrayXf(ArrayXf)>());
-            n.set_prob_change(0.0);
-            n.fixed=true;
+            root = get(NodeType::Logistic, DataType::ArrayF, Signature<ArrayXf(ArrayXf)>());
+            root.set_prob_change(0.0);
+            root.fixed=true;
+
         }
         else if (P::program_type == ProgramType::MulticlassClassifier)
         {
-            n = get(NodeType::Softmax, DataType::MatrixF, Signature<ArrayXXf(ArrayXXf)>());
-            n.set_prob_change(0.0);
-            n.fixed=true;
+            root = get(NodeType::Softmax, DataType::MatrixF, Signature<ArrayXXf(ArrayXXf)>());
+            root.set_prob_change(0.0);
+            root.fixed=true;
         }
-        else
-        { // we start with a non-terminal
+        else {
+            // we start with a non-terminal (can be replaced inside PTC2 though, if max_size==1)
             auto opt = sample_op(root_type);
             while (!opt) {
                 opt = sample_op(root_type);
             }
-            n = opt.value();
-        }
-
-        /* cout << "chose " << n.name << endl; */
-        // auto spot = Tree.set_head(n);
-        /* cout << "inserting...\n"; */
-        auto spot = Tree.insert(Tree.begin(), n);
-        // node depth
-        int d = 1;
-        // current tree size
-        int s = 1;
-        //For each argument position a of n, Enqueue(a; g) 
-        for (auto a : n.arg_types)
-        { 
-            /* cout << "queing a node of type " << DataTypeName[a] << endl; */
-            auto child_spot = Tree.append_child(spot);
-            queue.push_back(make_tuple(child_spot, a, d+1));
-        }
-
-        // Now we actually start the PTC2 procedure to create the program tree
-        /* cout << "queue size: " << queue.size() << endl; */ 
-        /* cout << "entering first while loop...\n"; */
-        while (queue.size() + s < max_size && queue.size() > 0) 
-        {            
-            /* cout << "queue size: " << queue.size() << endl; */ 
-            auto [qspot, t, d] = RandomDequeue(queue);
-
-            /* cout << "current depth: " << d << endl; */
-            if (d == max_d)
-            {
-                // choose terminal of matching type
-                /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ 
-                // qspot = sample_terminal(t);
-                // Tree.replace(qspot, sample_terminal(t));
-                // Tree.append_child(qspot, sample_terminal(t));
-
-                auto opt = sample_terminal(t);
-                while (!opt) {
-                    opt = sample_terminal(t);
-                }
-
-                // If we successfully get a terminal, use it
-                n = opt.value();
-
-                Tree.replace(qspot, n);
-            }
-            else
-            {
-                //choose a nonterminal of matching type
-                /* cout << "getting op of type " << DataTypeName[t] << endl; */
-                auto opt = sample_op(t);
-                /* cout << "chose " << n.name << endl; */
-                // TreeIter new_spot = Tree.append_child(qspot, n);
-                // qspot = n;
-
-                while (!opt) {
-                    opt = sample_op(t);
-                }
-
-                n = opt.value();
-                
-                auto newspot = Tree.replace(qspot, n);
-
-                // For each arg of n, add to queue
-                for (auto a : n.arg_types)
-                {
-                    /* cout << "queing a node of type " << DataTypeName[a] << endl; */
-                    // queue.push_back(make_tuple(new_spot, a, d+1));
-                    auto child_spot = Tree.append_child(newspot);
-
-                    queue.push_back(make_tuple(child_spot, a, d+1));
-                }
-            }
-
-            ++s;
-            /* cout << "current tree size: " << s << endl; */
-        } 
-        /* cout << "entering second while loop...\n"; */
-        while (queue.size() > 0)
-        {
-            if (queue.size() == 0)
-                break;
-
-            /* cout << "queue size: " << queue.size() << endl; */ 
-
-            auto [qspot, t, d] = RandomDequeue(queue);
-
-            /* cout << "getting " << DataTypeName[t] << " terminal\n"; */ 
-            // Tree.append_child(qspot, sample_terminal(t));
-            // qspot = sample_terminal(t);
-            // auto newspot = Tree.replace(qspot, sample_terminal(t));
-
-            auto opt = sample_terminal(t);
-            while (!opt) {
-                opt = sample_terminal(t);
-            }
-
-            n = opt.value();
-            
-            auto newspot = Tree.replace(qspot, n);
+            root = opt.value();
         }
+        
+        Tree = PTC2(root, max_d, max_size);
     }
-    /* cout << "final tree:\n" */ 
-    /*     << Tree.begin().node->get_model() << "\n" */
-    /*     << Tree.begin().node->get_tree_model(true) << endl; */
-         /* << Tree.get_model() << "\n" */ 
-         /* << Tree.get_model(true) << endl; // pretty */
 
     return P(*this,Tree);
 };
diff --git a/src/variation.h b/src/variation.h
index 6682a561..8f1717cd 100644
--- a/src/variation.h
+++ b/src/variation.h
@@ -144,6 +144,31 @@ inline bool toggle_weight_mutation(tree<Node>& Tree, Iter spot, const SearchSpac
     return true;
 }
 
+/// @brief replaces the subtree rooted in `spot`
+/// @param Tree the program tree
+/// @param spot an iterator to the node that is being mutated
+/// @param SS the search space to generate a compatible subtree
+/// @return boolean indicating the success (true) or fail (false) of the operation
+inline bool subtree_mutation(tree<Node>& Tree, Iter spot, const SearchSpace& SS)
+{
+    auto spot_type = spot.node->data.ret_type;
+    auto max_size  = PARAMS["max_size"].get<int>() - (Tree.size() - Tree.size(spot));
+    auto max_depth = PARAMS["max_depth"].get<int>() - (Tree.depth(spot));
+
+    // sample subtree uses PTC2, which operates on depth and size of the tree<Node> 
+    // (and not on the program!). we shoudn't care for weights here
+    auto subtree = SS.sample_subtree(spot.node->data, max_depth, max_size); 
+
+    if (!subtree) // there is no terminal with compatible arguments
+        return false;
+
+    // if optional contains a Node, we access its contained value
+    Tree.erase_children(spot); 
+    Tree.replace(spot, subtree.value().begin());
+
+    return true;
+}
+
 /**
  * @brief Stochastically mutate a program.
  * 
@@ -248,6 +273,7 @@ std::optional<Program<T>> mutate(const Program<T>& parent, const SearchSpace& SS
         {"insert",        insert_mutation},
         {"delete",        delete_mutation},
         {"point",         point_mutation},
+        {"subtree",       subtree_mutation},
         {"toggle_weight", toggle_weight_mutation}
     };
 

From a564896995748226344015de14e53f6dfa7506b5 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Sat, 5 Aug 2023 15:30:50 -0400
Subject: [PATCH 14/17] Updated tests after new mutation

---
 tests/cpp/test_program.cpp   | 4 ++++
 tests/cpp/test_variation.cpp | 6 +++---
 tests/python/test_brush.py   | 2 +-
 tests/python/test_params.py  | 3 ++-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tests/cpp/test_program.cpp b/tests/cpp/test_program.cpp
index a8da41b8..3fb2de8b 100644
--- a/tests/cpp/test_program.cpp
+++ b/tests/cpp/test_program.cpp
@@ -203,6 +203,10 @@ TEST(Operators, ProgramSizeAndDepthPARAMS)
             // and PTC2 uses the tree size (not the program size), it is not 
             // expected that initial trees will strictly respect `max_size`.
             ASSERT_TRUE(PRG.size() > 0); // size is always positive
+            
+            // PTC2: maximum size is s+max(arity). Since in Brush terminals are
+            // weighted by default, we set it to 3*max(arity)
+            ASSERT_TRUE(PRG.size() <= s+3*4); 
 
             ASSERT_TRUE(PRG.depth() <= d+1);
             ASSERT_TRUE(PRG.depth() > 0); // depth is always positive
diff --git a/tests/cpp/test_variation.cpp b/tests/cpp/test_variation.cpp
index 41c60389..d7058714 100644
--- a/tests/cpp/test_variation.cpp
+++ b/tests/cpp/test_variation.cpp
@@ -10,7 +10,7 @@ TEST(Operators, InsertMutationWorks)
     // To understand design implementation of this test, check Mutation test
 
     PARAMS["mutation_options"] = {
-        {"point", 0.0}, {"insert", 1.0}, {"delete", 0.0}, {"toggle_weight", 0.0}
+        {"point", 0.0}, {"insert", 1.0}, {"delete", 0.0}, {"subtree", 0.0}, {"toggle_weight", 0.0}
     };
 
     // retrieving the options to check if everything was set right
@@ -117,7 +117,7 @@ TEST(Operators, Mutation)
     // TODO: set random seed
 
     PARAMS["mutation_options"] = {
-        {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"toggle_weight", 0.25}
+        {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"subtree", 0.0}, {"toggle_weight", 0.25}
     };
     
     MatrixXf X(10,2);
@@ -193,7 +193,7 @@ TEST(Operators, Mutation)
 TEST(Operators, MutationSizeAndDepthLimit)
 {
     PARAMS["mutation_options"] = {
-        {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"toggle_weight", 0.25}
+        {"point",0.25}, {"insert", 0.25}, {"delete", 0.25}, {"subtree", 0.0}, {"toggle_weight", 0.25}
     };
         
     MatrixXf X(10,2);
diff --git a/tests/python/test_brush.py b/tests/python/test_brush.py
index 38c381dc..5e38898f 100644
--- a/tests/python/test_brush.py
+++ b/tests/python/test_brush.py
@@ -61,7 +61,7 @@ def test_fit(setup, brush_args, request):
         logging.error(traceback.format_exc())
         
 
-# def test_random_state():
+# def test_random_state(): # TODO: make it work
 #     test_y = np.array( [1. , 0. , 1.4, 1. , 0. , 1. , 1. , 0. , 0. , 0.  ])
 #     test_X = np.array([[1.1, 2.0, 3.0, 4.0, 5.0, 6.5, 7.0, 8.0, 9.0, 10.0],
 #                        [2.0, 1.2, 6.0, 4.0, 5.0, 8.0, 7.0, 5.0, 9.0, 10.0]]).T
diff --git a/tests/python/test_params.py b/tests/python/test_params.py
index e6166c4e..069a8987 100644
--- a/tests/python/test_params.py
+++ b/tests/python/test_params.py
@@ -60,11 +60,12 @@ def _change_and_wait(config):
         'mutation_options': {'point'        : 0.0,
                              'insert'       : 0.0,
                              'delete'       : 0.0,
+                             'subtree'      : 0.0,
                              'toggle_weight': 0.0}
     }
 
     # We need to guarantee order to use the index correctly
-    mutations = ['point', 'insert', 'delete', 'toggle_weight']
+    mutations = ['point', 'insert', 'delete', 'subtree', 'toggle_weight']
 
     for i, m in enumerate(mutations):
         params['mutation_options'][m] = 0 if i != index else 1.0

From 1a4c968732ccb610cb74116ce8137290392291fe Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Mon, 7 Aug 2023 09:39:01 -0400
Subject: [PATCH 15/17] Improved how PTC2 deals with weighted terminals

---
 src/search_space.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/search_space.cpp b/src/search_space.cpp
index 436f074f..625685c9 100644
--- a/src/search_space.cpp
+++ b/src/search_space.cpp
@@ -222,6 +222,9 @@ tree<Node> SearchSpace::PTC2(Node root, int max_d, int max_size) const
             n = opt.value();
 
             Tree.replace(qspot, n);
+
+            s=s+2; // (*) and (weight) nodes of the terminal. terminal itself is 
+                   // incremented at the end of the while loop
         }
         else
         {

From 9b1d26eac70d79d266922e90b361df3d1d708f18 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Tue, 8 Aug 2023 13:47:22 -0400
Subject: [PATCH 16/17] Fixed bug of different array types :D

---
 src/data/data.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data/data.cpp b/src/data/data.cpp
index c2fdaafc..b80668df 100644
--- a/src/data/data.cpp
+++ b/src/data/data.cpp
@@ -90,7 +90,7 @@ State check_type(const ArrayXf& x)
     }
     else
     {
-        if(isCategorical && uniqueMap.size() < 10)
+        if(isCategorical && uniqueMap.size() <= 10)
         {
             tmp = ArrayXi(x.cast<int>());
         }

From 1ac652cc22bb505719312ce8f462d6f9867789d0 Mon Sep 17 00:00:00 2001
From: Guilherme Aldeia <guilherme.seidyo@gmail.com>
Date: Tue, 8 Aug 2023 13:48:20 -0400
Subject: [PATCH 17/17] Implemented node weight initialization

---
 src/search_space.cpp | 71 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 60 insertions(+), 11 deletions(-)

diff --git a/src/search_space.cpp b/src/search_space.cpp
index 625685c9..94a93c0f 100644
--- a/src/search_space.cpp
+++ b/src/search_space.cpp
@@ -55,13 +55,47 @@ vector<Node> generate_terminals(const Dataset& d)
                     feature_name
                 );
 
-                float prob_change = 1.0;
+                float prob_change = 1.0; // default value
                 
                 // if the value can be casted to float array, we can calculate slope
-                if (std::holds_alternative<ArrayXf>(value)) {
+                if (std::holds_alternative<ArrayXf>(value)) 
+                {
                     prob_change = calc_initial_weight(std::get<ArrayXf>(value), d.y);
                 }
-
+                else if (std::holds_alternative<ArrayXi>(value))
+                {
+                    // for each variable we create a one-vs-all binary variable, then
+                    // calculate slope. Final value will be the average of slopes
+
+                    auto tmp = std::get<ArrayXi>(value);
+
+                    //get number of unique values
+                    std::map<float, bool> uniqueMap;
+                    for(int i = 0; i < tmp.size(); i++)
+                        uniqueMap[(float)tmp(i)] = true;
+
+                    ArrayXf slopes = ArrayXf::Ones(uniqueMap.size());
+                    int slopesIterator = 0;
+                    for (const auto& pair : uniqueMap)
+                    {
+                        auto one_vs_all = ArrayXf::Ones(tmp.size()).array() * (tmp.array()==pair.first).cast<float>();
+
+                        slopes[slopesIterator++] = calc_initial_weight(one_vs_all, d.y);
+                    }
+                    
+                    prob_change = slopes.mean();
+                }
+                else if (std::holds_alternative<ArrayXb>(value))
+                {
+                    auto tmp = std::get<ArrayXb>(value).template cast<float>();
+                    prob_change = calc_initial_weight(tmp, d.y);
+                }
+                else
+                {
+                    auto msg = fmt::format("Brush coudn't calculate the initial weight of variable {}\n",feature_name);
+                    HANDLE_ERROR_THROW(msg);
+                }
+                
                 n.set_prob_change( prob_change );
 
                 terminals.push_back(n);
@@ -71,18 +105,32 @@ vector<Node> generate_terminals(const Dataset& d)
         ++i;
     };
 
-    // add constants
-    float num_const_prob_change = calc_initial_weight(VectorXf::Ones(d.y.size()), d.y);
+    // iterate through terminals and take the average of values of same signature
+    auto signature_avg = [terminals](DataType ret_type){
+        float sum = 0.0;
+        int count = 0;
+
+        for (const auto& n : terminals) {
+            if (n.ret_type == ret_type) {
+                sum += n.get_prob_change();
+                count++;
+            }
+        }
+
+        return sum / count;
+    };
 
     auto cXf = Node(NodeType::Constant, Signature<ArrayXf()>{}, true, "C");
-    // cXf.set_prob_change(num_const_prob_change);
+    cXf.set_prob_change(signature_avg(cXf.ret_type));
     terminals.push_back(cXf);
 
     auto cXi = Node(NodeType::Constant, Signature<ArrayXi()>{}, true, "C");
-    // cXi.set_prob_change(num_const_prob_change);
+    cXi.set_prob_change(signature_avg(cXi.ret_type));
     terminals.push_back(cXi);
 
-    terminals.push_back( Node(NodeType::Constant, Signature<ArrayXb()>{}, false, "C"));
+    auto cXb = Node(NodeType::Constant, Signature<ArrayXb()>{}, false, "C");
+    cXb.set_prob_change(signature_avg(cXb.ret_type));
+    terminals.push_back(cXb);
 
     return terminals;
 };
@@ -222,9 +270,6 @@ tree<Node> SearchSpace::PTC2(Node root, int max_d, int max_size) const
             n = opt.value();
 
             Tree.replace(qspot, n);
-
-            s=s+2; // (*) and (weight) nodes of the terminal. terminal itself is 
-                   // incremented at the end of the while loop
         }
         else
         {
@@ -253,7 +298,11 @@ tree<Node> SearchSpace::PTC2(Node root, int max_d, int max_size) const
             }
         }
 
+        // increment is different based on node weights
         ++s;
+        if  (n.get_is_weighted())
+            s += 2;
+
         /* cout << "current tree size: " << s << endl; */
     } 
     /* cout << "entering second while loop...\n"; */