From c0e0ec82a08755564fa71e70f7fd6c0959f22257 Mon Sep 17 00:00:00 2001
From: gAldeia <guilherme.seidyo@gmail.com>
Date: Tue, 3 Sep 2024 08:41:21 -0300
Subject: [PATCH] Accuracy metric. Fixed hall of fame when merging fronts of
 multiple islands

---
 pybrush/EstimatorInterface.py |  3 +-
 src/eval/evaluation.h         |  1 +
 src/eval/metrics.cpp          | 62 +++++++++++++++++++++++++++++++++++
 src/eval/metrics.h            | 25 ++++++++++++++
 src/eval/scorer.h             |  3 ++
 src/ind/individual.h          |  2 +-
 src/pop/archive.cpp           |  1 +
 src/pop/population.cpp        | 30 ++++++++++++++++-
 src/selection/lexicase.cpp    |  2 +-
 9 files changed, 125 insertions(+), 4 deletions(-)
diff --git a/pybrush/EstimatorInterface.py b/pybrush/EstimatorInterface.py
index 4de7793d..ddfd1b79 100644
--- a/pybrush/EstimatorInterface.py
+++ b/pybrush/EstimatorInterface.py
@@ -254,7 +254,8 @@ def _wrap_parameters(self, **extra_kwargs):
                 assert self.scorer in ['mse'], \
                     "Invalid scorer for the regression mode"
             else:
-                assert self.scorer in ['log', 'multi_log', 'average_precision_score'], \
+                assert self.scorer in ['log', 'multi_log', 
+                                       'accuracy', 'average_precision_score'], \
                     "Invalid scorer for the classification mode"
                 
         params.scorer = self.scorer
diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h
index 1c8a85b4..fc5b5748 100644
--- a/src/eval/evaluation.h
+++ b/src/eval/evaluation.h
@@ -33,6 +33,7 @@ class Evaluation {
     Evaluation(){
         // TODO: make eval update loss_v accordingly, and set to th same as train loss if there is no batch or no validation
     
+        // TODO: make accuracy the main classification metric?
         string scorer;
         if ( (T == Brush::ProgramType::MulticlassClassifier)
         ||   (T == Brush::ProgramType::Representer) )
diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp
index 52036d63..ee9e3a3b 100644
--- a/src/eval/metrics.cpp
+++ b/src/eval/metrics.cpp
@@ -57,6 +57,18 @@ float mean_log_loss(const VectorXf& y,
     return loss.mean();
 }
 
+float zero_one_loss(const VectorXf& y,
+        const VectorXf& predict_proba, VectorXf& loss, 
+        const vector<float>& class_weights )
+{
+    VectorXi yhat = (predict_proba.array() > 0.5).cast<int>();
+
+    loss = (yhat.array() != y.cast<int>().array()).cast<float>();
+
+    //TODO: weight loss by sample weights
+    return 1.0 - loss.mean();
+}
+
 float average_precision_score(const VectorXf& y, const VectorXf& predict_proba,
                           VectorXf& loss,
                           const vector<float>& class_weights) {
@@ -199,5 +211,55 @@ float mean_multi_log_loss(const VectorXf& y,
     return loss.mean();
 }  
 
+float bal_zero_one_loss(const VectorXf& y,
+    const ArrayXXf& predict_proba, VectorXf& loss, 
+    const vector<float>& class_weights )
+{
+    // TODO: implement this
+    // vector<float> uc = unique(y);
+    // vector<int> c;
+    // for (const auto& i : uc)
+    //     c.push_back(int(i));
+        
+    // // sensitivity (TP) and specificity (TN)
+    // vector<float> TP(c.size(),0.0), TN(c.size(), 0.0), P(c.size(),0.0), N(c.size(),0.0);
+    // ArrayXf class_accuracies(c.size());
+    
+    // // get class counts
+    
+    // for (unsigned i=0; i< c.size(); ++i)
+    // {
+    //     P.at(i) = (y.array().cast<int>() == c.at(i)).count();  // total positives for this class
+    //     N.at(i) = (y.array().cast<int>() != c.at(i)).count();  // total negatives for this class
+    // }
+    
+
+    // for (unsigned i = 0; i < y.rows(); ++i)
+    // {
+    //     if (yhat(i) == y(i))                    // true positive
+    //         ++TP.at(y(i) == -1 ? 0 : y(i));     // if-then ? accounts for -1 class encoding
+
+    //     for (unsigned j = 0; j < c.size(); ++j)
+    //         if ( y(i) !=c.at(j) && yhat(i) != c.at(j) )    // true negative
+    //             ++TN.at(j);    
+        
+    // }
+
+    // // class-wise accuracy = 1/2 ( true positive rate + true negative rate)
+    // for (unsigned i=0; i< c.size(); ++i){
+    //     class_accuracies(i) = (TP.at(i)/P.at(i) + TN.at(i)/N.at(i))/2; 
+    //     //std::cout << "TP(" << i << "): " << TP.at(i) << ", P[" << i << "]: " << P.at(i) << "\n";
+    //     //std::cout << "TN(" << i << "): " << TN.at(i) << ", N[" << i << "]: " << N.at(i) << "\n";
+    //     //std::cout << "class accuracy(" << i << "): " << class_accuracies(i) << "\n";
+    // }
+    
+    // // set loss vectors if third argument supplied
+    // loss = (yhat.cast<int>().array() != y.cast<int>().array()).cast<float>();
+
+    // return 1.0 - class_accuracies.mean();
+    
+    return 0.0;
+}
+
 } // metrics
 } // Brush
\ No newline at end of file
diff --git a/src/eval/metrics.h b/src/eval/metrics.h
index 7a66f8e5..49e36b57 100644
--- a/src/eval/metrics.h
+++ b/src/eval/metrics.h
@@ -60,6 +60,18 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba,
                           VectorXf& loss,
                           const vector<float>& class_weights=vector<float>());
 
+/**
+ * @brief Accuracy for binary classification
+ * @param y The true labels.
+ * @param predict_proba The predicted probabilities.
+ * @param loss Reference to store the calculated losses for each sample.
+ * @param class_weights The optional class weights.
+ * @return The final accuracy.
+ */
+float zero_one_loss(const VectorXf& y, const VectorXf& predict_proba,
+                        VectorXf& loss, 
+                        const vector<float>& class_weights=vector<float>() );
+                
 // multiclass classification ---------------------------------------------------
 
 /**
@@ -84,6 +96,19 @@ float mean_multi_log_loss(const VectorXf& y, const ArrayXXf& predict_proba,
                           VectorXf& loss,
                           const vector<float>& class_weights=vector<float>());
 
+/**
+ * @brief Accuracy for multi-classification
+ * @param y The true labels.
+ * @param predict_proba The predicted probabilities.
+ * @param loss Reference to store the calculated losses for each sample.
+ * @param class_weights The optional class weights.
+ * @return The average accuracy in a one-vs-all schema.
+ */
+float bal_zero_one_loss(const VectorXf& y, const ArrayXXf& predict_proba,
+                        VectorXf& loss, 
+                        const vector<float>& class_weights=vector<float>() );
+
+
 } // metrics
 } // Brush
 
diff --git a/src/eval/scorer.h b/src/eval/scorer.h
index a47e4c9f..6f67f4a6 100644
--- a/src/eval/scorer.h
+++ b/src/eval/scorer.h
@@ -89,6 +89,7 @@ typedef float (*funcPointer)(const VectorXf&,
     Scorer(string scorer="log") {
         score_hash["log"] = &mean_log_loss;
         score_hash["average_precision_score"] = &average_precision_score;
+        score_hash["accuracy"] = &zero_one_loss;
     
         this->set_scorer(scorer);
     };
@@ -138,8 +139,10 @@ typedef float (*funcPointer)(const VectorXf&,
     std::map<string, funcPointer> score_hash;
     string scorer;
 
+    // TODO: I actually need to test this stuff
     Scorer(string scorer="multi_log") {
         score_hash["multi_log"] = &mean_multi_log_loss; 
+        score_hash["accuracy"] = &bal_zero_one_loss;
     
         this->set_scorer(scorer);
     };
diff --git a/src/ind/individual.h b/src/ind/individual.h
index 79ab1cb3..4eb96936 100644
--- a/src/ind/individual.h
+++ b/src/ind/individual.h
@@ -141,7 +141,7 @@ class Individual{
         {"log",                     -1.0},
         {"multi_log",               -1.0},
         {"average_precision_score", +1.0},
-        {"accuracy",                +1.0}
+        {"accuracy",                +1.0} // TODO: make sure we are maximizing accuracy
         // {"error",                   -1.0}
     };
 
diff --git a/src/pop/archive.cpp b/src/pop/archive.cpp
index 7d0f4579..abe78c4f 100644
--- a/src/pop/archive.cpp
+++ b/src/pop/archive.cpp
@@ -110,6 +110,7 @@ void Archive<T>::update(Population<T>& pop, const Parameters& params)
     individuals.resize(0);  // clear archive
 
     // refill archive with new pareto fronts (one pareto front for each island!)
+    // TODO: refill with fast nds just like hall of fame
     for (int island =0; island< pop.num_islands; ++island) {
         vector<size_t> indices = pop.get_island_indexes(island);
 
diff --git a/src/pop/population.cpp b/src/pop/population.cpp
index 20070901..bdfbae00 100644
--- a/src/pop/population.cpp
+++ b/src/pop/population.cpp
@@ -260,7 +260,8 @@ vector<vector<size_t>> Population<T>::sorted_front(unsigned rank)
 template<ProgramType T>
 vector<size_t> Population<T>::hall_of_fame(unsigned rank)
 {
-    // TODO: hall of fame should unify all pareto fronts by doing a new fast_nds.
+    // Inspired in fast nds from nsga2
+
     // TODO: use hall of fame instead of re-implmementing this feature in
     // archive init and update functions
 
@@ -279,6 +280,33 @@ vector<size_t> Population<T>::hall_of_fame(unsigned rank)
         }
     }
 
+    // checking if there is no dominance between different fronts
+    // (without updating their fitness objects)
+    vector<int> hof;                
+    hof.clear();
+
+    for (int i = 0; i < pf.size(); ++i) {
+    
+        std::vector<unsigned int> dom;
+        int dcount = 0;
+    
+        auto p = individuals.at(pf[i]);
+
+        for (int j = 0; j < pf.size(); ++j) {
+            const Individual<T>& q = (*individuals.at(pf[j]));
+        
+            int compare = p->fitness.dominates(q.fitness);
+            if (compare == -1) { // q dominates p
+                //p.dcounter += 1;
+                dcount += 1;
+            }
+        }
+
+        if (dcount == 0) {
+            hof.push_back(pf[i]);
+        }
+    }
+
     // TODO: should I sort the hall of fame by complexity? or error?
     std::sort(pf.begin(),pf.end(),SortComplexity(*this)); 
 
diff --git a/src/selection/lexicase.cpp b/src/selection/lexicase.cpp
index ac450a2c..8c74f3e0 100644
--- a/src/selection/lexicase.cpp
+++ b/src/selection/lexicase.cpp
@@ -116,7 +116,7 @@ vector<size_t> Lexicase<T>::select(Population<T>& pop, int island,
             // minimum error on case
             float minfit = std::numeric_limits<float>::max();                     
 
-            // get minimum
+            // get minimum (assuming minization of indiviual errors)
             for (size_t j = 0; j<pool.size(); ++j)
                 if (pop.individuals.at(pool[j])->error(cases[h]) < minfit) 
                     minfit = pop.individuals.at(pool[j])->error(cases[h]);