From 0308ac6f9e10f4ddde28931e1bb27bc953c3c3a3 Mon Sep 17 00:00:00 2001 From: kasim Date: Sun, 12 Aug 2018 10:18:17 +0300 Subject: [PATCH 01/17] Rename complexity_bases_scorer to combo_based_scorer --- moses/moses/deme/deme_expander.cc | 4 ++-- moses/moses/representation/instance_scorer.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/moses/moses/deme/deme_expander.cc b/moses/moses/deme/deme_expander.cc index ec72e98315..89b9c5044f 100644 --- a/moses/moses/deme/deme_expander.cc +++ b/moses/moses/deme/deme_expander.cc @@ -487,8 +487,8 @@ void deme_expander::optimize_demes(int max_evals, time_t max_time) } // Optimize - complexity_based_scorer cpx_scorer = - complexity_based_scorer(_cscorer, _reps[i], _params.reduce_all); + combo_based_scorer cpx_scorer = + combo_based_scorer(_cscorer, _reps[i], _params.reduce_all); _optimize(_demes[i][j], cpx_scorer, max_evals_per_deme, max_time); } diff --git a/moses/moses/representation/instance_scorer.h b/moses/moses/representation/instance_scorer.h index 85dea68cb9..7ec50afdb8 100644 --- a/moses/moses/representation/instance_scorer.h +++ b/moses/moses/representation/instance_scorer.h @@ -67,16 +67,16 @@ struct distance_based_scorer : public iscorer_base const instance& target_inst; }; -struct complexity_based_scorer : public iscorer_base +struct combo_based_scorer : public iscorer_base { - complexity_based_scorer(behave_cscore& cs, + combo_based_scorer(behave_cscore& cs, representation& rep, bool reduce) : _cscorer(cs), _rep(rep), _reduce(reduce) {} composite_score operator()(const instance& inst) const { if (logger().is_fine_enabled()) { - logger().fine() << "complexity_based_scorer - Evaluate instance: " + logger().fine() << "combo_based_scorer - Evaluate instance: " << _rep.fields().to_string(inst); } From 9bf1139ebea5070dd18248d100fbe793bd3b3b8d Mon Sep 17 00:00:00 2001 From: kasim Date: Sun, 12 Aug 2018 14:03:35 +0300 Subject: [PATCH 02/17] Add atomese_based_scorer --- moses/moses/moses/complexity.cc | 6 +++ moses/moses/moses/complexity.h | 3 ++ moses/moses/representation/instance_scorer.h | 29 ++++++++++++++ moses/moses/scoring/behave_cscore.cc | 42 +++++++++++++++++++- moses/moses/scoring/behave_cscore.h | 12 ++++++ moses/moses/scoring/scoring_base.cc | 8 ++++ moses/moses/scoring/scoring_base.h | 9 ++++- 7 files changed, 107 insertions(+), 2 deletions(-) diff --git a/moses/moses/moses/complexity.cc b/moses/moses/moses/complexity.cc index a81b4aa70d..dc38cea7b0 100644 --- a/moses/moses/moses/complexity.cc +++ b/moses/moses/moses/complexity.cc @@ -119,5 +119,11 @@ complexity_t tree_complexity(const combo_tree& tr, return tree_complexity(tr.begin(), stopper); } +complexity_t atomese_complexity(const Handle &) +{ + OC_ASSERT(false, "Atomese Complexity not implemented yet"); + return 0; +} + } // ~namespace moses } // ~namespace opencog diff --git a/moses/moses/moses/complexity.h b/moses/moses/moses/complexity.h index caee027d86..04f4b11984 100644 --- a/moses/moses/moses/complexity.h +++ b/moses/moses/moses/complexity.h @@ -28,6 +28,7 @@ // of particular combo programs. #include +#include namespace opencog { namespace moses { @@ -44,6 +45,8 @@ namespace opencog { namespace moses { complexity_t tree_complexity(const combo::combo_tree&, bool (*)(const combo::combo_tree::iterator&) = NULL); + complexity_t atomese_complexity(const Handle&); + } //~namespace moses } //~namespace opencog diff --git a/moses/moses/representation/instance_scorer.h b/moses/moses/representation/instance_scorer.h index 7ec50afdb8..c99aedfed0 100644 --- a/moses/moses/representation/instance_scorer.h +++ b/moses/moses/representation/instance_scorer.h @@ -29,6 +29,7 @@ #include "field_set.h" #include "representation.h" #include "../scoring/behave_cscore.h" +#include "moses/comboreduct/converter/combo_atomese.h" namespace opencog { namespace moses { @@ -106,6 +107,34 @@ struct combo_based_scorer : public iscorer_base // hits. }; +struct atomese_based_scorer : public iscorer_base +{ + atomese_based_scorer(behave_cscore &cs, + representation &rep, bool reduce) + : _cscorer(cs), _rep(rep), _reduce(reduce) + {} + + composite_score operator()(const instance &inst) const + { + if (logger().is_fine_enabled()) { + logger().fine() << "atomese_based_scorer - Evaluate instance: " + << _rep.fields().to_string(inst); + } + + combo_tree tr = _rep.get_candidate(inst, _reduce); + Handle handle = atomese_combo(tr); + return _cscorer.get_cscore(handle); + } + +protected: + behave_cscore &_cscorer; + representation &_rep; + bool _reduce; // whether the exemplar should be reduced before being + // evaluated. This is advantagous when _cscorer is + // also a cache; the reduced form will have more cache + // hits. +}; + } //~namespace moses } //~namespace opencog diff --git a/moses/moses/scoring/behave_cscore.cc b/moses/moses/scoring/behave_cscore.cc index f97934093d..1dd94624d7 100644 --- a/moses/moses/scoring/behave_cscore.cc +++ b/moses/moses/scoring/behave_cscore.cc @@ -33,7 +33,8 @@ namespace opencog { namespace moses { behave_cscore::behave_cscore(bscore_base& b, size_t initial_cache_size) : _bscorer(b), _have_cache(0get_cscore_nocache(handle); +} + +composite_score behave_cscore::get_cscore_nocache(const Handle &handle) +{ + behavioral_score bs; + try { + bs = get_bscore(handle); + } + catch (...) + { + return worst_composite_score; + } + score_t res = _bscorer.sum_bscore(bs); + + complexity_t cpxy = _bscorer.get_complexity(handle); + score_t cpxy_coef = _bscorer.get_complexity_coef(); + if (logger().is_fine_enabled()) { + logger().fine() << "behave_cscore: " << res + << " complexity: " << cpxy + << " cpxy_coeff: " << cpxy_coef; + } + + return composite_score(res, cpxy, cpxy * cpxy_coef, 0.0); +} + score_t behave_cscore::best_possible_score() const { // This uses a flat, uniform weighting diff --git a/moses/moses/scoring/behave_cscore.h b/moses/moses/scoring/behave_cscore.h index d68a36151f..74f17d2a40 100644 --- a/moses/moses/scoring/behave_cscore.h +++ b/moses/moses/scoring/behave_cscore.h @@ -27,6 +27,7 @@ #define _MOSES_BEHAVE_CSCORE_H #include +#include #include "scoring_base.h" @@ -54,8 +55,10 @@ class behave_cscore behavioral_score get_bscore(const combo_tree&) const; behavioral_score get_bscore(const scored_combo_tree_set&) const; + behavioral_score get_bscore(const Handle&) const; composite_score get_cscore(const combo_tree&); composite_score get_cscore(const scored_combo_tree_set&); + composite_score get_cscore(const Handle&); /// Returns the best score reachable for this problem. Used as /// termination condition. @@ -116,10 +119,19 @@ class behave_cscore behave_cscore* self; }; + struct atomese_wrapper : public std::unary_function + { + composite_score operator()(const Handle&) const; + behave_cscore* self; + }; + bool _have_cache; wrapper _wrapper; + atomese_wrapper _atomese_wrapper; prr_cache_threaded _cscore_cache; + prr_cache_threaded _atomese_cscore_cache; composite_score get_cscore_nocache(const combo_tree&); + composite_score get_cscore_nocache(const Handle&); public: // weird hack for subsample scoring... diff --git a/moses/moses/scoring/scoring_base.cc b/moses/moses/scoring/scoring_base.cc index 6b6098c2a3..5372b06586 100644 --- a/moses/moses/scoring/scoring_base.cc +++ b/moses/moses/scoring/scoring_base.cc @@ -31,6 +31,7 @@ #include #include +#include #include "scoring_base.h" namespace opencog { namespace moses { @@ -69,6 +70,13 @@ void bscore_base::set_complexity_coef(score_t complexity_ratio) logger().info() << "BScore complexity ratio = " << 1.0/_complexity_coef; } +behavioral_score bscore_base::operator()(const Handle &) const +{ + OC_ASSERT(false, "Ensemble scoring not implemented for bscorer %s", + typeid(*this).name()); + return behavioral_score(); +} + behavioral_score bscore_base::operator()(const scored_combo_tree_set& ensemble) const { diff --git a/moses/moses/scoring/scoring_base.h b/moses/moses/scoring/scoring_base.h index 95141c35c1..01364a1cd2 100644 --- a/moses/moses/scoring/scoring_base.h +++ b/moses/moses/scoring/scoring_base.h @@ -54,7 +54,7 @@ score_t contin_complexity_coef(unsigned alphabet_size, double stdev); /// Abstract base class for behavioral scoring. /// A behavioral score is a vector of scores, one per sample of a dataset. -struct bscore_base : public std::unary_function +struct bscore_base { bscore_base() : _return_weighted_score(false), _complexity_coef(0.0), _size(0) {}; virtual ~bscore_base() {}; @@ -62,6 +62,9 @@ struct bscore_base : public std::unary_function /// Return the behavioral score for the combo_tree virtual behavioral_score operator()(const combo_tree&) const = 0; + /// Return the behavioral score for the Handle + virtual behavioral_score operator()(const Handle&) const; + /// Return the behavioral score for the ensemble virtual behavioral_score operator()(const scored_combo_tree_set&) const; @@ -203,6 +206,10 @@ struct bscore_base : public std::unary_function } virtual complexity_t get_complexity(const scored_combo_tree_set&) const; + virtual complexity_t get_complexity(const Handle &handle)const + { + return atomese_complexity(handle); + } /// Return the complexity coefficient. This is used to obtain the /// complexity penalty for the score, which is meant to be computed /// as penalty = get_complexity_coef() * get_complexity(tree); From dc4b078b7259ea624de0302a519282c9e1925976 Mon Sep 17 00:00:00 2001 From: kasim Date: Sun, 12 Aug 2018 17:37:33 +0300 Subject: [PATCH 03/17] Add logical_bscore --- moses/comboreduct/table/table.h | 11 +++++++++++ moses/moses/scoring/bscores.cc | 16 ++++++++++++++++ moses/moses/scoring/bscores.h | 2 ++ 3 files changed, 29 insertions(+) diff --git a/moses/comboreduct/table/table.h b/moses/comboreduct/table/table.h index 64944a171f..1bb7ebc027 100644 --- a/moses/comboreduct/table/table.h +++ b/moses/comboreduct/table/table.h @@ -41,6 +41,7 @@ #include #include #include +#include #include "../type_checker/type_tree.h" #include "../interpreter/eval.h" @@ -1454,6 +1455,16 @@ class complete_truth_table : public bool_seq populate(tr); } + complete_truth_table(const Handle&) + { + OC_ASSERT(false, "Truth table from Handle not implemented yet"); + } + + complete_truth_table(const Handle&, arity_t arity) + { + OC_ASSERT(false, "Truth table from Handle not implemented yet"); + } + template complete_truth_table(const Func& f, arity_t arity) : super(pow2(arity)), _arity(arity) { diff --git a/moses/moses/scoring/bscores.cc b/moses/moses/scoring/bscores.cc index cfd0cb5aae..88d2910a08 100644 --- a/moses/moses/scoring/bscores.cc +++ b/moses/moses/scoring/bscores.cc @@ -133,6 +133,22 @@ score_t logical_bscore::get_error(const behavioral_score& bs) const return - sum_bscore(bs) / ((score_t) _size); } +behavioral_score logical_bscore::operator()(const Handle &handle) const +{ + combo::complete_truth_table tt(handle, _arity); + behavioral_score bs(_size); + + // Compare the predictions of the tree to that of the desired + // result. A correct prdiction gets a score of 0, an incorrect + // prediction gets a score of -1. + boost::transform(tt, _target, bs.begin(), [](bool b1, bool b2) + { + return -score_t(b1 != b2); + }); + + return bs; +} + /////////////////// // contin_bscore // /////////////////// diff --git a/moses/moses/scoring/bscores.h b/moses/moses/scoring/bscores.h index 0623a3ac24..264bc9ca98 100644 --- a/moses/moses/scoring/bscores.h +++ b/moses/moses/scoring/bscores.h @@ -68,6 +68,8 @@ struct logical_bscore : public bscore_base behavioral_score operator()(const combo_tree&) const; behavioral_score operator()(const scored_combo_tree_set&) const; + behavioral_score operator()(const Handle&) const; + behavioral_score best_possible_bscore() const; behavioral_score worst_possible_bscore() const; score_t get_error(const behavioral_score&) const; From b7681610b1551e33e056d4f350369c30cfc38615 Mon Sep 17 00:00:00 2001 From: kasim Date: Mon, 13 Aug 2018 15:27:25 +0300 Subject: [PATCH 04/17] Add atomese_complexity calc --- moses/moses/moses/complexity.cc | 27 +++++++++++++++++++++++++-- moses/moses/moses/complexity.h | 4 +++- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/moses/moses/moses/complexity.cc b/moses/moses/moses/complexity.cc index dc38cea7b0..ff9b8b9525 100644 --- a/moses/moses/moses/complexity.cc +++ b/moses/moses/moses/complexity.cc @@ -23,6 +23,7 @@ */ #include #include +#include #include "complexity.h" @@ -119,9 +120,31 @@ complexity_t tree_complexity(const combo_tree& tr, return tree_complexity(tr.begin(), stopper); } -complexity_t atomese_complexity(const Handle &) +complexity_t atomese_complexity(const Handle &handle, + bool (*stopper)(const Handle&)) { - OC_ASSERT(false, "Atomese Complexity not implemented yet"); + if (stopper && stopper(handle)) return 0; + + Type t = handle->get_type(); + + if (SCHEMA_NODE == t || PREDICATE_NODE == t || NUMBER_NODE == t){ + return 1; + } + if (TIMES_LINK == t){ + for (Handle h : handle->getOutgoingSet()){ + if (h->get_type() == NUMBER_NODE && NumberNodeCast(h)->get_value() == 0) + return 0; + } + } + if (nameserver().isA(t, LINK)){ + int c = int(t==DIVIDE_LINK + ||t==RANDOM_CHOICE_LINK + ||t==RANDOM_NUMBER_LINK); + for (Handle h : handle->getOutgoingSet()){ + c += atomese_complexity(h, stopper); + } + return c; + } return 0; } diff --git a/moses/moses/moses/complexity.h b/moses/moses/moses/complexity.h index 04f4b11984..a8e1e58413 100644 --- a/moses/moses/moses/complexity.h +++ b/moses/moses/moses/complexity.h @@ -29,6 +29,7 @@ #include #include +#include namespace opencog { namespace moses { @@ -45,7 +46,8 @@ namespace opencog { namespace moses { complexity_t tree_complexity(const combo::combo_tree&, bool (*)(const combo::combo_tree::iterator&) = NULL); - complexity_t atomese_complexity(const Handle&); + complexity_t atomese_complexity(const Handle&, + bool (*)(const Handle&) = NULL); } //~namespace moses } //~namespace opencog From c45a24c1a20f4f53bd6bd6e440f991deae14c419 Mon Sep 17 00:00:00 2001 From: kasim Date: Sat, 18 Aug 2018 15:50:40 +0300 Subject: [PATCH 05/17] Fix _atomese_wrapper.self not initialized --- moses/moses/scoring/behave_cscore.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/moses/moses/scoring/behave_cscore.cc b/moses/moses/scoring/behave_cscore.cc index 1dd94624d7..918b8d1f87 100644 --- a/moses/moses/scoring/behave_cscore.cc +++ b/moses/moses/scoring/behave_cscore.cc @@ -37,6 +37,7 @@ behave_cscore::behave_cscore(bscore_base& b, size_t initial_cache_size) _atomese_cscore_cache(initial_cache_size, _atomese_wrapper, "compositescore") { _wrapper.self = this; + _atomese_wrapper.self = this; } behavioral_score behave_cscore::get_bscore(const combo_tree& tr) const From 8446ea4df744f090b1d1f065ad72293a576cdb65 Mon Sep 17 00:00:00 2001 From: kasim Date: Sat, 18 Aug 2018 16:49:44 +0300 Subject: [PATCH 06/17] Move instance_scorer implementations to instance_scorer.cc --- moses/moses/CMakeLists.txt | 1 + moses/moses/representation/instance_scorer.cc | 83 +++++++++++++++++++ moses/moses/representation/instance_scorer.h | 49 +---------- 3 files changed, 87 insertions(+), 46 deletions(-) create mode 100644 moses/moses/representation/instance_scorer.cc diff --git a/moses/moses/CMakeLists.txt b/moses/moses/CMakeLists.txt index 2f33b02bd7..c03bdbc8be 100644 --- a/moses/moses/CMakeLists.txt +++ b/moses/moses/CMakeLists.txt @@ -54,6 +54,7 @@ ADD_LIBRARY (moses SHARED representation/knob_mapper representation/knobs representation/representation + representation/instance_scorer scoring/behave_cscore scoring/bscores diff --git a/moses/moses/representation/instance_scorer.cc b/moses/moses/representation/instance_scorer.cc new file mode 100644 index 0000000000..f771163c07 --- /dev/null +++ b/moses/moses/representation/instance_scorer.cc @@ -0,0 +1,83 @@ +/* + * moses/moses/representation/instance_scorer.h + * + * Copyright (C) 2002-2008 Novamente LLC + * Copyright (C) 2012,2013 Poulin Holdings LLC + * All Rights Reserved + * + * Written by Moshe Looks, Nil Geisweiller, Linas Vepstas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "instance_scorer.h" + +namespace opencog +{ +namespace moses +{ + +composite_score distance_based_scorer::operator()(const instance &inst) const +{ + score_t sc = -fs.hamming_distance(target_inst, inst); + // Logger + if (logger().is_fine_enabled()) { + logger().fine() << "distance_based_scorer - Evaluate instance: " + << fs.to_string(inst) << "\n" + << "Score = " << sc << std::endl; + } + // ~Logger + return composite_score(sc, 0, 0, 0); +} + +composite_score combo_based_scorer::operator()(const instance &inst) const +{ + if (logger().is_fine_enabled()) { + logger().fine() << "combo_based_scorer - Evaluate instance: " + << _rep.fields().to_string(inst); + } + + try { + combo_tree tr = _rep.get_candidate(inst, _reduce); + return _cscorer.get_cscore(tr); + } catch (...) { +// XXX FIXME, calling score_tree above does not throw the exception; this should be done +// differntly, maybe call bscorer directly, then ascorer... +// ??? Huh? why couldn't we evaluate a tree anyway? why would we want an exception here? + combo_tree raw_tr = _rep.get_candidate(inst, false); + combo_tree red_tr = _rep.get_candidate(inst, true); + logger().warn() << "The following instance could not be evaluated: " + << _rep.fields().to_string(inst) + << "\nUnreduced tree: " << raw_tr + << "\nreduced tree: "<< red_tr; + } + return worst_composite_score; +} + +composite_score atomese_based_scorer::operator()(const instance &inst) const +{ + if (logger().is_fine_enabled()) { + logger().fine() << "atomese_based_scorer - Evaluate instance: " + << _rep.fields().to_string(inst); + } + + combo_tree tr = _rep.get_candidate(inst, _reduce); + Handle handle = atomese_combo(tr); + return _cscorer.get_cscore(handle); +} + +} +} diff --git a/moses/moses/representation/instance_scorer.h b/moses/moses/representation/instance_scorer.h index c99aedfed0..40025fff46 100644 --- a/moses/moses/representation/instance_scorer.h +++ b/moses/moses/representation/instance_scorer.h @@ -50,18 +50,7 @@ struct distance_based_scorer : public iscorer_base const instance& _target_inst) : fs(_fs), target_inst(_target_inst) {} - composite_score operator()(const instance& inst) const - { - score_t sc = -fs.hamming_distance(target_inst, inst); - // Logger - if (logger().is_fine_enabled()) { - logger().fine() << "distance_based_scorer - Evaluate instance: " - << fs.to_string(inst) << "\n" - << "Score = " << sc << std::endl; - } - // ~Logger - return composite_score(sc, 0, 0, 0); - } + composite_score operator()(const instance& inst) const; protected: const field_set& fs; @@ -74,29 +63,7 @@ struct combo_based_scorer : public iscorer_base representation& rep, bool reduce) : _cscorer(cs), _rep(rep), _reduce(reduce) {} - composite_score operator()(const instance& inst) const - { - if (logger().is_fine_enabled()) { - logger().fine() << "combo_based_scorer - Evaluate instance: " - << _rep.fields().to_string(inst); - } - - try { - combo_tree tr = _rep.get_candidate(inst, _reduce); - return _cscorer.get_cscore(tr); - } catch (...) { -// XXX FIXME, calling score_tree above does not throw the exception; this should be done -// differntly, maybe call bscorer directly, then ascorer... -// ??? Huh? why couldn't we evaluate a tree anyway? why would we want an exception here? - combo_tree raw_tr = _rep.get_candidate(inst, false); - combo_tree red_tr = _rep.get_candidate(inst, true); - logger().warn() << "The following instance could not be evaluated: " - << _rep.fields().to_string(inst) - << "\nUnreduced tree: " << raw_tr - << "\nreduced tree: "<< red_tr; - } - return worst_composite_score; - } + composite_score operator()(const instance& inst) const; protected: behave_cscore& _cscorer; @@ -114,17 +81,7 @@ struct atomese_based_scorer : public iscorer_base : _cscorer(cs), _rep(rep), _reduce(reduce) {} - composite_score operator()(const instance &inst) const - { - if (logger().is_fine_enabled()) { - logger().fine() << "atomese_based_scorer - Evaluate instance: " - << _rep.fields().to_string(inst); - } - - combo_tree tr = _rep.get_candidate(inst, _reduce); - Handle handle = atomese_combo(tr); - return _cscorer.get_cscore(handle); - } + composite_score operator()(const instance &inst) const; protected: behave_cscore &_cscorer; From a129435642e56a9e8b0b2678149426cb3f0bdede Mon Sep 17 00:00:00 2001 From: kasim Date: Sat, 18 Aug 2018 16:53:08 +0300 Subject: [PATCH 07/17] Replace NULL with nullptr --- moses/moses/moses/complexity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/moses/moses/complexity.h b/moses/moses/moses/complexity.h index a8e1e58413..b55a7bfdb6 100644 --- a/moses/moses/moses/complexity.h +++ b/moses/moses/moses/complexity.h @@ -47,7 +47,7 @@ namespace opencog { namespace moses { bool (*)(const combo::combo_tree::iterator&) = NULL); complexity_t atomese_complexity(const Handle&, - bool (*)(const Handle&) = NULL); + bool (*)(const Handle&) = nullptr); } //~namespace moses } //~namespace opencog From 87b726ca99b48399158cb24ae20ca133c6329c05 Mon Sep 17 00:00:00 2001 From: kasim Date: Fri, 31 Aug 2018 12:41:13 +0300 Subject: [PATCH 08/17] Reformat files to oc_code_standard --- moses/moses/moses/complexity.cc | 144 +- moses/moses/moses/complexity.h | 27 +- moses/moses/representation/instance_scorer.cc | 2 +- moses/moses/representation/instance_scorer.h | 67 +- moses/moses/scoring/behave_cscore.cc | 153 +- moses/moses/scoring/behave_cscore.h | 162 +- moses/moses/scoring/bscores.cc | 1535 ++++++++--------- moses/moses/scoring/bscores.h | 455 ++--- moses/moses/scoring/scoring_base.cc | 398 ++--- moses/moses/scoring/scoring_base.h | 568 +++--- 10 files changed, 1786 insertions(+), 1725 deletions(-) diff --git a/moses/moses/moses/complexity.cc b/moses/moses/moses/complexity.cc index ff9b8b9525..daa35981c2 100644 --- a/moses/moses/moses/complexity.cc +++ b/moses/moses/moses/complexity.cc @@ -27,7 +27,10 @@ #include "complexity.h" -namespace opencog { namespace moses { +namespace opencog +{ +namespace moses +{ using namespace opencog::combo; @@ -55,92 +58,93 @@ using namespace opencog::combo; // // Note Bene: this function returns a POSITIVE number! complexity_t tree_complexity(combo_tree::iterator it, - bool (*stopper)(const combo_tree::iterator&)) + bool (*stopper)(const combo_tree::iterator &)) { - // base cases - // null_vertex marks the location of a logical knob. Halt - // recursion past logical knobs. - if (*it == id::logical_true - || *it == id::logical_false - || *it == id::null_vertex) - return 0; - - // If the stopper function is defined, and it returns true, we halt - // recursion. This is needed for knob-probing across boundaries - // between logical and contin expressions. - if (stopper && stopper(it)) return 0; - - // *(0 stuff) marks the location of a contin knob. Halt recursion - // past contin knobs. This is for knob-probing. - if ((*it==id::times) && is_contin(*it.begin()) && - (0 == get_contin(*it.begin()))) - return 0; - - // Contins get a complexity of 1. But perhaps, contins should - // get a complexity of 2 or more, if they are very large, or - // require many digits of precision. - if (is_argument(*it) - || is_contin(*it) - || is_builtin_action(*it) - || is_ann_type(*it) - || is_action_result(*it)) - return 1; - - // recursive cases - if (*it == id::logical_not) - return tree_complexity(it.begin(), stopper); - - // If an operator is not listed below, it has a complexity of zero. - // Note that logical_and, logical_or are not listed, these have a - // complexity of zero. - // - // div and trigonometric functions have complexity one. - // But greatere_than_zero, impulse, plus, times are all treated - // with complexity zero. Why? I dunno; maybe because impulse is - // like an unavoidable type conversion? Kind of like id::not above ??? - int c = int(*it==id::div - || *it==id::exp - || *it==id::log - || *it==id::sin - || *it==id::rand - || *it==id::equ - || *it==id::cond); - - for (combo_tree::sibling_iterator sib = it.begin(); sib != it.end(); ++sib) - c += tree_complexity(sib, stopper); - return c; + // base cases + // null_vertex marks the location of a logical knob. Halt + // recursion past logical knobs. + if (*it == id::logical_true + || *it == id::logical_false + || *it == id::null_vertex) + return 0; + + // If the stopper function is defined, and it returns true, we halt + // recursion. This is needed for knob-probing across boundaries + // between logical and contin expressions. + if (stopper && stopper(it)) return 0; + + // *(0 stuff) marks the location of a contin knob. Halt recursion + // past contin knobs. This is for knob-probing. + if ((*it == id::times) && is_contin(*it.begin()) && + (0 == get_contin(*it.begin()))) + return 0; + + // Contins get a complexity of 1. But perhaps, contins should + // get a complexity of 2 or more, if they are very large, or + // require many digits of precision. + if (is_argument(*it) + || is_contin(*it) + || is_builtin_action(*it) + || is_ann_type(*it) + || is_action_result(*it)) + return 1; + + // recursive cases + if (*it == id::logical_not) + return tree_complexity(it.begin(), stopper); + + // If an operator is not listed below, it has a complexity of zero. + // Note that logical_and, logical_or are not listed, these have a + // complexity of zero. + // + // div and trigonometric functions have complexity one. + // But greatere_than_zero, impulse, plus, times are all treated + // with complexity zero. Why? I dunno; maybe because impulse is + // like an unavoidable type conversion? Kind of like id::not above ??? + int c = int(*it == id::div + || *it == id::exp + || *it == id::log + || *it == id::sin + || *it == id::rand + || *it == id::equ + || *it == id::cond); + + for (combo_tree::sibling_iterator sib = it.begin(); sib != it.end(); ++sib) + c += tree_complexity(sib, stopper); + return c; } -complexity_t tree_complexity(const combo_tree& tr, - bool (*stopper)(const combo_tree::iterator&)) +complexity_t tree_complexity(const combo_tree &tr, + bool (*stopper)(const combo_tree::iterator &)) { - combo_tree::iterator it = tr.begin(); - if (it == tr.end()) return 0; + combo_tree::iterator it = tr.begin(); + if (it == tr.end()) return 0; - return tree_complexity(tr.begin(), stopper); + return tree_complexity(tr.begin(), stopper); } complexity_t atomese_complexity(const Handle &handle, - bool (*stopper)(const Handle&)) + bool (*stopper)(const Handle &)) { - if (stopper && stopper(handle)) return 0; + if (stopper && stopper(handle)) + return 0; Type t = handle->get_type(); - if (SCHEMA_NODE == t || PREDICATE_NODE == t || NUMBER_NODE == t){ + if (SCHEMA_NODE == t || PREDICATE_NODE == t || NUMBER_NODE == t) return 1; - } - if (TIMES_LINK == t){ - for (Handle h : handle->getOutgoingSet()){ + + if (TIMES_LINK == t) { + for (Handle h : handle->getOutgoingSet()) { if (h->get_type() == NUMBER_NODE && NumberNodeCast(h)->get_value() == 0) return 0; } } - if (nameserver().isA(t, LINK)){ - int c = int(t==DIVIDE_LINK - ||t==RANDOM_CHOICE_LINK - ||t==RANDOM_NUMBER_LINK); - for (Handle h : handle->getOutgoingSet()){ + if (nameserver().isA(t, LINK)) { + int c = int(t == DIVIDE_LINK + || t == RANDOM_CHOICE_LINK + || t == RANDOM_NUMBER_LINK); + for (Handle h : handle->getOutgoingSet()) { c += atomese_complexity(h, stopper); } return c; diff --git a/moses/moses/moses/complexity.h b/moses/moses/moses/complexity.h index b55a7bfdb6..2b0d0e1a9a 100644 --- a/moses/moses/moses/complexity.h +++ b/moses/moses/moses/complexity.h @@ -31,23 +31,26 @@ #include #include -namespace opencog { namespace moses { +namespace opencog +{ +namespace moses +{ - // Right now, the algorithmic complexity of any combo program - // is always an (unsigned) int. I guess it could be made a float, - // if need be... there's no fundamental reason for an int here. - typedef unsigned complexity_t; +// Right now, the algorithmic complexity of any combo program +// is always an (unsigned) int. I guess it could be made a float, +// if need be... there's no fundamental reason for an int here. +typedef unsigned complexity_t; - static const complexity_t least_complexity = 0; +static const complexity_t least_complexity = 0; - complexity_t tree_complexity(combo::combo_tree::iterator, - bool (*)(const combo::combo_tree::iterator&) = NULL); +complexity_t tree_complexity(combo::combo_tree::iterator, + bool (*)(const combo::combo_tree::iterator &) = NULL); - complexity_t tree_complexity(const combo::combo_tree&, - bool (*)(const combo::combo_tree::iterator&) = NULL); +complexity_t tree_complexity(const combo::combo_tree &, + bool (*)(const combo::combo_tree::iterator &) = NULL); - complexity_t atomese_complexity(const Handle&, - bool (*)(const Handle&) = nullptr); +complexity_t atomese_complexity(const Handle &, + bool (*)(const Handle &) = nullptr); } //~namespace moses } //~namespace opencog diff --git a/moses/moses/representation/instance_scorer.cc b/moses/moses/representation/instance_scorer.cc index f771163c07..502f50c56a 100644 --- a/moses/moses/representation/instance_scorer.cc +++ b/moses/moses/representation/instance_scorer.cc @@ -62,7 +62,7 @@ composite_score combo_based_scorer::operator()(const instance &inst) const logger().warn() << "The following instance could not be evaluated: " << _rep.fields().to_string(inst) << "\nUnreduced tree: " << raw_tr - << "\nreduced tree: "<< red_tr; + << "\nreduced tree: " << red_tr; } return worst_composite_score; } diff --git a/moses/moses/representation/instance_scorer.h b/moses/moses/representation/instance_scorer.h index 40025fff46..c228acbc65 100644 --- a/moses/moses/representation/instance_scorer.h +++ b/moses/moses/representation/instance_scorer.h @@ -31,12 +31,17 @@ #include "../scoring/behave_cscore.h" #include "moses/comboreduct/converter/combo_atomese.h" -namespace opencog { namespace moses { +namespace opencog +{ +namespace moses +{ struct iscorer_base : public std::unary_function { - virtual composite_score operator()(const instance&) const = 0; - virtual ~iscorer_base() {} + virtual composite_score operator()(const instance &) const = 0; + + virtual ~iscorer_base() + {} }; /** @@ -46,50 +51,52 @@ struct iscorer_base : public std::unary_function */ struct distance_based_scorer : public iscorer_base { - distance_based_scorer(const field_set& _fs, - const instance& _target_inst) - : fs(_fs), target_inst(_target_inst) {} + distance_based_scorer(const field_set &_fs, + const instance &_target_inst) + : fs(_fs), target_inst(_target_inst) + {} - composite_score operator()(const instance& inst) const; + composite_score operator()(const instance &inst) const; protected: - const field_set& fs; - const instance& target_inst; + const field_set &fs; + const instance &target_inst; }; struct combo_based_scorer : public iscorer_base { - combo_based_scorer(behave_cscore& cs, - representation& rep, bool reduce) - : _cscorer(cs), _rep(rep), _reduce(reduce) {} + combo_based_scorer(behave_cscore &cs, + representation &rep, bool reduce) + : _cscorer(cs), _rep(rep), _reduce(reduce) + {} - composite_score operator()(const instance& inst) const; + composite_score operator()(const instance &inst) const; protected: - behave_cscore& _cscorer; - representation& _rep; - bool _reduce; // whether the exemplar should be reduced before being - // evaluated. This is advantagous when _cscorer is - // also a cache; the reduced form will have more cache - // hits. + behave_cscore &_cscorer; + representation &_rep; + bool _reduce; // whether the exemplar should be reduced before being + // evaluated. This is advantagous when _cscorer is + // also a cache; the reduced form will have more cache + // hits. }; struct atomese_based_scorer : public iscorer_base { - atomese_based_scorer(behave_cscore &cs, - representation &rep, bool reduce) - : _cscorer(cs), _rep(rep), _reduce(reduce) - {} + atomese_based_scorer(behave_cscore &cs, + representation &rep, bool reduce) + : _cscorer(cs), _rep(rep), _reduce(reduce) + {} - composite_score operator()(const instance &inst) const; + composite_score operator()(const instance &inst) const; protected: - behave_cscore &_cscorer; - representation &_rep; - bool _reduce; // whether the exemplar should be reduced before being - // evaluated. This is advantagous when _cscorer is - // also a cache; the reduced form will have more cache - // hits. + behave_cscore &_cscorer; + representation &_rep; + bool _reduce; // whether the exemplar should be reduced before being + // evaluated. This is advantagous when _cscorer is + // also a cache; the reduced form will have more cache + // hits. }; } //~namespace moses diff --git a/moses/moses/scoring/behave_cscore.cc b/moses/moses/scoring/behave_cscore.cc index 918b8d1f87..b8d2230055 100644 --- a/moses/moses/scoring/behave_cscore.cc +++ b/moses/moses/scoring/behave_cscore.cc @@ -28,26 +28,29 @@ #include "behave_cscore.h" -namespace opencog { namespace moses { +namespace opencog +{ +namespace moses +{ -behave_cscore::behave_cscore(bscore_base& b, size_t initial_cache_size) - : _bscorer(b), - _have_cache(0get_cscore_nocache(tr); + return self->get_cscore_nocache(tr); } -composite_score behave_cscore::get_cscore_nocache(const combo_tree& tr) +composite_score behave_cscore::get_cscore_nocache(const combo_tree &tr) { - behavioral_score bs; - try { - bs = get_bscore(tr); - } - catch (combo::EvalException& ee) - { - // Exceptions are raised when operands are out of their - // valid domain (negative input log or division by zero), - // or outputs a value which is not representable (too - // large exp or log). The error is logged as level fine - // because this happens very often when learning continuous - // functions, and it clogs up the log when logged at a - // higher level. - if (logger().is_fine_enabled()) { - logger().fine() - << "The following candidate: " << tr << "\n" - << "has failed to be evaluated, " - << "raising the following exception: " - << ee.get_message() << " " << ee.get_vertex(); - } - return worst_composite_score; - } - score_t res = _bscorer.sum_bscore(bs); - - complexity_t cpxy = _bscorer.get_complexity(tr); - score_t cpxy_coef = _bscorer.get_complexity_coef(); - if (logger().is_fine_enabled()) { - logger().fine() << "behave_cscore: " << res - << " complexity: " << cpxy - << " cpxy_coeff: " << cpxy_coef; - } - - return composite_score(res, cpxy, cpxy * cpxy_coef, 0.0); + behavioral_score bs; + try { + bs = get_bscore(tr); + } + catch (combo::EvalException &ee) { + // Exceptions are raised when operands are out of their + // valid domain (negative input log or division by zero), + // or outputs a value which is not representable (too + // large exp or log). The error is logged as level fine + // because this happens very often when learning continuous + // functions, and it clogs up the log when logged at a + // higher level. + if (logger().is_fine_enabled()) { + logger().fine() + << "The following candidate: " << tr << "\n" + << "has failed to be evaluated, " + << "raising the following exception: " + << ee.get_message() << " " << ee.get_vertex(); + } + return worst_composite_score; + } + score_t res = _bscorer.sum_bscore(bs); + + complexity_t cpxy = _bscorer.get_complexity(tr); + score_t cpxy_coef = _bscorer.get_complexity_coef(); + if (logger().is_fine_enabled()) { + logger().fine() << "behave_cscore: " << res + << " complexity: " << cpxy + << " cpxy_coeff: " << cpxy_coef; + } + + return composite_score(res, cpxy, cpxy * cpxy_coef, 0.0); } -composite_score behave_cscore::get_cscore(const scored_combo_tree_set& ensemble) +composite_score behave_cscore::get_cscore(const scored_combo_tree_set &ensemble) { - behavioral_score bs(get_bscore(ensemble)); - - // Listen up, this is confusing ... For ensembles, this method is - // called to obtain the "true" composite score, as it would hold - // for the unadulterated dataset. Thus we do NOT use the row - // weights as the weighted scorer would, but use the flat, uniform - // weighting. - // score_t res = _bscorer.score(bs); // this returns the weighted score. - score_t res = boost::accumulate(bs, 0.0); - - complexity_t cpxy = _bscorer.get_complexity(ensemble); - score_t cpxy_coef = _bscorer.get_complexity_coef(); - if (logger().is_fine_enabled()) { - logger().fine() << "ensemble behave_cscore: " << res - << " complexity: " << cpxy - << " cpxy_coeff: " << cpxy_coef; - } - - return composite_score(res, cpxy, cpxy * cpxy_coef, 0.0); + behavioral_score bs(get_bscore(ensemble)); + + // Listen up, this is confusing ... For ensembles, this method is + // called to obtain the "true" composite score, as it would hold + // for the unadulterated dataset. Thus we do NOT use the row + // weights as the weighted scorer would, but use the flat, uniform + // weighting. + // score_t res = _bscorer.score(bs); // this returns the weighted score. + score_t res = boost::accumulate(bs, 0.0); + + complexity_t cpxy = _bscorer.get_complexity(ensemble); + score_t cpxy_coef = _bscorer.get_complexity_coef(); + if (logger().is_fine_enabled()) { + logger().fine() << "ensemble behave_cscore: " << res + << " complexity: " << cpxy + << " cpxy_coeff: " << cpxy_coef; + } + + return composite_score(res, cpxy, cpxy * cpxy_coef, 0.0); } composite_score behave_cscore::get_cscore(const Handle &handle) @@ -144,8 +146,7 @@ composite_score behave_cscore::get_cscore_nocache(const Handle &handle) try { bs = get_bscore(handle); } - catch (...) - { + catch (...) { return worst_composite_score; } score_t res = _bscorer.sum_bscore(bs); @@ -163,13 +164,13 @@ composite_score behave_cscore::get_cscore_nocache(const Handle &handle) score_t behave_cscore::best_possible_score() const { - // This uses a flat, uniform weighting - return boost::accumulate(_bscorer.best_possible_bscore(), 0.0); + // This uses a flat, uniform weighting + return boost::accumulate(_bscorer.best_possible_bscore(), 0.0); } score_t behave_cscore::worst_possible_score() const { - return boost::accumulate(_bscorer.worst_possible_bscore(), 0.0); + return boost::accumulate(_bscorer.worst_possible_bscore(), 0.0); } } // ~namespace moses diff --git a/moses/moses/scoring/behave_cscore.h b/moses/moses/scoring/behave_cscore.h index 74f17d2a40..6d949815cb 100644 --- a/moses/moses/scoring/behave_cscore.h +++ b/moses/moses/scoring/behave_cscore.h @@ -31,7 +31,10 @@ #include "scoring_base.h" -namespace opencog { namespace moses { +namespace opencog +{ +namespace moses +{ /** * Composite score calculated from the behavioral score. @@ -51,91 +54,102 @@ namespace opencog { namespace moses { class behave_cscore { public: - behave_cscore(bscore_base& b, size_t initial_cache_size=0); - - behavioral_score get_bscore(const combo_tree&) const; - behavioral_score get_bscore(const scored_combo_tree_set&) const; - behavioral_score get_bscore(const Handle&) const; - composite_score get_cscore(const combo_tree&); - composite_score get_cscore(const scored_combo_tree_set&); - composite_score get_cscore(const Handle&); - - /// Returns the best score reachable for this problem. Used as - /// termination condition. - score_t best_possible_score() const; - - /// Returns the worst score reachable for this problem. Used to - /// compute the scoring error during boosting. - score_t worst_possible_score() const; - - /// Return the minimum value considered for improvement. - score_t min_improv() const - { - return _bscorer.min_improv(); - } - - /// In table-based scorers, fitness function evaluation can be sped - /// up when unused features are ignored. The unused features must - /// not subsequently appear in the combo tree to be scored. Calling - /// this with the empty set restores all features. The features are - /// indicated as set of indices (from 0). - void ignore_cols(const std::set& idxs) const - { - _bscorer.ignore_cols(idxs); - } - - // In case one wants to evaluate the fitness on a subset of the - // data, one can provide a set of row indexes to ignore - void ignore_rows(const std::set& idxs) const - { - _bscorer.ignore_rows(idxs); - } - - // Like ignore_rows but consider timestamps instead of indexes - void ignore_rows_at_times(const std::set& timestamps) const - { - _bscorer.ignore_rows_at_times(timestamps); - } - - // Return the uncompressed size of the CTable - unsigned get_ctable_usize() const - { - return _bscorer.get_ctable_usize(); - } - - // Return the original CTable - const CTable& get_ctable() const { - return _bscorer.get_ctable(); - } + behave_cscore(bscore_base &b, size_t initial_cache_size = 0); + + behavioral_score get_bscore(const combo_tree &) const; + + behavioral_score get_bscore(const scored_combo_tree_set &) const; + + behavioral_score get_bscore(const Handle &) const; + + composite_score get_cscore(const combo_tree &); + + composite_score get_cscore(const scored_combo_tree_set &); + + composite_score get_cscore(const Handle &); + + /// Returns the best score reachable for this problem. Used as + /// termination condition. + score_t best_possible_score() const; + + /// Returns the worst score reachable for this problem. Used to + /// compute the scoring error during boosting. + score_t worst_possible_score() const; + + /// Return the minimum value considered for improvement. + score_t min_improv() const + { + return _bscorer.min_improv(); + } + + /// In table-based scorers, fitness function evaluation can be sped + /// up when unused features are ignored. The unused features must + /// not subsequently appear in the combo tree to be scored. Calling + /// this with the empty set restores all features. The features are + /// indicated as set of indices (from 0). + void ignore_cols(const std::set &idxs) const + { + _bscorer.ignore_cols(idxs); + } + + // In case one wants to evaluate the fitness on a subset of the + // data, one can provide a set of row indexes to ignore + void ignore_rows(const std::set &idxs) const + { + _bscorer.ignore_rows(idxs); + } + + // Like ignore_rows but consider timestamps instead of indexes + void ignore_rows_at_times(const std::set ×tamps) const + { + _bscorer.ignore_rows_at_times(timestamps); + } + + // Return the uncompressed size of the CTable + unsigned get_ctable_usize() const + { + return _bscorer.get_ctable_usize(); + } + + // Return the original CTable + const CTable &get_ctable() const + { + return _bscorer.get_ctable(); + } private: - bscore_base& _bscorer; + bscore_base &_bscorer; - // Below follows some assorted infrastructure to allow composite - // scoress for trees to be cached. - struct wrapper : public std::unary_function - { - composite_score operator()(const combo_tree&) const; - behave_cscore* self; - }; + // Below follows some assorted infrastructure to allow composite + // scoress for trees to be cached. + struct wrapper : public std::unary_function + { + composite_score operator()(const combo_tree &) const; + + behave_cscore *self; + }; struct atomese_wrapper : public std::unary_function { - composite_score operator()(const Handle&) const; - behave_cscore* self; + composite_score operator()(const Handle &) const; + + behave_cscore *self; }; - bool _have_cache; - wrapper _wrapper; + bool _have_cache; + wrapper _wrapper; atomese_wrapper _atomese_wrapper; - prr_cache_threaded _cscore_cache; + prr_cache_threaded _cscore_cache; prr_cache_threaded _atomese_cscore_cache; - composite_score get_cscore_nocache(const combo_tree&); - composite_score get_cscore_nocache(const Handle&); + + composite_score get_cscore_nocache(const combo_tree &); + + composite_score get_cscore_nocache(const Handle &); public: - // weird hack for subsample scoring... - bscore_base& get_bscorer() { return _bscorer; } + // weird hack for subsample scoring... + bscore_base &get_bscorer() + { return _bscorer; } }; diff --git a/moses/moses/scoring/bscores.cc b/moses/moses/scoring/bscores.cc index 88d2910a08..63b6dc2188 100644 --- a/moses/moses/scoring/bscores.cc +++ b/moses/moses/scoring/bscores.cc @@ -47,7 +47,10 @@ #include "bscores.h" -namespace opencog { namespace moses { +namespace opencog +{ +namespace moses +{ using namespace std; using boost::adaptors::map_values; @@ -64,73 +67,74 @@ using namespace boost::accumulators; // logical_bscore // //////////////////// -behavioral_score logical_bscore::operator()(const combo_tree& tr) const +behavioral_score logical_bscore::operator()(const combo_tree &tr) const { - combo::complete_truth_table tt(tr, _arity); - behavioral_score bs(_size); + combo::complete_truth_table tt(tr, _arity); + behavioral_score bs(_size); - // Compare the predictions of the tree to that of the desired - // result. A correct prdiction gets a score of 0, an incorrect - // prediction gets a score of -1. - boost::transform(tt, _target, bs.begin(), [](bool b1, bool b2) { - return -score_t(b1 != b2); }); + // Compare the predictions of the tree to that of the desired + // result. A correct prdiction gets a score of 0, an incorrect + // prediction gets a score of -1. + boost::transform(tt, _target, bs.begin(), [](bool b1, bool b2) { + return -score_t(b1 != b2); + }); - return bs; + return bs; } /// Boolean ensemble scorer. Assumes that the ensemble signature outputs /// a boolean value. All of the trees in the ensmble get a weighted vote, /// that vote is totalled to get the prediction of the ensemble, and then /// compared to the desired output. -behavioral_score logical_bscore::operator()(const scored_combo_tree_set& ensemble) const +behavioral_score logical_bscore::operator()(const scored_combo_tree_set &ensemble) const { - // Step 1: accumulate the weighted prediction of each tree in - // the ensemble. - behavioral_score hypoth(_size, 0.0); - for (const scored_combo_tree& sct: ensemble) { - combo::complete_truth_table tt(sct.get_tree(), _arity); - score_t weight = sct.get_weight(); - for (size_t i=0; i<_size; i++) { - // Add +1 if prediction is true and -1 if prediction is false. - // We could gain some minor performance improvement if we - // moved this out of the loop, but who cares, this scorer is - // used only for the demo problems. - hypoth[i] += weight * (2.0 * ((score_t) tt[i]) - 1.0); - } - } - - // Step 2: compare the prediction of the ensemble to the desired - // result. The array "hypoth" is positive to predict true, and - // negative to predict false. The resulting score is 0 if correct, - // and -1 if incorrect. - behavioral_score bs(_size); - boost::transform(hypoth, _target, bs.begin(), - [](score_t hyp, bool b2) { - bool b1 = (hyp > 0.0) ? true : false; - return -score_t(b1 != b2); - }); - return bs; + // Step 1: accumulate the weighted prediction of each tree in + // the ensemble. + behavioral_score hypoth(_size, 0.0); + for (const scored_combo_tree &sct: ensemble) { + combo::complete_truth_table tt(sct.get_tree(), _arity); + score_t weight = sct.get_weight(); + for (size_t i = 0; i < _size; i++) { + // Add +1 if prediction is true and -1 if prediction is false. + // We could gain some minor performance improvement if we + // moved this out of the loop, but who cares, this scorer is + // used only for the demo problems. + hypoth[i] += weight * (2.0 * ((score_t) tt[i]) - 1.0); + } + } + + // Step 2: compare the prediction of the ensemble to the desired + // result. The array "hypoth" is positive to predict true, and + // negative to predict false. The resulting score is 0 if correct, + // and -1 if incorrect. + behavioral_score bs(_size); + boost::transform(hypoth, _target, bs.begin(), + [](score_t hyp, bool b2) { + bool b1 = (hyp > 0.0) ? true : false; + return -score_t(b1 != b2); + }); + return bs; } behavioral_score logical_bscore::best_possible_bscore() const { - return behavioral_score(_size, 0); + return behavioral_score(_size, 0); } behavioral_score logical_bscore::worst_possible_bscore() const { - return behavioral_score(_size, -1); + return behavioral_score(_size, -1); } score_t logical_bscore::min_improv() const { - return 0.5; + return 0.5; } -score_t logical_bscore::get_error(const behavioral_score& bs) const +score_t logical_bscore::get_error(const behavioral_score &bs) const { - // Its minus the score: 0.0 is perfect score, 1.0 is worst score. - return - sum_bscore(bs) / ((score_t) _size); + // Its minus the score: 0.0 is perfect score, 1.0 is worst score. + return -sum_bscore(bs) / ((score_t) _size); } behavioral_score logical_bscore::operator()(const Handle &handle) const @@ -141,302 +145,303 @@ behavioral_score logical_bscore::operator()(const Handle &handle) const // Compare the predictions of the tree to that of the desired // result. A correct prdiction gets a score of 0, an incorrect // prediction gets a score of -1. - boost::transform(tt, _target, bs.begin(), [](bool b1, bool b2) - { + boost::transform(tt, _target, bs.begin(), [](bool b1, bool b2) { return -score_t(b1 != b2); }); - return bs; + return bs; } /////////////////// // contin_bscore // /////////////////// -behavioral_score contin_bscore::operator()(const combo_tree& tr) const +behavioral_score contin_bscore::operator()(const combo_tree &tr) const { - // OTable target is the table of output we want to get. - behavioral_score bs; - - // boost/range/algorithm/transform. - // Take the input vectors cit, target, feed the elts to anon - // funtion[] (which just computes square of the difference) and - // put the results into bs. - interpreter_visitor iv(tr); - auto interpret_tr = boost::apply_visitor(iv); - boost::transform(cti, target, back_inserter(bs), - [&](const multi_type_seq& mts, const vertex& v) { - contin_t tar = get_contin(v), - res = get_contin(interpret_tr(mts.get_variant())); - return -err_func(res, tar); - }); - - log_candidate_bscore(tr, bs); - - return bs; + // OTable target is the table of output we want to get. + behavioral_score bs; + + // boost/range/algorithm/transform. + // Take the input vectors cit, target, feed the elts to anon + // funtion[] (which just computes square of the difference) and + // put the results into bs. + interpreter_visitor iv(tr); + auto interpret_tr = boost::apply_visitor(iv); + boost::transform(cti, target, back_inserter(bs), + [&](const multi_type_seq &mts, const vertex &v) { + contin_t tar = get_contin(v), + res = get_contin(interpret_tr(mts.get_variant())); + return -err_func(res, tar); + }); + + log_candidate_bscore(tr, bs); + + return bs; } behavioral_score contin_bscore::best_possible_bscore() const { - return behavioral_score(target.size(), 0); + return behavioral_score(target.size(), 0); } score_t contin_bscore::min_improv() const { - // The backwards compat version of this is 0.0. But for - // continuously-variable scores, this is crazy, as the - // system falls into a state of tweaking the tenth decimal place, - // Limit any such tweaking to 4 decimal places of precision. - // (thus 1e-4 below). - // - // Note: positive min_improv is taken as an absolute score. - // Negative min_improve is treated as a relative score. - return -1.0e-4; + // The backwards compat version of this is 0.0. But for + // continuously-variable scores, this is crazy, as the + // system falls into a state of tweaking the tenth decimal place, + // Limit any such tweaking to 4 decimal places of precision. + // (thus 1e-4 below). + // + // Note: positive min_improv is taken as an absolute score. + // Negative min_improve is treated as a relative score. + return -1.0e-4; } void contin_bscore::set_complexity_coef(unsigned alphabet_size, float stdev) { - _complexity_coef = 0.0; - if (stdev > 0.0) - _complexity_coef = contin_complexity_coef(alphabet_size, stdev); + _complexity_coef = 0.0; + if (stdev > 0.0) + _complexity_coef = contin_complexity_coef(alphabet_size, stdev); - logger().info() << "contin_bscore noise = " << stdev - << " alphabest size = " << alphabet_size - << " complexity ratio = " << 1.0/_complexity_coef; + logger().info() << "contin_bscore noise = " << stdev + << " alphabest size = " << alphabet_size + << " complexity ratio = " << 1.0 / _complexity_coef; } ////////////////////////////// // discretize_contin_bscore // ////////////////////////////// -discretize_contin_bscore::discretize_contin_bscore(const OTable& ot, - const ITable& it, - const vector& thres, +discretize_contin_bscore::discretize_contin_bscore(const OTable &ot, + const ITable &it, + const vector &thres, bool wa) - : target(ot), cit(it), thresholds(thres), weighted_accuracy(wa), - classes(ot.size()), weights(thresholds.size() + 1, 1) { - // enforce that thresholds is sorted - boost::sort(thresholds); - // precompute classes - boost::transform(target, classes.begin(), [&](const vertex& v) { - return this->class_idx(get_contin(v)); }); - // precompute weights - multiset cs(classes.begin(), classes.end()); - if (weighted_accuracy) - for (size_t i = 0; i < weights.size(); ++i) - weights[i] = classes.size() / (float)(weights.size() * cs.count(i)); - - OC_ASSERT(ot.size() == it.size(), - "Error: discretize_contin_bscore: input and output table size do not match: %d %d", - it.size(), ot.size()); - _size = ot.size(); + : target(ot), cit(it), thresholds(thres), weighted_accuracy(wa), + classes(ot.size()), weights(thresholds.size() + 1, 1) +{ + // enforce that thresholds is sorted + boost::sort(thresholds); + // precompute classes + boost::transform(target, classes.begin(), [&](const vertex &v) { + return this->class_idx(get_contin(v)); + }); + // precompute weights + multiset cs(classes.begin(), classes.end()); + if (weighted_accuracy) + for (size_t i = 0; i < weights.size(); ++i) + weights[i] = classes.size() / (float) (weights.size() * cs.count(i)); + + OC_ASSERT(ot.size() == it.size(), + "Error: discretize_contin_bscore: input and output table size do not match: %d %d", + it.size(), ot.size()); + _size = ot.size(); } behavioral_score discretize_contin_bscore::best_possible_bscore() const { - return behavioral_score(target.size(), 0); + return behavioral_score(target.size(), 0); } score_t discretize_contin_bscore::min_improv() const { - // not necessarily right, just the backwards-compat behavior - return 0.0; + // not necessarily right, just the backwards-compat behavior + return 0.0; } size_t discretize_contin_bscore::class_idx(contin_t v) const { - if (v < thresholds[0]) - return 0; - size_t s = thresholds.size(); - if (v >= thresholds[s - 1]) - return s; - return class_idx_within(v, 1, s); + if (v < thresholds[0]) + return 0; + size_t s = thresholds.size(); + if (v >= thresholds[s - 1]) + return s; + return class_idx_within(v, 1, s); } size_t discretize_contin_bscore::class_idx_within(contin_t v, size_t l_idx, size_t u_idx) const { - // base case - if(u_idx - l_idx == 1) - return l_idx; - // recursive case - size_t m_idx = l_idx + (u_idx - l_idx) / 2; - contin_t t = thresholds[m_idx - 1]; - if(v < t) - return class_idx_within(v, l_idx, m_idx); - else - return class_idx_within(v, m_idx, u_idx); + // base case + if (u_idx - l_idx == 1) + return l_idx; + // recursive case + size_t m_idx = l_idx + (u_idx - l_idx) / 2; + contin_t t = thresholds[m_idx - 1]; + if (v < t) + return class_idx_within(v, l_idx, m_idx); + else + return class_idx_within(v, m_idx, u_idx); } -behavioral_score discretize_contin_bscore::operator()(const combo_tree& tr) const +behavioral_score discretize_contin_bscore::operator()(const combo_tree &tr) const { - /// @todo could be optimized by avoiding computing the OTable and - /// directly using the results on the fly. On really big table - /// (dozens of thousands of data points and about 100 inputs, this - /// has overhead of about 10% of the overall time) - OTable ct(tr, cit); - behavioral_score bs(target.size()); - boost::transform(ct, classes, bs.begin(), [&](const vertex& v, size_t c_idx) { - return (c_idx != this->class_idx(get_contin(v))) * this->weights[c_idx]; - }); - - log_candidate_bscore(tr, bs); - return bs; + /// @todo could be optimized by avoiding computing the OTable and + /// directly using the results on the fly. On really big table + /// (dozens of thousands of data points and about 100 inputs, this + /// has overhead of about 10% of the overall time) + OTable ct(tr, cit); + behavioral_score bs(target.size()); + boost::transform(ct, classes, bs.begin(), [&](const vertex &v, size_t c_idx) { + return (c_idx != this->class_idx(get_contin(v))) * this->weights[c_idx]; + }); + + log_candidate_bscore(tr, bs); + return bs; } ///////////////////////// // ctruth_table_bscore // ///////////////////////// -behavioral_score ctruth_table_bscore::operator()(const combo_tree& tr) const +behavioral_score ctruth_table_bscore::operator()(const combo_tree &tr) const { - behavioral_score bs; + behavioral_score bs; - interpreter_visitor iv(tr); - auto interpret_tr = boost::apply_visitor(iv); - // Evaluate the bscore components for all rows of the ctable - for (const CTable::value_type& vct : _wrk_ctable) { - const CTable::counter_t& c = vct.second; - score_t sc = c.get(negate_vertex(interpret_tr(vct.first.get_variant()))); - bs.push_back(-sc); - } + interpreter_visitor iv(tr); + auto interpret_tr = boost::apply_visitor(iv); + // Evaluate the bscore components for all rows of the ctable + for (const CTable::value_type &vct : _wrk_ctable) { + const CTable::counter_t &c = vct.second; + score_t sc = c.get(negate_vertex(interpret_tr(vct.first.get_variant()))); + bs.push_back(-sc); + } if (_return_weighted_score) { // if boosting - // Report the score only relative to the best-possible score. - bs -= _best_possible_score; - } + // Report the score only relative to the best-possible score. + bs -= _best_possible_score; + } - log_candidate_bscore(tr, bs); + log_candidate_bscore(tr, bs); - return bs; + return bs; } /// Boolean ensemble scorer. Assumes that the ensemble signature outputs /// a boolean value. All of the trees in the ensmble get a weighted vote, /// that vote is totalled to get the prediction of the ensemble, and then /// compared to the desired output. -behavioral_score ctruth_table_bscore::operator()(const scored_combo_tree_set& ensemble) const +behavioral_score ctruth_table_bscore::operator()(const scored_combo_tree_set &ensemble) const { - size_t sz = _wrk_ctable.size(); - - // Step 1: accumulate the weighted prediction of each tree in - // the ensemble. - behavioral_score hypoth(sz, 0.0); - for (const scored_combo_tree& sct: ensemble) { - // apply each tree, in turn. - interpreter_visitor iv(sct.get_tree()); - auto interpret_tr = boost::apply_visitor(iv); - score_t weight = sct.get_weight(); - - // Evaluate the tree for all rows of the ctable - size_t i=0; - for (const CTable::value_type& vct : _wrk_ctable) { - // Add +1 if prediction is up and -1 if prediction is down. - vertex prediction(interpret_tr(vct.first.get_variant())); - hypoth[i] += (id::logical_true == prediction)? weight : -weight; - i++; - } - } - - // Step 2: compare the prediction of the ensemble to the desired - // result. The array "hypoth" is positive to predict up, and - // negative to predict down. The compressed table holds the - // (possibly weighted) count of up and down values; in the limit - // of an uncompressed table, the number of 'wrong' answers is - // the count of the iverted prediction. - behavioral_score bs(sz); - size_t i =0; - for (const CTable::value_type& vct : _wrk_ctable) { - const CTable::counter_t& cnt = vct.second; - vertex inverted_prediction = (hypoth[i] > 0.0) ? - id::logical_false : id::logical_true; - bs[i] = -cnt.get(inverted_prediction); - i++; - } + size_t sz = _wrk_ctable.size(); + + // Step 1: accumulate the weighted prediction of each tree in + // the ensemble. + behavioral_score hypoth(sz, 0.0); + for (const scored_combo_tree &sct: ensemble) { + // apply each tree, in turn. + interpreter_visitor iv(sct.get_tree()); + auto interpret_tr = boost::apply_visitor(iv); + score_t weight = sct.get_weight(); + + // Evaluate the tree for all rows of the ctable + size_t i = 0; + for (const CTable::value_type &vct : _wrk_ctable) { + // Add +1 if prediction is up and -1 if prediction is down. + vertex prediction(interpret_tr(vct.first.get_variant())); + hypoth[i] += (id::logical_true == prediction) ? weight : -weight; + i++; + } + } + + // Step 2: compare the prediction of the ensemble to the desired + // result. The array "hypoth" is positive to predict up, and + // negative to predict down. The compressed table holds the + // (possibly weighted) count of up and down values; in the limit + // of an uncompressed table, the number of 'wrong' answers is + // the count of the iverted prediction. + behavioral_score bs(sz); + size_t i = 0; + for (const CTable::value_type &vct : _wrk_ctable) { + const CTable::counter_t &cnt = vct.second; + vertex inverted_prediction = (hypoth[i] > 0.0) ? + id::logical_false : id::logical_true; + bs[i] = -cnt.get(inverted_prediction); + i++; + } if (_return_weighted_score) { // if boosting - // Report the score only relative to the best-possible score. - bs -= _best_possible_score; - } + // Report the score only relative to the best-possible score. + bs -= _best_possible_score; + } - return bs; + return bs; } void ctruth_table_bscore::set_best_possible_bscore() const { - _best_possible_score.clear(); - transform(_wrk_ctable | map_values, - back_inserter(_best_possible_score), - [](const CTable::counter_t& c) { - // OK, this looks like magic, but here's what it does: - // CTable is a compressed table; different rows may - // have identical inputs, differing only in output. - // Clearly, in such a case, both outputs cannot be - // simultanously satisfied, but we can try to satisfy - // the one of which there is more (the max). Thus, - // we take the fmin of the two possiblities as the - // number of wrong answers we are doomed to get. - // - // fmin is used, because the rows may be weighted - // by fractional values. This gives the correct - // result for weighted datasets: the most heavily - // weighted outcome is the prefered one. - return -score_t(fmin(c.get(id::logical_true), - c.get(id::logical_false))); - }); - - logger().info() << "ctruth_table_bscore: Best possible: " - << _best_possible_score; + _best_possible_score.clear(); + transform(_wrk_ctable | map_values, + back_inserter(_best_possible_score), + [](const CTable::counter_t &c) { + // OK, this looks like magic, but here's what it does: + // CTable is a compressed table; different rows may + // have identical inputs, differing only in output. + // Clearly, in such a case, both outputs cannot be + // simultanously satisfied, but we can try to satisfy + // the one of which there is more (the max). Thus, + // we take the fmin of the two possiblities as the + // number of wrong answers we are doomed to get. + // + // fmin is used, because the rows may be weighted + // by fractional values. This gives the correct + // result for weighted datasets: the most heavily + // weighted outcome is the prefered one. + return -score_t(fmin(c.get(id::logical_true), + c.get(id::logical_false))); + }); + + logger().info() << "ctruth_table_bscore: Best possible: " + << _best_possible_score; } behavioral_score ctruth_table_bscore::best_possible_bscore() const { if (_return_weighted_score) { - // The returned best score will always be zero, because the actual - // best score is subtracted; this is required to get boosting to - // work. - return behavioral_score(_size, 0.0); - } - set_best_possible_bscore(); - return _best_possible_score; + // The returned best score will always be zero, because the actual + // best score is subtracted; this is required to get boosting to + // work. + return behavioral_score(_size, 0.0); + } + set_best_possible_bscore(); + return _best_possible_score; } behavioral_score ctruth_table_bscore::worst_possible_bscore() const { - behavioral_score bs; - for (const CTable::value_type& vct : _wrk_ctable) { - const CTable::counter_t& cnt = vct.second; - - // The most that the score can improve is to flip true to false, - // or v.v. The worst score is to get the majority wrong. This - // workes correctly even for weighted tables, where the counts - // could be arbitrary float-point values. - score_t w = fabs (cnt.get(id::logical_true) - - cnt.get(id::logical_false)); - bs.push_back(-w); - } - return bs; + behavioral_score bs; + for (const CTable::value_type &vct : _wrk_ctable) { + const CTable::counter_t &cnt = vct.second; + + // The most that the score can improve is to flip true to false, + // or v.v. The worst score is to get the majority wrong. This + // workes correctly even for weighted tables, where the counts + // could be arbitrary float-point values. + score_t w = fabs(cnt.get(id::logical_true) - + cnt.get(id::logical_false)); + bs.push_back(-w); + } + return bs; } score_t ctruth_table_bscore::min_improv() const { - // A return value of 0.5 would be correct only if all rows had - // a weight of exactly 1.0. Otherwise, we look for the row with - // the smallest (positive) weight, and return that. - // return 0.5; - - score_t min_weight = FLT_MAX; - for (const CTable::value_type& vct : _wrk_ctable) { - const CTable::counter_t& cnt = vct.second; - - // The most that the score can improve is to flip - // true to false, or v.v. - score_t w = fabs (cnt.get(id::logical_true) - - cnt.get(id::logical_false)); - if (w != 0.0 and w < min_weight) min_weight = w; - } - return 0.5 * min_weight; + // A return value of 0.5 would be correct only if all rows had + // a weight of exactly 1.0. Otherwise, we look for the row with + // the smallest (positive) weight, and return that. + // return 0.5; + + score_t min_weight = FLT_MAX; + for (const CTable::value_type &vct : _wrk_ctable) { + const CTable::counter_t &cnt = vct.second; + + // The most that the score can improve is to flip + // true to false, or v.v. + score_t w = fabs(cnt.get(id::logical_true) - + cnt.get(id::logical_false)); + if (w != 0.0 and w < min_weight) min_weight = w; + } + return 0.5 * min_weight; } @@ -444,96 +449,96 @@ score_t ctruth_table_bscore::min_improv() const // enum_table_bscore // ///////////////////////// -behavioral_score enum_table_bscore::operator()(const combo_tree& tr) const +behavioral_score enum_table_bscore::operator()(const combo_tree &tr) const { - behavioral_score bs; - - // Evaluate the bscore components for all rows of the ctable - interpreter_visitor iv(tr); - auto interpret_tr = boost::apply_visitor(iv); - for (const CTable::value_type& vct : _ctable) { - const CTable::counter_t& c = vct.second; - // The number that are wrong equals total minus num correct. - score_t sc = score_t(c.get(interpret_tr(vct.first.get_variant()))); - sc -= score_t(c.total_count()); - bs.push_back(sc); - } - - log_candidate_bscore(tr, bs); - return bs; + behavioral_score bs; + + // Evaluate the bscore components for all rows of the ctable + interpreter_visitor iv(tr); + auto interpret_tr = boost::apply_visitor(iv); + for (const CTable::value_type &vct : _ctable) { + const CTable::counter_t &c = vct.second; + // The number that are wrong equals total minus num correct. + score_t sc = score_t(c.get(interpret_tr(vct.first.get_variant()))); + sc -= score_t(c.total_count()); + bs.push_back(sc); + } + + log_candidate_bscore(tr, bs); + return bs; } behavioral_score enum_table_bscore::best_possible_bscore() const { - behavioral_score bs; - transform(_ctable | map_values, back_inserter(bs), - [](const CTable::counter_t& c) { - // OK, this looks like magic, but here's what it does: - // CTable is a compressed table; multiple rows may - // have identical inputs, differing only in output. - // Clearly, in such a case, different outputs cannot be - // simultanously satisfied, but we can try to satisfy - // the one of which there is the most. - unsigned most = 0; - CTable::counter_t::const_iterator it = c.begin(); - for (; it != c.end(); ++it) { - if (most < it->second) most = it->second; - } - return score_t (most - c.total_count()); - }); - - return bs; + behavioral_score bs; + transform(_ctable | map_values, back_inserter(bs), + [](const CTable::counter_t &c) { + // OK, this looks like magic, but here's what it does: + // CTable is a compressed table; multiple rows may + // have identical inputs, differing only in output. + // Clearly, in such a case, different outputs cannot be + // simultanously satisfied, but we can try to satisfy + // the one of which there is the most. + unsigned most = 0; + CTable::counter_t::const_iterator it = c.begin(); + for (; it != c.end(); ++it) { + if (most < it->second) most = it->second; + } + return score_t(most - c.total_count()); + }); + + return bs; } score_t enum_table_bscore::min_improv() const { - return 0.5; + return 0.5; } ///////////////////////// // enum_filter_bscore // ///////////////////////// -behavioral_score enum_filter_bscore::operator()(const combo_tree& tr) const +behavioral_score enum_filter_bscore::operator()(const combo_tree &tr) const { - behavioral_score bs; + behavioral_score bs; - typedef combo_tree::sibling_iterator sib_it; - typedef combo_tree::iterator pre_it; + typedef combo_tree::sibling_iterator sib_it; + typedef combo_tree::iterator pre_it; - pre_it it = tr.begin(); - if (is_enum_type(*it)) - return enum_table_bscore::operator()(tr); + pre_it it = tr.begin(); + if (is_enum_type(*it)) + return enum_table_bscore::operator()(tr); - OC_ASSERT(*it == id::cond, "Error: unexpcected candidate!"); - sib_it predicate = it.begin(); - vertex consequent = *next(predicate); + OC_ASSERT(*it == id::cond, "Error: unexpcected candidate!"); + sib_it predicate = it.begin(); + vertex consequent = *next(predicate); - // Evaluate the bscore components for all rows of the ctable - interpreter_visitor iv_tr(tr), iv_predicate(predicate); - auto interpret_tr = boost::apply_visitor(iv_tr); - auto interpret_predicate = boost::apply_visitor(iv_predicate); - for (const CTable::value_type& vct : _ctable) { - const CTable::counter_t& c = vct.second; + // Evaluate the bscore components for all rows of the ctable + interpreter_visitor iv_tr(tr), iv_predicate(predicate); + auto interpret_tr = boost::apply_visitor(iv_tr); + auto interpret_predicate = boost::apply_visitor(iv_predicate); + for (const CTable::value_type &vct : _ctable) { + const CTable::counter_t &c = vct.second; - unsigned total = c.total_count(); + unsigned total = c.total_count(); - // The number that are wrong equals total minus num correct. - score_t sc = score_t(c.get(interpret_tr(vct.first.get_variant()))); - sc -= score_t(total); + // The number that are wrong equals total minus num correct. + score_t sc = score_t(c.get(interpret_tr(vct.first.get_variant()))); + sc -= score_t(total); - // Punish the first predicate, if it is wrong. - vertex pr = interpret_predicate(vct.first.get_variant()); - if (pr == id::logical_true) { - if (total != c.get(consequent)) - sc -= punish * total; - } + // Punish the first predicate, if it is wrong. + vertex pr = interpret_predicate(vct.first.get_variant()); + if (pr == id::logical_true) { + if (total != c.get(consequent)) + sc -= punish * total; + } - bs.push_back(sc); - } + bs.push_back(sc); + } - log_candidate_bscore(tr, bs); - return bs; + log_candidate_bscore(tr, bs); + return bs; } ///////////////////////// @@ -547,199 +552,199 @@ behavioral_score enum_filter_bscore::operator()(const combo_tree& tr) const /// retro-graded: punish more complex, later predicates... score_t enum_graded_bscore::graded_complexity(combo_tree::iterator it) const { - typedef combo_tree::sibling_iterator sib_it; - typedef combo_tree::iterator pre_it; - - if (it.is_childless()) return 0.0; - sib_it predicate = it.begin(); - score_t cpxy = 0.0; - score_t weight = 1.0; - while (1) { - cpxy += weight * score_t(tree_complexity((pre_it) predicate)); - - // Is it the last one, the else clause? - if (is_enum_type(*predicate)) - break; - - // advance - predicate = next(predicate, 2); - weight /= grading; - - } - return cpxy; + typedef combo_tree::sibling_iterator sib_it; + typedef combo_tree::iterator pre_it; + + if (it.is_childless()) return 0.0; + sib_it predicate = it.begin(); + score_t cpxy = 0.0; + score_t weight = 1.0; + while (1) { + cpxy += weight * score_t(tree_complexity((pre_it) predicate)); + + // Is it the last one, the else clause? + if (is_enum_type(*predicate)) + break; + + // advance + predicate = next(predicate, 2); + weight /= grading; + + } + return cpxy; } -behavioral_score enum_graded_bscore::operator()(const combo_tree& tr) const +behavioral_score enum_graded_bscore::operator()(const combo_tree &tr) const { - behavioral_score bs; - - typedef combo_tree::sibling_iterator sib_it; - typedef combo_tree::iterator pre_it; - - pre_it it = tr.begin(); - if (is_enum_type(*it)) - return enum_table_bscore::operator()(tr); - - OC_ASSERT(*it == id::cond, "Error: unexpected candidate!"); - - // Evaluate the bscore components for all rows of the ctable - // TODO - sib_it predicate = it.begin(); - for (const CTable::value_type& vct : _ctable) { - const CTable::counter_t& c = vct.second; - - unsigned total = c.total_count(); - score_t weight = 1.0; - - // The number that are wrong equals total minus num correct. - score_t sc = -score_t(total); - while (1) { - // Is it the last one, the else clause? - if (is_enum_type(*predicate)) { - vertex consequent = *predicate; - sc += c.get(consequent); - sc *= weight; - break; - } - - // The first true predicate terminates. - interpreter_visitor iv(predicate); - vertex pr = boost::apply_visitor(iv, vct.first.get_variant()); - if (pr == id::logical_true) { - vertex consequent = *next(predicate); - sc += c.get(consequent); - sc *= weight; - break; - } - - // advance - predicate = next(predicate, 2); - weight *= grading; - } - bs.push_back(sc); - } - - log_candidate_bscore(tr, bs); - return bs; + behavioral_score bs; + + typedef combo_tree::sibling_iterator sib_it; + typedef combo_tree::iterator pre_it; + + pre_it it = tr.begin(); + if (is_enum_type(*it)) + return enum_table_bscore::operator()(tr); + + OC_ASSERT(*it == id::cond, "Error: unexpected candidate!"); + + // Evaluate the bscore components for all rows of the ctable + // TODO + sib_it predicate = it.begin(); + for (const CTable::value_type &vct : _ctable) { + const CTable::counter_t &c = vct.second; + + unsigned total = c.total_count(); + score_t weight = 1.0; + + // The number that are wrong equals total minus num correct. + score_t sc = -score_t(total); + while (1) { + // Is it the last one, the else clause? + if (is_enum_type(*predicate)) { + vertex consequent = *predicate; + sc += c.get(consequent); + sc *= weight; + break; + } + + // The first true predicate terminates. + interpreter_visitor iv(predicate); + vertex pr = boost::apply_visitor(iv, vct.first.get_variant()); + if (pr == id::logical_true) { + vertex consequent = *next(predicate); + sc += c.get(consequent); + sc *= weight; + break; + } + + // advance + predicate = next(predicate, 2); + weight *= grading; + } + bs.push_back(sc); + } + + log_candidate_bscore(tr, bs); + return bs; } -complexity_t enum_graded_bscore::get_complexity(const combo::combo_tree& tr) const +complexity_t enum_graded_bscore::get_complexity(const combo::combo_tree &tr) const { - return graded_complexity(tr.begin()); + return graded_complexity(tr.begin()); } score_t enum_graded_bscore::min_improv() const { - // Negative values are interpreted as percentages by the optimizer. - // So -0.05 means "a 5% improvement". Problem is, the grading - // wrecks any sense of an absolute score improvement... - return -0.05; + // Negative values are interpreted as percentages by the optimizer. + // So -0.05 means "a 5% improvement". Problem is, the grading + // wrecks any sense of an absolute score improvement... + return -0.05; } // Much like enum_graded_score, above, except that we exchange the // inner and outer loops. This makes the algo slower and bulkier, but // it does allow the effectiveness of predicates to be tracked. // -behavioral_score enum_effective_bscore::operator()(const combo_tree& tr) const +behavioral_score enum_effective_bscore::operator()(const combo_tree &tr) const { - typedef combo_tree::sibling_iterator sib_it; - typedef combo_tree::iterator pre_it; - - behavioral_score bs(_ctable_usize); - - // Is this just a constant? Then just add them up. - pre_it it = tr.begin(); - if (is_enum_type(*it)) { - behavioral_score::iterator bit = bs.begin(); - for (const CTable::value_type& vct : _ctable) { - const CTable::counter_t& c = vct.second; - - // The number that are wrong equals total minus num correct. - *bit++ = c.get(*it) - score_t(c.total_count()); - } - return bs; - } - - OC_ASSERT(*it == id::cond, "Error: unexpcected candidate!"); - - // Accumulate the score with multiple passes, so zero them out here. - for (score_t& sc : bs) sc = 0.0; - - // Are we done yet? - vector done(_ctable_usize); - vector::iterator dit = done.begin(); - for (; dit != done.end(); ++dit) *dit = false; - - sib_it predicate = it.begin(); - score_t weight = 1.0; - while (1) { - - // Is it the last one, the else clause? - if (is_enum_type(*predicate)) { - vertex consequent = *predicate; - - behavioral_score::iterator bit = bs.begin(); - vector::iterator dit = done.begin(); - for (const CTable::value_type& vct : _ctable) { - if (*dit == false) { - const CTable::counter_t& c = vct.second; - - // The number that are wrong equals total minus num correct. - score_t sc = -score_t(c.total_count()); - sc += c.get(consequent); - *bit += weight * sc; - } - ++bit; - ++dit; - } - break; - } - - vertex consequent = *next(predicate); - - // Evaluate the bscore components for all rows of the ctable - behavioral_score::iterator bit = bs.begin(); - vector::iterator dit = done.begin(); - - bool effective = false; - interpreter_visitor iv(predicate); - auto interpret_predicate = boost::apply_visitor(iv); - for (const CTable::value_type& vct : _ctable) { - if (*dit == false) { - vertex pr = interpret_predicate(vct.first.get_variant()); - if (pr == id::logical_true) { - const CTable::counter_t& c = vct.second; - int sc = c.get(consequent); - // A predicate is effective if it evaluates to true, - // and at least gets a right answr when it does... - if (0 != sc) effective = true; - - // The number that are wrong equals total minus num correct. - sc -= c.total_count(); - *bit += weight * score_t(sc); - - *dit = true; - } - } - ++bit; - ++dit; - } - - // advance - predicate = next(predicate, 2); - if (effective) weight *= grading; - } - - log_candidate_bscore(tr, bs); - return bs; + typedef combo_tree::sibling_iterator sib_it; + typedef combo_tree::iterator pre_it; + + behavioral_score bs(_ctable_usize); + + // Is this just a constant? Then just add them up. + pre_it it = tr.begin(); + if (is_enum_type(*it)) { + behavioral_score::iterator bit = bs.begin(); + for (const CTable::value_type &vct : _ctable) { + const CTable::counter_t &c = vct.second; + + // The number that are wrong equals total minus num correct. + *bit++ = c.get(*it) - score_t(c.total_count()); + } + return bs; + } + + OC_ASSERT(*it == id::cond, "Error: unexpcected candidate!"); + + // Accumulate the score with multiple passes, so zero them out here. + for (score_t &sc : bs) sc = 0.0; + + // Are we done yet? + vector done(_ctable_usize); + vector::iterator dit = done.begin(); + for (; dit != done.end(); ++dit) *dit = false; + + sib_it predicate = it.begin(); + score_t weight = 1.0; + while (1) { + + // Is it the last one, the else clause? + if (is_enum_type(*predicate)) { + vertex consequent = *predicate; + + behavioral_score::iterator bit = bs.begin(); + vector::iterator dit = done.begin(); + for (const CTable::value_type &vct : _ctable) { + if (*dit == false) { + const CTable::counter_t &c = vct.second; + + // The number that are wrong equals total minus num correct. + score_t sc = -score_t(c.total_count()); + sc += c.get(consequent); + *bit += weight * sc; + } + ++bit; + ++dit; + } + break; + } + + vertex consequent = *next(predicate); + + // Evaluate the bscore components for all rows of the ctable + behavioral_score::iterator bit = bs.begin(); + vector::iterator dit = done.begin(); + + bool effective = false; + interpreter_visitor iv(predicate); + auto interpret_predicate = boost::apply_visitor(iv); + for (const CTable::value_type &vct : _ctable) { + if (*dit == false) { + vertex pr = interpret_predicate(vct.first.get_variant()); + if (pr == id::logical_true) { + const CTable::counter_t &c = vct.second; + int sc = c.get(consequent); + // A predicate is effective if it evaluates to true, + // and at least gets a right answr when it does... + if (0 != sc) effective = true; + + // The number that are wrong equals total minus num correct. + sc -= c.total_count(); + *bit += weight * score_t(sc); + + *dit = true; + } + } + ++bit; + ++dit; + } + + // advance + predicate = next(predicate, 2); + if (effective) weight *= grading; + } + + log_candidate_bscore(tr, bs); + return bs; } ////////////////////////////////// // interesting_predicate_bscore // ////////////////////////////////// -interesting_predicate_bscore::interesting_predicate_bscore(const CTable& ctable_, +interesting_predicate_bscore::interesting_predicate_bscore(const CTable &ctable_, weight_t kld_w_, weight_t skewness_w_, weight_t stdU_w_, @@ -750,181 +755,184 @@ interesting_predicate_bscore::interesting_predicate_bscore(const CTable& ctable_ bool positive_, bool abs_skewness_, bool decompose_kld_) - : _ctable(ctable_), - _kld_w(kld_w_), _skewness_w(skewness_w_), _abs_skewness(abs_skewness_), - _stdU_w(stdU_w_), _skew_U_w(skew_U_w_), _min_activation(min_activation_), - _max_activation(max_activation_), _penalty(penalty_), _positive(positive_), - _decompose_kld(decompose_kld_) + : _ctable(ctable_), + _kld_w(kld_w_), _skewness_w(skewness_w_), _abs_skewness(abs_skewness_), + _stdU_w(stdU_w_), _skew_U_w(skew_U_w_), _min_activation(min_activation_), + _max_activation(max_activation_), _penalty(penalty_), _positive(positive_), + _decompose_kld(decompose_kld_) { - // Define counter (mapping between observation and its number of occurences) - // That is, create a historgram showing how often each output value - // occurs in the ctable. - boost::for_each(_ctable | map_values, [this](const CTable::mapped_type& mv) { - boost::for_each(mv, [this](const CTable::counter_t::value_type& v) { - _counter[get_contin(v.first.value)] += v.second; }); }); - - // Precompute pdf (probability distribution function) - if (_kld_w > 0) { - _pdf = _counter; - _klds.set_p_pdf(_pdf); - - // Compute the skewness of the pdf - accumulator_t acc; - for (const auto& v : _pdf) - acc(v.first, weight = v.second); - _skewness = weighted_skewness(acc); - logger().debug("interesting_predicate_bscore::_skewness = %f", _skewness); - } + // Define counter (mapping between observation and its number of occurences) + // That is, create a historgram showing how often each output value + // occurs in the ctable. + boost::for_each(_ctable | map_values, [this](const CTable::mapped_type &mv) { + boost::for_each(mv, [this](const CTable::counter_t::value_type &v) { + _counter[get_contin(v.first.value)] += v.second; + }); + }); + + // Precompute pdf (probability distribution function) + if (_kld_w > 0) { + _pdf = _counter; + _klds.set_p_pdf(_pdf); + + // Compute the skewness of the pdf + accumulator_t acc; + for (const auto &v : _pdf) + acc(v.first, weight = v.second); + _skewness = weighted_skewness(acc); + logger().debug("interesting_predicate_bscore::_skewness = %f", _skewness); + } } -behavioral_score interesting_predicate_bscore::operator()(const combo_tree& tr) const +behavioral_score interesting_predicate_bscore::operator()(const combo_tree &tr) const { - // OK, here's the deal. The combo tree evaluates to T/F on each - // input table row. That is, the combo tree is a predicate that - // selects certain rows of the input table. Here, pred_cache is just - // a cache, to avoid multiple evaluations of the combo tree: its - // just a table with just one column, equal to the value of the - // combo tree on each input row. - OTable pred_cache(tr, _ctable); - - // target simply negates (inverts) the predicate. - vertex target = bool_to_vertex(_positive); - - // Count how many rows the predicate selected. - unsigned total = 0; // total number of observations (could be optimized) - unsigned actives = 0; // total number of positive (or negative if - // positive is false) predicate values - boost::for_each(_ctable | map_values, pred_cache, - [&](const CTable::counter_t& c, const vertex& v) { - unsigned tc = c.total_count(); - if (v == target) - actives += tc; - total += tc; - }); - - logger().fine("ip scorer: total = %u", total); - logger().fine("ip scorer: actives = %u", actives); - - // Create a histogram of output values, ignoring non-selected rows. - // Do this by filtering the ctable output column according to the, - // predicate, discarding non-selected rows. Then total up how often - // each distinct output value occurs. - counter_t pred_counter; - boost::for_each(_ctable | map_values, pred_cache, - [&](const CTable::counter_t& c, const vertex& v) { - if (v == target) { - for (const auto& mv : c) - pred_counter[get_contin(mv.first.value)] = mv.second; - }}); - - logger().fine("ip scorer: pred_cache.size() = %u", pred_cache.size()); - logger().fine("ip scorer: pred_counter.size() = %u", pred_counter.size()); - - // If there's only one output value left, then punt. Statistics - // like skewness need a distribution that isn't a single spike. - if (pred_counter.size() == 1) { - behavioral_score bs; - bs.push_back(very_worst_score); - log_candidate_bscore(tr, bs); - return bs; - } - - behavioral_score bs; - // Compute Kullback-Leibler divergence (KLD) of the filetered - // distribution. - if (_kld_w > 0.0) { - if (_decompose_kld) { - _klds(pred_counter, back_inserter(bs)); - boost::transform(bs, bs.begin(), _kld_w * arg1); - } else { - score_t pred_klds = _klds(pred_counter); - logger().fine("ip scorer: klds = %f", pred_klds); - bs.push_back(_kld_w * pred_klds); - } - } - - // Compute skewness of the filtered distribution. - if (_skewness_w > 0 || _stdU_w > 0 || _skew_U_w > 0) { - - // Gather statistics with a boost accumulator - accumulator_t acc; - for (const auto& v : pred_counter) - acc(v.first, weight = v.second); - - score_t diff_skewness = 0; - if (_skewness_w > 0 || _skew_U_w > 0) { - // push the absolute difference between the - // unconditioned skewness and conditioned one - score_t pred_skewness = weighted_skewness(acc); - diff_skewness = pred_skewness - _skewness; - score_t val_skewness = (_abs_skewness? - abs(diff_skewness): - diff_skewness); - logger().fine("ip scorer: pred_skewness = %f", pred_skewness); - if (_skewness_w > 0) - bs.push_back(_skewness_w * val_skewness); - } - - score_t stdU = 0; - if (_stdU_w > 0 || _skew_U_w > 0) { - - // Compute the standardized Mann–Whitney U - stdU = standardizedMannWhitneyU(_counter, pred_counter); - logger().fine("ip scorer: stdU = %f", stdU); - if (_stdU_w > 0.0) - bs.push_back(_stdU_w * abs(stdU)); - } - - // push the product of the relative differences of the - // shift (stdU) and the skewness (so that if both go - // in the same direction the value if positive, and - // negative otherwise) - if (_skew_U_w > 0) - bs.push_back(_skew_U_w * stdU * diff_skewness); - } - - // add activation_penalty component - score_t activation = actives / (score_t) total; - score_t activation_penalty = get_activation_penalty(activation); - logger().fine("ip scorer: activation = %f", activation); - logger().fine("ip scorer: activation penalty = %e", activation_penalty); - bs.push_back(activation_penalty); - - log_candidate_bscore(tr, bs); - return bs; + // OK, here's the deal. The combo tree evaluates to T/F on each + // input table row. That is, the combo tree is a predicate that + // selects certain rows of the input table. Here, pred_cache is just + // a cache, to avoid multiple evaluations of the combo tree: its + // just a table with just one column, equal to the value of the + // combo tree on each input row. + OTable pred_cache(tr, _ctable); + + // target simply negates (inverts) the predicate. + vertex target = bool_to_vertex(_positive); + + // Count how many rows the predicate selected. + unsigned total = 0; // total number of observations (could be optimized) + unsigned actives = 0; // total number of positive (or negative if + // positive is false) predicate values + boost::for_each(_ctable | map_values, pred_cache, + [&](const CTable::counter_t &c, const vertex &v) { + unsigned tc = c.total_count(); + if (v == target) + actives += tc; + total += tc; + }); + + logger().fine("ip scorer: total = %u", total); + logger().fine("ip scorer: actives = %u", actives); + + // Create a histogram of output values, ignoring non-selected rows. + // Do this by filtering the ctable output column according to the, + // predicate, discarding non-selected rows. Then total up how often + // each distinct output value occurs. + counter_t pred_counter; + boost::for_each(_ctable | map_values, pred_cache, + [&](const CTable::counter_t &c, const vertex &v) { + if (v == target) { + for (const auto &mv : c) + pred_counter[get_contin(mv.first.value)] = mv.second; + } + }); + + logger().fine("ip scorer: pred_cache.size() = %u", pred_cache.size()); + logger().fine("ip scorer: pred_counter.size() = %u", pred_counter.size()); + + // If there's only one output value left, then punt. Statistics + // like skewness need a distribution that isn't a single spike. + if (pred_counter.size() == 1) { + behavioral_score bs; + bs.push_back(very_worst_score); + log_candidate_bscore(tr, bs); + return bs; + } + + behavioral_score bs; + // Compute Kullback-Leibler divergence (KLD) of the filetered + // distribution. + if (_kld_w > 0.0) { + if (_decompose_kld) { + _klds(pred_counter, back_inserter(bs)); + boost::transform(bs, bs.begin(), _kld_w * arg1); + } else { + score_t pred_klds = _klds(pred_counter); + logger().fine("ip scorer: klds = %f", pred_klds); + bs.push_back(_kld_w * pred_klds); + } + } + + // Compute skewness of the filtered distribution. + if (_skewness_w > 0 || _stdU_w > 0 || _skew_U_w > 0) { + + // Gather statistics with a boost accumulator + accumulator_t acc; + for (const auto &v : pred_counter) + acc(v.first, weight = v.second); + + score_t diff_skewness = 0; + if (_skewness_w > 0 || _skew_U_w > 0) { + // push the absolute difference between the + // unconditioned skewness and conditioned one + score_t pred_skewness = weighted_skewness(acc); + diff_skewness = pred_skewness - _skewness; + score_t val_skewness = (_abs_skewness ? + abs(diff_skewness) : + diff_skewness); + logger().fine("ip scorer: pred_skewness = %f", pred_skewness); + if (_skewness_w > 0) + bs.push_back(_skewness_w * val_skewness); + } + + score_t stdU = 0; + if (_stdU_w > 0 || _skew_U_w > 0) { + + // Compute the standardized Mann–Whitney U + stdU = standardizedMannWhitneyU(_counter, pred_counter); + logger().fine("ip scorer: stdU = %f", stdU); + if (_stdU_w > 0.0) + bs.push_back(_stdU_w * abs(stdU)); + } + + // push the product of the relative differences of the + // shift (stdU) and the skewness (so that if both go + // in the same direction the value if positive, and + // negative otherwise) + if (_skew_U_w > 0) + bs.push_back(_skew_U_w * stdU * diff_skewness); + } + + // add activation_penalty component + score_t activation = actives / (score_t) total; + score_t activation_penalty = get_activation_penalty(activation); + logger().fine("ip scorer: activation = %f", activation); + logger().fine("ip scorer: activation penalty = %e", activation_penalty); + bs.push_back(activation_penalty); + + log_candidate_bscore(tr, bs); + return bs; } behavioral_score interesting_predicate_bscore::best_possible_bscore() const { - return behavioral_score(1, very_best_score); + return behavioral_score(1, very_best_score); } void interesting_predicate_bscore::set_complexity_coef(unsigned alphabet_size, float stdev) { - _complexity_coef = 0.0; - if (stdev > 0) - _complexity_coef = contin_complexity_coef(alphabet_size, stdev); + _complexity_coef = 0.0; + if (stdev > 0) + _complexity_coef = contin_complexity_coef(alphabet_size, stdev); - logger().info() << "intersting_predicate_bscore noise = " << stdev - << " alphabest size = " << alphabet_size - << " complexity ratio = " << 1.0/_complexity_coef; + logger().info() << "intersting_predicate_bscore noise = " << stdev + << " alphabest size = " << alphabet_size + << " complexity ratio = " << 1.0 / _complexity_coef; } score_t interesting_predicate_bscore::get_activation_penalty(score_t activation) const { - score_t dst = fmax(fmax(_min_activation - activation, score_t(0)) - / _min_activation, - fmax(activation - _max_activation, score_t(0)) - / (1.0 - _max_activation)); - logger().fine("ip scorer: dst = %f", dst); - return log(pow((1.0 - dst), _penalty)); + score_t dst = fmax(fmax(_min_activation - activation, score_t(0)) + / _min_activation, + fmax(activation - _max_activation, score_t(0)) + / (1.0 - _max_activation)); + logger().fine("ip scorer: dst = %f", dst); + return log(pow((1.0 - dst), _penalty)); } score_t interesting_predicate_bscore::min_improv() const { - return 0.0; // not necessarily right, just the - // backwards-compatible behavior + return 0.0; // not necessarily right, just the + // backwards-compatible behavior } // ==================================================================== @@ -945,156 +953,143 @@ score_t interesting_predicate_bscore::min_improv() const /// /// XXX this should probably be removed! TODO FIXME -cluster_bscore::cluster_bscore(const ITable& itable) - : _itable(itable) +cluster_bscore::cluster_bscore(const ITable &itable) + : _itable(itable) { } -behavioral_score cluster_bscore::operator()(const combo_tree& tr) const +behavioral_score cluster_bscore::operator()(const combo_tree &tr) const { - // evaluate the tree on the table - OTable oned(tr, _itable); - - size_t nclusters = 3; - - OC_ASSERT(nclusters < oned.size()); - - // Initial guess for the centroids - vector centers(nclusters); - size_t i; - for (i=0; i edges(nclusters); - for (i=0; i vals(numvals); - size_t j; - for (j=0; j edge_idx(nclusters-1); - bool changed = true; - while (changed) { - vector cnt(nclusters); - vector sum(nclusters); - changed = false; - i = 0; - for (j=0; j centers(nclusters); + size_t i; + for (i = 0; i < nclusters; i++) { + centers[i] = get_contin(oned[i]); + } + std::sort(centers.begin(), centers.end()); + + vector edges(nclusters); + for (i = 0; i < nclusters - 1; i++) { + edges[i] = 0.5 * (centers[i] + centers[i + 1]); + } + edges[nclusters - 1] = INFINITY; + + // sort the values. This makes assignment easier. + size_t numvals = oned.size(); + vector vals(numvals); + size_t j; + for (j = 0; j < numvals; j++) + vals[j] = get_contin(oned[j]); + std::sort(vals.begin(), vals.end()); + + // One-dimensional k-means algorithm (LLoyd's algorithm) + vector edge_idx(nclusters - 1); + bool changed = true; + while (changed) { + vector cnt(nclusters); + vector sum(nclusters); + changed = false; + i = 0; + for (j = 0; j < numvals; j++) { + score_t sc = vals[j]; + + if (isinf(sc) || isnan(sc)) { + behavioral_score bs; + bs.push_back(-INFINITY); + return bs; + } + + if (sc <= edges[i]) { + cnt[i] += 1.0; + sum[i] += sc; + } else { + OC_ASSERT(i < nclusters - 1); + if (j != edge_idx[i]) changed = true; + edge_idx[i] = j; + i++; + } + } + + // Compute cluster centers. + for (i = 0; i < nclusters; i++) { + // A cluster must have at least two points in it, + // as otherwise the RMS would be zero. Heck, lets make it three. + if (cnt[i] < 3.5) { + behavioral_score bs; + bs.push_back(-INFINITY); + return bs; + } + sum[i] /= cnt[i]; + } + for (i = 0; i < nclusters - 1; i++) edges[i] = 0.5 * (sum[i] + sum[i + 1]); + } + + + // Compute the RMS width of each cluster. + score_t final = 0.0; + score_t cnt = 0.0; + score_t sum = 0.0; + score_t squ = 0.0; + i = 0; + for (j = 0; j < numvals; j++) { + score_t sc = vals[j]; + if (sc <= edges[i]) { + cnt += 1.0; + sum += sc; + squ += sc * sc; + } else { + sum /= cnt; + squ /= cnt; + + final += squ - sum * sum; + i++; + cnt = 0.0; + sum = 0.0; + squ = 0.0; + } + } + + // normalize by bind-width + final = sqrt(final); + // score_t binwidth = edges[nclusters-2] - edges[0]; + score_t binwidth = vals[numvals - 1] - vals[0]; + final /= binwidth; + // The narrower the peaks, the higher the score. + // This way of doing it works better with the complexity penalty + final = 1.0 / final; + + behavioral_score bs; + bs.push_back(final); #if 0 - if (final > 80) { - logger().debug() << "cluster tr="< 80) { + logger().debug() << "cluster tr="< - logical_bscore(const Func& func, int a) - : _target(func, a), _arity(a) - { - _size = _target.size(); - reset_weights(); - } - logical_bscore(const combo_tree& tr, int a) - : _target(tr, a), _arity(a) - { - _size = _target.size(); - reset_weights(); - } - - behavioral_score operator()(const combo_tree&) const; - behavioral_score operator()(const scored_combo_tree_set&) const; - - behavioral_score operator()(const Handle&) const; - - behavioral_score best_possible_bscore() const; - behavioral_score worst_possible_bscore() const; - score_t get_error(const behavioral_score&) const; - - score_t min_improv() const; + template + logical_bscore(const Func &func, int a) + : _target(func, a), _arity(a) + { + _size = _target.size(); + reset_weights(); + } + + logical_bscore(const combo_tree &tr, int a) + : _target(tr, a), _arity(a) + { + _size = _target.size(); + reset_weights(); + } + + behavioral_score operator()(const combo_tree &) const; + + behavioral_score operator()(const scored_combo_tree_set &) const; + + behavioral_score operator()(const Handle &) const; + + behavioral_score best_possible_bscore() const; + + behavioral_score worst_possible_bscore() const; + + score_t get_error(const behavioral_score &) const; + + score_t min_improv() const; protected: - complete_truth_table _target; - int _arity; + complete_truth_table _target; + int _arity; }; - + /** * Fitness function based on discretization of the output. If the * classes match the bscore element is 0, or -1 otherwise. If @@ -92,46 +99,47 @@ struct logical_bscore : public bscore_base */ struct discretize_contin_bscore : public bscore_base { - discretize_contin_bscore(const OTable& ot, const ITable& it, - const std::vector& thres, - bool weighted_average); + discretize_contin_bscore(const OTable &ot, const ITable &it, + const std::vector &thres, + bool weighted_average); - // @todo when switching to gcc 4.6 use constructor delagation to - // simplify that - // discretize_contin_bscore(const Table& table, - // const std::vector& thres, - // bool weighted_average, - // float alphabet_size, float p); + // @todo when switching to gcc 4.6 use constructor delagation to + // simplify that + // discretize_contin_bscore(const Table& table, + // const std::vector& thres, + // bool weighted_average, + // float alphabet_size, float p); - behavioral_score operator()(const combo_tree& tr) const; + behavioral_score operator()(const combo_tree &tr) const; - // The best possible bscore is a vector of zeros. That's probably - // not quite true, because there could be duplicated inputs, but - // that's acceptable for now. - behavioral_score best_possible_bscore() const; + // The best possible bscore is a vector of zeros. That's probably + // not quite true, because there could be duplicated inputs, but + // that's acceptable for now. + behavioral_score best_possible_bscore() const; - score_t min_improv() const; + score_t min_improv() const; protected: - OTable target; - ITable cit; - std::vector thresholds; - bool weighted_accuracy; // Whether the bscore is weighted to - // deal with unbalanced data. - - // Return the index of the class of value v. - size_t class_idx(contin_t v) const; - // Like class_idx but assume that the value v is within the class - // [l_idx, u_idx) - size_t class_idx_within(contin_t v, size_t l_idx, size_t u_idx) const; - - std::vector classes; // classes of the output, alligned with target - - // Weight of each class, so that each one weighs as much as the - // others, even in case of unbalance sampling. Specifically: - // weights[i] = s / (n * c_i) where s is the sample size, n the - // number of classes and c_i the number of samples for class i. - std::vector weights; + OTable target; + ITable cit; + std::vector thresholds; + bool weighted_accuracy; // Whether the bscore is weighted to + // deal with unbalanced data. + + // Return the index of the class of value v. + size_t class_idx(contin_t v) const; + + // Like class_idx but assume that the value v is within the class + // [l_idx, u_idx) + size_t class_idx_within(contin_t v, size_t l_idx, size_t u_idx) const; + + std::vector classes; // classes of the output, alligned with target + + // Weight of each class, so that each one weighs as much as the + // others, even in case of unbalance sampling. Specifically: + // weights[i] = s / (n * c_i) where s is the sample size, n the + // number of classes and c_i the number of samples for class i. + std::vector weights; }; /** @@ -182,69 +190,71 @@ struct discretize_contin_bscore : public bscore_base */ struct contin_bscore : public bscore_base { - enum err_function_type { - squared_error, - abs_error - }; - - void init(err_function_type eft = squared_error) - { - switch (eft) { - case squared_error: - err_func = [](contin_t y1, contin_t y2) { return sq(y1 - y2); }; - break; - case abs_error: - err_func = [](contin_t y1, contin_t y2) { return std::abs(y1 - y2); }; - break; - default: - OC_ASSERT(false); - } - }; - - template - contin_bscore(const Scoring& score, const ITable& r, - err_function_type eft = squared_error) - : target(score, r), cti(r) - { - init(eft); - _size = r.size(); - } - - contin_bscore(const OTable& t, const ITable& r, - err_function_type eft = squared_error) - : target(t), cti(r) - { - init(eft); - _size = r.size(); - } - - contin_bscore(const Table& table, - err_function_type eft = squared_error) - : target(table.otable), cti(table.itable) { - init(eft); - _size = table.size(); - } - - behavioral_score operator()(const combo_tree& tr) const; - - // The best possible bscore is a vector of zeros. That's probably - // not quite true, because there could be duplicated inputs, but - // that's acceptable for now. - behavioral_score best_possible_bscore() const; - - score_t min_improv() const; - - virtual void set_complexity_coef(unsigned alphabet_size, float stddev); - using bscore_base::set_complexity_coef; // Avoid hiding/shadowing + enum err_function_type + { + squared_error, + abs_error + }; + + void init(err_function_type eft = squared_error) + { + switch (eft) { + case squared_error: + err_func = [](contin_t y1, contin_t y2) { return sq(y1 - y2); }; + break; + case abs_error: + err_func = [](contin_t y1, contin_t y2) { return std::abs(y1 - y2); }; + break; + default: OC_ASSERT(false); + } + }; + + template + contin_bscore(const Scoring &score, const ITable &r, + err_function_type eft = squared_error) + : target(score, r), cti(r) + { + init(eft); + _size = r.size(); + } + + contin_bscore(const OTable &t, const ITable &r, + err_function_type eft = squared_error) + : target(t), cti(r) + { + init(eft); + _size = r.size(); + } + + contin_bscore(const Table &table, + err_function_type eft = squared_error) + : target(table.otable), cti(table.itable) + { + init(eft); + _size = table.size(); + } + + behavioral_score operator()(const combo_tree &tr) const; + + // The best possible bscore is a vector of zeros. That's probably + // not quite true, because there could be duplicated inputs, but + // that's acceptable for now. + behavioral_score best_possible_bscore() const; + + score_t min_improv() const; + + virtual void set_complexity_coef(unsigned alphabet_size, float stddev); + + using bscore_base::set_complexity_coef; // Avoid hiding/shadowing protected: - OTable target; - ITable cti; + OTable target; + ITable cti; private: - // for a given data point calculate the error of the target - // compared to the candidate output - std::function err_func; + // for a given data point calculate the error of the target + // compared to the candidate output + std::function err_func; }; /** @@ -361,27 +371,30 @@ struct contin_bscore : public bscore_base */ struct ctruth_table_bscore : public bscore_ctable_base { - ctruth_table_bscore(const CTable& ctt) - : bscore_ctable_base(ctt) - { - _size = _wrk_ctable.size(); - reset_weights(); - set_best_possible_bscore(); - } + ctruth_table_bscore(const CTable &ctt) + : bscore_ctable_base(ctt) + { + _size = _wrk_ctable.size(); + reset_weights(); + set_best_possible_bscore(); + } + + behavioral_score operator()(const combo_tree &tr) const; - behavioral_score operator()(const combo_tree& tr) const; - behavioral_score operator()(const scored_combo_tree_set&) const; + behavioral_score operator()(const scored_combo_tree_set &) const; - // Return the best possible bscore. Used as one of the - // termination conditions (when the best bscore is reached). - behavioral_score best_possible_bscore() const; - behavioral_score worst_possible_bscore() const; + // Return the best possible bscore. Used as one of the + // termination conditions (when the best bscore is reached). + behavioral_score best_possible_bscore() const; - score_t min_improv() const; + behavioral_score worst_possible_bscore() const; + + score_t min_improv() const; protected: - mutable behavioral_score _best_possible_score; - void set_best_possible_bscore() const; + mutable behavioral_score _best_possible_score; + + void set_best_possible_bscore() const; }; /** @@ -410,19 +423,19 @@ struct ctruth_table_bscore : public bscore_ctable_base */ struct enum_table_bscore : public bscore_base { - enum_table_bscore(const CTable& ctt) : _ctable(ctt) - { _size = _ctable.size(); } + enum_table_bscore(const CTable &ctt) : _ctable(ctt) + { _size = _ctable.size(); } - behavioral_score operator()(const combo_tree& tr) const; + behavioral_score operator()(const combo_tree &tr) const; - // Return the best possible bscore. Used as one of the - // termination conditions (when the best bscore is reached). - behavioral_score best_possible_bscore() const; + // Return the best possible bscore. Used as one of the + // termination conditions (when the best bscore is reached). + behavioral_score best_possible_bscore() const; - virtual score_t min_improv() const; + virtual score_t min_improv() const; protected: - CTable _ctable; + CTable _ctable; }; /** @@ -447,13 +460,13 @@ struct enum_table_bscore : public bscore_base */ struct enum_filter_bscore : public enum_table_bscore { - enum_filter_bscore(const CTable& ctt) - : enum_table_bscore(ctt), punish(1.0) - {} + enum_filter_bscore(const CTable &ctt) + : enum_table_bscore(ctt), punish(1.0) + {} - behavioral_score operator()(const combo_tree& tr) const; + behavioral_score operator()(const combo_tree &tr) const; - score_t punish; + score_t punish; }; /** @@ -491,18 +504,19 @@ struct enum_filter_bscore : public enum_table_bscore */ struct enum_graded_bscore : public enum_table_bscore { - enum_graded_bscore(const CTable& ctt) - : enum_table_bscore(ctt), grading(0.9) - {} + enum_graded_bscore(const CTable &ctt) + : enum_table_bscore(ctt), grading(0.9) + {} - behavioral_score operator()(const combo_tree&) const; + behavioral_score operator()(const combo_tree &) const; - virtual score_t min_improv() const; - virtual complexity_t get_complexity(const combo_tree&) const; + virtual score_t min_improv() const; - score_t grading; + virtual complexity_t get_complexity(const combo_tree &) const; + + score_t grading; protected: - score_t graded_complexity(combo_tree::iterator) const; + score_t graded_complexity(combo_tree::iterator) const; }; /** @@ -517,13 +531,14 @@ struct enum_graded_bscore : public enum_table_bscore */ struct enum_effective_bscore : public enum_graded_bscore { - enum_effective_bscore(const CTable& ctt) - : enum_graded_bscore(ctt), _ctable_usize(ctt.uncompressed_size()) - { _size = _ctable_usize; } + enum_effective_bscore(const CTable &ctt) + : enum_graded_bscore(ctt), _ctable_usize(ctt.uncompressed_size()) + { _size = _ctable_usize; } + + behavioral_score operator()(const combo_tree &tr) const; - behavioral_score operator()(const combo_tree& tr) const; protected: - size_t _ctable_usize; + size_t _ctable_usize; }; // Bscore to find interesting predicates. @@ -554,78 +569,80 @@ struct enum_effective_bscore : public enum_graded_bscore // predicate is false). struct interesting_predicate_bscore : public bscore_base { - typedef score_t weight_t; - typedef Counter counter_t; - typedef Counter pdf_t; - typedef boost::accumulators::accumulator_set, contin_t> accumulator_t; - - interesting_predicate_bscore(const CTable& ctable, - weight_t kld_weight = 1.0, - weight_t skewness_weight = 1.0, - weight_t stdU_weight = 1.0, - weight_t skew_U_weight = 1.0, - score_t min_activation = 0.0, - score_t max_activation = 1.0, - score_t penalty = 1.0, - bool positive = true, - bool abs_skewness = false, - bool decompose_kld = false); - behavioral_score operator()(const combo_tree& tr) const; - - // the KLD has no upper boundary so the best of possible score is - // the maximum value a behavioral_score can represent - behavioral_score best_possible_bscore() const; - - score_t min_improv() const; - - virtual void set_complexity_coef(unsigned alphabet_size, float p); - using bscore_base::set_complexity_coef; // Avoid hiding/shadowing + typedef score_t weight_t; + typedef Counter counter_t; + typedef Counter pdf_t; + typedef boost::accumulators::accumulator_set, contin_t> accumulator_t; + + interesting_predicate_bscore(const CTable &ctable, + weight_t kld_weight = 1.0, + weight_t skewness_weight = 1.0, + weight_t stdU_weight = 1.0, + weight_t skew_U_weight = 1.0, + score_t min_activation = 0.0, + score_t max_activation = 1.0, + score_t penalty = 1.0, + bool positive = true, + bool abs_skewness = false, + bool decompose_kld = false); + + behavioral_score operator()(const combo_tree &tr) const; + + // the KLD has no upper boundary so the best of possible score is + // the maximum value a behavioral_score can represent + behavioral_score best_possible_bscore() const; + + score_t min_improv() const; + + virtual void set_complexity_coef(unsigned alphabet_size, float p); + + using bscore_base::set_complexity_coef; // Avoid hiding/shadowing protected: - counter_t _counter; // counter of the unconditioned distribution - pdf_t _pdf; // pdf of the unconditioned distribution - mutable KLDS _klds; /// @todo dangerous: not thread safe!!! - CTable _ctable; - contin_t _skewness; // skewness of the unconditioned distribution - - // weights of the various features - weight_t _kld_w; - weight_t _skewness_w; - bool _abs_skewness; - weight_t _stdU_w; - weight_t _skew_U_w; - score_t _min_activation, _max_activation; - score_t _penalty; - bool _positive; - // If true then each component of the computation of KLD - // corresponds to an element of the bscore. Otherwise the whole - // KLD occupies just one bscore element - bool _decompose_kld; + counter_t _counter; // counter of the unconditioned distribution + pdf_t _pdf; // pdf of the unconditioned distribution + mutable KLDS _klds; /// @todo dangerous: not thread safe!!! + CTable _ctable; + contin_t _skewness; // skewness of the unconditioned distribution + + // weights of the various features + weight_t _kld_w; + weight_t _skewness_w; + bool _abs_skewness; + weight_t _stdU_w; + weight_t _skew_U_w; + score_t _min_activation, _max_activation; + score_t _penalty; + bool _positive; + // If true then each component of the computation of KLD + // corresponds to an element of the bscore. Otherwise the whole + // KLD occupies just one bscore element + bool _decompose_kld; private: - score_t get_activation_penalty(score_t activation) const; + score_t get_activation_penalty(score_t activation) const; }; // ============================================================================ struct cluster_bscore : public bscore_base { - cluster_bscore(const ITable&); + cluster_bscore(const ITable &); - behavioral_score operator()(const combo_tree& tr) const; + behavioral_score operator()(const combo_tree &tr) const; - // Return the best possible bscore. Used as one of the - // termination conditions (when the best bscore is reached). - behavioral_score best_possible_bscore() const; + // Return the best possible bscore. Used as one of the + // termination conditions (when the best bscore is reached). + behavioral_score best_possible_bscore() const; - score_t min_improv() const; + score_t min_improv() const; protected: - ITable _itable; + ITable _itable; }; } //~namespace moses diff --git a/moses/moses/scoring/scoring_base.cc b/moses/moses/scoring/scoring_base.cc index 5372b06586..c4d49d4ab5 100644 --- a/moses/moses/scoring/scoring_base.cc +++ b/moses/moses/scoring/scoring_base.cc @@ -34,40 +34,43 @@ #include #include "scoring_base.h" -namespace opencog { namespace moses { +namespace opencog +{ +namespace moses +{ // Note that this function returns a POSITIVE number, since p < 0.5 score_t discrete_complexity_coef(unsigned alphabet_size, double p) { - return -log((double)alphabet_size) / log(p/(1-p)); + return -log((double) alphabet_size) / log(p / (1 - p)); } - // Note that this returns a POSITIVE number. +// Note that this returns a POSITIVE number. score_t contin_complexity_coef(unsigned alphabet_size, double stdev) { - return log(alphabet_size) * 2 * sq(stdev); + return log(alphabet_size) * 2 * sq(stdev); } void bscore_base::set_complexity_coef(unsigned alphabet_size, float p) { - // Both p==0.0 and p==0.5 are singularities in the forumla. - // See the explanation in the comment above ctruth_table_bscore. - _complexity_coef = 0.0; - if (p > 0.0f && p < 0.5f) - _complexity_coef = discrete_complexity_coef(alphabet_size, p); - - logger().info() << "BScore noise = " << p - << " alphabest size = " << alphabet_size - << " complexity ratio = " << 1.0/_complexity_coef; + // Both p==0.0 and p==0.5 are singularities in the forumla. + // See the explanation in the comment above ctruth_table_bscore. + _complexity_coef = 0.0; + if (p > 0.0f && p < 0.5f) + _complexity_coef = discrete_complexity_coef(alphabet_size, p); + + logger().info() << "BScore noise = " << p + << " alphabest size = " << alphabet_size + << " complexity ratio = " << 1.0 / _complexity_coef; } void bscore_base::set_complexity_coef(score_t complexity_ratio) { - _complexity_coef = 0.0; - if (complexity_ratio > 0.0) - _complexity_coef = 1.0 / complexity_ratio; + _complexity_coef = 0.0; + if (complexity_ratio > 0.0) + _complexity_coef = 1.0 / complexity_ratio; - logger().info() << "BScore complexity ratio = " << 1.0/_complexity_coef; + logger().info() << "BScore complexity ratio = " << 1.0 / _complexity_coef; } behavioral_score bscore_base::operator()(const Handle &) const @@ -78,20 +81,20 @@ behavioral_score bscore_base::operator()(const Handle &) const } behavioral_score -bscore_base::operator()(const scored_combo_tree_set& ensemble) const +bscore_base::operator()(const scored_combo_tree_set &ensemble) const { - OC_ASSERT(false, "Ensemble scoring not implemented for bscorer %s", - typeid(*this).name()); - return behavioral_score(); + OC_ASSERT(false, "Ensemble scoring not implemented for bscorer %s", + typeid(*this).name()); + return behavioral_score(); } behavioral_score bscore_base::worst_possible_bscore() const { - // Can't assert; this will fail during ensemble setup. - // OC_ASSERT(false, "Worst possible score not implemented for bscorer %s", - // typeid(*this).name()); - return behavioral_score(); + // Can't assert; this will fail during ensemble setup. + // OC_ASSERT(false, "Worst possible score not implemented for bscorer %s", + // typeid(*this).name()); + return behavioral_score(); } /** @@ -101,102 +104,102 @@ bscore_base::worst_possible_bscore() const * I dunno .. something. Unclear how the theory should even work for this * case. */ -complexity_t bscore_base::get_complexity(const scored_combo_tree_set& ensemble) const +complexity_t bscore_base::get_complexity(const scored_combo_tree_set &ensemble) const { - if (ensemble.empty()) return 0.0; - - double cpxy = 0.0; - double norm = 0.0; - for (const scored_combo_tree& sct : ensemble) { - double w = sct.get_weight(); - cpxy += w * tree_complexity(sct.get_tree()); - norm += w; - } - - // XXX FIXME complexity_t should be a double not an int ... - return (complexity_t) floor (cpxy / norm + 0.5); + if (ensemble.empty()) return 0.0; + + double cpxy = 0.0; + double norm = 0.0; + for (const scored_combo_tree &sct : ensemble) { + double w = sct.get_weight(); + cpxy += w * tree_complexity(sct.get_tree()); + norm += w; + } + + // XXX FIXME complexity_t should be a double not an int ... + return (complexity_t) floor(cpxy / norm + 0.5); } score_t -bscore_base::get_error(const behavioral_score&) const +bscore_base::get_error(const behavioral_score &) const { - OC_ASSERT(false, "bscore error not implemented for bscorer %s", - typeid(*this).name()); - return 1.0; + OC_ASSERT(false, "bscore error not implemented for bscorer %s", + typeid(*this).name()); + return 1.0; } score_t -bscore_base::get_error(const combo_tree& tr) const +bscore_base::get_error(const combo_tree &tr) const { - // This is the correct result for most cases, except for precision- - // scorer-like cases. However, it should never be called for the - // 'usual' non-precion case; the cached bascore should be used - // instead. This should only be called for the precision scorer. - // Thus, to avoid mis-use, we assert here. - // - // return get_error(operator()(tr)); - OC_ASSERT(false, "tree error not implemented for bscorer %s", - typeid(*this).name()); - return 1.0; + // This is the correct result for most cases, except for precision- + // scorer-like cases. However, it should never be called for the + // 'usual' non-precion case; the cached bascore should be used + // instead. This should only be called for the precision scorer. + // Thus, to avoid mis-use, we assert here. + // + // return get_error(operator()(tr)); + OC_ASSERT(false, "tree error not implemented for bscorer %s", + typeid(*this).name()); + return 1.0; } -score_t bscore_base::sum_bscore(const behavioral_score& bs) const +score_t bscore_base::sum_bscore(const behavioral_score &bs) const { - // Don't use weights if not boosting. - if (not _return_weighted_score or _size == 0 or _weights.size() == 0) - return boost::accumulate(bs, 0.0); - - size_t bsz = bs.size(); - OC_ASSERT(_size <= bsz, "Behavioral score too small!"); - - size_t i=0; - score_t res = 0.0; - for (; i < _size; i++) { - res += _weights[i] * bs[i]; - } - - // Any extra penalties tacked onto the end of the bscore get added - // without any weights. For example, the "pre" scoer tacks these - // on, so that the minimum activation can be hit. - for (; i < bsz; i++) { - res += bs[i]; - } - return res; + // Don't use weights if not boosting. + if (not _return_weighted_score or _size == 0 or _weights.size() == 0) + return boost::accumulate(bs, 0.0); + + size_t bsz = bs.size(); + OC_ASSERT(_size <= bsz, "Behavioral score too small!"); + + size_t i = 0; + score_t res = 0.0; + for (; i < _size; i++) { + res += _weights[i] * bs[i]; + } + + // Any extra penalties tacked onto the end of the bscore get added + // without any weights. For example, the "pre" scoer tacks these + // on, so that the minimum activation can be hit. + for (; i < bsz; i++) { + res += bs[i]; + } + return res; } void bscore_base::reset_weights() { - if (_return_weighted_score) - _weights = std::vector(_size, 1.0); - else - _weights = std::vector(); + if (_return_weighted_score) + _weights = std::vector(_size, 1.0); + else + _weights = std::vector(); } -void bscore_base::update_weights(const std::vector& rew) +void bscore_base::update_weights(const std::vector &rew) { - OC_ASSERT(_return_weighted_score, - "Unexpected use of weights in the bscorer!"); + OC_ASSERT(_return_weighted_score, + "Unexpected use of weights in the bscorer!"); - OC_ASSERT(rew.size() == _size, - "Unexpected size of weight array!"); + OC_ASSERT(rew.size() == _size, + "Unexpected size of weight array!"); #if 1 - // Simple rescaling, without normalization. - for (size_t i = 0; i < _size; i++) { - _weights[i] *= rew[i]; - } + // Simple rescaling, without normalization. + for (size_t i = 0; i < _size; i++) { + _weights[i] *= rew[i]; + } #else - double znorm = 0.0; - for (size_t i = 0; i < _size; i++) { - _weights[i] *= rew[i]; - znorm += _weights[i]; - } - - // Normalization: sum of weights must equal 1.0 - // Uhhh, not all scorers need this. Mostly, this is used - // to make get_error() return the right thing ... - znorm = 1.0 / znorm; - for (size_t i=0; i<_size; i++) _weights[i] *= znorm; + double znorm = 0.0; + for (size_t i = 0; i < _size; i++) { + _weights[i] *= rew[i]; + znorm += _weights[i]; + } + + // Normalization: sum of weights must equal 1.0 + // Uhhh, not all scorers need this. Mostly, this is used + // to make get_error() return the right thing ... + znorm = 1.0 / znorm; + for (size_t i=0; i<_size; i++) _weights[i] *= znorm; #endif } @@ -204,133 +207,132 @@ void bscore_base::update_weights(const std::vector& rew) // bscore_ctable_base // //////////////////////// -bscore_ctable_base::bscore_ctable_base(const CTable& ctable) - : _orig_ctable(ctable), _wrk_ctable(_orig_ctable), - _all_rows_wrk_ctable(_wrk_ctable), - _ctable_usize(_orig_ctable.uncompressed_size()) +bscore_ctable_base::bscore_ctable_base(const CTable &ctable) + : _orig_ctable(ctable), _wrk_ctable(_orig_ctable), + _all_rows_wrk_ctable(_wrk_ctable), + _ctable_usize(_orig_ctable.uncompressed_size()) { - _size = ctable.size(); - recompute_weight(); + _size = ctable.size(); + recompute_weight(); } void bscore_ctable_base::recompute_weight() const { - // Sum of all of the weights in the working table. - // Note that these weights can be fractional! - _ctable_weight = 0.0; - for (const CTable::value_type& vct : _wrk_ctable) { - const CTable::counter_t& cnt = vct.second; - - _ctable_weight += cnt.total_count(); - } + // Sum of all of the weights in the working table. + // Note that these weights can be fractional! + _ctable_weight = 0.0; + for (const CTable::value_type &vct : _wrk_ctable) { + const CTable::counter_t &cnt = vct.second; + + _ctable_weight += cnt.total_count(); + } } -void bscore_ctable_base::ignore_cols(const std::set& idxs) const +void bscore_ctable_base::ignore_cols(const std::set &idxs) const { - // Must not compress if boosting: the booster keeps track of a - // weight for each row, and so altering the number of rows will - // just confuse the mechanisn. - if (_return_weighted_score) return; - - if (logger().is_debug_enabled()) - { - std::stringstream ss; - ss << "Compress CTable for optimization by ignoring features: "; - ostream_container(ss, idxs, ","); - logger().debug(ss.str()); - } - - // Get permitted idxs. - auto irng = boost::irange(0, _orig_ctable.get_arity()); - std::set all_idxs(irng.begin(), irng.end()); - std::set permitted_idxs = opencog::set_difference(all_idxs, idxs); - - // Filter orig_table with permitted idxs. - _wrk_ctable = _orig_ctable.filtered_preserve_idxs(permitted_idxs); - recompute_weight(); - - // for debugging, keep that around till we fix best_possible_bscore - // fully_filtered_ctable = _orig_ctable.filtered(permitted_idxs); - - logger().debug("Original CTable size = %u", _orig_ctable.size()); - logger().debug("Working CTable size = %u", _wrk_ctable.size()); - - if (logger().is_fine_enabled()) { - std::stringstream ss; - ss << "Contents of _wrk_ctable =" << std::endl; - ostreamCTable(ss, _wrk_ctable); - logger().fine(ss.str()); - // for debugging, keep that around till we fix best_possible_bscore - // std::stringstream ss2; - // ss2 << "fully_filtered_ctable =" << std::endl; - // ostreamCTable(ss2, fully_filtered_ctable); - // logger().fine(ss2.str()); - } - - // Copy the working ctable in a temporary ctable that keeps track - // of all rows (so ignore_rows can be applied several times) - _all_rows_wrk_ctable = _wrk_ctable; + // Must not compress if boosting: the booster keeps track of a + // weight for each row, and so altering the number of rows will + // just confuse the mechanisn. + if (_return_weighted_score) return; + + if (logger().is_debug_enabled()) { + std::stringstream ss; + ss << "Compress CTable for optimization by ignoring features: "; + ostream_container(ss, idxs, ","); + logger().debug(ss.str()); + } + + // Get permitted idxs. + auto irng = boost::irange(0, _orig_ctable.get_arity()); + std::set all_idxs(irng.begin(), irng.end()); + std::set permitted_idxs = opencog::set_difference(all_idxs, idxs); + + // Filter orig_table with permitted idxs. + _wrk_ctable = _orig_ctable.filtered_preserve_idxs(permitted_idxs); + recompute_weight(); + + // for debugging, keep that around till we fix best_possible_bscore + // fully_filtered_ctable = _orig_ctable.filtered(permitted_idxs); + + logger().debug("Original CTable size = %u", _orig_ctable.size()); + logger().debug("Working CTable size = %u", _wrk_ctable.size()); + + if (logger().is_fine_enabled()) { + std::stringstream ss; + ss << "Contents of _wrk_ctable =" << std::endl; + ostreamCTable(ss, _wrk_ctable); + logger().fine(ss.str()); + // for debugging, keep that around till we fix best_possible_bscore + // std::stringstream ss2; + // ss2 << "fully_filtered_ctable =" << std::endl; + // ostreamCTable(ss2, fully_filtered_ctable); + // logger().fine(ss2.str()); + } + + // Copy the working ctable in a temporary ctable that keeps track + // of all rows (so ignore_rows can be applied several times) + _all_rows_wrk_ctable = _wrk_ctable; } -void bscore_ctable_base::ignore_rows(const std::set& idxs) const +void bscore_ctable_base::ignore_rows(const std::set &idxs) const { - // Must not compress if boosting: the booster keeps track of a - // weight for each row, and so altering the number of rows will - // just confuse the mechanisn. - if (_return_weighted_score) return; - - _wrk_ctable = _all_rows_wrk_ctable; // to include all rows in _wrk_ctable - - // if (logger().isFineEnabled()) - // logger().fine() << "Remove " << idxs.size() << " uncompressed rows from " - // << "_wrk_ctable of compressed size " << _wrk_ctable.size() - // << ", uncompressed size = " << _wrk_ctable.uncompressed_size(); - _wrk_ctable.remove_rows(idxs); - _ctable_usize = _wrk_ctable.uncompressed_size(); - _size = _wrk_ctable.size(); - recompute_weight(); - - // if (logger().isFineEnabled()) - // logger().fine() << "New _wrk_ctable compressed size = " << _wrk_ctable.size() - // << ", uncompressed size = " << _ctable_usize; + // Must not compress if boosting: the booster keeps track of a + // weight for each row, and so altering the number of rows will + // just confuse the mechanisn. + if (_return_weighted_score) return; + + _wrk_ctable = _all_rows_wrk_ctable; // to include all rows in _wrk_ctable + + // if (logger().isFineEnabled()) + // logger().fine() << "Remove " << idxs.size() << " uncompressed rows from " + // << "_wrk_ctable of compressed size " << _wrk_ctable.size() + // << ", uncompressed size = " << _wrk_ctable.uncompressed_size(); + _wrk_ctable.remove_rows(idxs); + _ctable_usize = _wrk_ctable.uncompressed_size(); + _size = _wrk_ctable.size(); + recompute_weight(); + + // if (logger().isFineEnabled()) + // logger().fine() << "New _wrk_ctable compressed size = " << _wrk_ctable.size() + // << ", uncompressed size = " << _ctable_usize; } -void bscore_ctable_base::ignore_rows_at_times(const std::set& timestamps) const +void bscore_ctable_base::ignore_rows_at_times(const std::set ×tamps) const { - // Must not compress if boosting: the booster keeps track of a - // weight for each row, and so altering the number of rows will - // just confuse the mechanisn. - if (_return_weighted_score) return; + // Must not compress if boosting: the booster keeps track of a + // weight for each row, and so altering the number of rows will + // just confuse the mechanisn. + if (_return_weighted_score) return; - // logger().fine() << "bscore_ctable_base::ignore_rows_at_times"; - // ostreamContainer(logger().fine() << "timestamps = ", timestamps); + // logger().fine() << "bscore_ctable_base::ignore_rows_at_times"; + // ostreamContainer(logger().fine() << "timestamps = ", timestamps); - _wrk_ctable = _all_rows_wrk_ctable; // to include all rows in _wrk_ctable + _wrk_ctable = _all_rows_wrk_ctable; // to include all rows in _wrk_ctable - // if (logger().isFineEnabled()) - // logger().fine() << "Remove " << timestamps.size() << " dates from " - // << "_wrk_ctable of compressed size " << _wrk_ctable.size() - // << ", uncompressed size = " << _wrk_ctable.uncompressed_size(); + // if (logger().isFineEnabled()) + // logger().fine() << "Remove " << timestamps.size() << " dates from " + // << "_wrk_ctable of compressed size " << _wrk_ctable.size() + // << ", uncompressed size = " << _wrk_ctable.uncompressed_size(); - _wrk_ctable.remove_rows_at_times(timestamps); - _ctable_usize = _wrk_ctable.uncompressed_size(); - _size = _wrk_ctable.size(); - recompute_weight(); + _wrk_ctable.remove_rows_at_times(timestamps); + _ctable_usize = _wrk_ctable.uncompressed_size(); + _size = _wrk_ctable.size(); + recompute_weight(); - // if (logger().isFineEnabled()) - // logger().fine() << "New _wrk_ctable compressed size = " << _wrk_ctable.size() - // << ", uncompressed size = " << _ctable_usize; + // if (logger().isFineEnabled()) + // logger().fine() << "New _wrk_ctable compressed size = " << _wrk_ctable.size() + // << ", uncompressed size = " << _ctable_usize; } unsigned bscore_ctable_base::get_ctable_usize() const { - return _ctable_usize; + return _ctable_usize; } -const CTable& bscore_ctable_base::get_ctable() const +const CTable &bscore_ctable_base::get_ctable() const { - return _orig_ctable; + return _orig_ctable; } } // ~namespace moses diff --git a/moses/moses/scoring/scoring_base.h b/moses/moses/scoring/scoring_base.h index 01364a1cd2..2e317372cb 100644 --- a/moses/moses/scoring/scoring_base.h +++ b/moses/moses/scoring/scoring_base.h @@ -30,7 +30,10 @@ #include #include -namespace opencog { namespace moses { +namespace opencog +{ +namespace moses +{ using combo::combo_tree; using combo::arity_t; @@ -56,300 +59,315 @@ score_t contin_complexity_coef(unsigned alphabet_size, double stdev); /// A behavioral score is a vector of scores, one per sample of a dataset. struct bscore_base { - bscore_base() : _return_weighted_score(false), _complexity_coef(0.0), _size(0) {}; - virtual ~bscore_base() {}; - - /// Return the behavioral score for the combo_tree - virtual behavioral_score operator()(const combo_tree&) const = 0; - - /// Return the behavioral score for the Handle - virtual behavioral_score operator()(const Handle&) const; - - /// Return the behavioral score for the ensemble - virtual behavioral_score operator()(const scored_combo_tree_set&) const; - - /// Return the size (length) of the behavioral_score that operator() - /// above would return. - virtual size_t size() const { return _size; } - - /// Return the best possible bscore achievable with this fitness - /// function. This is useful for stopping MOSES when the best - /// possible score has been reached. - virtual behavioral_score best_possible_bscore() const = 0; - - /// Return the worst possible bscore achievable with this fitness - /// function. This is needed during boosting, to ascertain if at - // least half the answers are correct. - virtual behavioral_score worst_possible_bscore() const; - - /// Return the smallest change in the score which can be considered - /// to be an improvement over the previous score. This is useful for - /// avoiding local maxima which have a very flat top. That is, where - /// all combo trees in the same local maximum have almost exactly - /// the same score, and so scores improve by very small amounts - /// during the search. In such cases, one can save a lot of CPU - /// time by terminating the search when the imrpovements are smaller - /// than the min_improv(). Returns 0.0 by default. - virtual score_t min_improv() const { return 0.0; } - - /// Return weighted scores instead of flat scores. The weighted - /// scores are needed by the boosting algorithms; the unweighted - /// scores are needed to find out what the "actual" score would be. - void use_weighted_scores() { _return_weighted_score = true; } - - /// Return the (possbily weighted) sum of the behavioral score. - /// If _return_weighted_score is false, then this returns the "flat" - /// score, a simple sum over all samples: - /// - /// score = sum_x BScore(x) - /// - /// Otherwise, it returns a weighted sum of the bscore: - /// - /// score = sum_x weight(x) * BScore(x) - /// - /// Each element in the bscore typically corresponds to a sample in - /// a supervised training set, that is, a row of a table contianing - /// the training data. By default, the weight is 1.0 for each entry. - /// The intended use of the weights is for boosting, so that the - /// the score for erroneous rows can be magnified, such as in AdaBoost. - /// - /// See, for example, http://en.wikipedia.org/wiki/AdaBoost -- - /// However, CAUTION! That wikipedia article currently (as of July - /// 2014) contains serious, fundamental mistakes in it's desciption - /// of the boosting algo! - virtual score_t sum_bscore(const behavioral_score&) const; - - /// Reset the weights to a uniform distribution. - virtual void reset_weights(); - - /// A vector of per-bscore weights, used to tote up the behavioral - /// score into a single number. - // XXX TODO should be a std::valarray not a vector. - virtual void update_weights(const std::vector&); - - /// Return the amount by which the bscore differs from a perfect - /// score. This is used by the boosting algorithm to weight the - /// a scored combo tree. - /// - /// The returned value must be normalized so that 0.0 stands for - /// a perfect score (all answers are the best possible), a value - /// of 0.5 corresponds to "random guessing", and 1.0 corresponds - /// to a worst-possible score (all answers are the worst possible.) - /// This error amount does not have to be a metric or distance - /// measure, nor does it have to be linear; however, boosting will - /// probably work better if the error is vaguely metric-like and - /// quasi-linear. - /// - /// See the notes below, for the CTable sccorer, for special - /// considerations that CTable-based scorers must make. - virtual score_t get_error(const behavioral_score&) const; - virtual score_t get_error(const combo_tree&) const; - - /// Indicate a set of features that should be ignored during scoring, - /// The features are indicated as indexes, starting from 0. - /// - /// The primary intended use of this function is to improve - /// performance by avoiding evaluation of the ignored features. - /// At this time, the only users of this method are the table-based - /// scorers. By ignoring most columns, the table can typically be - /// significantly compressed, thus reducing evaluation time. - /// - /// It is important that the combo trees to be scored do not use - /// any of the ignored indexes, as otherwise, a faulty scoring will - /// result. Thus, the typical use case is to remove all columns - /// that do not appear in a knob-decorated combo tree. The resulting - /// table is then safe to use during instance scoring, because no - /// instance could ever reference one of the ignored columns. - /// - /// Note that the best_possible_score may depend on the set of - /// ignored features. Thus, the best_possible_score() method should - /// be called only after this method. - /// - /// This method may be called multiple times; with each call, the - /// previously-ignored features will first be restored, before the - /// new index set is ignored. Thus, calling this with the empty set - /// will have the effect of restoring all columns that were previously - /// ignored. - virtual void ignore_cols(const std::set&) const {} - - /// In case one wants to evaluate the fitness on a subset of the - /// data, one can provide a set of row indexes to ignore. - /// - /// This method may be called multiple times. With each call, the - /// previously ignored rows will be restored, before the - /// newly-specified rows are removed. Thus, calling this with the - /// empty set has the effect of restoring all ignored rows. - virtual void ignore_rows(const std::set&) const {} - - // Like ignore_rows but consider timestamps instead of indexes - virtual void ignore_rows_at_times(const std::set&) const {} - - // Return the uncompressed size of the CTable - virtual unsigned get_ctable_usize() const { - OC_ASSERT(false, "You must implement me in the derived class"); - return 0U; - } - - // Return the original CTable - virtual const CTable& get_ctable() const { - static const CTable empty_ctable; - OC_ASSERT(false, "You must implement me in the derived class"); - return empty_ctable; - } - - /// Get the appropriate complexity measure for the indicated combo - /// tree. By default, this is the tree complexity, although it may - /// depend on the scorer. - virtual complexity_t get_complexity(const combo_tree& tr) const - { - return tree_complexity(tr); - } - virtual complexity_t get_complexity(const scored_combo_tree_set&) const; - - virtual complexity_t get_complexity(const Handle &handle)const + bscore_base() : _return_weighted_score(false), _complexity_coef(0.0), _size(0) {}; + + virtual ~bscore_base() + {}; + + /// Return the behavioral score for the combo_tree + virtual behavioral_score operator()(const combo_tree &) const = 0; + + /// Return the behavioral score for the Handle + virtual behavioral_score operator()(const Handle &) const; + + /// Return the behavioral score for the ensemble + virtual behavioral_score operator()(const scored_combo_tree_set &) const; + + /// Return the size (length) of the behavioral_score that operator() + /// above would return. + virtual size_t size() const + { return _size; } + + /// Return the best possible bscore achievable with this fitness + /// function. This is useful for stopping MOSES when the best + /// possible score has been reached. + virtual behavioral_score best_possible_bscore() const = 0; + + /// Return the worst possible bscore achievable with this fitness + /// function. This is needed during boosting, to ascertain if at + // least half the answers are correct. + virtual behavioral_score worst_possible_bscore() const; + + /// Return the smallest change in the score which can be considered + /// to be an improvement over the previous score. This is useful for + /// avoiding local maxima which have a very flat top. That is, where + /// all combo trees in the same local maximum have almost exactly + /// the same score, and so scores improve by very small amounts + /// during the search. In such cases, one can save a lot of CPU + /// time by terminating the search when the imrpovements are smaller + /// than the min_improv(). Returns 0.0 by default. + virtual score_t min_improv() const + { return 0.0; } + + /// Return weighted scores instead of flat scores. The weighted + /// scores are needed by the boosting algorithms; the unweighted + /// scores are needed to find out what the "actual" score would be. + void use_weighted_scores() + { _return_weighted_score = true; } + + /// Return the (possbily weighted) sum of the behavioral score. + /// If _return_weighted_score is false, then this returns the "flat" + /// score, a simple sum over all samples: + /// + /// score = sum_x BScore(x) + /// + /// Otherwise, it returns a weighted sum of the bscore: + /// + /// score = sum_x weight(x) * BScore(x) + /// + /// Each element in the bscore typically corresponds to a sample in + /// a supervised training set, that is, a row of a table contianing + /// the training data. By default, the weight is 1.0 for each entry. + /// The intended use of the weights is for boosting, so that the + /// the score for erroneous rows can be magnified, such as in AdaBoost. + /// + /// See, for example, http://en.wikipedia.org/wiki/AdaBoost -- + /// However, CAUTION! That wikipedia article currently (as of July + /// 2014) contains serious, fundamental mistakes in it's desciption + /// of the boosting algo! + virtual score_t sum_bscore(const behavioral_score &) const; + + /// Reset the weights to a uniform distribution. + virtual void reset_weights(); + + /// A vector of per-bscore weights, used to tote up the behavioral + /// score into a single number. + // XXX TODO should be a std::valarray not a vector. + virtual void update_weights(const std::vector &); + + /// Return the amount by which the bscore differs from a perfect + /// score. This is used by the boosting algorithm to weight the + /// a scored combo tree. + /// + /// The returned value must be normalized so that 0.0 stands for + /// a perfect score (all answers are the best possible), a value + /// of 0.5 corresponds to "random guessing", and 1.0 corresponds + /// to a worst-possible score (all answers are the worst possible.) + /// This error amount does not have to be a metric or distance + /// measure, nor does it have to be linear; however, boosting will + /// probably work better if the error is vaguely metric-like and + /// quasi-linear. + /// + /// See the notes below, for the CTable sccorer, for special + /// considerations that CTable-based scorers must make. + virtual score_t get_error(const behavioral_score &) const; + + virtual score_t get_error(const combo_tree &) const; + + /// Indicate a set of features that should be ignored during scoring, + /// The features are indicated as indexes, starting from 0. + /// + /// The primary intended use of this function is to improve + /// performance by avoiding evaluation of the ignored features. + /// At this time, the only users of this method are the table-based + /// scorers. By ignoring most columns, the table can typically be + /// significantly compressed, thus reducing evaluation time. + /// + /// It is important that the combo trees to be scored do not use + /// any of the ignored indexes, as otherwise, a faulty scoring will + /// result. Thus, the typical use case is to remove all columns + /// that do not appear in a knob-decorated combo tree. The resulting + /// table is then safe to use during instance scoring, because no + /// instance could ever reference one of the ignored columns. + /// + /// Note that the best_possible_score may depend on the set of + /// ignored features. Thus, the best_possible_score() method should + /// be called only after this method. + /// + /// This method may be called multiple times; with each call, the + /// previously-ignored features will first be restored, before the + /// new index set is ignored. Thus, calling this with the empty set + /// will have the effect of restoring all columns that were previously + /// ignored. + virtual void ignore_cols(const std::set &) const + {} + + /// In case one wants to evaluate the fitness on a subset of the + /// data, one can provide a set of row indexes to ignore. + /// + /// This method may be called multiple times. With each call, the + /// previously ignored rows will be restored, before the + /// newly-specified rows are removed. Thus, calling this with the + /// empty set has the effect of restoring all ignored rows. + virtual void ignore_rows(const std::set &) const + {} + + // Like ignore_rows but consider timestamps instead of indexes + virtual void ignore_rows_at_times(const std::set &) const + {} + + // Return the uncompressed size of the CTable + virtual unsigned get_ctable_usize() const + { + OC_ASSERT(false, "You must implement me in the derived class"); + return 0U; + } + + // Return the original CTable + virtual const CTable &get_ctable() const + { + static const CTable empty_ctable; + OC_ASSERT(false, "You must implement me in the derived class"); + return empty_ctable; + } + + /// Get the appropriate complexity measure for the indicated combo + /// tree. By default, this is the tree complexity, although it may + /// depend on the scorer. + virtual complexity_t get_complexity(const combo_tree &tr) const + { + return tree_complexity(tr); + } + + virtual complexity_t get_complexity(const scored_combo_tree_set &) const; + + virtual complexity_t get_complexity(const Handle &handle) const { return atomese_complexity(handle); } - /// Return the complexity coefficient. This is used to obtain the - /// complexity penalty for the score, which is meant to be computed - /// as penalty = get_complexity_coef() * get_complexity(tree); - /// This is done in two steps like this, because different scorers - /// use a different scale, and so the complexity needs to be rescaled. - /// Furthermore, different scorers also have a different notion of - /// complexity. This is the opportunity to make adjustments for each - /// case. - virtual score_t get_complexity_coef() const { return _complexity_coef; } - - /// Store a complexity coefficient with the scorerer. This is - /// done to work around the fact that different kinds of scorers - /// normalize their scores in different ways, and so the way that - /// the penalties are scaled should differ as well. - /// Strictly speaking, the user could just specify a different - /// scale on the moses command line, but perhaps this inflicts too - /// much effort on the user. Thus, we maintain a "suggested" scaling - /// here. - virtual void set_complexity_coef(score_t complexity_ratio); - virtual void set_complexity_coef(unsigned alphabet_size, float p); + + /// Return the complexity coefficient. This is used to obtain the + /// complexity penalty for the score, which is meant to be computed + /// as penalty = get_complexity_coef() * get_complexity(tree); + /// This is done in two steps like this, because different scorers + /// use a different scale, and so the complexity needs to be rescaled. + /// Furthermore, different scorers also have a different notion of + /// complexity. This is the opportunity to make adjustments for each + /// case. + virtual score_t get_complexity_coef() const + { return _complexity_coef; } + + /// Store a complexity coefficient with the scorerer. This is + /// done to work around the fact that different kinds of scorers + /// normalize their scores in different ways, and so the way that + /// the penalties are scaled should differ as well. + /// Strictly speaking, the user could just specify a different + /// scale on the moses command line, but perhaps this inflicts too + /// much effort on the user. Thus, we maintain a "suggested" scaling + /// here. + virtual void set_complexity_coef(score_t complexity_ratio); + + virtual void set_complexity_coef(unsigned alphabet_size, float p); protected: - mutable bool _return_weighted_score; - score_t _complexity_coef; - mutable size_t _size; // mutable to work around const bugs - std::vector _weights; + mutable bool _return_weighted_score; + score_t _complexity_coef; + mutable size_t _size; // mutable to work around const bugs + std::vector _weights; }; /// Base class for fitness functions that use a ctable. Provides useful /// table compression. struct bscore_ctable_base : public bscore_base { - bscore_ctable_base(const CTable&); - - /// Indicate a set of features that should be ignored during scoring, - /// The features are indicated as indexes, starting from 0. - /// - /// This function is used to improve performance by avoiding - /// evaluation of the ignored features. By ignoring most columns, - /// the table can typically be significantly compressed, thus - /// reducing evaluation time. For tables with tens of thousands of - /// uncompressed rows, this can provide a significant speedup when - /// evaluating a large number of instances. - void ignore_cols(const std::set&) const; - - /// In case one wants to evaluate the fitness on a subset of the - /// data, one can provide a set of row indexes to ignore. - void ignore_rows(const std::set&) const; - - /// Like ignore_rows but consider timestamps instead of indexes - void ignore_rows_at_times(const std::set&) const; - - /// Return the uncompressed size of the CTable - unsigned get_ctable_usize() const; - - /// Return the original CTable - const CTable& get_ctable() const; - - /// Implementing get_error() for CTables-based scorers requires some - /// special consideration. First, the length of the behavioral - /// score is needed, for normalization. The correct "length" is - /// kind-of tricky to understand when a table has weighted rows, - /// or when it is degenerate. In the degenerate case, no matter - /// what selection is made, some rows will be wrong. - /// - /// We explicitly review these cases here: the table may have - /// degenerate or non-degenerate rows, and these may be weighted or - /// non-weighted. Here, the "weights" are not the boosting weights, - /// but the user-specified row weights. - /// - /// non-degenerate, non weighted: - /// (each row has defacto weight of 1.0) - /// best score = 0.0 so err = score / num rows; - /// - /// non-degenerate, weighted: - /// best score = 0.0 so err = score / weighted num rows; - /// since the score is a sum of weighted rows. - /// - /// e.g. two rows with user-specified weights: - /// 0.1 - /// 2.3 - /// so if first row is wrong, then err = 0.1/2.4 - /// and if second row is wrong, err = 2.3/2.4 - /// - /// degenerate, non-weighted: - /// best score > 0.0 err = (score - best_score) / eff_num_rows; - /// - /// where eff_num_rows = sum_row fabs(up-count - down-count) - /// is the "effective" number of rows, as opposing rows - /// effectively cancel each-other out. This is also the - /// "worst possible score", what would be returned if every - /// row was marked wrong. - /// - /// e.g. table five uncompressed rows: - /// up:1 input-a - /// dn:2 input-a - /// up:2 input-b - /// best score is -1 (i.e. is 4-5 where 4 = 2+2). - /// so if first row is wrong, then err = (1-1)/5 = 0/3 - /// so if second row is wrong, then err = (2-1)/5 = 1/3 - /// so if third & first is wrong, then err = (3-1)/3 = 2/3 - /// so if third & second is wrong, then err = (4-1)/3 = 3/3 - /// - /// Thus, the "effective_length" is (minus) the worst possible score. - /// - /// The subtraction (score - best_score) needs to be done in the - /// by the get_error() method, and not somewhere else: that's - /// because the boost row weighting must be performed on this - /// difference, so that only the rows that are far away from their - /// best-possible values get boosted. - /// - // score_t get_error(const behavioral_score&) const; + bscore_ctable_base(const CTable &); + + /// Indicate a set of features that should be ignored during scoring, + /// The features are indicated as indexes, starting from 0. + /// + /// This function is used to improve performance by avoiding + /// evaluation of the ignored features. By ignoring most columns, + /// the table can typically be significantly compressed, thus + /// reducing evaluation time. For tables with tens of thousands of + /// uncompressed rows, this can provide a significant speedup when + /// evaluating a large number of instances. + void ignore_cols(const std::set &) const; + + /// In case one wants to evaluate the fitness on a subset of the + /// data, one can provide a set of row indexes to ignore. + void ignore_rows(const std::set &) const; + + /// Like ignore_rows but consider timestamps instead of indexes + void ignore_rows_at_times(const std::set &) const; + + /// Return the uncompressed size of the CTable + unsigned get_ctable_usize() const; + + /// Return the original CTable + const CTable &get_ctable() const; + + /// Implementing get_error() for CTables-based scorers requires some + /// special consideration. First, the length of the behavioral + /// score is needed, for normalization. The correct "length" is + /// kind-of tricky to understand when a table has weighted rows, + /// or when it is degenerate. In the degenerate case, no matter + /// what selection is made, some rows will be wrong. + /// + /// We explicitly review these cases here: the table may have + /// degenerate or non-degenerate rows, and these may be weighted or + /// non-weighted. Here, the "weights" are not the boosting weights, + /// but the user-specified row weights. + /// + /// non-degenerate, non weighted: + /// (each row has defacto weight of 1.0) + /// best score = 0.0 so err = score / num rows; + /// + /// non-degenerate, weighted: + /// best score = 0.0 so err = score / weighted num rows; + /// since the score is a sum of weighted rows. + /// + /// e.g. two rows with user-specified weights: + /// 0.1 + /// 2.3 + /// so if first row is wrong, then err = 0.1/2.4 + /// and if second row is wrong, err = 2.3/2.4 + /// + /// degenerate, non-weighted: + /// best score > 0.0 err = (score - best_score) / eff_num_rows; + /// + /// where eff_num_rows = sum_row fabs(up-count - down-count) + /// is the "effective" number of rows, as opposing rows + /// effectively cancel each-other out. This is also the + /// "worst possible score", what would be returned if every + /// row was marked wrong. + /// + /// e.g. table five uncompressed rows: + /// up:1 input-a + /// dn:2 input-a + /// up:2 input-b + /// best score is -1 (i.e. is 4-5 where 4 = 2+2). + /// so if first row is wrong, then err = (1-1)/5 = 0/3 + /// so if second row is wrong, then err = (2-1)/5 = 1/3 + /// so if third & first is wrong, then err = (3-1)/3 = 2/3 + /// so if third & second is wrong, then err = (4-1)/3 = 3/3 + /// + /// Thus, the "effective_length" is (minus) the worst possible score. + /// + /// The subtraction (score - best_score) needs to be done in the + /// by the get_error() method, and not somewhere else: that's + /// because the boost row weighting must be performed on this + /// difference, so that only the rows that are far away from their + /// best-possible values get boosted. + /// + // score_t get_error(const behavioral_score&) const; protected: - const CTable& _orig_ctable; // Reference to the original table. + const CTable &_orig_ctable; // Reference to the original table. - // The table that is actually used for the evaluation. This is the - // the compressed table that results after ignore_cols() and - // ignore_rows() have been applied. Must be mutable to avoid the - // const-ness of tables in general. - mutable CTable _wrk_ctable; + // The table that is actually used for the evaluation. This is the + // the compressed table that results after ignore_cols() and + // ignore_rows() have been applied. Must be mutable to avoid the + // const-ness of tables in general. + mutable CTable _wrk_ctable; - // A copy of wrk_ctable prior to ignore_rows() being applied. This - // allows ignore_rows() to be called multiple times, without forcing - // a complete recalculation. - mutable CTable _all_rows_wrk_ctable; // mutable to work around const bugs. + // A copy of wrk_ctable prior to ignore_rows() being applied. This + // allows ignore_rows() to be called multiple times, without forcing + // a complete recalculation. + mutable CTable _all_rows_wrk_ctable; // mutable to work around const bugs. - mutable size_t _ctable_usize; // uncompressed size of ctable - mutable count_t _ctable_weight; // Total weight of all rows in table. + mutable size_t _ctable_usize; // uncompressed size of ctable + mutable count_t _ctable_weight; // Total weight of all rows in table. - void recompute_weight() const; // recompute _ctable_weight + void recompute_weight() const; // recompute _ctable_weight }; // helper to log a combo_tree and its behavioral score -static inline void log_candidate_bscore(const combo_tree& tr, - const behavioral_score& bs) +static inline void log_candidate_bscore(const combo_tree &tr, + const behavioral_score &bs) { - if (logger().is_fine_enabled()) - logger().fine() << "Evaluate candidate: " << tr << "\n" - << "\tBScore size=" << bs.size() - << " bscore: " << bs; + if (logger().is_fine_enabled()) + logger().fine() << "Evaluate candidate: " << tr << "\n" + << "\tBScore size=" << bs.size() + << " bscore: " << bs; } } //~namespace moses From 756a83cd329a73010ddb5f5784bf51897ac8104a Mon Sep 17 00:00:00 2001 From: kasim Date: Fri, 7 Sep 2018 12:04:52 +0300 Subject: [PATCH 09/17] =?UTF-8?q?Fix=20=E2=80=98result=5Ftype=E2=80=99=20d?= =?UTF-8?q?oes=20not=20name=20a=20type?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- moses/moses/scoring/scoring_base.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/moses/moses/scoring/scoring_base.h b/moses/moses/scoring/scoring_base.h index 2e317372cb..32518419cd 100644 --- a/moses/moses/scoring/scoring_base.h +++ b/moses/moses/scoring/scoring_base.h @@ -59,6 +59,8 @@ score_t contin_complexity_coef(unsigned alphabet_size, double stdev); /// A behavioral score is a vector of scores, one per sample of a dataset. struct bscore_base { + typedef behavioral_score result_type; + bscore_base() : _return_weighted_score(false), _complexity_coef(0.0), _size(0) {}; virtual ~bscore_base() From ef97cb3a50c053df4757c758d4b5745d3da25cfb Mon Sep 17 00:00:00 2001 From: kasim Date: Sun, 16 Sep 2018 17:40:55 +0300 Subject: [PATCH 10/17] Reformat table files --- moses/comboreduct/table/table.cc | 1197 ++++++++-------- moses/comboreduct/table/table.h | 2278 ++++++++++++++++-------------- 2 files changed, 1836 insertions(+), 1639 deletions(-) diff --git a/moses/comboreduct/table/table.cc b/moses/comboreduct/table/table.cc index 4bd6b61a9b..a0a949dc0a 100644 --- a/moses/comboreduct/table/table.cc +++ b/moses/comboreduct/table/table.cc @@ -45,512 +45,515 @@ #include "table.h" #include "table_io.h" -namespace opencog { namespace combo { +namespace opencog +{ +namespace combo +{ using namespace std; using namespace boost; using namespace boost::adaptors; -string table_fmt_builtin_to_str(const builtin& b) +string table_fmt_builtin_to_str(const builtin &b) { - stringstream ss; - if (is_boolean(b)) - ss << builtin_to_bool(b); - else - ss << b; - return ss.str(); + stringstream ss; + if (is_boolean(b)) + ss << builtin_to_bool(b); + else + ss << b; + return ss.str(); } -string table_fmt_vertex_to_str(const vertex& v) + +string table_fmt_vertex_to_str(const vertex &v) { - stringstream ss; - if (is_boolean(v)) - ss << vertex_to_bool(v); - else - ss << v; - return ss.str(); + stringstream ss; + if (is_boolean(v)) + ss << vertex_to_bool(v); + else + ss << v; + return ss.str(); } - + // ------------------------------------------------------- -ITable::ITable() {} +ITable::ITable() +{} -ITable::ITable(const vector& ts, const vector& il) - : types(ts), labels(il) {} +ITable::ITable(const vector &ts, const vector &il) + : types(ts), labels(il) +{} -ITable::ITable(const ITable::super& mat, const vector& il) - : super(mat), labels(il) {} +ITable::ITable(const ITable::super &mat, const vector &il) + : super(mat), labels(il) +{} /// Construct an ITable holding a single column, the column from the OTable. -ITable::ITable(const OTable& ot) +ITable::ITable(const OTable &ot) { - insert_col(ot.get_label(), ot); + insert_col(ot.get_label(), ot); - type_seq typs; - typs.push_back(ot.get_type()); - set_types(typs); + type_seq typs; + typs.push_back(ot.get_type()); + set_types(typs); } -ITable::ITable(const type_tree& tt, int nsamples, +ITable::ITable(const type_tree &tt, int nsamples, contin_t min_contin, contin_t max_contin) { - arity_t barity = boolean_arity(tt), carity = contin_arity(tt); - - if (nsamples < 0) - nsamples = std::max(pow2(barity), sample_count(carity)); - - // in that case the boolean inputs are not picked randomly but - // instead are enumerated - bool comp_tt = nsamples == (int)pow2(barity); - - // Populate the matrix. - auto root = tt.begin(); - for (int i = 0; i < nsamples; ++i) { - size_t bidx = 0; // counter used to enumerate all - // booleans - vertex_seq vs; - for (auto it = root.begin(); it != root.last_child(); ++it) - if (*it == id::boolean_type) - vs.push_back(bool_to_vertex(comp_tt? - i & (1 << bidx++) - : randGen().randint(2))); - else if (*it == id::contin_type) - vs.push_back((max_contin - min_contin) - * randGen().randdouble() + min_contin); - else if (*it == id::enum_type) - vs.push_back(enum_t::get_random_enum()); - else if (*it == id::unknown_type) - vs.push_back(vertex()); // push default vertex - else - OC_ASSERT(false, "Not implemented yet"); - - // input vector - push_back(vs); - } -} - -bool ITable::operator==(const ITable& rhs) const -{ - // return - bool super_eq = static_cast(*this) == static_cast(rhs); - bool labels_eq = get_labels() == rhs.get_labels(); - bool types_eq = get_types() == rhs.get_types(); - return super_eq && labels_eq && types_eq; + arity_t barity = boolean_arity(tt), carity = contin_arity(tt); + + if (nsamples < 0) + nsamples = std::max(pow2(barity), sample_count(carity)); + + // in that case the boolean inputs are not picked randomly but + // instead are enumerated + bool comp_tt = nsamples == (int) pow2(barity); + + // Populate the matrix. + auto root = tt.begin(); + for (int i = 0; i < nsamples; ++i) { + size_t bidx = 0; // counter used to enumerate all + // booleans + vertex_seq vs; + for (auto it = root.begin(); it != root.last_child(); ++it) + if (*it == id::boolean_type) + vs.push_back(bool_to_vertex(comp_tt ? + i & (1 << bidx++) + : randGen().randint(2))); + else if (*it == id::contin_type) + vs.push_back((max_contin - min_contin) + * randGen().randdouble() + min_contin); + else if (*it == id::enum_type) + vs.push_back(enum_t::get_random_enum()); + else if (*it == id::unknown_type) + vs.push_back(vertex()); // push default vertex + else OC_ASSERT(false, "Not implemented yet"); + + // input vector + push_back(vs); + } +} + +bool ITable::operator==(const ITable &rhs) const +{ + // return + bool super_eq = static_cast(*this) == static_cast(rhs); + bool labels_eq = get_labels() == rhs.get_labels(); + bool types_eq = get_types() == rhs.get_types(); + return super_eq && labels_eq && types_eq; } // ------------------------------------------------------- -void ITable::set_labels(const vector& il) +void ITable::set_labels(const vector &il) { - labels = il; + labels = il; } static const std::string default_input_label("i"); vector ITable::get_default_labels() const { - string_seq res; - for (arity_t i = 1; i <= get_arity(); ++i) - res.push_back(default_input_label - + boost::lexical_cast(i)); - return res; + string_seq res; + for (arity_t i = 1; i <= get_arity(); ++i) + res.push_back(default_input_label + + boost::lexical_cast(i)); + return res; } -const vector& ITable::get_labels() const +const vector &ITable::get_labels() const { - if (labels.empty() and !super::empty()) // return default labels - labels = get_default_labels(); - return labels; + if (labels.empty() and !super::empty()) // return default labels + labels = get_default_labels(); + return labels; } -void ITable::set_types(const vector& il) +void ITable::set_types(const vector &il) { - types = il; + types = il; } -const vector& ITable::get_types() const +const vector &ITable::get_types() const { - if (types.empty() and !super::empty()) { - arity_t arity = get_arity(); - types.resize(arity); - for (arity_t i=0; i 1 ? 1 : 0; - type_tree col_tt = get_type_tree(col[idx]); - type_node col_type = get_type_node(col_tt); - types.insert(off >= 0 ? types.begin() + off : types.end(), col_type); + // Infer the column type + // If it exists use the second row, just in case the first holds labels... + unsigned idx = col.size() > 1 ? 1 : 0; + type_tree col_tt = get_type_tree(col[idx]); + type_node col_type = get_type_node(col_tt); + types.insert(off >= 0 ? types.begin() + off : types.end(), col_type); - // Insert label - labels.insert(off >= 0 ? labels.begin() + off : labels.end(), clab); + // Insert label + labels.insert(off >= 0 ? labels.begin() + off : labels.end(), clab); - // Insert values - if (empty()) { - OC_ASSERT(off < 0); - for (const auto& v : col) - push_back({v}); - return; - } + // Insert values + if (empty()) { + OC_ASSERT(off < 0); + for (const auto &v : col) + push_back({v}); + return; + } - OC_ASSERT (col.size() == size(), "Incorrect column length!"); - for (unsigned i = 0; i < col.size(); i++) { - auto& row = (*this)[i]; + OC_ASSERT (col.size() == size(), "Incorrect column length!"); + for (unsigned i = 0; i < col.size(); i++) { + auto &row = (*this)[i]; - // convert row into vertex_seq - vertex_seq vs; - for (unsigned j = 0; j < row.size(); ++j) - vs.push_back(row.get_at(j)); - row = vs; + // convert row into vertex_seq + vertex_seq vs; + for (unsigned j = 0; j < row.size(); ++j) + vs.push_back(row.get_at(j)); + row = vs; - // insert the value from col at off - row.insert_at(off, col[i]); - } + // insert the value from col at off + row.insert_at(off, col[i]); + } } -int ITable::get_column_offset(const std::string& name) const +int ITable::get_column_offset(const std::string &name) const { - // If the name is empty, get column zero. - if (name.empty()) - return 0; + // If the name is empty, get column zero. + if (name.empty()) + return 0; - // If the name is numeric, then assume its a column number - // starting at column 1 for the leftmost column. - // i.e. subtract one to get number. - if (isdigit(name.c_str()[0])) - return atoi(name.c_str()) - 1; + // If the name is numeric, then assume its a column number + // starting at column 1 for the leftmost column. + // i.e. subtract one to get number. + if (isdigit(name.c_str()[0])) + return atoi(name.c_str()) - 1; - auto pos = std::find(labels.begin(), labels.end(), name); - if (pos == labels.end()) - return -1; - return distance(labels.begin(), pos); + auto pos = std::find(labels.begin(), labels.end(), name); + if (pos == labels.end()) + return -1; + return distance(labels.begin(), pos); } vertex_seq ITable::get_column_data(int offset) const { - // @todo it outputs vertex_seq, it's not very general - - vertex_seq col; + // @todo it outputs vertex_seq, it's not very general + + vertex_seq col; - if (-1 == offset) - return col; + if (-1 == offset) + return col; - get_at_visitor gvav(offset); - auto agva = boost::apply_visitor(gvav); - for (const auto& row : *this) { - col.push_back(agva(row.get_variant())); - } - return col; + get_at_visitor gvav(offset); + auto agva = boost::apply_visitor(gvav); + for (const auto &row : *this) { + col.push_back(agva(row.get_variant())); + } + return col; } -vertex_seq ITable::get_column_data(const std::string& name) const +vertex_seq ITable::get_column_data(const std::string &name) const { - return get_column_data(get_column_offset(name)); + return get_column_data(get_column_offset(name)); } -string ITable::delete_column(const string& name) +string ITable::delete_column(const string &name) { - int off = get_column_offset(name); - if (-1 == off) - throw IndexErrorException(TRACE_INFO, - "Can't delete, unknown column name: %s", name.c_str()); + int off = get_column_offset(name); + if (-1 == off) + throw IndexErrorException(TRACE_INFO, + "Can't delete, unknown column name: %s", name.c_str()); - // Delete the column - for (multi_type_seq& row : *this) - row.erase_at(off); + // Delete the column + for (multi_type_seq &row : *this) + row.erase_at(off); - // Delete the label as well. - string rv; - if (not labels.empty()) { - rv = *(labels.begin() + off); - labels.erase(labels.begin() + off); - } + // Delete the label as well. + string rv; + if (not labels.empty()) { + rv = *(labels.begin() + off); + labels.erase(labels.begin() + off); + } - if (not types.empty()) - types.erase(types.begin() + off); + if (not types.empty()) + types.erase(types.begin() + off); - return rv; + return rv; } -void ITable::delete_columns(const vector& ignore_features) +void ITable::delete_columns(const vector &ignore_features) { - for (const string& feat : ignore_features) - delete_column(feat); + for (const string &feat : ignore_features) + delete_column(feat); } //////////// // OTable // //////////// -OTable::OTable(const string& ol) - : label(ol), type(id::unknown_type) {} +OTable::OTable(const string &ol) + : label(ol), type(id::unknown_type) +{} -OTable::OTable(const super& ot, const string& ol) - : super(ot), label(ol) +OTable::OTable(const super &ot, const string &ol) + : super(ot), label(ol) { - // Be sure to set the column type as well ... - type = get_type_node(get_type_tree((*this)[0])); + // Be sure to set the column type as well ... + type = get_type_node(get_type_tree((*this)[0])); } -OTable::OTable(const combo_tree& tr, const ITable& itable, const string& ol) - : label(ol) +OTable::OTable(const combo_tree &tr, const ITable &itable, const string &ol) + : label(ol) { - OC_ASSERT(not tr.empty()); - if (is_ann_type(*tr.begin())) { - // we treat ANN differently because they must be decoded - // before being evaluated. Also note that if there are memory - // neurones then the state of the network is evolving at each - // input, so the order within itable does matter - ann net = tree_transform().decodify_tree(tr); - int depth = net.feedforward_depth(); - for (const multi_type_seq& vv : itable) { - contin_seq tmp = vv.get_seq(); - tmp.push_back(1.0); // net uses that in case the function - // to learn needs some kind of offset - net.load_inputs(tmp); - dorepeat(depth) - net.propagate(); - push_back(net.outputs[0]->activation); - } - } else { - interpreter_visitor iv(tr); - auto ai = boost::apply_visitor(iv); - for (const multi_type_seq& vs : itable) - push_back(ai(vs.get_variant())); - } + OC_ASSERT(not tr.empty()); + if (is_ann_type(*tr.begin())) { + // we treat ANN differently because they must be decoded + // before being evaluated. Also note that if there are memory + // neurones then the state of the network is evolving at each + // input, so the order within itable does matter + ann net = tree_transform().decodify_tree(tr); + int depth = net.feedforward_depth(); + for (const multi_type_seq &vv : itable) { + contin_seq tmp = vv.get_seq(); + tmp.push_back(1.0); // net uses that in case the function + // to learn needs some kind of offset + net.load_inputs(tmp); + dorepeat(depth)net.propagate(); + push_back(net.outputs[0]->activation); + } + } else { + interpreter_visitor iv(tr); + auto ai = boost::apply_visitor(iv); + for (const multi_type_seq &vs : itable) + push_back(ai(vs.get_variant())); + } - // Be sure to set the column type as well ... - type = get_type_node(get_type_tree((*this)[0])); + // Be sure to set the column type as well ... + type = get_type_node(get_type_tree((*this)[0])); } -OTable::OTable(const combo_tree& tr, const CTable& ctable, const string& ol) - : label(ol) +OTable::OTable(const combo_tree &tr, const CTable &ctable, const string &ol) + : label(ol) { - arity_set as = get_argument_abs_idx_set(tr); - interpreter_visitor iv(tr); - auto ai = boost::apply_visitor(iv); - for_each(ctable | map_keys, [&](const multi_type_seq& mts) { - this->push_back(ai(mts.get_variant())); - }); + arity_set as = get_argument_abs_idx_set(tr); + interpreter_visitor iv(tr); + auto ai = boost::apply_visitor(iv); + for_each(ctable | map_keys, [&](const multi_type_seq &mts) { + this->push_back(ai(mts.get_variant())); + }); - // Be sure to set the column type as well ... - type = get_type_node(get_type_tree((*this)[0])); + // Be sure to set the column type as well ... + type = get_type_node(get_type_tree((*this)[0])); } -void OTable::set_label(const string& ol) +void OTable::set_label(const string &ol) { - if (ol.empty()) - label = default_output_label; - else - label = ol; + if (ol.empty()) + label = default_output_label; + else + label = ol; } -const string& OTable::get_label() const +const string &OTable::get_label() const { - return label; + return label; } void OTable::set_type(type_node t) { - type = t; + type = t; } type_node OTable::get_type() const { - return type; + return type; } // ------------------------------------------------------- -bool OTable::operator==(const OTable& rhs) const +bool OTable::operator==(const OTable &rhs) const { - const static contin_t epsilon = 1e-12; - for (auto lit = begin(), rit = rhs.begin(); lit != end(); ++lit, ++rit) { - if (is_contin(*lit) && is_contin(*rit)) { - if (!is_approx_eq(get_contin(*lit), get_contin(*rit), epsilon)) - return false; - } - else if (*lit != *rit) - return false; - } - return rhs.get_label() == label; + const static contin_t epsilon = 1e-12; + for (auto lit = begin(), rit = rhs.begin(); lit != end(); ++lit, ++rit) { + if (is_contin(*lit) && is_contin(*rit)) { + if (!is_approx_eq(get_contin(*lit), get_contin(*rit), epsilon)) + return false; + } else if (*lit != *rit) + return false; + } + return rhs.get_label() == label; } // XXX TODO replace this by the util p_norm function. -contin_t OTable::abs_distance(const OTable& ot) const -{ - OC_ASSERT(ot.size() == size()); - contin_t res = 0; - if (id::contin_type == type and id::contin_type == ot.type) { - for (const_iterator x = begin(), y = ot.begin(); x != end();) - res += fabs(get_contin(*(x++)) - get_contin(*(y++))); - } - else - if (id::boolean_type == type and id::boolean_type == ot.type) { - for (const_iterator x = begin(), y = ot.begin(); x != end();) - res += (contin_t) (get_builtin(*(x++)) != get_builtin(*(y++))); - } - else - if (id::enum_type == type and id::enum_type == ot.type) { - for (const_iterator x = begin(), y = ot.begin(); x != end();) - res += (contin_t) (get_enum_type(*(x++)) != get_enum_type(*(y++))); - } - else - throw InconsistenceException(TRACE_INFO, - "Can't compare, mismatched column types."); - return res; +contin_t OTable::abs_distance(const OTable &ot) const +{ + OC_ASSERT(ot.size() == size()); + contin_t res = 0; + if (id::contin_type == type and id::contin_type == ot.type) { + for (const_iterator x = begin(), y = ot.begin(); x != end();) + res += fabs(get_contin(*(x++)) - get_contin(*(y++))); + } else if (id::boolean_type == type and id::boolean_type == ot.type) { + for (const_iterator x = begin(), y = ot.begin(); x != end();) + res += (contin_t) (get_builtin(*(x++)) != get_builtin(*(y++))); + } else if (id::enum_type == type and id::enum_type == ot.type) { + for (const_iterator x = begin(), y = ot.begin(); x != end();) + res += (contin_t) (get_enum_type(*(x++)) != get_enum_type(*(y++))); + } else + throw InconsistenceException(TRACE_INFO, + "Can't compare, mismatched column types."); + return res; } // XXX TODO replace this by the util p_norm function. -contin_t OTable::sum_squared_error(const OTable& ot) const +contin_t OTable::sum_squared_error(const OTable &ot) const { - OC_ASSERT(ot.size() == size()); - contin_t res = 0; - for (const_iterator x = begin(), y = ot.begin(); x != end();) - res += sq(get_contin(*(x++)) - get_contin(*(y++))); - return res; + OC_ASSERT(ot.size() == size()); + contin_t res = 0; + for (const_iterator x = begin(), y = ot.begin(); x != end();) + res += sq(get_contin(*(x++)) - get_contin(*(y++))); + return res; } -contin_t OTable::mean_squared_error(const OTable& ot) const +contin_t OTable::mean_squared_error(const OTable &ot) const { - OC_ASSERT(ot.size() == size() && size() > 0); - return sum_squared_error(ot) / ot.size(); + OC_ASSERT(ot.size() == size() && size() > 0); + return sum_squared_error(ot) / ot.size(); } -contin_t OTable::root_mean_square_error(const OTable& ot) const +contin_t OTable::root_mean_square_error(const OTable &ot) const { - OC_ASSERT(ot.size() == size() && size() > 0); - return sqrt(mean_squared_error(ot)); + OC_ASSERT(ot.size() == size() && size() > 0); + return sqrt(mean_squared_error(ot)); } //////////// // TTable // //////////// -TTable::TTable(const string& tl) - : label(tl) {} +TTable::TTable(const string &tl) + : label(tl) +{} -TTable::TTable(const super& tt, const string& tl) - : super(tt), label(tl) {} +TTable::TTable(const super &tt, const string &tl) + : super(tt), label(tl) +{} -void TTable::set_label(const string& tl) +void TTable::set_label(const string &tl) { - label = tl; + label = tl; } -const string& TTable::get_label() const +const string &TTable::get_label() const { - return label; + return label; } -TTable::value_type TTable::from_string(const std::string& timestamp_str) { - return boost::gregorian::from_string(timestamp_str); +TTable::value_type TTable::from_string(const std::string ×tamp_str) +{ + return boost::gregorian::from_string(timestamp_str); } -std::string TTable::to_string(const TTable::value_type& timestamp) { - return boost::gregorian::to_iso_extended_string(timestamp); +std::string TTable::to_string(const TTable::value_type ×tamp) +{ + return boost::gregorian::to_iso_extended_string(timestamp); } /////////// // Table // /////////// -Table::Table() : target_pos(0), timestamp_pos(0) {} +Table::Table() : target_pos(0), timestamp_pos(0) +{} -Table::Table(const OTable& otable_, const ITable& itable_) - : itable(itable_), otable(otable_), target_pos(0), timestamp_pos(0) {} +Table::Table(const OTable &otable_, const ITable &itable_) + : itable(itable_), otable(otable_), target_pos(0), timestamp_pos(0) +{} -Table::Table(const combo_tree& tr, int nsamples, +Table::Table(const combo_tree &tr, int nsamples, contin_t min_contin, contin_t max_contin) : - itable(infer_type_tree(tr), nsamples, min_contin, max_contin), - otable(tr, itable), target_pos(0), timestamp_pos(0) {} + itable(infer_type_tree(tr), nsamples, min_contin, max_contin), + otable(tr, itable), target_pos(0), timestamp_pos(0) +{} vector Table::get_labels() const { - vector labels = itable.get_labels(); - labels.insert(labels.begin(), otable.get_label()); - return labels; + vector labels = itable.get_labels(); + labels.insert(labels.begin(), otable.get_label()); + return labels; } // ------------------------------------------------------- CTable Table::compressed(const std::string weight_col) const { - logger().debug("Compress the dataset, current size is %d", itable.size()); - - // If no weight column, then its straight-forward - if (weight_col.empty()) { - CTable res(otable.get_label(), itable.get_labels(), get_signature()); - - ITable::const_iterator in_it = itable.begin(); - OTable::const_iterator out_it = otable.begin(); - if (ttable.empty()) - for(; in_it != itable.end(); ++in_it, ++out_it) - ++res[*in_it][TimedValue(*out_it)]; - else { - TTable::const_iterator time_it = ttable.begin(); - for(; in_it != itable.end(); ++in_it, ++out_it, ++time_it) - ++res[*in_it][TimedValue(*out_it, *time_it)]; - } - logger().debug("Size of the compressed dataset is %u", res.size()); - return res; - } - else { - // Else, remove the weight column from the input; - // we don't want to use it as an independent feature. - ITable trimmed(itable); - trimmed.delete_column(weight_col); - - CTable res(otable.get_label(), trimmed.get_labels(), get_signature()); - - size_t widx = itable.get_column_offset(weight_col); - ITable::const_iterator w_it = itable.begin(); - ITable::const_iterator in_it = trimmed.begin(); - OTable::const_iterator out_it = otable.begin(); - if (ttable.empty()) { - for (; in_it != trimmed.end(); ++in_it, ++out_it, ++w_it) - { - vertex v = w_it->get_at(widx); - contin_t weight = get_contin(v); - res[*in_it][TimedValue(*out_it)] += weight; - } - } - else { - TTable::const_iterator time_it = ttable.begin(); - for (; in_it != trimmed.end(); ++in_it, ++out_it, ++w_it, ++time_it) - { - vertex v = w_it->get_at(widx); - contin_t weight = get_contin(v); - res[*in_it][TimedValue(*out_it, *time_it)] += weight; - } - } - logger().debug("Size of the compressed dataset is %d", res.size()); - return res; - } + logger().debug("Compress the dataset, current size is %d", itable.size()); + + // If no weight column, then its straight-forward + if (weight_col.empty()) { + CTable res(otable.get_label(), itable.get_labels(), get_signature()); + + ITable::const_iterator in_it = itable.begin(); + OTable::const_iterator out_it = otable.begin(); + if (ttable.empty()) + for (; in_it != itable.end(); ++in_it, ++out_it) + ++res[*in_it][TimedValue(*out_it)]; + else { + TTable::const_iterator time_it = ttable.begin(); + for (; in_it != itable.end(); ++in_it, ++out_it, ++time_it) + ++res[*in_it][TimedValue(*out_it, *time_it)]; + } + logger().debug("Size of the compressed dataset is %u", res.size()); + return res; + } else { + // Else, remove the weight column from the input; + // we don't want to use it as an independent feature. + ITable trimmed(itable); + trimmed.delete_column(weight_col); + + CTable res(otable.get_label(), trimmed.get_labels(), get_signature()); + + size_t widx = itable.get_column_offset(weight_col); + ITable::const_iterator w_it = itable.begin(); + ITable::const_iterator in_it = trimmed.begin(); + OTable::const_iterator out_it = otable.begin(); + if (ttable.empty()) { + for (; in_it != trimmed.end(); ++in_it, ++out_it, ++w_it) { + vertex v = w_it->get_at(widx); + contin_t weight = get_contin(v); + res[*in_it][TimedValue(*out_it)] += weight; + } + } else { + TTable::const_iterator time_it = ttable.begin(); + for (; in_it != trimmed.end(); ++in_it, ++out_it, ++w_it, ++time_it) { + vertex v = w_it->get_at(widx); + contin_t weight = get_contin(v); + res[*in_it][TimedValue(*out_it, *time_it)] += weight; + } + } + logger().debug("Size of the compressed dataset is %d", res.size()); + return res; + } } // ------------------------------------------------------- @@ -560,308 +563,312 @@ CTable Table::compressed(const std::string weight_col) const * header. The labels can be sequenced in any order, it will always * return the order consistent with the header. */ -vector get_indices(const vector& labels, - const vector& header) +vector get_indices(const vector &labels, + const vector &header) { - vector res; - for (unsigned i = 0; i < header.size(); ++i) - if (std::find(labels.begin(), labels.end(), header[i]) != labels.end()) - res.push_back(i); - return res; + vector res; + for (unsigned i = 0; i < header.size(); ++i) + if (std::find(labels.begin(), labels.end(), header[i]) != labels.end()) + res.push_back(i); + return res; } std::vector discretize_contin_feature(contin_t min, contin_t max) { - std::vector res; - contin_t interval = (max - min)/TARGET_DISCRETIZED_BINS_NUM; - for (unsigned i = 0; i < TARGET_DISCRETIZED_BINS_NUM; ++i) - res.push_back(min+i*interval); - return res; + std::vector res; + contin_t interval = (max - min) / TARGET_DISCRETIZED_BINS_NUM; + for (unsigned i = 0; i < TARGET_DISCRETIZED_BINS_NUM; ++i) + res.push_back(min + i * interval); + return res; } builtin get_discrete_bin(std::vector disc_intvs, contin_t val) { - unsigned i; - for (i = 1; i < TARGET_DISCRETIZED_BINS_NUM; i++) - { - if (val < disc_intvs[i]) - break; - } - return (builtin)i; + unsigned i; + for (i = 1; i < TARGET_DISCRETIZED_BINS_NUM; i++) { + if (val < disc_intvs[i]) + break; + } + return (builtin) i; } -unsigned get_index(const string& label, const vector& header) +unsigned get_index(const string &label, const vector &header) { - return std::distance(header.begin(), std::find(header.begin(), header.end(), label)); + return std::distance(header.begin(), std::find(header.begin(), header.end(), label)); } -bool Table::operator==(const Table& rhs) const { - return itable == rhs.itable and otable == rhs.otable and ttable == rhs.ttable - and target_pos == rhs.target_pos and timestamp_pos == rhs.timestamp_pos; +bool Table::operator==(const Table &rhs) const +{ + return itable == rhs.itable and otable == rhs.otable and ttable == rhs.ttable + and target_pos == rhs.target_pos and timestamp_pos == rhs.timestamp_pos; } - + ///////////////// // TimeCounter // ///////////////// -count_t TimedCounter::get(const vertex& v) const { - count_t res = 0; - for (const auto& vtc : *this) - if (vtc.first.value == v) - res += vtc.second; - return res; +count_t TimedCounter::get(const vertex &v) const +{ + count_t res = 0; + for (const auto &vtc : *this) + if (vtc.first.value == v) + res += vtc.second; + return res; } -Counter TimedCounter::untimedCounter() const { - Counter vc; - for (const auto& vtc : *this) - vc[vtc.first.value] += vtc.second; - return vc; +Counter TimedCounter::untimedCounter() const +{ + Counter vc; + for (const auto &vtc : *this) + vc[vtc.first.value] += vtc.second; + return vc; } -vertex TimedCounter::mode() const { - return untimedCounter().mode(); +vertex TimedCounter::mode() const +{ + return untimedCounter().mode(); } //////////// // CTable // //////////// -CTable::CTable(const std::string& _olabel) - : olabel(_olabel) {} +CTable::CTable(const std::string &_olabel) + : olabel(_olabel) +{} -CTable::CTable(const string_seq& labs, const type_tree& tt) - : tsig(tt), olabel(labs[0]), ilabels(labs) +CTable::CTable(const string_seq &labs, const type_tree &tt) + : tsig(tt), olabel(labs[0]), ilabels(labs) { - ilabels.erase(ilabels.begin()); + ilabels.erase(ilabels.begin()); } -CTable::CTable(const std::string& _olabel, const string_seq& _ilabels, - const type_tree& tt) - : tsig(tt), olabel(_olabel), ilabels(_ilabels) +CTable::CTable(const std::string &_olabel, const string_seq &_ilabels, + const type_tree &tt) + : tsig(tt), olabel(_olabel), ilabels(_ilabels) {} -void CTable::remove_rows(const set& idxs) +void CTable::remove_rows(const set &idxs) { - // iterator of the set of row indexes to remove - auto idx_it = idxs.begin(); + // iterator of the set of row indexes to remove + auto idx_it = idxs.begin(); - // iterator index of the CTable from the perspective of an - // uncompressed table - unsigned i = 0; + // iterator index of the CTable from the perspective of an + // uncompressed table + unsigned i = 0; - // For each row check if some indexes are within uncompressed - // begining and end of that row and decrease their counts if - // so - for (auto row_it = begin(); - row_it != end() and idx_it != idxs.end();) { - auto& outputs = row_it->second; + // For each row check if some indexes are within uncompressed + // begining and end of that row and decrease their counts if + // so + for (auto row_it = begin(); + row_it != end() and idx_it != idxs.end();) { + auto &outputs = row_it->second; // XXX this cannot possibly be correct, the total count is in general // a fraction, not an integer; it is merely the sum of the weights // of the rows. It is NOT equal to the toal number of rows! // I cannot figure out what this algo is trying to do, so I can't // actually fix it :-( - count_t total_row_weights = outputs.total_count(); - count_t truncate = floor(total_row_weights); -OC_ASSERT(0.0 == (total_row_weights - truncate), "This algo is broken!"); - unsigned i_end = i + ((unsigned) truncate); - if (i <= *idx_it and *idx_it < i_end) { - for (auto v_it = outputs.begin(); - v_it != outputs.end() and idx_it != idxs.end();) { - OC_ASSERT(i <= *idx_it, "There must be a bug"); - // Remove all overlapping indexes with v and - // advance idx_it of that amount - unsigned i_v_end = i + v_it->second; - while (idx_it != idxs.end() and *idx_it < i_v_end) { - OC_ASSERT(v_it->second > 0, "There must be a bug"); - --v_it->second; - ++idx_it; - } - - // Increment i with the count of that value - i = i_v_end; - - // Check if the count went to zero, if so remove - // v_it entirely - if (v_it->second == 0) - v_it = outputs.erase(v_it); - else - ++v_it; - } - - // Check if the output is empty, and if so remove the - // row entirely - if (row_it->second.empty()) - row_it = erase(row_it); - else - ++row_it; - } else { - ++row_it; - } - i = i_end; - } -} - -void CTable::remove_rows_at_times(const set& timestamps) -{ - for (const TTable::value_type& timestamp : timestamps) - remove_rows_at_time(timestamp); -} - -void CTable::remove_rows_at_time(const TTable::value_type& timestamp) -{ - for (auto row_it = begin(); row_it != end();) { - auto& outputs = row_it->second; - - // Remove all output values at timestamp - for (auto v_it = outputs.begin(); v_it != outputs.end();) { - if (v_it->first.timestamp == timestamp) - v_it = outputs.erase(v_it); - else - ++v_it; - } - - // Check if the output is empty, and if so remove the - // row entirely - if (row_it->second.empty()) - row_it = erase(row_it); - else - ++row_it; - } + count_t total_row_weights = outputs.total_count(); + count_t truncate = floor(total_row_weights); + OC_ASSERT(0.0 == (total_row_weights - truncate), "This algo is broken!"); + unsigned i_end = i + ((unsigned) truncate); + if (i <= *idx_it and *idx_it < i_end) { + for (auto v_it = outputs.begin(); + v_it != outputs.end() and idx_it != idxs.end();) { + OC_ASSERT(i <= *idx_it, "There must be a bug"); + // Remove all overlapping indexes with v and + // advance idx_it of that amount + unsigned i_v_end = i + v_it->second; + while (idx_it != idxs.end() and *idx_it < i_v_end) { + OC_ASSERT(v_it->second > 0, "There must be a bug"); + --v_it->second; + ++idx_it; + } + + // Increment i with the count of that value + i = i_v_end; + + // Check if the count went to zero, if so remove + // v_it entirely + if (v_it->second == 0) + v_it = outputs.erase(v_it); + else + ++v_it; + } + + // Check if the output is empty, and if so remove the + // row entirely + if (row_it->second.empty()) + row_it = erase(row_it); + else + ++row_it; + } else { + ++row_it; + } + i = i_end; + } +} + +void CTable::remove_rows_at_times(const set ×tamps) +{ + for (const TTable::value_type ×tamp : timestamps) + remove_rows_at_time(timestamp); +} + +void CTable::remove_rows_at_time(const TTable::value_type ×tamp) +{ + for (auto row_it = begin(); row_it != end();) { + auto &outputs = row_it->second; + + // Remove all output values at timestamp + for (auto v_it = outputs.begin(); v_it != outputs.end();) { + if (v_it->first.timestamp == timestamp) + v_it = outputs.erase(v_it); + else + ++v_it; + } + + // Check if the output is empty, and if so remove the + // row entirely + if (row_it->second.empty()) + row_it = erase(row_it); + else + ++row_it; + } } set CTable::get_timestamps() const { - set res; - for (const CTable::value_type& row : *this) - for (const auto& vtc : row.second) - if (vtc.first.timestamp != boost::gregorian::date()) - res.insert(vtc.first.timestamp); + set res; + for (const CTable::value_type &row : *this) + for (const auto &vtc : row.second) + if (vtc.first.timestamp != boost::gregorian::date()) + res.insert(vtc.first.timestamp); - return res; + return res; } -void CTable::set_labels(const vector& labels) +void CTable::set_labels(const vector &labels) { - olabel = labels.front(); - ilabels.clear(); - ilabels.insert(ilabels.begin(), labels.begin() + 1, labels.end()); + olabel = labels.front(); + ilabels.clear(); + ilabels.insert(ilabels.begin(), labels.begin() + 1, labels.end()); } vector CTable::get_labels() const { - vector labels = ilabels; - labels.insert(labels.begin(), olabel); - return labels; + vector labels = ilabels; + labels.insert(labels.begin(), olabel); + return labels; } -const string& CTable::get_output_label() const +const string &CTable::get_output_label() const { - return olabel; + return olabel; } -const string_seq& CTable::get_input_labels() const +const string_seq &CTable::get_input_labels() const { - return ilabels; + return ilabels; } -void CTable::set_signature(const type_tree& tt) +void CTable::set_signature(const type_tree &tt) { - tsig = tt; + tsig = tt; } -const type_tree& CTable::get_signature() const +const type_tree &CTable::get_signature() const { - return tsig; + return tsig; } count_t CTable::uncompressed_size() const { - count_t res = 0.0; - for (const value_type& v : *this) { - res += v.second.total_count(); - } - return res; + count_t res = 0.0; + for (const value_type &v : *this) { + res += v.second.total_count(); + } + return res; } type_node CTable::get_output_type() const { - return get_type_node(get_signature_output(tsig)); + return get_type_node(get_signature_output(tsig)); } CTableTime CTable::ordered_by_time() const { - // Turn the input to timestamped output map into timetamp to - // output map - CTableTime res; - for (const auto& v : *this) - for (const auto& tcv : v.second) - res[tcv.first.timestamp] += - Counter({{tcv.first.value, tcv.second}}); - return res; + // Turn the input to timestamped output map into timetamp to + // output map + CTableTime res; + for (const auto &v : *this) + for (const auto &tcv : v.second) + res[tcv.first.timestamp] += + Counter({{tcv.first.value, tcv.second}}); + return res; } void CTable::balance() { - type_node otype = get_output_type(); - if (otype == id::boolean_type or otype == id::enum_type) { - // Get total count for each class (called Ni in comment below) - Counter class_count; - for (auto iorow : *this) - class_count += iorow.second.untimedCounter(); - - count_t usize = uncompressed_size(), n = class_count.size(); - // N1 + ... + Nn = usize - // - // where Ni = ci1 + ... cim, with cij the count of each row j - // of class i. - // - // We want Ni' = usize / n, the new count of class i. - // - // where Ni' = ci1' + ... + cim', with cij' the count of each - // row of class i. - // - // cij' = ci * cij - // - // ci * ci1 + ... + ci * cim = usize / n - // ci = (usize/n) / Ni - for (auto iorow : *this) - for (auto tvc : iorow.second) - tvc.second *= (usize / n) / class_count[tvc.first.value]; - } else { - logger().warn() << "CTable::balance() - " - << "cannot balance non discrete output type " - << otype; - } + type_node otype = get_output_type(); + if (otype == id::boolean_type or otype == id::enum_type) { + // Get total count for each class (called Ni in comment below) + Counter class_count; + for (auto iorow : *this) + class_count += iorow.second.untimedCounter(); + + count_t usize = uncompressed_size(), n = class_count.size(); + // N1 + ... + Nn = usize + // + // where Ni = ci1 + ... cim, with cij the count of each row j + // of class i. + // + // We want Ni' = usize / n, the new count of class i. + // + // where Ni' = ci1' + ... + cim', with cij' the count of each + // row of class i. + // + // cij' = ci * cij + // + // ci * ci1 + ... + ci * cim = usize / n + // ci = (usize/n) / Ni + for (auto iorow : *this) + for (auto tvc : iorow.second) + tvc.second *= (usize / n) / class_count[tvc.first.value]; + } else { + logger().warn() << "CTable::balance() - " + << "cannot balance non discrete output type " + << otype; + } } // ------------------------------------------------------- // XXX TODO replace this by the util p_norm function. complete_truth_table::size_type -complete_truth_table::hamming_distance(const complete_truth_table& other) const +complete_truth_table::hamming_distance(const complete_truth_table &other) const { - OC_ASSERT(other.size() == size(), - "complete_truth_tables size should be the same."); + OC_ASSERT(other.size() == size(), + "complete_truth_tables size should be the same."); - size_type res = 0; - for (const_iterator x = begin(), y = other.begin();x != end();) - res += (*x++ != *y++); - return res; + size_type res = 0; + for (const_iterator x = begin(), y = other.begin(); x != end();) + res += (*x++ != *y++); + return res; } -bool complete_truth_table::same_complete_truth_table(const combo_tree& tr) const +bool complete_truth_table::same_complete_truth_table(const combo_tree &tr) const { - const_iterator cit = begin(); - for (int i = 0; cit != end(); ++i, ++cit) { - for (int j = 0; j < _arity; ++j) - inputs[j] = bool_to_builtin((i >> j) % 2); - if (*cit != builtin_to_bool(boolean_interpreter(inputs)(tr))) - return false; - } - return true; + const_iterator cit = begin(); + for (int i = 0; cit != end(); ++i, ++cit) { + for (int j = 0; j < _arity; ++j) + inputs[j] = bool_to_builtin((i >> j) % 2); + if (*cit != builtin_to_bool(boolean_interpreter(inputs)(tr))) + return false; + } + return true; } ///////////////////// @@ -869,60 +876,60 @@ bool complete_truth_table::same_complete_truth_table(const combo_tree& tr) const ///////////////////// // Remove enough rows randomly so that the table has only nrows -void subsampleTable(unsigned nrows, ITable& it, OTable& ot, TTable& tt) -{ - OC_ASSERT(it.empty() || ot.empty() || it.size() == ot.size()); - OC_ASSERT(ot.empty() || tt.empty() || ot.size() == tt.size()); - OC_ASSERT(tt.empty() || it.empty() || tt.size() == it.size()); - unsigned size = std::max(it.size(), std::max(ot.size(), tt.size())); - if(nrows < size) { - unsigned nremove = size - nrows; - dorepeat(nremove) { - unsigned int ridx = randGen().randint(size); - if (!it.empty()) - it.erase(it.begin()+ridx); - if (!ot.empty()) - ot.erase(ot.begin()+ridx); - if (!tt.empty()) - tt.erase(tt.begin()+ridx); - } - } -} - -void subsampleTable(float ratio, Table& table) -{ - OC_ASSERT(0.0 <= ratio and ratio <= 1.0, - "Ratio must be in [0.0, 1.0], but is %f", ratio); - subsampleTable(ratio * table.size(), table.itable, table.otable, table.ttable); -} - -void subsampleCTable(float ratio, CTable& ctable) -{ - OC_ASSERT(0.0 <= ratio and ratio <= 1.0, - "Ratio must be in [0.0, 1.0], but is %f", ratio); - std::set rm_row_idxs; - unsigned ctable_usize = ctable.uncompressed_size(), - nremove = (1.0 - ratio) * ctable_usize; - lazy_random_selector rm_selector(ctable_usize); - dorepeat(nremove) - rm_row_idxs.insert(rm_selector()); - ctable.remove_rows(rm_row_idxs); +void subsampleTable(unsigned nrows, ITable &it, OTable &ot, TTable &tt) +{ + OC_ASSERT(it.empty() || ot.empty() || it.size() == ot.size()); + OC_ASSERT(ot.empty() || tt.empty() || ot.size() == tt.size()); + OC_ASSERT(tt.empty() || it.empty() || tt.size() == it.size()); + unsigned size = std::max(it.size(), std::max(ot.size(), tt.size())); + if (nrows < size) { + unsigned nremove = size - nrows; + dorepeat(nremove) { + unsigned int ridx = randGen().randint(size); + if (!it.empty()) + it.erase(it.begin() + ridx); + if (!ot.empty()) + ot.erase(ot.begin() + ridx); + if (!tt.empty()) + tt.erase(tt.begin() + ridx); + } + } +} + +void subsampleTable(float ratio, Table &table) +{ + OC_ASSERT(0.0 <= ratio and ratio <= 1.0, + "Ratio must be in [0.0, 1.0], but is %f", ratio); + subsampleTable(ratio * table.size(), table.itable, table.otable, table.ttable); +} + +void subsampleCTable(float ratio, CTable &ctable) +{ + OC_ASSERT(0.0 <= ratio and ratio <= 1.0, + "Ratio must be in [0.0, 1.0], but is %f", ratio); + std::set rm_row_idxs; + unsigned ctable_usize = ctable.uncompressed_size(), + nremove = (1.0 - ratio) * ctable_usize; + lazy_random_selector rm_selector(ctable_usize); + dorepeat(nremove)rm_row_idxs.insert(rm_selector()); + ctable.remove_rows(rm_row_idxs); } //////////////////////// // Mutual Information // //////////////////////// -double OTEntropy(const OTable& ot) +double OTEntropy(const OTable &ot) { - // Compute the probability distributions - Counter counter(ot); - vector py(counter.size()); - double total = ot.size(); - transform(counter | map_values, py.begin(), - [&](count_t c) { return c/total; }); - // Compute the entropy - return entropy(py); + // Compute the probability distributions + Counter counter(ot); + vector py(counter.size()); + double total = ot.size(); + transform(counter | map_values, py.begin(), + [&](count_t c) { return c / total; }); + // Compute the entropy + return entropy(py); } -}} // ~namespaces combo opencog +} +} // ~namespaces combo opencog diff --git a/moses/comboreduct/table/table.h b/moses/comboreduct/table/table.h index 1bb7ebc027..c45502026b 100644 --- a/moses/comboreduct/table/table.h +++ b/moses/comboreduct/table/table.h @@ -50,13 +50,16 @@ #include "../combo/common_def.h" #define COEF_SAMPLE_COUNT 20.0 // involved in the formula that counts - // the number of trials needed to check - // a formula +// the number of trials needed to check +// a formula #define TARGET_DISCRETIZED_BINS_NUM 5 // discretize contin type target - // into # bins +// into # bins -namespace opencog { namespace combo { +namespace opencog +{ +namespace combo +{ std::vector discretize_contin_feature(contin_t min, contin_t max); @@ -69,8 +72,8 @@ builtin get_discrete_bin(std::vector disc_intvs, contin_t val); * header. The labels can be sequenced in any order, it will always * return the order consistent with the header. */ -std::vector get_indices(const std::vector& labels, - const std::vector& header); +std::vector get_indices(const std::vector &labels, + const std::vector &header); /////////////////// // Generic table // @@ -106,27 +109,38 @@ typedef std::vector string_seq; template struct push_back_visitor : public boost::static_visitor<> { - push_back_visitor(const T& value) : _value(value) {} - void operator()(std::vector& seq) const { - seq.push_back(_value); - } - void operator()(vertex_seq& seq) const { - seq.push_back(_value); - } - template void operator()(Seq& seq) const { - std::stringstream ss; - ss << "You can't push_back " << _value << " in container "; - ostream_container(ss, seq); - OC_ASSERT(false, ss.str()); - } - const T& _value; + push_back_visitor(const T &value) : _value(value) + {} + + void operator()(std::vector &seq) const + { + seq.push_back(_value); + } + + void operator()(vertex_seq &seq) const + { + seq.push_back(_value); + } + + template + void operator()(Seq &seq) const + { + std::stringstream ss; + ss << "You can't push_back " << _value << " in container "; + ostream_container(ss, seq); + OC_ASSERT(false, ss.str()); + } + + const T &_value; }; struct pop_back_visitor : public boost::static_visitor<> { - template void operator()(Seq& seq) const { - seq.pop_back(); - } + template + void operator()(Seq &seq) const + { + seq.pop_back(); + } }; /** @@ -134,97 +148,140 @@ struct pop_back_visitor : public boost::static_visitor<> */ struct init_at_visitor : public boost::static_visitor<> { - init_at_visitor(size_t pos) : _pos(pos) {} - template - void operator()(Seq& seq) const { - typedef typename Seq::value_type vt; - seq[_pos] = vt(); - } - size_t _pos; + init_at_visitor(size_t pos) : _pos(pos) + {} + + template + void operator()(Seq &seq) const + { + typedef typename Seq::value_type vt; + seq[_pos] = vt(); + } + + size_t _pos; }; template struct get_at_visitor : public boost::static_visitor { - get_at_visitor(size_t pos) : _pos(pos) {} - T operator()(const std::vector& seq) const { - return seq[_pos]; - } - T operator()(const vertex_seq& seq) const { - return boost::get(seq[_pos]); - } - T operator()(const combo_tree_seq& seq) const { - return boost::get(*seq[_pos].begin()); - } - template T operator()(const Seq& seq) const { - OC_ASSERT(false, "Impossible operation"); - return T(); - } - size_t _pos; + get_at_visitor(size_t pos) : _pos(pos) + {} + + T operator()(const std::vector &seq) const + { + return seq[_pos]; + } + + T operator()(const vertex_seq &seq) const + { + return boost::get(seq[_pos]); + } + + T operator()(const combo_tree_seq &seq) const + { + return boost::get(*seq[_pos].begin()); + } + + template + T operator()(const Seq &seq) const + { + OC_ASSERT(false, "Impossible operation"); + return T(); + } + + size_t _pos; }; template<> struct get_at_visitor : public boost::static_visitor { - get_at_visitor(size_t pos) : _pos(pos) {} - vertex operator()(const combo_tree_seq& seq) const { - return *seq[_pos].begin(); - } - template vertex operator()(const Seq& seq) const { - return seq[_pos]; - } - size_t _pos; + get_at_visitor(size_t pos) : _pos(pos) + {} + + vertex operator()(const combo_tree_seq &seq) const + { + return *seq[_pos].begin(); + } + + template + vertex operator()(const Seq &seq) const + { + return seq[_pos]; + } + + size_t _pos; }; template<> struct get_at_visitor : public boost::static_visitor { - get_at_visitor(size_t pos) : _pos(pos) {} - template combo_tree operator()(const Seq& seq) const { - return seq[_pos]; - } - size_t _pos; + get_at_visitor(size_t pos) : _pos(pos) + {} + + template + combo_tree operator()(const Seq &seq) const + { + return seq[_pos]; + } + + size_t _pos; }; struct erase_at_visitor : public boost::static_visitor<> { - erase_at_visitor(size_t pos) : _pos(pos) {} - template void operator()(Seq& seq) const { - seq.erase(seq.begin() + _pos); - } - size_t _pos; + erase_at_visitor(size_t pos) : _pos(pos) + {} + + template + void operator()(Seq &seq) const + { + seq.erase(seq.begin() + _pos); + } + + size_t _pos; }; template struct insert_at_visitor : public boost::static_visitor<> { - // if pos is negative then it inserts at the end - insert_at_visitor(int pos, const T v) : _pos(pos), _v(v) {} - void operator()(std::vector& seq) const { - seq.insert(_pos >= 0 ? seq.begin() + _pos : seq.end(), _v); - } - template void operator()(Seq& seq) const { - std::stringstream ss; - ss << "You can't insert " << _v << " at " << _pos << " in container "; - ostream_container(ss, seq); - OC_ASSERT(false, ss.str()); - } - int _pos; - const T& _v; + // if pos is negative then it inserts at the end + insert_at_visitor(int pos, const T v) : _pos(pos), _v(v) + {} + + void operator()(std::vector &seq) const + { + seq.insert(_pos >= 0 ? seq.begin() + _pos : seq.end(), _v); + } + + template + void operator()(Seq &seq) const + { + std::stringstream ss; + ss << "You can't insert " << _v << " at " << _pos << " in container "; + ostream_container(ss, seq); + OC_ASSERT(false, ss.str()); + } + + int _pos; + const T &_v; }; struct size_visitor : public boost::static_visitor { - template size_t operator()(const Seq& seq) { - return seq.size(); - } + template + size_t operator()(const Seq &seq) + { + return seq.size(); + } }; struct empty_visitor : public boost::static_visitor { - template bool operator()(const Seq& seq) { - return seq.empty(); - } + template + bool operator()(const Seq &seq) + { + return seq.empty(); + } }; /** @@ -236,64 +293,95 @@ struct equal_visitor : public boost::static_visitor bool operator()(const seql_t& l, const seqr_t& r) const { \ return false; \ } - __FALSE_EQ__(builtin_seq, contin_seq); - __FALSE_EQ__(builtin_seq, string_seq); - __FALSE_EQ__(builtin_seq, combo_tree_seq); - __FALSE_EQ__(contin_seq, builtin_seq); - __FALSE_EQ__(contin_seq, string_seq); - __FALSE_EQ__(contin_seq, combo_tree_seq); - __FALSE_EQ__(string_seq, builtin_seq); - __FALSE_EQ__(string_seq, contin_seq); - __FALSE_EQ__(string_seq, combo_tree_seq); - __FALSE_EQ__(combo_tree_seq, builtin_seq); - __FALSE_EQ__(combo_tree_seq, contin_seq); - __FALSE_EQ__(combo_tree_seq, string_seq); - __FALSE_EQ__(combo_tree_seq, vertex_seq); - __FALSE_EQ__(vertex_seq, combo_tree_seq); + + __FALSE_EQ__(builtin_seq, contin_seq); + + __FALSE_EQ__(builtin_seq, string_seq); + + __FALSE_EQ__(builtin_seq, combo_tree_seq); + + __FALSE_EQ__(contin_seq, builtin_seq); + + __FALSE_EQ__(contin_seq, string_seq); + + __FALSE_EQ__(contin_seq, combo_tree_seq); + + __FALSE_EQ__(string_seq, builtin_seq); + + __FALSE_EQ__(string_seq, contin_seq); + + __FALSE_EQ__(string_seq, combo_tree_seq); + + __FALSE_EQ__(combo_tree_seq, builtin_seq); + + __FALSE_EQ__(combo_tree_seq, contin_seq); + + __FALSE_EQ__(combo_tree_seq, string_seq); + + __FALSE_EQ__(combo_tree_seq, vertex_seq); + + __FALSE_EQ__(vertex_seq, combo_tree_seq); #undef __FALSE_EQ__ - template - bool operator()(const SeqL& l, const SeqR& r) const { - return boost::equal(l, r); - } + + template + bool operator()(const SeqL &l, const SeqR &r) const + { + return boost::equal(l, r); + } }; // function specifically for output table -std::string table_fmt_vertex_to_str(const vertex& v); -std::string table_fmt_builtin_to_str(const builtin& b); +std::string table_fmt_vertex_to_str(const vertex &v); + +std::string table_fmt_builtin_to_str(const builtin &b); + struct to_strings_visitor : public boost::static_visitor { - string_seq operator()(const string_seq& seq) { - return seq; - } - string_seq operator()(const vertex_seq& seq) { - string_seq res; - boost::transform(seq, back_inserter(res), table_fmt_vertex_to_str); - return res; - } - string_seq operator()(const builtin_seq& seq) { - string_seq res; - boost::transform(seq, back_inserter(res), table_fmt_builtin_to_str); - return res; - } - template string_seq operator()(const Seq& seq) { - string_seq res; - boost::transform(seq, back_inserter(res), - [](const typename Seq::value_type& v) { - std::stringstream ss; - ss << v; - return ss.str(); - }); - return res; - } + string_seq operator()(const string_seq &seq) + { + return seq; + } + + string_seq operator()(const vertex_seq &seq) + { + string_seq res; + boost::transform(seq, back_inserter(res), table_fmt_vertex_to_str); + return res; + } + + string_seq operator()(const builtin_seq &seq) + { + string_seq res; + boost::transform(seq, back_inserter(res), table_fmt_builtin_to_str); + return res; + } + + template + string_seq operator()(const Seq &seq) + { + string_seq res; + boost::transform(seq, back_inserter(res), + [](const typename Seq::value_type &v) { + std::stringstream ss; + ss << v; + return ss.str(); + }); + return res; + } }; struct get_type_tree_at_visitor : public boost::static_visitor { - get_type_tree_at_visitor(size_t pos) : _pos(pos) {} - template type_tree operator()(const Seq& seq) { - return get_type_tree(seq[_pos]); - } - size_t _pos; + get_type_tree_at_visitor(size_t pos) : _pos(pos) + {} + + template + type_tree operator()(const Seq &seq) + { + return get_type_tree(seq[_pos]); + } + + size_t _pos; }; /** @@ -303,51 +391,61 @@ struct get_type_tree_at_visitor : public boost::static_visitor */ struct interpreter_visitor : public boost::static_visitor { - interpreter_visitor(const combo_tree& tr) : _it(tr.begin()) - { - // If any of the vertexes in the tree are contin-type, - // then the mixed interpreter will have to be used. - mixed = false; - combo_tree::iterator mit = tr.begin(); - combo_tree::iterator mend = tr.end(); - for (; mit != mend; ++mit ) { - mixed = is_contin_expr(*mit); - if (mixed) break; - mixed = (id::greater_than_zero == *mit); - if (mixed) break; - } - } + interpreter_visitor(const combo_tree &tr) : _it(tr.begin()) + { + // If any of the vertexes in the tree are contin-type, + // then the mixed interpreter will have to be used. + mixed = false; + combo_tree::iterator mit = tr.begin(); + combo_tree::iterator mend = tr.end(); + for (; mit != mend; ++mit) { + mixed = is_contin_expr(*mit); + if (mixed) break; + mixed = (id::greater_than_zero == *mit); + if (mixed) break; + } + } - interpreter_visitor(const combo_tree::iterator& it) : _it(it) - { - // Cannot check the entire tree with this ctor. - mixed = is_contin_expr(*_it); - if (not mixed) mixed = (id::greater_than_zero == *_it); - } + interpreter_visitor(const combo_tree::iterator &it) : _it(it) + { + // Cannot check the entire tree with this ctor. + mixed = is_contin_expr(*_it); + if (not mixed) mixed = (id::greater_than_zero == *_it); + } - vertex operator()(const std::vector& inputs) { - if (mixed) return mixed_interpreter(inputs)(_it); - return boolean_interpreter(inputs)(_it); - } - vertex operator()(const std::vector& inputs) { - // Can't use contin, since the output might be non-contin, - // e.g. a boolean, or an enum. - // return contin_interpreter(inputs)(_it); - return mixed_interpreter(inputs)(_it); - } - vertex operator()(const std::vector& inputs) { - return mixed_interpreter(inputs)(_it); - } - vertex operator()(const string_seq& inputs) { - OC_ASSERT(false, "Not implemented"); - return vertex(); - } - vertex operator()(const std::vector& inputs) { - OC_ASSERT(false, "Not implemented"); - return vertex(); - } - combo_tree::iterator _it; - bool mixed; + vertex operator()(const std::vector &inputs) + { + if (mixed) return mixed_interpreter(inputs)(_it); + return boolean_interpreter(inputs)(_it); + } + + vertex operator()(const std::vector &inputs) + { + // Can't use contin, since the output might be non-contin, + // e.g. a boolean, or an enum. + // return contin_interpreter(inputs)(_it); + return mixed_interpreter(inputs)(_it); + } + + vertex operator()(const std::vector &inputs) + { + return mixed_interpreter(inputs)(_it); + } + + vertex operator()(const string_seq &inputs) + { + OC_ASSERT(false, "Not implemented"); + return vertex(); + } + + vertex operator()(const std::vector &inputs) + { + OC_ASSERT(false, "Not implemented"); + return vertex(); + } + + combo_tree::iterator _it; + bool mixed; }; /** @@ -361,88 +459,132 @@ struct interpreter_visitor : public boost::static_visitor struct multi_type_seq : public boost::less_than_comparable, public boost::equality_comparable { - typedef boost::variant multi_type_variant; - multi_type_seq() { - // logger().debug("sizeof(builtin) = %u", sizeof(builtin)); - // logger().debug("sizeof(vertex) = %u", sizeof(vertex)); - } - template multi_type_seq(const std::initializer_list& il) - : _variant(std::vector(il)) {} - template multi_type_seq(const T& v) : _variant(v) {} - template void push_back(const T& e) { - boost::apply_visitor(push_back_visitor(e), _variant); - } - void pop_back() { - pop_back_visitor popbv; - boost::apply_visitor(popbv, _variant); - } - bool operator<(const multi_type_seq& r) const { - return get_variant() < r.get_variant(); - } - bool operator==(const multi_type_seq& r) const { - equal_visitor ev; - return boost::apply_visitor(ev, get_variant(), r.get_variant()); - // return get_variant() == r.get_variant(); - } - size_t size() const { - size_visitor sv; - return boost::apply_visitor(sv, _variant); - } - bool empty() const { - empty_visitor ev; - return boost::apply_visitor(ev, _variant); - } - void erase_at(size_t pos) { - boost::apply_visitor(erase_at_visitor(pos), _variant); - } - void init_at(size_t pos) { - boost::apply_visitor(init_at_visitor(pos), _variant); - } - template T get_at(size_t pos) const { - return boost::apply_visitor(get_at_visitor(pos), _variant); - } - template void insert_at(int pos, const T& v) { - boost::apply_visitor(insert_at_visitor(pos, v), _variant); - } - std::vector to_strings() const { - to_strings_visitor tsv; - return boost::apply_visitor(tsv, _variant); - } + typedef boost::variant multi_type_variant; + + multi_type_seq() + { + // logger().debug("sizeof(builtin) = %u", sizeof(builtin)); + // logger().debug("sizeof(vertex) = %u", sizeof(vertex)); + } - multi_type_variant& get_variant() { return _variant; } - const multi_type_variant& get_variant() const { return _variant; } + template + multi_type_seq(const std::initializer_list &il) + : _variant(std::vector(il)) + {} - // variant helpers - template - std::vector& get_seq() { - return boost::get>(_variant); - } - template - const std::vector& get_seq() const { - return boost::get>(_variant); - } + template + multi_type_seq(const T &v) : _variant(v) + {} + + template + void push_back(const T &e) + { + boost::apply_visitor(push_back_visitor(e), _variant); + } + + void pop_back() + { + pop_back_visitor popbv; + boost::apply_visitor(popbv, _variant); + } + + bool operator<(const multi_type_seq &r) const + { + return get_variant() < r.get_variant(); + } + + bool operator==(const multi_type_seq &r) const + { + equal_visitor ev; + return boost::apply_visitor(ev, get_variant(), r.get_variant()); + // return get_variant() == r.get_variant(); + } + + size_t size() const + { + size_visitor sv; + return boost::apply_visitor(sv, _variant); + } + + bool empty() const + { + empty_visitor ev; + return boost::apply_visitor(ev, _variant); + } + + void erase_at(size_t pos) + { + boost::apply_visitor(erase_at_visitor(pos), _variant); + } + + void init_at(size_t pos) + { + boost::apply_visitor(init_at_visitor(pos), _variant); + } + + template + T get_at(size_t pos) const + { + return boost::apply_visitor(get_at_visitor(pos), _variant); + } + + template + void insert_at(int pos, const T &v) + { + boost::apply_visitor(insert_at_visitor(pos, v), _variant); + } + + std::vector to_strings() const + { + to_strings_visitor tsv; + return boost::apply_visitor(tsv, _variant); + } + + multi_type_variant &get_variant() + { return _variant; } + + const multi_type_variant &get_variant() const + { return _variant; } + + // variant helpers + template + std::vector &get_seq() + { + return boost::get>(_variant); + } + + template + const std::vector &get_seq() const + { + return boost::get>(_variant); + } protected: - // I set it as mutable because the FUCKING boost::variant - // apply_visitor doesn't allow to deal with const variants. For - // the same reason I cannot define multi_type_seq as an inherited - // class from multi_type_variant (boost::variant kinda sucks!). - mutable multi_type_variant _variant; + // I set it as mutable because the FUCKING boost::variant + // apply_visitor doesn't allow to deal with const variants. For + // the same reason I cannot define multi_type_seq as an inherited + // class from multi_type_variant (boost::variant kinda sucks!). + mutable multi_type_variant _variant; }; // Filter a multi_type_seq template struct seq_filtered_visitor : public boost::static_visitor { - seq_filtered_visitor(const F& filter) : _filter(filter) {} - template multi_type_seq operator()(const Seq& seq) { - return seq_filtered(seq, _filter); - } - const F& _filter; + seq_filtered_visitor(const F &filter) : _filter(filter) + {} + + template + multi_type_seq operator()(const Seq &seq) + { + return seq_filtered(seq, _filter); + } + + const F &_filter; }; static const std::string default_timestamp_label("timestamp"); @@ -453,39 +595,47 @@ static const std::string default_timestamp_label("timestamp"); */ struct TTable : public std::vector { - typedef std::vector super; + typedef std::vector super; public: - typedef boost::gregorian::date value_type; + typedef boost::gregorian::date value_type; + + TTable(const std::string &tl = default_timestamp_label); + + TTable(const super &tt, const std::string &tl = default_timestamp_label); - TTable(const std::string& tl = default_timestamp_label); - TTable(const super& tt, const std::string& tl = default_timestamp_label); - void set_label(const std::string&); - const std::string& get_label() const; + void set_label(const std::string &); - static TTable::value_type from_string(const std::string& timestamp_str); - static std::string to_string(const TTable::value_type& timestamp); + const std::string &get_label() const; + + static TTable::value_type from_string(const std::string ×tamp_str); + + static std::string to_string(const TTable::value_type ×tamp); protected: - std::string label; + std::string label; }; struct TimedValue : - public boost::less_than_comparable, - public boost::equality_comparable + public boost::less_than_comparable, + public boost::equality_comparable { - TimedValue(const vertex v, - const TTable::value_type t = TTable::value_type()) - : value(v), timestamp(t) {} - vertex value; - TTable::value_type timestamp; - - bool operator<(const TimedValue &r) const { - return (value < r.value) || (timestamp < r.timestamp); - } + TimedValue(const vertex v, + const TTable::value_type t = TTable::value_type()) + : value(v), timestamp(t) + {} - bool operator==(const TimedValue& r) const { - return (value == r.value) && (timestamp == r.timestamp); - } + vertex value; + TTable::value_type timestamp; + + bool operator<(const TimedValue &r) const + { + return (value < r.value) || (timestamp < r.timestamp); + } + + bool operator==(const TimedValue &r) const + { + return (value == r.value) && (timestamp == r.timestamp); + } }; @@ -493,18 +643,19 @@ struct TimedValue : // work with weighted features. typedef double count_t; -struct TimedCounter : public Counter { - // Overload get(const TimedValue&) to work with a vertex v, in - // that case it returns the sum of that counter across all - // timestamps with vertex equal to v. - count_t get(const vertex& v) const; +struct TimedCounter : public Counter +{ + // Overload get(const TimedValue&) to work with a vertex v, in + // that case it returns the sum of that counter across all + // timestamps with vertex equal to v. + count_t get(const vertex &v) const; - // Return a counter without timestamps - Counter untimedCounter() const; + // Return a counter without timestamps + Counter untimedCounter() const; - // Overload mode() so that it returns the most frequent - // vertex over all timestamps. - vertex mode() const; + // Overload mode() so that it returns the most frequent + // vertex over all timestamps. + vertex mode() const; }; /** @@ -548,184 +699,190 @@ typedef std::map> CTableTime; /// combo program on duplicated inputs. // class CTable : public std::map - // , public boost::equality_comparable + // , public boost::equality_comparable { public: - typedef multi_type_seq key_type; - typedef TimedCounter mapped_type; - typedef TimedCounter counter_t; - typedef std::map super; - typedef typename super::value_type value_type; - typedef std::vector string_seq; - - // Definition is delayed until after Table, as it uses Table. - template - CTable(const Func& func, arity_t arity, int nsamples = -1); - - CTable(const std::string& _olabel = "output"); - - CTable(const string_seq& labs, const type_tree& tt); - - CTable(const std::string& _olabel, const string_seq& _ilabels, - const type_tree& tt); - - arity_t get_arity() const { return ilabels.size(); } - - /// Return the total number of observations. - /// This will equal to the size of the corresponding uncompressed - /// when all row weights are equal to 1.0; otherwise, this is the - /// sum of all the row weights. - count_t uncompressed_size() const; - - /// Create a new table from this one, which contains only those - /// columns specified by the filter. The filter is assumed to be - /// either a set or a vector (or an iterable, in general) that - /// holds the index numbers of the columns to be kept. - /// - /// Note that the filtered CTable can typically be further - /// compressed, and so the compressed size will be smaller. This - /// can be exploted to obtain performance gains. - template - CTable filtered(const F& filter) const - { - typedef type_tree::iterator pre_it; - typedef type_tree::sibling_iterator sib_it; - - // Filter the type signature tree - // copy head - type_tree fsig; - pre_it head_src = tsig.begin(); - OC_ASSERT(*head_src == id::lambda_type); - OC_ASSERT((int)tsig.number_of_children(head_src) == get_arity() + 1); - pre_it head_dst = fsig.set_head(*head_src); - // copy filtered input types - sib_it sib_src = head_src.begin(); - arity_t a_pre = 0; - for (arity_t a : filter) { - std::advance(sib_src, a - a_pre); - a_pre = a; - fsig.replace(fsig.append_child(head_dst), sib_src); - } - - // copy output type - fsig.replace(fsig.append_child(head_dst), head_src.last_child()); - - // Filter the labels - CTable res(olabel, seq_filtered(ilabels, filter), fsig); - - // Filter the content - seq_filtered_visitor sfv(filter); - auto asfv = boost::apply_visitor(sfv); - for (const CTable::value_type v : *this) - res[asfv(v.first.get_variant())] += v.second; - - // return the filtered CTable - return res; - } + typedef multi_type_seq key_type; + typedef TimedCounter mapped_type; + typedef TimedCounter counter_t; + typedef std::map super; + typedef typename super::value_type value_type; + typedef std::vector string_seq; + + // Definition is delayed until after Table, as it uses Table. + template + CTable(const Func &func, arity_t arity, int nsamples = -1); + + CTable(const std::string &_olabel = "output"); + + CTable(const string_seq &labs, const type_tree &tt); + + CTable(const std::string &_olabel, const string_seq &_ilabels, + const type_tree &tt); + + arity_t get_arity() const + { return ilabels.size(); } + + /// Return the total number of observations. + /// This will equal to the size of the corresponding uncompressed + /// when all row weights are equal to 1.0; otherwise, this is the + /// sum of all the row weights. + count_t uncompressed_size() const; + + /// Create a new table from this one, which contains only those + /// columns specified by the filter. The filter is assumed to be + /// either a set or a vector (or an iterable, in general) that + /// holds the index numbers of the columns to be kept. + /// + /// Note that the filtered CTable can typically be further + /// compressed, and so the compressed size will be smaller. This + /// can be exploted to obtain performance gains. + template + CTable filtered(const F &filter) const + { + typedef type_tree::iterator pre_it; + typedef type_tree::sibling_iterator sib_it; + + // Filter the type signature tree + // copy head + type_tree fsig; + pre_it head_src = tsig.begin(); + OC_ASSERT(*head_src == id::lambda_type); + OC_ASSERT((int) tsig.number_of_children(head_src) == get_arity() + 1); + pre_it head_dst = fsig.set_head(*head_src); + // copy filtered input types + sib_it sib_src = head_src.begin(); + arity_t a_pre = 0; + for (arity_t a : filter) { + std::advance(sib_src, a - a_pre); + a_pre = a; + fsig.replace(fsig.append_child(head_dst), sib_src); + } + + // copy output type + fsig.replace(fsig.append_child(head_dst), head_src.last_child()); + + // Filter the labels + CTable res(olabel, seq_filtered(ilabels, filter), fsig); + + // Filter the content + seq_filtered_visitor sfv(filter); + auto asfv = boost::apply_visitor(sfv); + for (const CTable::value_type v : *this) + res[asfv(v.first.get_variant())] += v.second; + + // return the filtered CTable + return res; + } - template - multi_type_seq filtered_preserve_idxs(const F& filter, - const multi_type_seq& seq) const - { - multi_type_seq res; - auto it = filter.cbegin(); - for (unsigned i = 0; i < seq.size(); ++i) { - if (it != filter.cend() && (typename F::value_type)i == *it) { - // XXX TODO WARNING ERROR: builtin hardcoded shit!!! - res.push_back(seq.get_at(i)); - ++it; - } else { - // XXX TODO WARNING ERROR: builtin hardcoded shit!!! - res.push_back(id::null_vertex); - } - } - return res; - } + template + multi_type_seq filtered_preserve_idxs(const F &filter, + const multi_type_seq &seq) const + { + multi_type_seq res; + auto it = filter.cbegin(); + for (unsigned i = 0; i < seq.size(); ++i) { + if (it != filter.cend() && (typename F::value_type) i == *it) { + // XXX TODO WARNING ERROR: builtin hardcoded shit!!! + res.push_back(seq.get_at(i)); + ++it; + } else { + // XXX TODO WARNING ERROR: builtin hardcoded shit!!! + res.push_back(id::null_vertex); + } + } + return res; + } - /** - * Create a new table from this one, with all column values not in - * the filtered set being replaced by id::null_vertex. This is - * similar to the filtered() method above, except that the total - * number of columns remains unchanged. The table signature is also - * left unchanged. - * - * The filter should be an iterable (set or vector) containing - * column index numbers. The specified columns are kept; all others - * are blanked. - * - * Note that the filtered CTable can typically be further - * compressed, and so the compressed size will be smaller. This - * can be exploted to obtain performance gains. - */ - template - CTable filtered_preserve_idxs(const F& filter) const - { - // Set new CTable - CTable res(olabel, ilabels, tsig); - - // Filter the rows (replace filtered out values by id::null_vertex) - for (const CTable::value_type v : *this) - res[filtered_preserve_idxs(filter, v.first)] += v.second; - - // return the filtered CTable - return res; - } + /** + * Create a new table from this one, with all column values not in + * the filtered set being replaced by id::null_vertex. This is + * similar to the filtered() method above, except that the total + * number of columns remains unchanged. The table signature is also + * left unchanged. + * + * The filter should be an iterable (set or vector) containing + * column index numbers. The specified columns are kept; all others + * are blanked. + * + * Note that the filtered CTable can typically be further + * compressed, and so the compressed size will be smaller. This + * can be exploted to obtain performance gains. + */ + template + CTable filtered_preserve_idxs(const F &filter) const + { + // Set new CTable + CTable res(olabel, ilabels, tsig); - /** - * Remove the rows of the ctable. It treats rows as if they were - * uncompressed. The indexes follow the order set by the input - * rows, and then the output values, so for instance if a - * compressed row has N 0s and M 1s output values, it will treat - * that as N+M rows, where the rows ending by 0s precedes the ones - * ending by 1s (since 0 < 1). - */ - void remove_rows(const std::set& idxs); - - /** - * Similar to above but remove rows matching a set of dates. - */ - void remove_rows_at_times(const std::set& timestamps); - - /** - * Remove rows timestamped timestamp. - */ - void remove_rows_at_time(const TTable::value_type& timestamp); - - /** - * Get the set of timestamps in the data (if any) - */ - std::set get_timestamps() const; - - // return the output label + list of input labels - void set_labels(const string_seq& labels); - string_seq get_labels() const; - const std::string& get_output_label() const; - const string_seq& get_input_labels() const; - void set_signature(const type_tree& tt); - const type_tree& get_signature() const; - type_node get_output_type() const; - - CTableTime ordered_by_time() const; - - // Balance the ctable, so that, in case the output type is - // discrete, all class counts are equal, but the uncompressed size - // is till the same. - void balance(); - - // hmmm, it doesn't compile, I give up - // bool operator==(const CTable& r) const { - // return super::operator==(static_cast(r)) - // && get_labels() == r.get_labels() - // && get_signature() == r.get_signature(); - // } + // Filter the rows (replace filtered out values by id::null_vertex) + for (const CTable::value_type v : *this) + res[filtered_preserve_idxs(filter, v.first)] += v.second; + + // return the filtered CTable + return res; + } + + /** + * Remove the rows of the ctable. It treats rows as if they were + * uncompressed. The indexes follow the order set by the input + * rows, and then the output values, so for instance if a + * compressed row has N 0s and M 1s output values, it will treat + * that as N+M rows, where the rows ending by 0s precedes the ones + * ending by 1s (since 0 < 1). + */ + void remove_rows(const std::set &idxs); + + /** + * Similar to above but remove rows matching a set of dates. + */ + void remove_rows_at_times(const std::set ×tamps); + + /** + * Remove rows timestamped timestamp. + */ + void remove_rows_at_time(const TTable::value_type ×tamp); + + /** + * Get the set of timestamps in the data (if any) + */ + std::set get_timestamps() const; + + // return the output label + list of input labels + void set_labels(const string_seq &labels); + + string_seq get_labels() const; + + const std::string &get_output_label() const; + + const string_seq &get_input_labels() const; + + void set_signature(const type_tree &tt); + + const type_tree &get_signature() const; + + type_node get_output_type() const; + + CTableTime ordered_by_time() const; + + // Balance the ctable, so that, in case the output type is + // discrete, all class counts are equal, but the uncompressed size + // is till the same. + void balance(); + + // hmmm, it doesn't compile, I give up + // bool operator==(const CTable& r) const { + // return super::operator==(static_cast(r)) + // && get_labels() == r.get_labels() + // && get_signature() == r.get_signature(); + // } protected: - type_tree tsig; // table signature - std::string olabel; // output label - string_seq ilabels; // list of input labels + type_tree tsig; // table signature + std::string olabel; // output label + string_seq ilabels; // list of input labels }; - /** * Input table of vertexes. * Rows represent data samples. @@ -735,123 +892,134 @@ class CTable : public std::map * Each entry in the vector is a row. */ class OTable; + class ITable : public std::vector { public: - typedef std::vector super; - typedef super::value_type value_type; - typedef std::vector string_seq; - typedef std::vector type_seq; - ITable(); - ITable(const type_seq& ts, const string_seq& il=string_seq()); - ITable(const super& mat, const string_seq& il=string_seq()); - ITable(const OTable&); - /** - * generate an input table according to the signature tt. - * - * @param tt signature of the table to generate. - * @param nsamples sample size, if negative then the sample - size is automatically determined. - * @param min_contin minimum contin value. - * @param max_contin maximum contin value. - * - * It only works for contin-boolean signatures - */ - // min_contin and max_contin are used in case tt has contin inputs - ITable(const type_tree& tt, int nsamples=-1, - contin_t min_contin=-1.0, contin_t max_contin=1.0); - - arity_t get_arity() const { - return super::front().size(); - } + typedef std::vector super; + typedef super::value_type value_type; + typedef std::vector string_seq; + typedef std::vector type_seq; + + ITable(); + + ITable(const type_seq &ts, const string_seq &il = string_seq()); + + ITable(const super &mat, const string_seq &il = string_seq()); + + ITable(const OTable &); + /** + * generate an input table according to the signature tt. + * + * @param tt signature of the table to generate. + * @param nsamples sample size, if negative then the sample + size is automatically determined. + * @param min_contin minimum contin value. + * @param max_contin maximum contin value. + * + * It only works for contin-boolean signatures + */ + // min_contin and max_contin are used in case tt has contin inputs + ITable(const type_tree &tt, int nsamples = -1, + contin_t min_contin = -1.0, contin_t max_contin = 1.0); + + arity_t get_arity() const + { + return super::front().size(); + } - bool operator==(const ITable& rhs) const; - - // set input labels - void set_labels(const string_seq&); - const string_seq& get_labels() const; - - void set_types(const type_seq&); - const type_seq& get_types() const; - type_node get_type(const std::string&) const; - - /** - * Insert a column 'col', named 'clab', after position 'off'. - * If off is negative, then insert after the last column. - * - * TODO: we really should use iterators here, not column numbers. - * - * TODO: should be generalized for multi_type_seq rather than - * vertex_seq - * - * WARNING: this function is automatically converting the ITable's - * rows into vertex_seq (this is also a hack till it handles - * multi_type_seq). - */ - void insert_col(const std::string& clab, - const vertex_seq& col, - int off=-1); - - /** - * Delete the named feature from the input table. - * If the feature is the empty string, then column zero is deleted. - * The returned value is the name of the column. - */ - std::string delete_column(const std::string& feature); - void delete_columns(const string_seq& ignore_features); - - /** - * Get the column, given its offset or label - */ - vertex_seq get_column_data(const std::string& name) const; - vertex_seq get_column_data(int offset) const; - - /// Return a copy of the input table filtered according to a given - /// container of arity_t. Each value of that container corresponds - /// to the column index of the ITable (starting from 0). - template - ITable filtered(const F& filter) const - { - ITable res; - - // filter labels - res.set_labels(seq_filtered(get_labels(), filter)); - - // filter types - res.set_types(seq_filtered(get_types(), filter)); - - // filter content - seq_filtered_visitor sfv(filter); - auto asf = boost::apply_visitor(sfv); - for (const value_type& row : *this) - res.push_back(asf(row.get_variant())); - - return res; - } + bool operator==(const ITable &rhs) const; + + // set input labels + void set_labels(const string_seq &); + + const string_seq &get_labels() const; + + void set_types(const type_seq &); + + const type_seq &get_types() const; + + type_node get_type(const std::string &) const; + + /** + * Insert a column 'col', named 'clab', after position 'off'. + * If off is negative, then insert after the last column. + * + * TODO: we really should use iterators here, not column numbers. + * + * TODO: should be generalized for multi_type_seq rather than + * vertex_seq + * + * WARNING: this function is automatically converting the ITable's + * rows into vertex_seq (this is also a hack till it handles + * multi_type_seq). + */ + void insert_col(const std::string &clab, + const vertex_seq &col, + int off = -1); + + /** + * Delete the named feature from the input table. + * If the feature is the empty string, then column zero is deleted. + * The returned value is the name of the column. + */ + std::string delete_column(const std::string &feature); + + void delete_columns(const string_seq &ignore_features); + + /** + * Get the column, given its offset or label + */ + vertex_seq get_column_data(const std::string &name) const; + + vertex_seq get_column_data(int offset) const; + + /// Return a copy of the input table filtered according to a given + /// container of arity_t. Each value of that container corresponds + /// to the column index of the ITable (starting from 0). + template + ITable filtered(const F &filter) const + { + ITable res; + + // filter labels + res.set_labels(seq_filtered(get_labels(), filter)); + + // filter types + res.set_types(seq_filtered(get_types(), filter)); + + // filter content + seq_filtered_visitor sfv(filter); + auto asf = boost::apply_visitor(sfv); + for (const value_type &row : *this) + res.push_back(asf(row.get_variant())); + + return res; + } - int get_column_offset(const std::string& col_name) const; + int get_column_offset(const std::string &col_name) const; protected: - mutable type_seq types; // list of types of the columns - mutable string_seq labels; // list of input labels + mutable type_seq types; // list of types of the columns + mutable string_seq labels; // list of input labels private: - string_seq get_default_labels() const; - - /** - * this function take an arity in input and returns in output the - * number of samples that would be appropriate to check the semantics - * of its associated tree. - * - * Note : could take the two trees to checking and according to their - * arity structure, whatever, find an appropriate number. - */ - unsigned sample_count(arity_t contin_arity) - { - if (contin_arity == 0) - return 1; - else return COEF_SAMPLE_COUNT*log(contin_arity + EXPONENTIAL); - } + string_seq get_default_labels() const; + + /** + * this function take an arity in input and returns in output the + * number of samples that would be appropriate to check the semantics + * of its associated tree. + * + * Note : could take the two trees to checking and according to their + * arity structure, whatever, find an appropriate number. + */ + unsigned sample_count(arity_t contin_arity) + { + if (contin_arity == 0) + return 1; + else return COEF_SAMPLE_COUNT * log(contin_arity + EXPONENTIAL); + } }; @@ -865,48 +1033,57 @@ static const std::string default_output_label("output"); */ class OTable : public vertex_seq { - typedef vertex_seq super; + typedef vertex_seq super; public: - typedef vertex value_type; - - OTable(const std::string& ol=default_output_label); - OTable(const super& ot, const std::string& ol=default_output_label); - - /// Construct the OTable by evaluating the combo tree @tr for each - /// row in the input ITable. - OTable(const combo_tree& tr, const ITable& itable, - const std::string& ol=default_output_label); - - /// Construct the OTable by evaluating the combo tree @tr for each - /// row in the input CTable. - OTable(const combo_tree& tr, const CTable& ctable, - const std::string& ol=default_output_label); - - template - OTable(const Func& f, const ITable& it, - const std::string& ol=default_output_label) - : label(ol) - { - for (const multi_type_seq& vs : it) - push_back(f(vs.get_seq().begin(), - vs.get_seq().end())); - } + typedef vertex value_type; + + OTable(const std::string &ol = default_output_label); + + OTable(const super &ot, const std::string &ol = default_output_label); + + /// Construct the OTable by evaluating the combo tree @tr for each + /// row in the input ITable. + OTable(const combo_tree &tr, const ITable &itable, + const std::string &ol = default_output_label); + + /// Construct the OTable by evaluating the combo tree @tr for each + /// row in the input CTable. + OTable(const combo_tree &tr, const CTable &ctable, + const std::string &ol = default_output_label); + + template + OTable(const Func &f, const ITable &it, + const std::string &ol = default_output_label) + : label(ol) + { + for (const multi_type_seq &vs : it) + push_back(f(vs.get_seq().begin(), + vs.get_seq().end())); + } + + void set_label(const std::string &); + + const std::string &get_label() const; + + void set_type(type_node); + + type_node get_type() const; - void set_label(const std::string&); - const std::string& get_label() const; - void set_type(type_node); - type_node get_type() const; - bool operator==(const OTable& rhs) const; - contin_t abs_distance(const OTable&) const; - contin_t sum_squared_error(const OTable&) const; - contin_t mean_squared_error(const OTable&) const; - contin_t root_mean_square_error(const OTable&) const; + bool operator==(const OTable &rhs) const; - vertex get_enum_vertex(const std::string& token); + contin_t abs_distance(const OTable &) const; + + contin_t sum_squared_error(const OTable &) const; + + contin_t mean_squared_error(const OTable &) const; + + contin_t root_mean_square_error(const OTable &) const; + + vertex get_enum_vertex(const std::string &token); protected: - std::string label; // output label - type_node type; // the type of the column data. + std::string label; // output label + type_node type; // the type of the column data. }; /** @@ -917,112 +1094,120 @@ class OTable : public vertex_seq */ struct Table : public boost::equality_comparable { - typedef std::vector string_seq; - typedef vertex value_type; - - Table(); - - Table(const OTable& otable_, const ITable& itable_); - - template - Table(const Func& func, arity_t a, int nsamples=-1) : - itable(gen_signature(type_node_of(), - type_node_of(), a)), - otable(func, itable), target_pos(0), timestamp_pos(0) {} - - Table(const combo_tree& tr, int nsamples=-1, - contin_t min_contin=-1.0, contin_t max_contin=1.0); - - size_t size() const { return itable.size(); } - - arity_t get_arity() const { return itable.get_arity(); } - - // Return the types of the columns in the table. - // The type is returned as a lambda(input col types) -> output col type. - // This is computed on the fly each time, instead ov being - // stored with the object, so that RAM isn't wasted holding this - // infrequently-needed info. - type_tree get_signature() const - { - type_tree tt(id::lambda_type); - auto root = tt.begin(); - for (type_node tn : itable.get_types()) - tt.append_child(root, tn); - tt.append_child(root, otable.get_type()); - return tt; - } + typedef std::vector string_seq; + typedef vertex value_type; - // return a string with the io labels, the output label comes first - string_seq get_labels() const; - - const std::string& get_target() const { return otable.get_label(); } - - // Useful for filtered (see below), return some column position - // after a filter has been applied - template unsigned update_pos(unsigned pos, const F& f) const { - unsigned filtered_out_count = 0, - last = 0; - for (unsigned v : f) { - if (v < pos) - filtered_out_count += v - last; - else { - filtered_out_count += pos - last; - break; - } - last = v; - } - return pos - filtered_out_count; - } + Table(); - // Filter in, according to a container of arity_t. Each value of - // that container corresponds to the column index of the ITable - // (starting from 0). - template Table filtered(const F& f) const { - Table res; + Table(const OTable &otable_, const ITable &itable_); - // filter input table - res.itable = itable.filtered(f); + template + Table(const Func &func, arity_t a, int nsamples = -1) : + itable(gen_signature(type_node_of(), + type_node_of(), a)), + otable(func, itable), target_pos(0), timestamp_pos(0) + {} - // set output table - res.otable = otable; + Table(const combo_tree &tr, int nsamples = -1, + contin_t min_contin = -1.0, contin_t max_contin = 1.0); - // set timestamp table - res.ttable = ttable; + size_t size() const + { return itable.size(); } - // update target_pos - res.target_pos = update_pos(target_pos, f); + arity_t get_arity() const + { return itable.get_arity(); } - // update timestamp_pos - if (!ttable.empty()) - res.timestamp_pos = update_pos(timestamp_pos, f); + // Return the types of the columns in the table. + // The type is returned as a lambda(input col types) -> output col type. + // This is computed on the fly each time, instead ov being + // stored with the object, so that RAM isn't wasted holding this + // infrequently-needed info. + type_tree get_signature() const + { + type_tree tt(id::lambda_type); + auto root = tt.begin(); + for (type_node tn : itable.get_types()) + tt.append_child(root, tn); + tt.append_child(root, otable.get_type()); + return tt; + } - return res; - } + // return a string with the io labels, the output label comes first + string_seq get_labels() const; + + const std::string &get_target() const + { return otable.get_label(); } + + // Useful for filtered (see below), return some column position + // after a filter has been applied + template + unsigned update_pos(unsigned pos, const F &f) const + { + unsigned filtered_out_count = 0, + last = 0; + for (unsigned v : f) { + if (v < pos) + filtered_out_count += v - last; + else { + filtered_out_count += pos - last; + break; + } + last = v; + } + return pos - filtered_out_count; + } + + // Filter in, according to a container of arity_t. Each value of + // that container corresponds to the column index of the ITable + // (starting from 0). + template + Table filtered(const F &f) const + { + Table res; + + // filter input table + res.itable = itable.filtered(f); + + // set output table + res.otable = otable; + + // set timestamp table + res.ttable = ttable; + + // update target_pos + res.target_pos = update_pos(target_pos, f); + + // update timestamp_pos + if (!ttable.empty()) + res.timestamp_pos = update_pos(timestamp_pos, f); - /// Return the corresponding compressed table. - /// The named column, if not empty, will be used to provide weights - /// for each row, during compression. - CTable compressed(const std::string="") const; + return res; + } + + /// Return the corresponding compressed table. + /// The named column, if not empty, will be used to provide weights + /// for each row, during compression. + CTable compressed(const std::string= "") const; - ITable itable; - OTable otable; - TTable ttable; + ITable itable; + OTable otable; + TTable ttable; - // Position of the target, useful for writing the table - unsigned target_pos; + // Position of the target, useful for writing the table + unsigned target_pos; - // Position of the timestamp feature, useful for writing the - // table. If the timestamp feature is empty then it is irrelevant. - unsigned timestamp_pos; + // Position of the timestamp feature, useful for writing the + // table. If the timestamp feature is empty then it is irrelevant. + unsigned timestamp_pos; - bool operator==(const Table& rhs) const; + bool operator==(const Table &rhs) const; }; template -CTable::CTable(const Func& func, arity_t arity, int nsamples) +CTable::CTable(const Func &func, arity_t arity, int nsamples) { - Table table(func, arity, nsamples); - *this = table.compressed(); + Table table(func, arity, nsamples); + *this = table.compressed(); } ///////////////////// @@ -1030,8 +1215,9 @@ CTable::CTable(const Func& func, arity_t arity, int nsamples) ///////////////////// // Randomly remove rows so that the new size is ratio * table size -void subsampleTable(float ratio, Table& table); -void subsampleCTable(float ratio, CTable& ctable); +void subsampleTable(float ratio, Table &table); + +void subsampleCTable(float ratio, CTable &ctable); //////////////////////// // Mutual Information // @@ -1041,7 +1227,7 @@ void subsampleCTable(float ratio, CTable& ctable); * Compute the joint entropy H(Y) of an output table. It assumes the data * are discretized. (?) */ -double OTEntropy(const OTable& ot); +double OTEntropy(const OTable &ot); /** * Compute the mutual information between a set of independent features @@ -1071,55 +1257,55 @@ double OTEntropy(const OTable& ot); * important should not contribute to the MI. */ template -double mutualInformation(const ITable& it, const OTable& ot, const FeatureSet& fs) +double mutualInformation(const ITable &it, const OTable &ot, const FeatureSet &fs) { - // XXX TODO to implement enum support, cut-n-paste from CTable - // mutual info code, below. - type_node otype = ot.get_type(); - OC_ASSERT(id::boolean_type == otype, "Only boolean types supported"); - - // declare useful visitors - seq_filtered_visitor sfv(fs); - auto asf = boost::apply_visitor(sfv); - - // Let X1, ..., Xn be the input columns on the table, and - // Y be the output column. We need to compute the joint entropies - // H(Y, X1, ..., Xn) and H(X1, ..., Xn) - // To do this, we need to count how often the vertex sequence - // (X1, ..., Xn) occurs. This count is kept in "ic". Likewise, the - // "ioc" counter counts how often the vertex_seq (Y, X1, ..., Xn) - // occurs. - typedef Counter VSCounter; - VSCounter ic, // for H(X1, ..., Xn) - ioc; // for H(Y, X1, ..., Xn) - ITable::const_iterator i_it = it.begin(); - OTable::const_iterator o_it = ot.begin(); - - for (; i_it != it.end(); ++i_it, ++o_it) { - multi_type_seq ic_vec = asf(i_it->get_variant()); - ic[ic_vec] += 1.0; - multi_type_seq ioc_vec(ic_vec); - ioc_vec.push_back(get_builtin(*o_it)); - ioc[ioc_vec] += 1.0; - } + // XXX TODO to implement enum support, cut-n-paste from CTable + // mutual info code, below. + type_node otype = ot.get_type(); + OC_ASSERT(id::boolean_type == otype, "Only boolean types supported"); + + // declare useful visitors + seq_filtered_visitor sfv(fs); + auto asf = boost::apply_visitor(sfv); + + // Let X1, ..., Xn be the input columns on the table, and + // Y be the output column. We need to compute the joint entropies + // H(Y, X1, ..., Xn) and H(X1, ..., Xn) + // To do this, we need to count how often the vertex sequence + // (X1, ..., Xn) occurs. This count is kept in "ic". Likewise, the + // "ioc" counter counts how often the vertex_seq (Y, X1, ..., Xn) + // occurs. + typedef Counter VSCounter; + VSCounter ic, // for H(X1, ..., Xn) + ioc; // for H(Y, X1, ..., Xn) + ITable::const_iterator i_it = it.begin(); + OTable::const_iterator o_it = ot.begin(); + + for (; i_it != it.end(); ++i_it, ++o_it) { + multi_type_seq ic_vec = asf(i_it->get_variant()); + ic[ic_vec] += 1.0; + multi_type_seq ioc_vec(ic_vec); + ioc_vec.push_back(get_builtin(*o_it)); + ioc[ioc_vec] += 1.0; + } - // Compute the probability distributions - std::vector ip(ic.size()), iop(ioc.size()); - double total = it.size(); - auto div_total = [&](count_t c) { return c/total; }; - transform(ic | map_values, ip.begin(), div_total); - transform(ioc | map_values, iop.begin(), div_total); + // Compute the probability distributions + std::vector ip(ic.size()), iop(ioc.size()); + double total = it.size(); + auto div_total = [&](count_t c) { return c / total; }; + transform(ic | map_values, ip.begin(), div_total); + transform(ioc | map_values, iop.begin(), div_total); - // Compute the joint entropies - return entropy(ip) + OTEntropy(ot) - entropy(iop); + // Compute the joint entropies + return entropy(ip) + OTEntropy(ot) - entropy(iop); } // Like the above, but taking a table in argument instead of // input and output tables template -double mutualInformation(const Table& table, const FeatureSet& fs) +double mutualInformation(const Table &table, const FeatureSet &fs) { - return mutualInformation(table.itable, table.otable, fs); + return mutualInformation(table.itable, table.otable, fs); } /** @@ -1129,178 +1315,172 @@ double mutualInformation(const Table& table, const FeatureSet& fs) * correct, we really should use Fisher information. @todo this). */ template -double mutualInformation(const CTable& ctable, const FeatureSet& fs) +double mutualInformation(const CTable &ctable, const FeatureSet &fs) { - // declare useful visitors - seq_filtered_visitor sfv(fs); - auto asf = boost::apply_visitor(sfv); - type_node otype = ctable.get_output_type(); - - const type_tree& tsig = ctable.get_signature(); - bool all_discrete_inputs = true; - for (const type_tree& in_tt : get_signature_inputs(tsig)) { - type_node tn = get_type_node(in_tt); - if (tn != id::boolean_type and tn != id::enum_type) { - all_discrete_inputs = false; - break; - } - } + // declare useful visitors + seq_filtered_visitor sfv(fs); + auto asf = boost::apply_visitor(sfv); + type_node otype = ctable.get_output_type(); + + const type_tree &tsig = ctable.get_signature(); + bool all_discrete_inputs = true; + for (const type_tree &in_tt : get_signature_inputs(tsig)) { + type_node tn = get_type_node(in_tt); + if (tn != id::boolean_type and tn != id::enum_type) { + all_discrete_inputs = false; + break; + } + } - ///////////////////// - // discrete inputs // - ///////////////////// - if (all_discrete_inputs - and (id::enum_type == otype - or id::boolean_type == otype - or id::contin_type == otype)) - { - // Let X1, ..., Xn be the input columns on the table (as given by fs), - // and Y be the output column. We need to compute the joint entropies - // H(Y, X1, ..., Xn) and H(X1, ..., Xn) - // To do this, we need to count how often the vertex sequence - // (X1, ..., Xn) occurs. This count is kept in "ic". Likewise, the - // "ioc" counter counts how often the vertex_seq (Y, X1, ..., Xn) - // occurs. - typedef Counter VSCounter; - VSCounter ic; // for H(X1, ..., Xn) - VSCounter ioc; // for H(Y, X1, ..., Xn) - double total = 0.0; - - std::vector disc_intvs; - - if (id::contin_type == otype) - { - contin_t min = 100000.0; - contin_t max = 0.0; - - for (const auto& row : ctable) - { - for (const auto& val_pair : row.second) { - const vertex& v = val_pair.first.value; // key of map - if (get_contin(v) < min) - min = get_contin(v); - - if (get_contin(v) > max) - max = get_contin(v); - } - } - disc_intvs = discretize_contin_feature(min, max); - } - - // Count the total number of times an enum appears in the table - Counter ycount; - - for (const auto& row : ctable) - { - // Create the filtered row. - CTable::key_type vec = asf(row.first.get_variant()); - - // update ic (input counter) - count_t row_total = row.second.total_count(); - ic[vec] += row_total; - - // for each enum type counted in the row, - for (const auto& val_pair : row.second) { - const vertex& v = val_pair.first.value; // counter key value - - count_t count = row.second.get(v); - - builtin b; - - // update ioc == "input output counter" - switch (otype) { - case id::enum_type: vec.push_back(get_enum_type(v)); - ycount[v] += count; - break; - case id::boolean_type: vec.push_back(get_builtin(v)); - ycount[v] += count; - break; - case id::contin_type: - b = get_discrete_bin(disc_intvs,get_contin(v)); - vec.push_back(b); - ycount[b] += count; - break; - default: - OC_ASSERT(false, "case not implemented"); - } - ioc[vec] += count; - vec.pop_back(); - } - - // update total numer of data points - total += row_total; - } - - // Compute the probability distributions; viz divide count by total. - // "c" == count, "p" == probability - std::vector yprob(ycount.size()), ip(ic.size()), iop(ioc.size()); - auto div_total = [&](count_t c) { return c/total; }; - transform(ycount | map_values, yprob.begin(), div_total); - transform(ic | map_values, ip.begin(), div_total); - transform(ioc | map_values, iop.begin(), div_total); - - // Compute the entropies - return entropy(ip) + entropy(yprob) - entropy(iop); - } + ///////////////////// + // discrete inputs // + ///////////////////// + if (all_discrete_inputs + and (id::enum_type == otype + or id::boolean_type == otype + or id::contin_type == otype)) { + // Let X1, ..., Xn be the input columns on the table (as given by fs), + // and Y be the output column. We need to compute the joint entropies + // H(Y, X1, ..., Xn) and H(X1, ..., Xn) + // To do this, we need to count how often the vertex sequence + // (X1, ..., Xn) occurs. This count is kept in "ic". Likewise, the + // "ioc" counter counts how often the vertex_seq (Y, X1, ..., Xn) + // occurs. + typedef Counter VSCounter; + VSCounter ic; // for H(X1, ..., Xn) + VSCounter ioc; // for H(Y, X1, ..., Xn) + double total = 0.0; + + std::vector disc_intvs; + + if (id::contin_type == otype) { + contin_t min = 100000.0; + contin_t max = 0.0; + + for (const auto &row : ctable) { + for (const auto &val_pair : row.second) { + const vertex &v = val_pair.first.value; // key of map + if (get_contin(v) < min) + min = get_contin(v); + + if (get_contin(v) > max) + max = get_contin(v); + } + } + disc_intvs = discretize_contin_feature(min, max); + } + + // Count the total number of times an enum appears in the table + Counter ycount; + + for (const auto &row : ctable) { + // Create the filtered row. + CTable::key_type vec = asf(row.first.get_variant()); + + // update ic (input counter) + count_t row_total = row.second.total_count(); + ic[vec] += row_total; + + // for each enum type counted in the row, + for (const auto &val_pair : row.second) { + const vertex &v = val_pair.first.value; // counter key value + + count_t count = row.second.get(v); + + builtin b; + + // update ioc == "input output counter" + switch (otype) { + case id::enum_type: + vec.push_back(get_enum_type(v)); + ycount[v] += count; + break; + case id::boolean_type: + vec.push_back(get_builtin(v)); + ycount[v] += count; + break; + case id::contin_type: + b = get_discrete_bin(disc_intvs, get_contin(v)); + vec.push_back(b); + ycount[b] += count; + break; + default: OC_ASSERT(false, "case not implemented"); + } + ioc[vec] += count; + vec.pop_back(); + } + + // update total numer of data points + total += row_total; + } + + // Compute the probability distributions; viz divide count by total. + // "c" == count, "p" == probability + std::vector yprob(ycount.size()), ip(ic.size()), iop(ioc.size()); + auto div_total = [&](count_t c) { return c / total; }; + transform(ycount | map_values, yprob.begin(), div_total); + transform(ic | map_values, ip.begin(), div_total); + transform(ioc | map_values, iop.begin(), div_total); + + // Compute the entropies + return entropy(ip) + entropy(yprob) - entropy(iop); + } - ///////////////////// - // continuous case // - ///////////////////// - else if (id::contin_type == otype) - { - if (1 < fs.size()) { - OC_ASSERT(0, "Contin MI currently supports only 1 feature."); - } - std::multimap sorted_list; - for (const auto& row : ctable) - { - CTable::key_type vec = asf(row.first.get_variant()); - contin_t x = vec.get_at(0); - - // for each contin counted in the row, - for (const auto& val_pair : row.second) { - const auto& v = val_pair.first.value; // counter key value - contin_t y = get_contin(v); // typecast - - unsigned flt_count = val_pair.second; - dorepeat(flt_count) { - auto pr = std::make_pair(x,y); - sorted_list.insert(pr); - } - } - } - - // XXX TODO, it would be easier if KLD took a sorted list - // as the argument. - std::vector p, q; - for (auto pr : sorted_list) { - p.push_back(pr.first); - q.push_back(pr.second); - } - - // KLD is negative; we want the IC to be postive. - // XXX review this, is this really correct? At any rate, - // feature selection utterly fails with negative IC. - // Also a problem, this is returning values greater than 1.0; - // I thought that IC was supposed to max out at 1.0 !? - contin_t ic = - KLD(p,q); - // XXX TODO remove this print, for better performance. - unsigned idx = *(fs.begin()); - logger().debug() <<"Contin MI for feat=" << idx << " ic=" << ic; - return ic; - } + ///////////////////// + // continuous case // + ///////////////////// + else if (id::contin_type == otype) { + if (1 < fs.size()) { + OC_ASSERT(0, "Contin MI currently supports only 1 feature."); + } + std::multimap sorted_list; + for (const auto &row : ctable) { + CTable::key_type vec = asf(row.first.get_variant()); + contin_t x = vec.get_at(0); + + // for each contin counted in the row, + for (const auto &val_pair : row.second) { + const auto &v = val_pair.first.value; // counter key value + contin_t y = get_contin(v); // typecast + + unsigned flt_count = val_pair.second; + dorepeat(flt_count) { + auto pr = std::make_pair(x, y); + sorted_list.insert(pr); + } + } + } + + // XXX TODO, it would be easier if KLD took a sorted list + // as the argument. + std::vector p, q; + for (auto pr : sorted_list) { + p.push_back(pr.first); + q.push_back(pr.second); + } + + // KLD is negative; we want the IC to be postive. + // XXX review this, is this really correct? At any rate, + // feature selection utterly fails with negative IC. + // Also a problem, this is returning values greater than 1.0; + // I thought that IC was supposed to max out at 1.0 !? + contin_t ic = -KLD(p, q); + // XXX TODO remove this print, for better performance. + unsigned idx = *(fs.begin()); + logger().debug() << "Contin MI for feat=" << idx << " ic=" << ic; + return ic; + } - ////////////////////////////////// - // Other non implemented cases // - ////////////////////////////////// - else - { - std::stringstream ss; - ss << "Type " << otype << " is not supported for mutual information"; - OC_ASSERT(0, ss.str()); - return 0.0; - } + ////////////////////////////////// + // Other non implemented cases // + ////////////////////////////////// + else { + std::stringstream ss; + ss << "Type " << otype << " is not supported for mutual information"; + OC_ASSERT(0, ss.str()); + return 0.0; + } } /** @@ -1308,108 +1488,107 @@ double mutualInformation(const CTable& ctable, const FeatureSet& fs) * discrete types are supported. */ template -double mutualInformationBtwSets(const CTable& ctable, - const FeatureSet& fs_l, - const FeatureSet& fs_r) { - // get union of fs_l and fs_r - FeatureSet fs_u = set_union(fs_l, fs_r); - - // Check that the arities are within permitted range - OC_ASSERT(all_of(fs_u.begin(), fs_u.end(), - [&](const typename FeatureSet::value_type& f) { - return f < ctable.get_arity();})); - - // declare useful visitors - seq_filtered_visitor sfv_u(fs_u), sfv_l(fs_l), sfv_r(fs_r); - auto asf_u = boost::apply_visitor(sfv_u), - asf_l = boost::apply_visitor(sfv_l), - asf_r = boost::apply_visitor(sfv_r); - type_node otype = ctable.get_output_type(); - - /////////////////// - // discrete case // - /////////////////// - if (id::enum_type == otype or id::boolean_type == otype or id::contin_type) - { - // Let U1, ..., Un the features resulting from the union - // between fs_l and fs_r. - // - // Let L1, ..., Lm the features of fs_l - // - // Let R1, ..., Rl the features of fs_r - // - // We need to compute the entropies - // - // H(U1, ..., Un) - // H(L1, ..., Lm) - // H(R1, ..., Rl) - // - // Then the mutual information is - // - // MI(fs_l, fs_r) = H(L1, ..., Lm) + H(R1, ..., Rl) - H(U1, ..., Un) - // - // To do this, we need to count how often those events occurs. - typedef Counter VSCounter; - VSCounter - uc, // for H(U1, ..., Un) - lc, // for H(L1, ..., Lm) - rc; // for H(R1, ..., Rl) - double total = 0.0; - - for (const auto& row : ctable) - { - // Create the filtered row. - CTable::key_type vec_u = asf_u(row.first.get_variant()), - vec_l = asf_l(row.first.get_variant()), - vec_r = asf_r(row.first.get_variant()); - count_t row_total = row.second.total_count(); - - // update uc, lc and rc - uc[vec_u] += row_total; - lc[vec_l] += row_total; - rc[vec_r] += row_total; - - // update total numer of data points - total += row_total; - } - - // Compute the probability distributions; viz divide count by total. - // "c" == count, "p" == probability - std::vector up(uc.size()), lp(lc.size()), rp(rc.size()); - auto div_total = [&](count_t c) { return c/total; }; - transform(uc | map_values, up.begin(), div_total); - transform(lc | map_values, lp.begin(), div_total); - transform(rc | map_values, rp.begin(), div_total); - - // Compute the entropies - return entropy(lp) + entropy(rp) - entropy(up); - } +double mutualInformationBtwSets(const CTable &ctable, + const FeatureSet &fs_l, + const FeatureSet &fs_r) +{ + // get union of fs_l and fs_r + FeatureSet fs_u = set_union(fs_l, fs_r); + + // Check that the arities are within permitted range + OC_ASSERT(all_of(fs_u.begin(), fs_u.end(), + [&](const typename FeatureSet::value_type &f) { + return f < ctable.get_arity(); + })); + + // declare useful visitors + seq_filtered_visitor sfv_u(fs_u), sfv_l(fs_l), sfv_r(fs_r); + auto asf_u = boost::apply_visitor(sfv_u), + asf_l = boost::apply_visitor(sfv_l), + asf_r = boost::apply_visitor(sfv_r); + type_node otype = ctable.get_output_type(); + + /////////////////// + // discrete case // + /////////////////// + if (id::enum_type == otype or id::boolean_type == otype or id::contin_type) { + // Let U1, ..., Un the features resulting from the union + // between fs_l and fs_r. + // + // Let L1, ..., Lm the features of fs_l + // + // Let R1, ..., Rl the features of fs_r + // + // We need to compute the entropies + // + // H(U1, ..., Un) + // H(L1, ..., Lm) + // H(R1, ..., Rl) + // + // Then the mutual information is + // + // MI(fs_l, fs_r) = H(L1, ..., Lm) + H(R1, ..., Rl) - H(U1, ..., Un) + // + // To do this, we need to count how often those events occurs. + typedef Counter VSCounter; + VSCounter + uc, // for H(U1, ..., Un) + lc, // for H(L1, ..., Lm) + rc; // for H(R1, ..., Rl) + double total = 0.0; + + for (const auto &row : ctable) { + // Create the filtered row. + CTable::key_type vec_u = asf_u(row.first.get_variant()), + vec_l = asf_l(row.first.get_variant()), + vec_r = asf_r(row.first.get_variant()); + count_t row_total = row.second.total_count(); + + // update uc, lc and rc + uc[vec_u] += row_total; + lc[vec_l] += row_total; + rc[vec_r] += row_total; + + // update total numer of data points + total += row_total; + } + + // Compute the probability distributions; viz divide count by total. + // "c" == count, "p" == probability + std::vector up(uc.size()), lp(lc.size()), rp(rc.size()); + auto div_total = [&](count_t c) { return c / total; }; + transform(uc | map_values, up.begin(), div_total); + transform(lc | map_values, lp.begin(), div_total); + transform(rc | map_values, rp.begin(), div_total); + + // Compute the entropies + return entropy(lp) + entropy(rp) - entropy(up); + } - ////////////////////////////////// - // Other non implemented cases // - ////////////////////////////////// - else - { - OC_ASSERT(0, "Unsupported type for mutual information"); - return 0.0; - } + ////////////////////////////////// + // Other non implemented cases // + ////////////////////////////////// + else { + OC_ASSERT(0, "Unsupported type for mutual information"); + return 0.0; + } } /** * template to subsample input and output tables, after subsampling * the table have size min(nsamples, *table.size()) */ -void subsampleTable(ITable& it, OTable& ot, unsigned nsamples); +void subsampleTable(ITable &it, OTable &ot, unsigned nsamples); /** * Like above on Table instead of ITable and OTable */ -void subsampleTable(Table& table, unsigned nsamples); +void subsampleTable(Table &table, unsigned nsamples); /** * like above but subsample only the input table */ -void subsampleTable(ITable& it, unsigned nsamples); +void subsampleTable(ITable &it, unsigned nsamples); ///////////////// // Truth table // @@ -1433,89 +1612,100 @@ void subsampleTable(ITable& it, unsigned nsamples); * +-----------------------+--+--+ */ typedef std::vector bool_seq; + class complete_truth_table : public bool_seq { public: - typedef bool_seq super; - - complete_truth_table() {} - template - complete_truth_table(It from, It to) : super(from, to) {} - template - complete_truth_table(const tree& tr, arity_t arity) - : super(pow2(arity)), _arity(arity) - { - populate(tr); - } - template - complete_truth_table(const tree& tr) - { - _arity = arity(tr); - this->resize(pow2(_arity)); - populate(tr); - } + typedef bool_seq super; - complete_truth_table(const Handle&) + complete_truth_table() + {} + + template + complete_truth_table(It from, It to) : super(from, to) + {} + + template + complete_truth_table(const tree &tr, arity_t arity) + : super(pow2(arity)), _arity(arity) + { + populate(tr); + } + + template + complete_truth_table(const tree &tr) + { + _arity = arity(tr); + this->resize(pow2(_arity)); + populate(tr); + } + + complete_truth_table(const Handle &) { OC_ASSERT(false, "Truth table from Handle not implemented yet"); } - complete_truth_table(const Handle&, arity_t arity) + complete_truth_table(const Handle &, arity_t arity) { OC_ASSERT(false, "Truth table from Handle not implemented yet"); } - template - complete_truth_table(const Func& f, arity_t arity) - : super(pow2(arity)), _arity(arity) { - iterator it = begin(); - for (int i = 0; it != end(); ++i, ++it) { - bool_seq v(_arity); - for (arity_t j = 0;j < _arity; ++j) - v[j] = (i >> j) % 2; // j'th bit of i - (*it) = f(v.begin(), v.end()); - } - } + template + complete_truth_table(const Func &f, arity_t arity) + : super(pow2(arity)), _arity(arity) + { + iterator it = begin(); + for (int i = 0; it != end(); ++i, ++it) { + bool_seq v(_arity); + for (arity_t j = 0; j < _arity; ++j) + v[j] = (i >> j) % 2; // j'th bit of i + (*it) = f(v.begin(), v.end()); + } + } - /* - this operator allows to access quickly to the results of a - complete_truth_table. [from, to) points toward a chain of - boolean describing the inputs of the function coded into the - complete_truth_table and the operator returns the results. - */ - template - bool operator()(It from, It to) { - const_iterator it = begin(); - for (int i = 1; from != to; ++from, i = i << 1) - if (*from) - it += i; - return *it; - } + /* + this operator allows to access quickly to the results of a + complete_truth_table. [from, to) points toward a chain of + boolean describing the inputs of the function coded into the + complete_truth_table and the operator returns the results. + */ + template + bool operator()(It from, It to) + { + const_iterator it = begin(); + for (int i = 1; from != to; ++from, i = i << 1) + if (*from) + it += i; + return *it; + } + + size_type hamming_distance(const complete_truth_table &other) const; - size_type hamming_distance(const complete_truth_table& other) const; + /** + * compute the truth table of tr and compare it to self. This + * method is optimized so that if there are not equal it can be + * detected before calculating the entire table. + */ + bool same_complete_truth_table(const combo_tree &tr) const; - /** - * compute the truth table of tr and compare it to self. This - * method is optimized so that if there are not equal it can be - * detected before calculating the entire table. - */ - bool same_complete_truth_table(const combo_tree& tr) const; protected: - template - void populate(const tree& tr) - { - inputs.resize(_arity); - iterator it = begin(); - for (int i = 0; it != end(); ++i, ++it) { - for (int j = 0; j < _arity; ++j) - inputs[j] = bool_to_builtin((i >> j) % 2); // j'th bit of i - *it = builtin_to_bool(boolean_interpreter(inputs)(tr)); - } - } - arity_t _arity; - mutable builtin_seq inputs; + template + void populate(const tree &tr) + { + inputs.resize(_arity); + iterator it = begin(); + for (int i = 0; it != end(); ++i, ++it) { + for (int j = 0; j < _arity; ++j) + inputs[j] = bool_to_builtin((i >> j) % 2); // j'th bit of i + *it = builtin_to_bool(boolean_interpreter(inputs)(tr)); + } + } + + arity_t _arity; + mutable builtin_seq inputs; }; -}} // ~namespaces combo opencog +} +} // ~namespaces combo opencog #endif // _OPENCOG_TABLE_H From d881d268e8387bce580dcba832bd2115cb389e13 Mon Sep 17 00:00:00 2001 From: kasim Date: Sun, 16 Sep 2018 18:03:49 +0300 Subject: [PATCH 11/17] Add Complete truth table for atomese --- moses/comboreduct/table/table.cc | 38 +++++++++++++++++++++++++++++ moses/comboreduct/table/table.h | 42 ++++++++++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/moses/comboreduct/table/table.cc b/moses/comboreduct/table/table.cc index a0a949dc0a..b31f55e2c8 100644 --- a/moses/comboreduct/table/table.cc +++ b/moses/comboreduct/table/table.cc @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include "../combo/ann.h" #include "../combo/simple_nn.h" @@ -871,6 +873,42 @@ bool complete_truth_table::same_complete_truth_table(const combo_tree &tr) const return true; } +void complete_truth_table::populate(const Handle &handle) +{ + // create a vector containing values for each feature or arity. + // this will contain the inputs of the truth table. + std::vector features(_arity); + populate_features(features); + + // map the values of inputs to the program. + setup_features(handle, features.begin(), features.end()); + + atomese::Interpreter interpreter(key); + std::vector result = LinkValueCast(interpreter(handle))->value(); + + // convert Links in the result of the interpreter to bool, + // and store it to the truth table. + std::transform(result.begin(), result.end(), begin(), [](ProtoAtomPtr p){ + return HandleCast(p)->get_type() == TRUE_LINK; + }); +} + +void complete_truth_table::populate_features(std::vector &features) +{ + auto it = begin(); + for (int i = 0; it != end(); ++i, ++it) { + ProtoAtomPtrVec row; + for (int j = 0; j < _arity; ++j) { + ProtoAtomPtr v; + if ((i >> j) % 2) + v = ProtoAtomPtr(createLink(TRUE_LINK)); + else v = ProtoAtomPtr(createLink(FALSE_LINK)); + row.push_back(v); + } + features.push_back(row); + } +} + ///////////////////// // Subsample table // ///////////////////// diff --git a/moses/comboreduct/table/table.h b/moses/comboreduct/table/table.h index c45502026b..bb890c29a4 100644 --- a/moses/comboreduct/table/table.h +++ b/moses/comboreduct/table/table.h @@ -42,6 +42,9 @@ #include #include #include +#include +#include +#include #include "../type_checker/type_tree.h" #include "../interpreter/eval.h" @@ -1612,6 +1615,7 @@ void subsampleTable(ITable &it, unsigned nsamples); * +-----------------------+--+--+ */ typedef std::vector bool_seq; +typedef std::vector ProtoAtomPtrVec; class complete_truth_table : public bool_seq { @@ -1645,9 +1649,10 @@ class complete_truth_table : public bool_seq OC_ASSERT(false, "Truth table from Handle not implemented yet"); } - complete_truth_table(const Handle &, arity_t arity) + complete_truth_table(const Handle &handle, arity_t arity) + : super(pow2(arity)), _arity(arity) { - OC_ASSERT(false, "Truth table from Handle not implemented yet"); + populate(handle); } template @@ -1701,8 +1706,41 @@ class complete_truth_table : public bool_seq } } + /** + * Sets the values of each predicateNode[Variables] of the program in order to + * get evaluated by the interpreter. + * + * @param handle contains the program to get its variables populated. + * @param It from beginning iterator of the vector containing values of variables. + * @param It to end iterator of vector containing values of variables. + */ + template + void setup_features(const Handle &handle, It from, It to) + { + if (from == to) + return; + + if (PREDICATE_NODE == handle->get_type()) { + handle->setValue(createNode(NODE, "*-AS-MOSES:SchemaValuesKey-*"), + ProtoAtomPtr(new LinkValue(*from))); + ++from; + return; + } + + if (handle->is_link()) { + for (Handle h : handle->getOutgoingSet()) { + setup_features(h, from, to); + } + } + } + + void populate(const Handle &handle); + + void populate_features(std::vector &features); + arity_t _arity; mutable builtin_seq inputs; + const Handle key = createNode(NODE, "*-AS-MOSES:SchemaValuesKey-*"); }; } From fa4e49da0caffabc2ef879ebd32d3bdbde424bbc Mon Sep 17 00:00:00 2001 From: kasim Date: Mon, 17 Sep 2018 16:03:53 +0300 Subject: [PATCH 12/17] Fix undefined ref --- moses/CMakeLists.txt | 2 +- moses/atomese/CMakeLists.txt | 4 +--- moses/atomese/representation/CMakeLists.txt | 9 -------- moses/comboreduct/CMakeLists.txt | 1 + moses/data/CMakeLists.txt | 21 +++++++++++++++++++ moses/data/representation/CMakeLists.txt | 9 ++++++++ .../representation/load_table.cc | 0 .../representation/load_table.h | 0 tests/CMakeLists.txt | 1 + tests/atomese/CMakeLists.txt | 1 - tests/data/CMakeLists.txt | 1 + .../representation/CMakeLists.txt | 2 +- .../representation/boolean_data_result.scm | 0 .../representation/boolean_data_test.csv | 0 .../commented_dataset_result.scm | 0 .../representation/load_tableUTest.cxxtest | 4 ++-- .../representation/real_data_result.scm | 0 .../representation/real_data_result2.scm | 0 .../representation/real_data_test.csv | 0 .../representation/real_data_test2.csv | 0 20 files changed, 38 insertions(+), 17 deletions(-) delete mode 100644 moses/atomese/representation/CMakeLists.txt create mode 100644 moses/data/CMakeLists.txt create mode 100644 moses/data/representation/CMakeLists.txt rename moses/{atomese => data}/representation/load_table.cc (100%) rename moses/{atomese => data}/representation/load_table.h (100%) create mode 100644 tests/data/CMakeLists.txt rename tests/{atomese => data}/representation/CMakeLists.txt (90%) rename tests/{atomese => data}/representation/boolean_data_result.scm (100%) rename tests/{atomese => data}/representation/boolean_data_test.csv (100%) rename tests/{atomese => data}/representation/commented_dataset_result.scm (100%) rename tests/{atomese => data}/representation/load_tableUTest.cxxtest (98%) rename tests/{atomese => data}/representation/real_data_result.scm (100%) rename tests/{atomese => data}/representation/real_data_result2.scm (100%) rename tests/{atomese => data}/representation/real_data_test.csv (100%) rename tests/{atomese => data}/representation/real_data_test2.csv (100%) diff --git a/moses/CMakeLists.txt b/moses/CMakeLists.txt index 4d5d7f0566..84bc104474 100644 --- a/moses/CMakeLists.txt +++ b/moses/CMakeLists.txt @@ -1,8 +1,8 @@ - ADD_SUBDIRECTORY (comboreduct) ADD_SUBDIRECTORY (moses) ADD_SUBDIRECTORY (feature-selection) ADD_SUBDIRECTORY (atomese) +ADD_SUBDIRECTORY (data) # Currently, the pleasure code does not build, as it uses various # obsolete combo types and routines and etc. It needs to be ported diff --git a/moses/atomese/CMakeLists.txt b/moses/atomese/CMakeLists.txt index 6703ce4546..539ccdea51 100644 --- a/moses/atomese/CMakeLists.txt +++ b/moses/atomese/CMakeLists.txt @@ -1,13 +1,10 @@ -ADD_SUBDIRECTORY(representation) ADD_SUBDIRECTORY(interpreter) ADD_LIBRARY(atomese SHARED - representation/load_table interpreter/Interpreter ) TARGET_LINK_LIBRARIES (atomese - ascomboreduct ${COGUTIL_LIBRARY} ${ATOMSPACE_LIBRARIES} ) @@ -17,6 +14,7 @@ IF (WIN32) INSTALL(TARGETS atomese DESTINATION "lib${LIB_DIR_SUFFIX}/moses") ELSE (WIN32) INSTALL(TARGETS atomese + EXPORT ASMosesTargets LIBRARY DESTINATION "lib${LIB_DIR_SUFFIX}" # lib*.so files ARCHIVE DESTINATION "lib${LIB_DIR_SUFFIX}") # lib*.a files ENDIF (WIN32) diff --git a/moses/atomese/representation/CMakeLists.txt b/moses/atomese/representation/CMakeLists.txt deleted file mode 100644 index 382cf8deb6..0000000000 --- a/moses/atomese/representation/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -#install header files -INSTALL(FILES - - load_table.h - - DESTINATION - - "include/${PROJECT_NAME}/atomese/representation" - ) diff --git a/moses/comboreduct/CMakeLists.txt b/moses/comboreduct/CMakeLists.txt index 8bae2d97f9..50da0af903 100644 --- a/moses/comboreduct/CMakeLists.txt +++ b/moses/comboreduct/CMakeLists.txt @@ -65,6 +65,7 @@ ADD_LIBRARY(ascomboreduct SHARED ) TARGET_LINK_LIBRARIES (ascomboreduct + atomese ${COGUTIL_LIBRARY} ${Boost_DATE_TIME_LIBRARY} ${Boost_THREAD_LIBRARY} diff --git a/moses/data/CMakeLists.txt b/moses/data/CMakeLists.txt new file mode 100644 index 0000000000..97fbd12ebc --- /dev/null +++ b/moses/data/CMakeLists.txt @@ -0,0 +1,21 @@ +ADD_SUBDIRECTORY(representation) + +ADD_LIBRARY(data SHARED + representation/load_table + ) + +TARGET_LINK_LIBRARIES (data + ascomboreduct + ${COGUTIL_LIBRARY} + ${ATOMSPACE_LIBRARIES} + ) + +#install library +IF (WIN32) + INSTALL(TARGETS data DESTINATION "lib${LIB_DIR_SUFFIX}/moses") +ELSE (WIN32) + INSTALL(TARGETS data + EXPORT ASMosesTargets + LIBRARY DESTINATION "lib${LIB_DIR_SUFFIX}" # lib*.so files + ARCHIVE DESTINATION "lib${LIB_DIR_SUFFIX}") # lib*.a files +ENDIF (WIN32) diff --git a/moses/data/representation/CMakeLists.txt b/moses/data/representation/CMakeLists.txt new file mode 100644 index 0000000000..29cf38db09 --- /dev/null +++ b/moses/data/representation/CMakeLists.txt @@ -0,0 +1,9 @@ +#install header files +INSTALL(FILES + + load_table.h + + DESTINATION + + "include/${PROJECT_NAME}/data/representation" + ) diff --git a/moses/atomese/representation/load_table.cc b/moses/data/representation/load_table.cc similarity index 100% rename from moses/atomese/representation/load_table.cc rename to moses/data/representation/load_table.cc diff --git a/moses/atomese/representation/load_table.h b/moses/data/representation/load_table.h similarity index 100% rename from moses/atomese/representation/load_table.h rename to moses/data/representation/load_table.h diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4317db1d9e..954162e24d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -26,6 +26,7 @@ IF (CXXTEST_FOUND) ADD_SUBDIRECTORY (moses) ADD_SUBDIRECTORY (feature-selection) ADD_SUBDIRECTORY (atomese) + ADD_SUBDIRECTORY (data) ENDIF (CXXTEST_FOUND) diff --git a/tests/atomese/CMakeLists.txt b/tests/atomese/CMakeLists.txt index a698586824..0c45d55874 100644 --- a/tests/atomese/CMakeLists.txt +++ b/tests/atomese/CMakeLists.txt @@ -1,2 +1 @@ -ADD_SUBDIRECTORY (representation) ADD_SUBDIRECTORY (interpreter) diff --git a/tests/data/CMakeLists.txt b/tests/data/CMakeLists.txt new file mode 100644 index 0000000000..e3a846ccb3 --- /dev/null +++ b/tests/data/CMakeLists.txt @@ -0,0 +1 @@ +ADD_SUBDIRECTORY (representation) diff --git a/tests/atomese/representation/CMakeLists.txt b/tests/data/representation/CMakeLists.txt similarity index 90% rename from tests/atomese/representation/CMakeLists.txt rename to tests/data/representation/CMakeLists.txt index 879b3574ef..fc19768fde 100644 --- a/tests/atomese/representation/CMakeLists.txt +++ b/tests/data/representation/CMakeLists.txt @@ -1,7 +1,7 @@ IF (HAVE_GUILE) ADD_CXXTEST(load_tableUTest) TARGET_LINK_LIBRARIES(load_tableUTest - atomese + data ${ATOMSPACE_LIBRARIES} ${GUILE_LIBRARIES} ) diff --git a/tests/atomese/representation/boolean_data_result.scm b/tests/data/representation/boolean_data_result.scm similarity index 100% rename from tests/atomese/representation/boolean_data_result.scm rename to tests/data/representation/boolean_data_result.scm diff --git a/tests/atomese/representation/boolean_data_test.csv b/tests/data/representation/boolean_data_test.csv similarity index 100% rename from tests/atomese/representation/boolean_data_test.csv rename to tests/data/representation/boolean_data_test.csv diff --git a/tests/atomese/representation/commented_dataset_result.scm b/tests/data/representation/commented_dataset_result.scm similarity index 100% rename from tests/atomese/representation/commented_dataset_result.scm rename to tests/data/representation/commented_dataset_result.scm diff --git a/tests/atomese/representation/load_tableUTest.cxxtest b/tests/data/representation/load_tableUTest.cxxtest similarity index 98% rename from tests/atomese/representation/load_tableUTest.cxxtest rename to tests/data/representation/load_tableUTest.cxxtest index 08d205a416..42273cebf9 100644 --- a/tests/atomese/representation/load_tableUTest.cxxtest +++ b/tests/data/representation/load_tableUTest.cxxtest @@ -2,7 +2,7 @@ #include #include #include -#include "moses/atomese/representation/load_table.h" +#include "moses/data/representation/load_table.h" using namespace opencog; using namespace std; @@ -12,7 +12,7 @@ private: AtomSpace *as; SchemeEval *eval; const string load_tableUTest_dir = - string(PROJECT_SOURCE_DIR) + "/tests/atomese/representation/"; + string(PROJECT_SOURCE_DIR) + "/tests/data/representation/"; const string tableUTest_dir = string(PROJECT_SOURCE_DIR) + "/tests/comboreduct/table/"; diff --git a/tests/atomese/representation/real_data_result.scm b/tests/data/representation/real_data_result.scm similarity index 100% rename from tests/atomese/representation/real_data_result.scm rename to tests/data/representation/real_data_result.scm diff --git a/tests/atomese/representation/real_data_result2.scm b/tests/data/representation/real_data_result2.scm similarity index 100% rename from tests/atomese/representation/real_data_result2.scm rename to tests/data/representation/real_data_result2.scm diff --git a/tests/atomese/representation/real_data_test.csv b/tests/data/representation/real_data_test.csv similarity index 100% rename from tests/atomese/representation/real_data_test.csv rename to tests/data/representation/real_data_test.csv diff --git a/tests/atomese/representation/real_data_test2.csv b/tests/data/representation/real_data_test2.csv similarity index 100% rename from tests/atomese/representation/real_data_test2.csv rename to tests/data/representation/real_data_test2.csv From 2d3ef4d0ebbafc56f478dfa24ce5abff51245b18 Mon Sep 17 00:00:00 2001 From: kasim Date: Tue, 18 Sep 2018 14:37:48 +0300 Subject: [PATCH 13/17] Add complete truth table test --- tests/comboreduct/table/tableUTest.cxxtest | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/comboreduct/table/tableUTest.cxxtest b/tests/comboreduct/table/tableUTest.cxxtest index e618c9b733..ebc2a6d375 100644 --- a/tests/comboreduct/table/tableUTest.cxxtest +++ b/tests/comboreduct/table/tableUTest.cxxtest @@ -21,6 +21,7 @@ */ #include #include +#include using namespace opencog; using namespace combo; @@ -434,4 +435,16 @@ public: "0,7,2012-03-23\n") } } + void test_complete_truth_table() + { + HandleSeq seq_1 = {createNode(PREDICATE_NODE, "f1"), createNode(PREDICATE_NODE, "f2")}; + Handle or_link = createLink(seq_1, OR_LINK); + HandleSeq seq_2 = {createNode(PREDICATE_NODE, "f3"), or_link}; + Handle program = createLink(seq_2, AND_LINK); + + combo::complete_truth_table ctable(program, 3); + + bool_seq expected = {false, false, false, true, false, true, false, true}; + TS_ASSERT(ctable == expected); + } }; From ca109845a541306c62d40971a113771eca8dc5cb Mon Sep 17 00:00:00 2001 From: kasim Date: Tue, 18 Sep 2018 14:38:41 +0300 Subject: [PATCH 14/17] Reformat tableUTest --- tests/comboreduct/table/tableUTest.cxxtest | 802 +++++++++++---------- 1 file changed, 403 insertions(+), 399 deletions(-) diff --git a/tests/comboreduct/table/tableUTest.cxxtest b/tests/comboreduct/table/tableUTest.cxxtest index ebc2a6d375..92cfd753b5 100644 --- a/tests/comboreduct/table/tableUTest.cxxtest +++ b/tests/comboreduct/table/tableUTest.cxxtest @@ -35,406 +35,410 @@ const string tableUTest_dir = string(PROJECT_SOURCE_DIR) + "/tests/comboreduct/t class tableUTest : public CxxTest::TestSuite { private: - void testIStreamTable(const string& ts, const Table& expected_table, - const string& target_feature = string(), - const string& timestamp_feature = string(), - const vector& ignore_features = vector()) - { - std::cout << "\n----------------------------" << std::endl; - - stringstream ss(ts); - Table table; - istreamTable(ss, table, target_feature, - timestamp_feature, ignore_features); - - std::cout << "table test =" << std::endl; - std::cout << table << std::endl; - - // Due to some weird ambiguity between CxxTest::equals and - // boost::algorithm::equals we avoid using TS_ASSERT_EQUALS - TS_ASSERT(table == expected_table); - } + void testIStreamTable(const string &ts, const Table &expected_table, + const string &target_feature = string(), + const string ×tamp_feature = string(), + const vector &ignore_features = vector()) + { + std::cout << "\n----------------------------" << std::endl; + + stringstream ss(ts); + Table table; + istreamTable(ss, table, target_feature, + timestamp_feature, ignore_features); + + std::cout << "table test =" << std::endl; + std::cout << table << std::endl; + + // Due to some weird ambiguity between CxxTest::equals and + // boost::algorithm::equals we avoid using TS_ASSERT_EQUALS + TS_ASSERT(table == expected_table); + } + public: - - tableUTest() - { - logger().set_print_to_stdout_flag(true); - logger().set_level(Logger::DEBUG); - } - - void test_TruthTable() - { - vertex_seq o{id::logical_true, id::logical_false}; - OTable ot(o); - ITable it(vector(2, id::boolean_type)); - it.push_back({id::logical_true, id::logical_false}); - it.push_back({id::logical_false, id::logical_true}); - Table table(ot, it); - testIStreamTable("1,1,0\n" - "0,0,1\n", - table); - } - - void test_ContinTable() - { - vertex_seq o{10.1, 6.5}; - OTable ot(o); - ITable it(vector(2, id::contin_type)); - it.push_back({0.4, 244.2}); - it.push_back({9.5, 4.2}); - Table table(ot, it); - testIStreamTable("10.1,0.4,244.2\n" - "6.5,9.5,4.2\n", table); - } - - void test_ContinTable_type_inference() - { - vertex_seq o{0, 1, 10.1, 6.5}; - OTable ot(o); - ITable it(vector(2, id::contin_type)); - it.push_back({1.0, 1.0}); - it.push_back({0.0, 1.0}); - it.push_back({0.4, 244.2}); - it.push_back({9.5, 4.2}); - Table table(ot, it); - testIStreamTable("0,1,1\n" - "1,0,1\n" - "10.1,0.4,244.2\n" - "6.5,9.5,4.2\n", table); - } - - void test_ContinTableLabels() - { - vertex_seq o{0, 10.1, 6.5}; - OTable ot(o, olabel); - ITable it(vector(2, id::contin_type)); - it.push_back({1.0, 1.0}); - it.push_back({0.4, 244.2}); - it.push_back({9.5, 4.2}); - vector il{label1, label2}; - it.set_labels(il); - Table table(ot, it); - testIStreamTable("happiness,price,temperature\n" - "0,1,1\n" - "10.1,0.4,244.2\n" - "6.5,9.5,4.2\n", table); - } - - void test_EnumTableLabels() - { - vertex_seq o{enum_t("sad"), enum_t("bored"), enum_t("super-happy")}; - OTable ot(o, olabel); - vector itypes({id::contin_type, id::contin_type, id::boolean_type}); - vector il{label1, label2, "truth"}; - ITable it(itypes, il); - it.push_back(vertex_seq({1, 1, id::logical_false})); - it.push_back(vertex_seq({0.44, 244.2, id::logical_true})); - it.push_back(vertex_seq({9.5, 4.2, id::logical_false})); - Table table(ot, it); - testIStreamTable("happiness,price,temperature,truth\n" - "sad,1,1,0\n" - "bored,0.44,244.2,T\n" - "super-happy,9.5,4.2,F\n", table); - } - - void test_filterITable() - { - std::cout << "\n----------------------------" << std::endl; - vector lbs{"truc" ,"trac"}; - ITable it(vector(2, id::boolean_type), lbs); - it.push_back({id::logical_true, id::logical_false}); - it.push_back({id::logical_false, id::logical_true}); - vertex_seq o{id::logical_true, id::logical_false,}; - // ignore second argument trac - vector args{0}; - ITable fit_it = it.filtered(args); - OTable ot(o, "troc"); - Table table(ot, fit_it); - stringstream ss; - ostreamTable(ss, table); - cout << "result: filterITable" << endl; - cout << ss.str(); - cout << "fit_it:" << endl; - cout << fit_it; - cout << "ot:" << endl; - cout << ot; - TS_ASSERT_EQUALS(ss.str(), - "troc,truc\n" - "1,1\n" - "0,0\n"); - } - - void test_filterTable() - { - string input_file = tableUTest_dir + "timestamp_table2.csv"; - Table table = loadTable(input_file, "target", "time"); - Table filtered_table = table.filtered(std::set({1})); - - stringstream ss; - ostreamTable(ss, filtered_table); - TS_ASSERT_EQUALS(ss.str(), - "input1,time,target\n" - "1,2012-03-11,6\n" - "0,2012-03-23,7\n") - } - - void test_filterCTable() - { - std::cout << "\n----------------------------" << std::endl; - vector ts(2, id::boolean_type); - vector lbs{"truc" ,"trac"}; - ITable it(ts, lbs); - it.push_back({id::logical_true, id::logical_false}); - it.push_back({id::logical_false, id::logical_true}); - vertex_seq o{id::logical_true, id::logical_false,}; - OTable ot(o, "troc"); - Table table(ot, it); - // Generate CTable - CTable ctable = table.compressed(); - // ignore second argument trac - vector args{0}; - CTable filtered_ctable = ctable.filtered(args); - stringstream ss; - ostreamCTable(ss, filtered_ctable); - cout << "test_filterCTable result:" << endl; - cout << ss.str(); - TS_ASSERT_EQUALS(ss.str(), - "troc,truc\n" - "{1:1},1\n" - "{0:1},0\n"); - } - void test_weightCTable() - { - std::cout << "\n----------------------------" << std::endl; - string target_feature("r"); - string timestamp_feature(""); - vector ignore_features; - Table tt = loadTable(tableUTest_dir + "weighted.csv", - target_feature, timestamp_feature, - ignore_features); - CTable ctt = tt.compressed("wgt"); - - cout << "weighted result:\n" << ctt << endl; - stringstream ss; - ss << ctt; - - TS_ASSERT_EQUALS(ss.str(), - "r,a,b,c,d,e\n" - "{1:3.5},1,1,1,1,1\n" - "{1:2.66},1,1,1,0,0\n" - "{0:2.5},0,1,1,1,1\n" - "{1:0.4,0:0.8},0,0,0,0,0\n"); - } - - // Set target Price, and ignore column Temperature - void test_target_col_ignore_col() - { - vertex_seq o{0.4, 9.5}; // price column - OTable ot(o, label1); - vector il{olabel}; - ITable it(vector({id::contin_type}), il); - it.push_back({10.1}); - it.push_back({6.5}); - Table table(ot, it); - table.target_pos = 1; - - testIStreamTable("happiness,price,temperature\n" - "10.1,0.4,244.2\n" - "6.5,9.5,4.2\n", table, - "price", "", {"temperature"}); - } - - void test_ignore_col_complicated() - { - vertex_seq o{1, 0.44, 9.5}; // price column - OTable ot(o, label1); - vector il{olabel, "truth"}; - vector itypes({id::enum_type, id::boolean_type}); - ITable it(itypes, il); - it.push_back(vertex_seq({enum_t("sad"), id::logical_false})); - it.push_back(vertex_seq({enum_t("bored"), id::logical_true})); - it.push_back(vertex_seq({enum_t("super-happy"), id::logical_false})); - Table table(ot, it); - table.target_pos = 1; - - testIStreamTable("happiness,price,temperature,truth\n" - "sad,1,1,0\n" - "bored,0.44,244.2,T\n" - "super-happy,9.5,4.2,F\n", table, - "price", "", {"temperature"}); - } - - // // This is no longer supported - // void test_ignore_col_numeric() - // { - // vertex_seq o{1, 0.44, 9.5}; // price column - // OTable ot(o, label1); - // vector itypes({id::enum_type, id::boolean_type}); - // vector il{olabel, "truth"}; - // ITable it(itypes, il); - // it.push_back(vertex_seq({enum_t("sad"), id::logical_false})); - // it.push_back(vertex_seq({enum_t("bored"), id::logical_true})); - // it.push_back(vertex_seq({enum_t("super-happy"), id::logical_false})); - // testIStreamTable("happiness,price,temperature,truth\n" - // "sad,1,1,0\n" - // "bored,0.44,244.2,T\n" - // "super-happy,9.5,4.2,F\n", it, ot, true, - // // numeric values for these columns; the left-most is column 1 - // // so price is column 2... - // // "price", {"temperature"}); - // "2", "", {"3"}); - // } - - void test_sparseFile1() - { - std::cout << "\n----------------------------" << std::endl; - string target_feature = "target1"; - string timestamp_feature = ""; - vector ignore_features = {"a", "b", "c", "target2"}; - Table tt = loadTable(tableUTest_dir + "sparse.data", - target_feature, timestamp_feature, - ignore_features); - - testIStreamTable("target1,date,d,e,f,g,x,y,z\n" - "thing,12-04-2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0\n" - "some,12-05-2012,4.0,0.0,6.0,0.0,0.0,0.0,0.0\n" - "other,12-06-2012T14:35:02-0500,0.0,5.0,0.0,7.0,0.0,0.0,0.0\n" - "last,12-07-2012T16:00:00-0500,4.0,5.0,0.0,0.0,24.0,25.0,26.0\n", - tt); - } - - void test_sparseFile2() - { - std::cout << "\n----------------------------" << std::endl; - string target_feature = "target2"; - string timestamp_feature = ""; - vector ignore_features = {"a", "b", "c", "target1"}; - Table tt = loadTable(tableUTest_dir + "sparse.data", - target_feature, timestamp_feature, - ignore_features); - - testIStreamTable("target2,date,d,e,f,g,x,y,z\n" - "3.0,12-04-2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0\n" - "4.4,12-05-2012,4.0,0.0,6.0,0.0,0.0,0.0,0.0\n" - "5.5,12-06-2012T14:35:02-0500,0.0,5.0,0.0,7.0,0.0,0.0,0.0\n" - "6.6,12-07-2012T16:00:00-0500,4.0,5.0,0.0,0.0,24.0,25.0,26.0\n", - tt); - } - - void test_mutualInformation() - { - std::cout << "\n----------------------------" << std::endl; - string target_feature; - string timestamps_feature; - Table tt = loadTable(tableUTest_dir + "dataset.csv", - target_feature, timestamps_feature); - set fs{0, 1}; - CTable ctt = tt.compressed(); - double mi = mutualInformation(ctt, fs); - cout << "mi = " << mi << endl; - TS_ASSERT_DELTA(0.419973, mi, 0.0001); - } - - void test_mutualInformationBtwSets() - { - std::cout << "\n----------------------------" << std::endl; - string target_feature; - vector ignore_features; - Table tt = loadTable(tableUTest_dir + "dataset.csv"); - - std::cout << "tt = " << tt << std::endl; - set fs1{0}; - set fs2{1}; - CTable ctt = tt.compressed(); - double mi = mutualInformationBtwSets(ctt, fs1, fs2); - cout << "mi = " << mi << endl; - TS_ASSERT_DELTA(0.3219, mi, 0.0001); - } - - void test_insert_get_pos() - { - std::cout << "\n----------------------------" << std::endl; - ITable it; - vertex_seq col1 = {id::logical_true, id::logical_false}; - it.insert_col("label1", col1); - { - stringstream ss_res; - ss_res << it; - TS_ASSERT_EQUALS(ss_res.str(), string("label1\nboolean\n1\n0\n")); - } - vertex_seq col2 = {id::logical_false, id::logical_true}; - it.insert_col("label2", col2); - { - stringstream ss_res; - ss_res << it; - TS_ASSERT_EQUALS(ss_res.str(), string("label1,label2\nboolean,boolean\n1,0\n0,1\n")); - } - auto lc = it.get_column_data("label2"); - it.insert_col("label3", lc, 0); - { - stringstream ss_res; - ss_res << it; - TS_ASSERT_EQUALS(ss_res.str(), string("label3,label1,label2\nboolean,boolean,boolean\n0,1,0\n1,0,1\n")); - } - } - - void test_loadCTable() - { - string input_file = tableUTest_dir + "dataset.ctable"; - CTable ct = loadCTable(input_file); - - stringstream ss; - ss << ct; - - TS_ASSERT_EQUALS(ss.str(), string("target,f1,f2\n" - "{1:10,0:3},1,0\n" - "{1:4,0:5},0,1\n")); - } - - void test_timestamp_feature() - { - string input_file = tableUTest_dir + "timestamp_table.csv"; - Table table = loadTable(input_file, "length", "time"); - - std::vector dates_str = {"2012-Jan-11", - "2012-Jan-13", - "2012-Feb-18"}; - std::vector dates; - for (const auto& ds : dates_str) - dates.push_back(TTable::from_string(ds)); - TTable expected_ttable(dates, "time"); - - // Due to some weird ambiguity between CxxTest::equals and - // boost::algorithm::equals we avoid using TS_ASSERT_EQUALS - TS_ASSERT(table.ttable == dates); - } - - void test_ostream_timestamp_table() { - { // timestamp feature before target and ignore the second feature - string input_file = tableUTest_dir + "timestamp_table2.csv"; - // vector ignore_features = ; - - Table table = loadTable(input_file, "target", "time", {"input1"}); - - stringstream ss; - ostreamTable(ss, table); - TS_ASSERT_EQUALS(ss.str(), - "input0,time,target\n" - "0,2012-03-11,6\n" - "1,2012-03-23,7\n") - } - { // timestamp feature after target - string input_file = tableUTest_dir + "timestamp_table3.csv"; - Table table = loadTable(input_file, "target", "time"); - - stringstream ss; - ostreamTable(ss, table); - TS_ASSERT_EQUALS(ss.str(), - "input1,target,time\n" - "1,6,2012-03-11\n" - "0,7,2012-03-23\n") - } - } + + tableUTest() + { + logger().set_print_to_stdout_flag(true); + logger().set_level(Logger::DEBUG); + } + + void test_TruthTable() + { + vertex_seq o{id::logical_true, id::logical_false}; + OTable ot(o); + ITable it(vector(2, id::boolean_type)); + it.push_back({id::logical_true, id::logical_false}); + it.push_back({id::logical_false, id::logical_true}); + Table table(ot, it); + testIStreamTable("1,1,0\n" + "0,0,1\n", + table); + } + + void test_ContinTable() + { + vertex_seq o{10.1, 6.5}; + OTable ot(o); + ITable it(vector(2, id::contin_type)); + it.push_back({0.4, 244.2}); + it.push_back({9.5, 4.2}); + Table table(ot, it); + testIStreamTable("10.1,0.4,244.2\n" + "6.5,9.5,4.2\n", table); + } + + void test_ContinTable_type_inference() + { + vertex_seq o{0, 1, 10.1, 6.5}; + OTable ot(o); + ITable it(vector(2, id::contin_type)); + it.push_back({1.0, 1.0}); + it.push_back({0.0, 1.0}); + it.push_back({0.4, 244.2}); + it.push_back({9.5, 4.2}); + Table table(ot, it); + testIStreamTable("0,1,1\n" + "1,0,1\n" + "10.1,0.4,244.2\n" + "6.5,9.5,4.2\n", table); + } + + void test_ContinTableLabels() + { + vertex_seq o{0, 10.1, 6.5}; + OTable ot(o, olabel); + ITable it(vector(2, id::contin_type)); + it.push_back({1.0, 1.0}); + it.push_back({0.4, 244.2}); + it.push_back({9.5, 4.2}); + vector il{label1, label2}; + it.set_labels(il); + Table table(ot, it); + testIStreamTable("happiness,price,temperature\n" + "0,1,1\n" + "10.1,0.4,244.2\n" + "6.5,9.5,4.2\n", table); + } + + void test_EnumTableLabels() + { + vertex_seq o{enum_t("sad"), enum_t("bored"), enum_t("super-happy")}; + OTable ot(o, olabel); + vector itypes({id::contin_type, id::contin_type, id::boolean_type}); + vector il{label1, label2, "truth"}; + ITable it(itypes, il); + it.push_back(vertex_seq({1, 1, id::logical_false})); + it.push_back(vertex_seq({0.44, 244.2, id::logical_true})); + it.push_back(vertex_seq({9.5, 4.2, id::logical_false})); + Table table(ot, it); + testIStreamTable("happiness,price,temperature,truth\n" + "sad,1,1,0\n" + "bored,0.44,244.2,T\n" + "super-happy,9.5,4.2,F\n", table); + } + + void test_filterITable() + { + std::cout << "\n----------------------------" << std::endl; + vector lbs{"truc", "trac"}; + ITable it(vector(2, id::boolean_type), lbs); + it.push_back({id::logical_true, id::logical_false}); + it.push_back({id::logical_false, id::logical_true}); + vertex_seq o{id::logical_true, id::logical_false,}; + // ignore second argument trac + vector args{0}; + ITable fit_it = it.filtered(args); + OTable ot(o, "troc"); + Table table(ot, fit_it); + stringstream ss; + ostreamTable(ss, table); + cout << "result: filterITable" << endl; + cout << ss.str(); + cout << "fit_it:" << endl; + cout << fit_it; + cout << "ot:" << endl; + cout << ot; + TS_ASSERT_EQUALS(ss.str(), + "troc,truc\n" + "1,1\n" + "0,0\n"); + } + + void test_filterTable() + { + string input_file = tableUTest_dir + "timestamp_table2.csv"; + Table table = loadTable(input_file, "target", "time"); + Table filtered_table = table.filtered(std::set({1})); + + stringstream ss; + ostreamTable(ss, filtered_table); + TS_ASSERT_EQUALS(ss.str(), + "input1,time,target\n" + "1,2012-03-11,6\n" + "0,2012-03-23,7\n") + } + + void test_filterCTable() + { + std::cout << "\n----------------------------" << std::endl; + vector ts(2, id::boolean_type); + vector lbs{"truc", "trac"}; + ITable it(ts, lbs); + it.push_back({id::logical_true, id::logical_false}); + it.push_back({id::logical_false, id::logical_true}); + vertex_seq o{id::logical_true, id::logical_false,}; + OTable ot(o, "troc"); + Table table(ot, it); + // Generate CTable + CTable ctable = table.compressed(); + // ignore second argument trac + vector args{0}; + CTable filtered_ctable = ctable.filtered(args); + stringstream ss; + ostreamCTable(ss, filtered_ctable); + cout << "test_filterCTable result:" << endl; + cout << ss.str(); + TS_ASSERT_EQUALS(ss.str(), + "troc,truc\n" + "{1:1},1\n" + "{0:1},0\n"); + } + + void test_weightCTable() + { + std::cout << "\n----------------------------" << std::endl; + string target_feature("r"); + string timestamp_feature(""); + vector ignore_features; + Table tt = loadTable(tableUTest_dir + "weighted.csv", + target_feature, timestamp_feature, + ignore_features); + CTable ctt = tt.compressed("wgt"); + + cout << "weighted result:\n" << ctt << endl; + stringstream ss; + ss << ctt; + + TS_ASSERT_EQUALS(ss.str(), + "r,a,b,c,d,e\n" + "{1:3.5},1,1,1,1,1\n" + "{1:2.66},1,1,1,0,0\n" + "{0:2.5},0,1,1,1,1\n" + "{1:0.4,0:0.8},0,0,0,0,0\n"); + } + + // Set target Price, and ignore column Temperature + void test_target_col_ignore_col() + { + vertex_seq o{0.4, 9.5}; // price column + OTable ot(o, label1); + vector il{olabel}; + ITable it(vector({id::contin_type}), il); + it.push_back({10.1}); + it.push_back({6.5}); + Table table(ot, it); + table.target_pos = 1; + + testIStreamTable("happiness,price,temperature\n" + "10.1,0.4,244.2\n" + "6.5,9.5,4.2\n", table, + "price", "", {"temperature"}); + } + + void test_ignore_col_complicated() + { + vertex_seq o{1, 0.44, 9.5}; // price column + OTable ot(o, label1); + vector il{olabel, "truth"}; + vector itypes({id::enum_type, id::boolean_type}); + ITable it(itypes, il); + it.push_back(vertex_seq({enum_t("sad"), id::logical_false})); + it.push_back(vertex_seq({enum_t("bored"), id::logical_true})); + it.push_back(vertex_seq({enum_t("super-happy"), id::logical_false})); + Table table(ot, it); + table.target_pos = 1; + + testIStreamTable("happiness,price,temperature,truth\n" + "sad,1,1,0\n" + "bored,0.44,244.2,T\n" + "super-happy,9.5,4.2,F\n", table, + "price", "", {"temperature"}); + } + + // // This is no longer supported + // void test_ignore_col_numeric() + // { + // vertex_seq o{1, 0.44, 9.5}; // price column + // OTable ot(o, label1); + // vector itypes({id::enum_type, id::boolean_type}); + // vector il{olabel, "truth"}; + // ITable it(itypes, il); + // it.push_back(vertex_seq({enum_t("sad"), id::logical_false})); + // it.push_back(vertex_seq({enum_t("bored"), id::logical_true})); + // it.push_back(vertex_seq({enum_t("super-happy"), id::logical_false})); + // testIStreamTable("happiness,price,temperature,truth\n" + // "sad,1,1,0\n" + // "bored,0.44,244.2,T\n" + // "super-happy,9.5,4.2,F\n", it, ot, true, + // // numeric values for these columns; the left-most is column 1 + // // so price is column 2... + // // "price", {"temperature"}); + // "2", "", {"3"}); + // } + + void test_sparseFile1() + { + std::cout << "\n----------------------------" << std::endl; + string target_feature = "target1"; + string timestamp_feature = ""; + vector ignore_features = {"a", "b", "c", "target2"}; + Table tt = loadTable(tableUTest_dir + "sparse.data", + target_feature, timestamp_feature, + ignore_features); + + testIStreamTable("target1,date,d,e,f,g,x,y,z\n" + "thing,12-04-2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0\n" + "some,12-05-2012,4.0,0.0,6.0,0.0,0.0,0.0,0.0\n" + "other,12-06-2012T14:35:02-0500,0.0,5.0,0.0,7.0,0.0,0.0,0.0\n" + "last,12-07-2012T16:00:00-0500,4.0,5.0,0.0,0.0,24.0,25.0,26.0\n", + tt); + } + + void test_sparseFile2() + { + std::cout << "\n----------------------------" << std::endl; + string target_feature = "target2"; + string timestamp_feature = ""; + vector ignore_features = {"a", "b", "c", "target1"}; + Table tt = loadTable(tableUTest_dir + "sparse.data", + target_feature, timestamp_feature, + ignore_features); + + testIStreamTable("target2,date,d,e,f,g,x,y,z\n" + "3.0,12-04-2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0\n" + "4.4,12-05-2012,4.0,0.0,6.0,0.0,0.0,0.0,0.0\n" + "5.5,12-06-2012T14:35:02-0500,0.0,5.0,0.0,7.0,0.0,0.0,0.0\n" + "6.6,12-07-2012T16:00:00-0500,4.0,5.0,0.0,0.0,24.0,25.0,26.0\n", + tt); + } + + void test_mutualInformation() + { + std::cout << "\n----------------------------" << std::endl; + string target_feature; + string timestamps_feature; + Table tt = loadTable(tableUTest_dir + "dataset.csv", + target_feature, timestamps_feature); + set fs{0, 1}; + CTable ctt = tt.compressed(); + double mi = mutualInformation(ctt, fs); + cout << "mi = " << mi << endl; + TS_ASSERT_DELTA(0.419973, mi, 0.0001); + } + + void test_mutualInformationBtwSets() + { + std::cout << "\n----------------------------" << std::endl; + string target_feature; + vector ignore_features; + Table tt = loadTable(tableUTest_dir + "dataset.csv"); + + std::cout << "tt = " << tt << std::endl; + set fs1{0}; + set fs2{1}; + CTable ctt = tt.compressed(); + double mi = mutualInformationBtwSets(ctt, fs1, fs2); + cout << "mi = " << mi << endl; + TS_ASSERT_DELTA(0.3219, mi, 0.0001); + } + + void test_insert_get_pos() + { + std::cout << "\n----------------------------" << std::endl; + ITable it; + vertex_seq col1 = {id::logical_true, id::logical_false}; + it.insert_col("label1", col1); + { + stringstream ss_res; + ss_res << it; + TS_ASSERT_EQUALS(ss_res.str(), string("label1\nboolean\n1\n0\n")); + } + vertex_seq col2 = {id::logical_false, id::logical_true}; + it.insert_col("label2", col2); + { + stringstream ss_res; + ss_res << it; + TS_ASSERT_EQUALS(ss_res.str(), string("label1,label2\nboolean,boolean\n1,0\n0,1\n")); + } + auto lc = it.get_column_data("label2"); + it.insert_col("label3", lc, 0); + { + stringstream ss_res; + ss_res << it; + TS_ASSERT_EQUALS(ss_res.str(), string("label3,label1,label2\nboolean,boolean,boolean\n0,1,0\n1,0,1\n")); + } + } + + void test_loadCTable() + { + string input_file = tableUTest_dir + "dataset.ctable"; + CTable ct = loadCTable(input_file); + + stringstream ss; + ss << ct; + + TS_ASSERT_EQUALS(ss.str(), string("target,f1,f2\n" + "{1:10,0:3},1,0\n" + "{1:4,0:5},0,1\n")); + } + + void test_timestamp_feature() + { + string input_file = tableUTest_dir + "timestamp_table.csv"; + Table table = loadTable(input_file, "length", "time"); + + std::vector dates_str = {"2012-Jan-11", + "2012-Jan-13", + "2012-Feb-18"}; + std::vector dates; + for (const auto &ds : dates_str) + dates.push_back(TTable::from_string(ds)); + TTable expected_ttable(dates, "time"); + + // Due to some weird ambiguity between CxxTest::equals and + // boost::algorithm::equals we avoid using TS_ASSERT_EQUALS + TS_ASSERT(table.ttable == dates); + } + + void test_ostream_timestamp_table() + { + { // timestamp feature before target and ignore the second feature + string input_file = tableUTest_dir + "timestamp_table2.csv"; + // vector ignore_features = ; + + Table table = loadTable(input_file, "target", "time", {"input1"}); + + stringstream ss; + ostreamTable(ss, table); + TS_ASSERT_EQUALS(ss.str(), + "input0,time,target\n" + "0,2012-03-11,6\n" + "1,2012-03-23,7\n") + } + { // timestamp feature after target + string input_file = tableUTest_dir + "timestamp_table3.csv"; + Table table = loadTable(input_file, "target", "time"); + + stringstream ss; + ostreamTable(ss, table); + TS_ASSERT_EQUALS(ss.str(), + "input1,target,time\n" + "1,6,2012-03-11\n" + "0,7,2012-03-23\n") + } + } + void test_complete_truth_table() { HandleSeq seq_1 = {createNode(PREDICATE_NODE, "f1"), createNode(PREDICATE_NODE, "f2")}; From 2bf9c63e86daee7e1f65db5bb604bc22e319d5c8 Mon Sep 17 00:00:00 2001 From: kasim Date: Tue, 18 Sep 2018 16:06:57 +0300 Subject: [PATCH 15/17] Fix failed test case --- moses/comboreduct/table/table.cc | 7 +++---- moses/comboreduct/table/table.h | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/moses/comboreduct/table/table.cc b/moses/comboreduct/table/table.cc index b31f55e2c8..85338cbbde 100644 --- a/moses/comboreduct/table/table.cc +++ b/moses/comboreduct/table/table.cc @@ -881,7 +881,8 @@ void complete_truth_table::populate(const Handle &handle) populate_features(features); // map the values of inputs to the program. - setup_features(handle, features.begin(), features.end()); + auto bg = features.begin(); + setup_features(handle, bg, features.end()); atomese::Interpreter interpreter(key); std::vector result = LinkValueCast(interpreter(handle))->value(); @@ -897,15 +898,13 @@ void complete_truth_table::populate_features(std::vector &featu { auto it = begin(); for (int i = 0; it != end(); ++i, ++it) { - ProtoAtomPtrVec row; for (int j = 0; j < _arity; ++j) { ProtoAtomPtr v; if ((i >> j) % 2) v = ProtoAtomPtr(createLink(TRUE_LINK)); else v = ProtoAtomPtr(createLink(FALSE_LINK)); - row.push_back(v); + features[j].push_back(v); } - features.push_back(row); } } diff --git a/moses/comboreduct/table/table.h b/moses/comboreduct/table/table.h index bb890c29a4..c8e562db02 100644 --- a/moses/comboreduct/table/table.h +++ b/moses/comboreduct/table/table.h @@ -1715,15 +1715,14 @@ class complete_truth_table : public bool_seq * @param It to end iterator of vector containing values of variables. */ template - void setup_features(const Handle &handle, It from, It to) + void setup_features(const Handle &handle, It& from, It to) { if (from == to) return; if (PREDICATE_NODE == handle->get_type()) { - handle->setValue(createNode(NODE, "*-AS-MOSES:SchemaValuesKey-*"), - ProtoAtomPtr(new LinkValue(*from))); - ++from; + handle->setValue(key, ProtoAtomPtr(new LinkValue(*from))); + from++; return; } From 00cf7ab5c7dfd28f5c6f29f1806a24fa0a6bf30c Mon Sep 17 00:00:00 2001 From: kasim Date: Sat, 22 Sep 2018 11:18:38 +0300 Subject: [PATCH 16/17] Fix feature_setup logic error --- moses/comboreduct/table/table.cc | 23 ++++++++++++++++++-- moses/comboreduct/table/table.h | 19 +--------------- tests/comboreduct/table/tableUTest.cxxtest | 25 ++++++++++++++++++---- 3 files changed, 43 insertions(+), 24 deletions(-) diff --git a/moses/comboreduct/table/table.cc b/moses/comboreduct/table/table.cc index 85338cbbde..1a6c62193d 100644 --- a/moses/comboreduct/table/table.cc +++ b/moses/comboreduct/table/table.cc @@ -881,8 +881,7 @@ void complete_truth_table::populate(const Handle &handle) populate_features(features); // map the values of inputs to the program. - auto bg = features.begin(); - setup_features(handle, bg, features.end()); + setup_features(handle, features); atomese::Interpreter interpreter(key); std::vector result = LinkValueCast(interpreter(handle))->value(); @@ -908,6 +907,26 @@ void complete_truth_table::populate_features(std::vector &featu } } +void complete_truth_table::setup_features(const Handle &handle, const std::vector &features) +{ + if (PREDICATE_NODE == handle->get_type()) { + // We extract the index of the feature from the name of the Predicate Node. + // the assumption is the Predicate nodes have names in [$#] format. and this + // convention is adopted from the combo counterpart. + const std::string h_name = handle->get_name(); + ProtoAtomPtrVec value = features[std::stoi(h_name.substr(h_name.find("$")+1))-1]; + + handle->setValue(key, ProtoAtomPtr(new LinkValue(value))); + return; + } + + if (handle->is_link()) { + for (Handle h : handle->getOutgoingSet()) { + setup_features(h, features); + } + } +} + ///////////////////// // Subsample table // ///////////////////// diff --git a/moses/comboreduct/table/table.h b/moses/comboreduct/table/table.h index c8e562db02..7a8ea7d259 100644 --- a/moses/comboreduct/table/table.h +++ b/moses/comboreduct/table/table.h @@ -1714,24 +1714,7 @@ class complete_truth_table : public bool_seq * @param It from beginning iterator of the vector containing values of variables. * @param It to end iterator of vector containing values of variables. */ - template - void setup_features(const Handle &handle, It& from, It to) - { - if (from == to) - return; - - if (PREDICATE_NODE == handle->get_type()) { - handle->setValue(key, ProtoAtomPtr(new LinkValue(*from))); - from++; - return; - } - - if (handle->is_link()) { - for (Handle h : handle->getOutgoingSet()) { - setup_features(h, from, to); - } - } - } + void setup_features(const Handle &handle, const std::vector& features); void populate(const Handle &handle); diff --git a/tests/comboreduct/table/tableUTest.cxxtest b/tests/comboreduct/table/tableUTest.cxxtest index 92cfd753b5..0b2a8572d0 100644 --- a/tests/comboreduct/table/tableUTest.cxxtest +++ b/tests/comboreduct/table/tableUTest.cxxtest @@ -441,14 +441,31 @@ public: void test_complete_truth_table() { - HandleSeq seq_1 = {createNode(PREDICATE_NODE, "f1"), createNode(PREDICATE_NODE, "f2")}; + HandleSeq seq_1 = {createNode(PREDICATE_NODE, "$1"), createNode(PREDICATE_NODE, "$2")}; Handle or_link = createLink(seq_1, OR_LINK); - HandleSeq seq_2 = {createNode(PREDICATE_NODE, "f3"), or_link}; + HandleSeq seq_2 = {createNode(PREDICATE_NODE, "$3"), or_link}; Handle program = createLink(seq_2, AND_LINK); combo::complete_truth_table ctable(program, 3); - bool_seq expected = {false, false, false, true, false, true, false, true}; - TS_ASSERT(ctable == expected); + combo_tree c_program = combo::str2combo_tree("and(or($f1 $f2) $f3)", {"f1","f2","f3"}); + complete_truth_table c_ctable(c_program, 3); + + TS_ASSERT(ctable == c_ctable); + } + + void test_complete_truth_table_2() + { + HandleSeq seq_1 = {createNode(PREDICATE_NODE, "$3"), createNode(PREDICATE_NODE, "$2")}; + Handle or_link = createLink(seq_1, OR_LINK); + HandleSeq seq_2 = {createNode(PREDICATE_NODE, "$1"), or_link}; + Handle program = createLink(seq_2, AND_LINK); + + combo::complete_truth_table ctable(program, 3); + + combo_tree c_program = combo::str2combo_tree("and(or($f3 $f2) $f1)", {"f1","f2","f3"}); + complete_truth_table c_ctable(c_program, 3); + + TS_ASSERT(ctable == c_ctable); } }; From 8f4bf62f650990b791a6fc3f136729f2ec2759ad Mon Sep 17 00:00:00 2001 From: kasim Date: Tue, 25 Sep 2018 10:29:45 +0300 Subject: [PATCH 17/17] Add Comment --- moses/comboreduct/table/table.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/moses/comboreduct/table/table.h b/moses/comboreduct/table/table.h index 7a8ea7d259..ef35f931ac 100644 --- a/moses/comboreduct/table/table.h +++ b/moses/comboreduct/table/table.h @@ -1649,6 +1649,11 @@ class complete_truth_table : public bool_seq OC_ASSERT(false, "Truth table from Handle not implemented yet"); } + /** + * This constructor assumes the program[handle] to have its features named + * '$1' to $[arity]. This convention was required in [setup_features] in order + * to map features with their respective values. + * */ complete_truth_table(const Handle &handle, arity_t arity) : super(pow2(arity)), _arity(arity) {