diff --git a/extras/benchmark.cpp b/extras/benchmark.cpp index fd7788610..915dcc3c0 100644 --- a/extras/benchmark.cpp +++ b/extras/benchmark.cpp @@ -85,10 +85,6 @@ std::set BuildAllSubsplits(size_t n) { } } // namespace SubsplitSetBuilder -void Test1(int argc, char* argv[]) {} - -void Test2(int argc, char* argv[]) {} - int main(int argc, char* argv[]) { Stopwatch timer(true, Stopwatch::TimeScale::SecondScale); diff --git a/extras/benchmark.cpp.bak b/extras/benchmark.cpp.bak deleted file mode 100644 index 7b2539773..000000000 --- a/extras/benchmark.cpp.bak +++ /dev/null @@ -1,142 +0,0 @@ -#include "stopwatch.hpp" -#include "gp_instance.hpp" -#include "rooted_sbn_instance.hpp" -#include "unrooted_sbn_instance.hpp" -#include "rooted_tree_collection.hpp" - -// This is just a place to muck around, and check out performance. - -auto now = std::chrono::high_resolution_clock::now; - -// To valgrind (you can pip install gprof2dot): -// valgrind --tool=callgrind ./_build/noodle -// gprof2dot -f callgrind callgrind.out.16763 | dot -Tpng -o ~/output.png - -auto GetArgVec(int argc, char* argv[]) { - std::string current_exec_name = argv[0]; // Name of the current exec program - std::vector all_args; - if (argc > 1) { - all_args.assign(argv + 1, argv + argc); - } - return all_args; -} - -auto MakeGPInstanceFromFiles(std::string& fasta_path, std::string& newick_path) { - GPInstance inst("../_ignore/mmap.data"); - inst.ReadFastaFile(fasta_path); - inst.ReadNewickFile(newick_path); - return inst; -} - -void BuildAllSubsplitsRecurse(std::vector& subsplit_assign, size_t n, - std::set& results) { - if (n > 0) { - for (size_t i = 0; i < 3; i++) { - subsplit_assign[subsplit_assign.size() - 1 - n] = i; - BuildAllSubsplitsRecurse(subsplit_assign, n - 1, results); - } - return; - } - if (n == 0) { - Bitset clade_left(subsplit_assign.size(), false); - Bitset clade_right(subsplit_assign.size(), false); - for (size_t i = 0; i < subsplit_assign.size(); i++) { - if (subsplit_assign[i] == 0) { - clade_left.set(i); - } - if (subsplit_assign[i] == 1) { - clade_right.set(i); - } - } - if ((clade_left.Count() == 0) && (clade_right.Count() == 0)) { - return; - } - if ((clade_left.Count() == 0) && (clade_right.Count() > 1)) { - return; - } - if ((clade_left.Count() > 1) && (clade_right.Count() == 0)) { - return; - } - Bitset subsplit = Bitset::Subsplit(clade_left, clade_right); - results.insert(subsplit); - return; - } -} - -std::set BuildAllSubsplits(size_t n) { - std::vector subsplit_assign(n); - std::set all_subsplits; - BuildAllSubsplitsRecurse(subsplit_assign, n, all_subsplits); - std::cout << "all_subsplits: " << n << " " << all_subsplits.size() << " " - << all_subsplits << std::endl; - // return all_subsplits; -} - -int main(int argc, char* argv[]) { - auto args = GetArgVec(argc, argv); - if (args.size() != 2) { - std::cout << "usage: " << std::endl; - exit(0); - } - - auto fasta_path = args[0]; - auto newick_path = args[1]; - bool use_gp = true; - - std::cout << "Fasta: " << fasta_path << std::endl; - std::cout << "Newick: " << newick_path << std::endl; - - auto inst = MakeGPInstanceFromFiles(fasta_path, newick_path); - inst.MakeDAG(); - auto& dag = inst.GetDAG(); - inst.MakeGPEngine(); - inst.MakeTPEngine(); - inst.MakeNNIEngine(); - auto& nni_engine = inst.GetNNIEngine(); - - // inst.TakeFirstBranchLength(); - auto& gp_engine = inst.GetGPEngine(); - inst.EstimateBranchLengths(1e-5, 5); - inst.TPEngineSetChoiceMapByTakingFirst(); - auto& tp_engine = inst.GetTPEngine(); - tp_engine.OptimizeBranchLengths(false); - - nni_engine.SetGPLikelihoodCutoffFilteringScheme(0.0); - // nni_engine.SetTPLikelihoodCutoffFilteringScheme(0.0); - nni_engine.SetTopNScoreFilteringScheme(5); - - std::cout << "Initial DAG: " << dag.TaxonCount() << " " << dag.NodeCount() << " " - << dag.EdgeCountWithLeafSubsplits() << std::endl; - // BuildAllSubsplits(dag.TaxonCount()); - std::cout - << "TPEngine::BranchLengths: " - << tp_engine.GetLikelihoodEvalEngine().GetDAGBranchHandler().GetBranchLengthData() - << std::endl; - std::cout << "GPEngine::BranchLengths: " - << gp_engine.GetBranchLengthHandler().GetBranchLengthData() << std::endl; - - size_t iter_max = 2; - nni_engine.RunInit(false); - for (size_t iter = 0; iter < iter_max; iter++) { - auto llhs = gp_engine.GetPerGPCSPLogLikelihoods(); - std::cout << "llhs: " << llhs << std::endl; - // if (use_gp) { - // inst.MakeGPEngine(); - // GPEngine* gp_engine = &inst.GetGPEngine(); - // nni_engine.MakeGPEvalEngine(gp_engine); - // inst.EstimateBranchLengths(1e-5, 5); - // } - std::cout << "Iteration: " << iter << " of " << iter_max << std::endl; - std::cout << "DAG: " << dag.NodeCount() << " " << dag.EdgeCountWithLeafSubsplits() - << std::endl; - std::cout << "Adjacent NNIs: " << nni_engine.GetAdjacentNNICount() << std::endl; - nni_engine.RunMainLoop(false); - std::cout << "Accepted NNIs: " << nni_engine.GetAcceptedNNICount() << std::endl; - std::cout << "Scored NNIs: " << nni_engine.GetScoredNNIs() << std::endl; - nni_engine.RunPostLoop(false); - nni_engine.SyncAdjacentNNIsWithDAG(); - } - std::cout << "FINAL: " << std::endl; - std::cout << "DAG: " << dag.NodeCount() << " " << dag.EdgeCountWithLeafSubsplits() - << std::endl; -} diff --git a/src/gp_doctest.cpp b/src/gp_doctest.cpp index a96356f5f..7f52b072b 100644 --- a/src/gp_doctest.cpp +++ b/src/gp_doctest.cpp @@ -1745,7 +1745,7 @@ TEST_CASE("NNIEngine: Resize and Reindex GPEngine after AddNodePair") { GPDAG& pre_dag, GPEngine& pre_gpengine) { bool passes_resized = true; bool passes_plv_reindexed = true; - bool passes_gpcsp_reindexed = true; + bool passes_branch_reindexed = true; // Check resizing GPEngine properly. passes_resized &= (gpengine.GetNodeCount() == dag.NodeCountWithoutDAGRoot()); passes_resized &= (gpengine.GetGPCSPCount() == dag.EdgeCountWithLeafSubsplits()); @@ -1757,7 +1757,8 @@ TEST_CASE("NNIEngine: Resize and Reindex GPEngine after AddNodePair") { for (const auto plv_type : PLVTypeEnum::Iterator()) { const auto& plv_a = gpengine.GetPLVHandler().GetPV(plv_type, node_id); const auto& plv_b = pre_gpengine.GetPLVHandler().GetPV(plv_type, pre_node_id); - if (PLVNodeHandler::MaxDifference(plv_a, plv_b) > 1e-3) { + auto max_diff = PLVNodeHandler::MaxDifference(plv_a, plv_b); + if (max_diff > 1e-3) { passes_plv_reindexed = false; } } @@ -1769,10 +1770,10 @@ TEST_CASE("NNIEngine: Resize and Reindex GPEngine after AddNodePair") { const auto branch_a = branch_lengths.Get(edge_id); const auto branch_b = pre_branch_lengths.Get(pre_edge_id); if (abs(branch_a - branch_b) > 1e-3) { - passes_gpcsp_reindexed = false; + passes_branch_reindexed = false; } } - return passes_resized && passes_plv_reindexed && passes_gpcsp_reindexed; + return passes_resized && passes_plv_reindexed && passes_branch_reindexed; }; // Test that adds nodes to DAG, resizes and reindexes GPEngine and checks that // GPEngine reindexed correctly. @@ -1905,7 +1906,7 @@ TEST_CASE("NNIEngine: Resize and Reindex GPEngine after AddNodePair") { // a single node pair to DAG. auto test_2 = ResizeAndReindexGPEngineTest(10, 1, true, false); CHECK_FALSE_MESSAGE(test_2, - "TEST_2: Resize and reindex GPEngine is not incorrect when not " + "TEST_2: Resize and reindex GPEngine is not incorrect when NOT " "reindexing after single AddNodePair to DAG."); // TEST_3: Test resize and reindex GPEngine works when adding a many node pairs, // performing resizing and reindexing for each modification of DAG. @@ -1929,7 +1930,7 @@ TEST_CASE("NNIEngine: Resize and Reindex GPEngine after AddNodePair") { "TEST_5: Resized GPEngine with unmodified DAG changed results of GP Run."); }; -// This test compares NNI likelihoods computed by two different GPInstances. +// Compares NNI likelihoods computed by two different GPInstances. // - The true GPInstance adds each NNI individually to the DAG, resizes the GPEngine, // repopulates all PLVs in DAG, then recomputes the likelihoods for each NNI. // - The NNIEngine version of the GPInstance adds all NNIs to the GraftDAG, then resizes @@ -2156,6 +2157,143 @@ TEST_CASE("NNIEngine via GPEngine: Proposed NNI vs DAG NNI GPLikelihoods") { "Test_1c: Six Taxon (with optimized branch lengths) failed."); } +// This checks that potential parent or child nodes found via subsplit map lookup match +// those found via brute force (linear scan) after adding or grafting nodes to DAG. +// Repeats tests after adding or grafting NNIs to DAG. +TEST_CASE("NNIEngine: Finding Parent and Child Nodes After Adding/Grafting Nodes") { + bool is_quiet = false; + std::stringstream dev_null; + std::ostream& os = (is_quiet ? dev_null : std::cerr); + + const std::string fasta_path = "data/five_taxon.fasta"; + const std::string newick_path_1 = "data/five_taxon_rooted.nwk"; + auto inst_1 = GPInstanceOfFiles(fasta_path, newick_path_1, "_ignore/mmapped_pv.data"); + inst_1.MakeTPEngine(); + inst_1.MakeNNIEngine(); + auto& dag_1 = inst_1.GetDAG(); + auto& nniengine_1 = inst_1.GetNNIEngine(); + auto& graftdag_1 = nniengine_1.GetGraftDAG(); + nniengine_1.SetTPLikelihoodCutoffFilteringScheme(0.0); + nniengine_1.SetTopNScoreFilteringScheme(2); + // nniengine_1.SetNoFilter(); + nniengine_1.RunInit(); + + auto VectorToSet = [](const std::pair& node_id_vector) + -> std::pair, std::set> { + auto [left, right] = node_id_vector; + std::set left_set(left.begin(), left.end()); + std::set right_set(right.begin(), right.end()); + return {left_set, right_set}; + }; + + auto TestDAGFindNodeIdsViaMapVSViaScan = [&os, &dag_1, + &VectorToSet](const Bitset& subsplit) { + bool test_passed = true; + { + auto [left, right] = VectorToSet(dag_1.FindParentNodeIds(subsplit)); + auto [left_map, right_map] = VectorToSet(dag_1.FindParentNodeIdsViaMap(subsplit)); + auto [left_scan, right_scan] = + VectorToSet(dag_1.FindParentNodeIdsViaScan(subsplit)); + bool results_match = (left == left_scan and left_map == left_scan) and + (right == right_scan and right_map == right_scan); + if (!results_match) { + os << "DAG_FIND_PARENT_NODES: " << (results_match ? "PASS" : "FAIL") + << std::endl; + os << "LEFT: " << left_map << " " << left_scan << std::endl; + os << "RIGHT: " << right_map << " " << right_scan << std::endl; + } + test_passed &= results_match; + } + { + auto [left, right] = VectorToSet(dag_1.FindChildNodeIds(subsplit)); + auto [left_map, right_map] = VectorToSet(dag_1.FindChildNodeIdsViaMap(subsplit)); + auto [left_scan, right_scan] = + VectorToSet(dag_1.FindChildNodeIdsViaScan(subsplit)); + bool results_match = (left == left_scan and left_map == left_scan) and + (right == right_scan and right_map == right_scan); + if (!results_match) { + os << "DAG_FIND_CHILD_NODES: " << (results_match ? "PASS" : "FAIL") + << std::endl; + os << "LEFT: " << left_map << " " << left_scan << std::endl; + os << "RIGHT: " << right_map << " " << right_scan << std::endl; + } + test_passed &= results_match; + } + return test_passed; + }; + + auto TestGraftDAGFindNodeIdsViaMapVSViaScan = [&os, &graftdag_1, + &VectorToSet](const Bitset& subsplit) { + bool test_passed = true; + { + auto [left, right] = VectorToSet(graftdag_1.FindParentNodeIds(subsplit)); + auto [left_map, right_map] = + VectorToSet(graftdag_1.FindParentNodeIdsViaMap(subsplit)); + auto [left_scan, right_scan] = + VectorToSet(graftdag_1.FindParentNodeIdsViaScan(subsplit)); + bool results_match = (left == left_scan and left_map == left_scan) and + (right == right_scan and right_map == right_scan); + if (!results_match) { + os << "GRAFTDAG_FIND_PARENT_NODES: " << (results_match ? "PASS" : "FAIL") + << std::endl; + os << "LEFT: " << left_map << " " << left_scan << std::endl; + os << "RIGHT: " << right_map << " " << right_scan << std::endl; + } + test_passed &= results_match; + } + { + auto [left, right] = VectorToSet(graftdag_1.FindChildNodeIds(subsplit)); + auto [left_map, right_map] = + VectorToSet(graftdag_1.FindChildNodeIdsViaMap(subsplit)); + auto [left_scan, right_scan] = + VectorToSet(graftdag_1.FindChildNodeIdsViaScan(subsplit)); + bool results_match = (left == left_scan and left_map == left_scan) and + (right == right_scan and right_map == right_scan); + if (!results_match) { + os << "GRAFTDAG_FIND_CHILD_NODES: " << (results_match ? "PASS" : "FAIL") + << std::endl; + os << "LEFT: " << left_map << " " << left_scan << std::endl; + os << "RIGHT: " << right_map << " " << right_scan << std::endl; + } + test_passed &= results_match; + } + return test_passed; + }; + + for (NodeId node_id{0}; node_id < dag_1.NodeCount(); node_id++) { + auto subsplit = dag_1.GetDAGNodeBitset(node_id); + CHECK_MESSAGE(TestDAGFindNodeIdsViaMapVSViaScan(subsplit), + "Finding nodes via map does not match finding nodes via scan (before " + "adding NNIs)."); + } + + size_t max_iter = 10; + for (size_t iter = 0; iter < max_iter; iter++) { + nniengine_1.GraftAdjacentNNIsToDAG(); + for (NodeId node_id{0}; node_id < graftdag_1.NodeCount(); node_id++) { + auto subsplit = graftdag_1.GetDAGNodeBitset(node_id); + CHECK_MESSAGE( + TestGraftDAGFindNodeIdsViaMapVSViaScan(subsplit), + "Finding nodes via map does not match finding nodes via scan (before " + "adding NNIs)."); + } + nniengine_1.FilterPreUpdate(); + nniengine_1.FilterEvaluateAdjacentNNIs(); + nniengine_1.FilterPostUpdate(); + nniengine_1.FilterProcessAdjacentNNIs(); + nniengine_1.RemoveAllGraftedNNIsFromDAG(); + nniengine_1.AddAcceptedNNIsToDAG(); + for (NodeId node_id{0}; node_id < dag_1.NodeCount(); node_id++) { + auto subsplit = dag_1.GetDAGNodeBitset(node_id); + CHECK_MESSAGE( + TestDAGFindNodeIdsViaMapVSViaScan(subsplit), + "Finding nodes via map does not match finding nodes via scan (before " + "adding NNIs)."); + } + nniengine_1.RunPostLoop(); + } +} + // ** TPEngine tests ** // Makes and returns an SBNInstance. Used for truth test vs TPEngine likelihoods. @@ -3012,6 +3150,13 @@ TEST_CASE("TPEngine: Branch Length Optimization") { CHECK_LT(max_diff, tol); } +// Exports Newicks from DAG and TPEngine. DAG exports a covering set of trees, which +// should contain every node and edge from the DAG in a (unproven) minimum set of trees. +// TPEngine exports an ordered vector of trees which contains a covering set of trees +// that maintains the relative priority of edges according to the TPEngine's choice map. +// Tests this by using the exported trees to build a new DAG and TPEngine, and checks +// that new DAG and TPEngine match the old DAG and TPEngine. Repeats tests after adding +// NNIs to the DAG. TEST_CASE("TPEngine: Exporting Newicks") { const std::string fasta_path = "data/five_taxon.fasta"; const std::string newick_path_1 = "data/five_taxon_rooted.nwk"; @@ -3094,20 +3239,7 @@ TEST_CASE("TPEngine: Exporting Newicks") { bool newicks_equal = (newick_1 == newick_2); if (!newicks_equal) { std::cerr << "ERROR: Newicks do not match." << std::endl; - auto tree_map_1 = inst_1.GetTPEngine().BuildMapOfTreeIdToTopTopologies(); - std::cerr << "TREE_MAP_1: " << std::endl; - for (const auto& [tree_id, topos] : tree_map_1) { - for (const auto& topo : topos) { - std::cerr << "\tTree" << tree_id << ": " << topo->Newick() << std::endl; - } - } - auto tree_edge_map_1 = inst_1.GetTPEngine().BuildMapOfEdgeIdToTopTopologies(); - std::cerr << "TREE_EDGE_MAP_1: " << std::endl; - for (const auto& [edge_ids, topo] : tree_edge_map_1) { - std::cerr << "\tEdge " << edge_ids << ": " << topo->Newick() << std::endl; - } std::cerr << "NEWICK_1: " << std::endl << newick_1 << std::endl; - std::cerr << "NEWICK_2: " << std::endl << newick_2 << std::endl; } @@ -3177,6 +3309,60 @@ TEST_CASE("TPEngine: Exporting Newicks") { } } +// PVHandler can be reindexed using two methods. Either via move-copy, where PVs are +// moved so that the PV vector is ordered according to the order of the node_ids they +// represent in the DAG, or via remapping, where a PV map is updated to point at the +// correct PV when given a node_id, which avoids copying. Test checks that both +// methods have PV with same values. Repeats tests after adding NNIs to DAG. +TEST_CASE("TPEngine: Resize and Reindex PV Handler") { + const std::string fasta_path = "data/five_taxon.fasta"; + const std::string newick_path_1 = "data/five_taxon_rooted.nwk"; + auto inst_1 = + GPInstanceOfFiles(fasta_path, newick_path_1, "_ignore/mmapped_pv.data1"); + auto inst_2 = + GPInstanceOfFiles(fasta_path, newick_path_1, "_ignore/mmapped_pv.data2"); + // NNI Engine that uses remapping for reindexing PVs. + inst_1.MakeTPEngine(); + inst_1.MakeNNIEngine(); + auto& tpengine_1 = inst_1.GetTPEngine(); + auto& nniengine_1 = inst_1.GetNNIEngine(); + auto& pvs_1 = tpengine_1.GetLikelihoodEvalEngine().GetPVs(); + pvs_1.SetUseRemapping(false); + nniengine_1.SetTPLikelihoodCutoffFilteringScheme(0.0); + nniengine_1.SetTopNScoreFilteringScheme(1); + nniengine_1.RunInit(); + // NNI Engine that does NOT use remapping for reindexing PVs. + inst_2.MakeTPEngine(); + inst_2.MakeNNIEngine(); + auto& tpengine_2 = inst_2.GetTPEngine(); + auto& nniengine_2 = inst_2.GetNNIEngine(); + auto& pvs_2 = tpengine_2.GetLikelihoodEvalEngine().GetPVs(); + pvs_2.SetUseRemapping(false); + nniengine_2.SetTPLikelihoodCutoffFilteringScheme(0.0); + nniengine_2.SetTopNScoreFilteringScheme(1); + nniengine_2.RunInit(); + + CHECK_MESSAGE(nniengine_1.GetDAG() == nniengine_2.GetDAG(), + "DAGs do not match (before adding NNIs)."); + auto pvs_match = pvs_1.Compare(pvs_1, pvs_2, false); + CHECK_MESSAGE(pvs_match, "PVs do not match (before adding NNIs)."); + + size_t max_iter = 10; + for (size_t iter = 0; iter < max_iter; iter++) { + bool pvs_match; + nniengine_1.RunMainLoop(); + nniengine_1.RunPostLoop(); + pvs_match = pvs_1.Compare(pvs_1, pvs_2, true); + CHECK_FALSE_MESSAGE( + pvs_match, "PVs incorrectly match (after adding NNIs to only one NNIEngine)."); + + nniengine_2.RunMainLoop(); + nniengine_2.RunPostLoop(); + pvs_match = pvs_1.Compare(pvs_1, pvs_2, false); + CHECK_MESSAGE(pvs_match, "PVs do not match (after adding NNIs)."); + } +} + // ** DAGData tests ** // Builds DAGData vectors from the nodes and edges of a DAG. Checks that data is diff --git a/src/nni_engine.cpp b/src/nni_engine.cpp index 14048d23c..5510b8f9b 100644 --- a/src/nni_engine.cpp +++ b/src/nni_engine.cpp @@ -180,7 +180,8 @@ void NNIEngine::UpdateEvalEngineAfterModifyingDAG( void NNIEngine::ScoreAdjacentNNIs() { // Split scored NNIs from ones we can just fetch previous computations and NNIs we // need to freshly compute. - NNISet old_nnis, new_nnis; + new_nni_count_ = old_nni_count_ = 0; + NNISet new_nnis; for (const auto &nni : GetAdjacentNNIs()) { bool is_new_nni = (GetPastScoredNNIs().find(nni) == GetPastScoredNNIs().end()); bool do_rescore_nni = GetRescoreRejectedNNIs(); @@ -188,9 +189,13 @@ void NNIEngine::ScoreAdjacentNNIs() { if (is_new_nni or do_rescore_nni) { new_nnis.insert(nni); } else if (do_reeval_nni) { - old_nnis.insert(nni); GetScoredNNIs()[nni] = GetPastScoredNNIs().find(nni)->second; } + if (is_new_nni) { + new_nni_count_++; + } else { + old_nni_count_++; + } } if (IsEvalEngineInUse(NNIEvalEngineType::GPEvalEngine)) { @@ -659,6 +664,7 @@ void NNIEngine::AddAcceptedNNIsToDAG(const bool is_quiet) { // auto mods = GetDAG().AddNodes(nodes_to_add); // node_reindexer_ = mods.node_reindexer; // edge_reindexer_ = mods.edge_reindexer; + RemoveAllGraftedNNIsFromDAG(); os << "AddAcceptedNNIsToDAG::AddAll: " << timer.Lap() << std::endl; GrowEvalEngineForDAG(node_reindexer_, edge_reindexer_); diff --git a/src/nni_engine.hpp b/src/nni_engine.hpp index 8a13cff54..6dc5281c6 100644 --- a/src/nni_engine.hpp +++ b/src/nni_engine.hpp @@ -163,6 +163,9 @@ class NNIEngine { return scores; } + size_t GetNewNNICount() const { return new_nni_count_; } + size_t GetOldNNICount() const { return old_nni_count_; } + // Get node reindexer const Reindexer &GetNodeReindexer() const { return node_reindexer_; } // Get edge reindexer @@ -460,6 +463,9 @@ class NNIEngine { NNIDoubleMap scored_nnis_; // Map of previous rejected NNIs to their score. NNIDoubleMap scored_past_nnis_; + // Holds the counts of new NNIs (NNI not found in previous iterations) and old NNIs. + size_t new_nni_count_ = 0; + size_t old_nni_count_ = 0; // Steps of filtering scheme. StaticFilterInitFunction filter_init_fn_ = nullptr; diff --git a/src/pv_handler.cpp b/src/pv_handler.cpp index d25f8cd3b..3e613873b 100644 --- a/src/pv_handler.cpp +++ b/src/pv_handler.cpp @@ -29,13 +29,33 @@ void PartialVectorHandler::Resize( for (size_t i = old_pv_count; i < GetPaddedPVCount(); i++) { pvs_.at(i).setZero(); } + pv_reindexer_.Resize(GetPaddedPVCount()); } template void PartialVectorHandler::Reindex( const Reindexer pv_reindexer) { - Reindexer::ReindexInPlace(pvs_, pv_reindexer, GetPVCount(), GetPV(GetPVCount()), + ReindexViaRemap(pv_reindexer); + if (!(pv_reindexer.size() < (reindexer_init_size_ * 1.5)) or !use_remapping_) { + ReindexViaMoveCopy(pv_reindexer); + } +} + +template +void PartialVectorHandler::ReindexViaMoveCopy( + const Reindexer pv_reindexer) { + Reindexer::ReindexInPlace(pvs_, pv_reindexer_, GetPVCount(), GetPV(GetPVCount()), GetPV(GetPVCount() + 1)); + pv_reindexer_ = Reindexer::IdentityReindexer(GetPaddedPVCount()); + reindexer_init_size_ = pv_reindexer_.size(); +} + +template +void PartialVectorHandler::ReindexViaRemap( + const Reindexer pv_reindexer) { + pv_reindexer_.Resize(pv_reindexer.size()); + pv_reindexer_ = pv_reindexer_.ComposeWith(pv_reindexer); + pv_reindexer_.Resize(GetPaddedPVCount()); } template diff --git a/src/pv_handler.hpp b/src/pv_handler.hpp index 09406313c..0bbcac933 100644 --- a/src/pv_handler.hpp +++ b/src/pv_handler.hpp @@ -121,7 +121,10 @@ class PartialVectorHandler { mmapped_master_pvs_(mmap_file_path_, (elem_count + element_spare_count_) * pv_count_per_element_ * size_t(resizing_factor_) * - pattern_count) {} + pattern_count) { + pv_reindexer_ = Reindexer::IdentityReindexer(GetPaddedPVCount()); + reindexer_init_size_ = pv_reindexer_.size(); + } // ** Counts @@ -152,6 +155,10 @@ class PartialVectorHandler { std::optional new_element_spare = std::nullopt); // Reindex PV according to pv_reindexer. void Reindex(const Reindexer pv_reindexer); + // Reindex PVs by moving data to align with reindexer by copying. + void ReindexViaMoveCopy(const Reindexer pv_reindexer); + // Reindex PVs by updating the map from pv_id to data index. + void ReindexViaRemap(const Reindexer pv_reindexer); // Expand element_reindexer into pv_reindexer. Reindexer BuildPVReindexer(const Reindexer &element_reindexer, const size_t old_elem_count, const size_t new_elem_count); @@ -162,9 +169,15 @@ class PartialVectorHandler { NucleotidePLVRefVector &GetPVs() { return pvs_; } const NucleotidePLVRefVector &GetPVs() const { return pvs_; } // Get PV by absolute index from the vector of Partial Vectors. - NucleotidePLVRef &GetPV(const PVId pv_id) { return pvs_.at(pv_id.value_); } + NucleotidePLVRef &GetPV(const PVId pv_id) { + auto &pv = pvs_.at(pv_reindexer_.GetOldIndexByNewIndex(pv_id.value_)); + Assert(pv_id < GetAllocatedPVCount(), "pv_id outside valid range."); + return pv; + } const NucleotidePLVRef &GetPV(const PVId pv_id) const { - return pvs_.at(pv_id.value_); + auto &pv = pvs_.at(pv_reindexer_.GetOldIndexByNewIndex(pv_id.value_)); + Assert(pv_id < GetAllocatedPVCount(), "pv_id outside valid range."); + return pv; } NucleotidePLVRef &operator()(const PVId pv_id) { return GetPV(pv_id); } const NucleotidePLVRef &operator()(const PVId pv_id) const { return GetPV(pv_id); } @@ -233,6 +246,9 @@ class PartialVectorHandler { return pv_ids; } + // PV Reindexer, which serves as the data map to sort PV data. + const Reindexer &GetPVReindexer() const { return pv_reindexer_; } + // ** PV Operations std::pair ValueRange(const PVId pvid) const { @@ -310,6 +326,12 @@ class PartialVectorHandler { }; return ApplyBinaryOperation(dest_pvid, src_pvid_a, src_pvid_b, SubtractFunc); } + void AbsDiff(const PVId dest_pvid, const PVId src_pvid_a, const PVId src_pvid_b) { + auto AbsDiffFunc = [](const double src_a, const double src_b) { + return abs(src_a - src_b); + }; + return ApplyBinaryOperation(dest_pvid, src_pvid_a, src_pvid_b, AbsDiffFunc); + } void Multiply(const PVId dest_pvid, const PVId src_pvid_a, const PVId src_pvid_b) { auto MultiplyFunc = [](const double src_a, const double src_b) { return (src_a * src_b); @@ -400,6 +422,34 @@ class PartialVectorHandler { return pvid_map; } + void SetUseRemapping(bool use_remapping) { use_remapping_ = use_remapping; } + bool GetUseRemapping() const { return use_remapping_; } + + static int Compare(const PartialVectorHandler &pv_lhs, + const PartialVectorHandler &pv_rhs, + const bool is_quiet = true) { + std::stringstream dev_null; + std::ostream &os = (is_quiet ? dev_null : std::cerr); + + bool pv_count_match = (pv_lhs.GetPVCount() == pv_rhs.GetPVCount()); + if (!pv_count_match) { + os << "PVHandler::Compare: PV Counts do not match." << std::endl; + return false; + } + bool pvs_match = true; + for (PVId pv_id(0); pv_id < pv_lhs.GetPVCount(); pv_id++) { + bool pv_match = (pv_lhs.GetPV(pv_id) == pv_rhs.GetPV(pv_id)); + pvs_match &= pv_match; + if (!pvs_match) { + os << "PVHandler::Compare: PVs do not match at PV" << pv_id << std::endl; + os << "PV_LHS:" << std::endl << pv_lhs.ToString(pv_id, true) << std::endl; + os << "PV_RHS:" << std::endl << pv_rhs.ToString(pv_id, true) << std::endl; + return false; + } + } + return true; + } + protected: // Get total offset into PVs. static PVId GetPVIndex(const size_t pv_type_id, const DAGElementId elem_id, @@ -445,6 +495,12 @@ class PartialVectorHandler { // - [4*num_nodes, 5*num_nodes): r(s_right). // - [5*num_nodes, 6*num_nodes): r(s_left). NucleotidePLVRefVector pvs_; + + // Reindex map for finding pv locations. + Reindexer pv_reindexer_; + size_t reindexer_init_size_ = 0; + // Whether to use remapping to reindex PLVs, otherwise only use + bool use_remapping_ = true; }; // PLVHandler: Partial Likelihood Vector Handler diff --git a/src/reindexer.cpp b/src/reindexer.cpp index 5d0d63b11..ed1c8857e 100644 --- a/src/reindexer.cpp +++ b/src/reindexer.cpp @@ -26,6 +26,14 @@ bool Reindexer::IsValid(std::optional length) const { // ** Modification Operations +void Reindexer::Resize(size_t new_size) { + const auto old_size = GetData().size(); + GetData().resize(new_size); + for (size_t i = old_size; i < new_size; i++) { + GetData()[i] = i; + } +} + // Gets the inverse of a given reindexer. Reindexer Reindexer::InvertReindexer() const { Assert(IsValid(), "Reindexer must be valid in Reindexer::InvertedReindexer."); diff --git a/src/reindexer.hpp b/src/reindexer.hpp index b1d463d64..3104c7885 100644 --- a/src/reindexer.hpp +++ b/src/reindexer.hpp @@ -27,6 +27,7 @@ class Reindexer { Reindexer(SizeVector data) : data_(std::move(data)){}; // ** Special Constructors + // For each position in a identity reindexer, reindexer[`i`] = `i`. // E.g. for size = 5, reindexer = [0, 1, 2, 3, 4]. static Reindexer IdentityReindexer(const size_t size); @@ -65,6 +66,9 @@ class Reindexer { // ** Modification Operations + // Resize reindexer. If new indices are created, pad them with identity. + void Resize(size_t new_size); + // Builds new inverse reindexer of a given reindexer, such that input->output becomes // output->input. Reindexer InvertReindexer() const; diff --git a/src/subsplit_dag.cpp b/src/subsplit_dag.cpp index a0d625689..d7837b5b9 100644 --- a/src/subsplit_dag.cpp +++ b/src/subsplit_dag.cpp @@ -42,6 +42,8 @@ SubsplitDAG::SubsplitDAG(SubsplitDAG &host_dag, HostDispatchTag) : storage_{host_dag.storage_, HostDispatchTag()}, dag_taxa_{host_dag.dag_taxa_}, subsplit_to_id_{host_dag.subsplit_to_id_}, + subsplit_union_{host_dag.subsplit_union_}, + subsplit_clade_{host_dag.subsplit_clade_}, parent_to_child_range_{host_dag.parent_to_child_range_}, taxon_count_{host_dag.taxon_count_}, edge_count_without_leaf_subsplits_{host_dag.edge_count_without_leaf_subsplits_}, @@ -57,6 +59,11 @@ void SubsplitDAG::ResetHostDAG(SubsplitDAG &host_dag) { edge_count_without_leaf_subsplits_ = host_dag.edge_count_without_leaf_subsplits_; topology_count_ = host_dag.topology_count_; topology_count_below_ = host_dag.topology_count_below_; + + subsplit_union_ = host_dag.subsplit_union_; + subsplit_clade_ = host_dag.subsplit_clade_; + subsplit_union_graft_.clear(); + subsplit_clade_graft_.clear(); } // ** Comparators @@ -380,6 +387,22 @@ SizeBitsetMap SubsplitDAG::BuildInverseEdgeIndexer() const { // ** Access +const BitsetNodeIdSetMap &SubsplitDAG::GetSubsplitUnionMap() const { + return subsplit_union_; +} + +const BitsetNodeIdSetMap &SubsplitDAG::GetSubsplitUnionGraftMap() const { + return subsplit_union_graft_; +} + +const BitsetNodeIdSetMap &SubsplitDAG::GetSubsplitCladeMap() const { + return subsplit_clade_; +} + +const BitsetNodeIdSetMap &SubsplitDAG::GetSubsplitCladeGraftMap() const { + return subsplit_clade_graft_; +} + TaxonIdVector SubsplitDAG::GetTaxonIds() const { TaxonIdVector taxon_ids; for (TaxonId taxon_id = 0; taxon_id < TaxonCount(); taxon_id++) { @@ -411,7 +434,7 @@ Bitset SubsplitDAG::GetDAGNodeBitset(const NodeId node_id) const { NodeId SubsplitDAG::GetDAGNodeId(const Bitset &subsplit) const { Assert(ContainsNode(subsplit), "Node with the given subsplit does not exist in DAG."); - if (storage_.HaveHost()) { + if (ContainsGraft()) { auto node = storage_.FindVertex(subsplit); return NodeId(node->get().GetId()); } @@ -1110,29 +1133,35 @@ NodeId SubsplitDAG::CreateAndInsertNode(const Bitset &subsplit) { storage_.AddVertex({node_id, subsplit}); SafeInsert(subsplit_to_id_, subsplit, node_id); - if (!storage_.HaveHost()) { + for (const auto contains_graft : {false, true}) { + // Determine to add node to map as a real node or as a graft. + if (contains_graft != ContainsGraft()) continue; + auto &subsplit_union_map = + !ContainsGraft() ? subsplit_union_ : subsplit_union_graft_; + auto &subsplit_clade_map = + !ContainsGraft() ? subsplit_clade_ : subsplit_clade_graft_; // Add Node to adjacency maps. if (!subsplit.SubsplitIsUCA()) { // Add clade union to map. const auto subsplit_union = subsplit.SubsplitCladeUnion(); - if (subsplit_union_.find(subsplit_union) == subsplit_union_.end()) { - subsplit_union_[subsplit_union] = NodeIdSet(); + if (subsplit_union_map.find(subsplit_union) == subsplit_union_map.end()) { + subsplit_union_map[subsplit_union] = NodeIdSet(); } - subsplit_union_[subsplit_union].insert(node_id); + subsplit_union_map[subsplit_union].insert(node_id); } if (!subsplit.SubsplitIsLeaf()) { // Add left clade to map. const auto subsplit_left = subsplit.SubsplitGetClade(SubsplitClade::Left); - if (subsplit_clade_.find(subsplit_left) == subsplit_clade_.end()) { - subsplit_clade_[subsplit_left] = NodeIdSet(); + if (subsplit_clade_map.find(subsplit_left) == subsplit_clade_map.end()) { + subsplit_clade_map[subsplit_left] = NodeIdSet(); } - subsplit_clade_[subsplit_left].insert(node_id); + subsplit_clade_map[subsplit_left].insert(node_id); // Add right clade to map. const auto subsplit_right = subsplit.SubsplitGetClade(SubsplitClade::Right); - if (subsplit_clade_.find(subsplit_right) == subsplit_clade_.end()) { - subsplit_clade_[subsplit_right] = NodeIdSet(); + if (subsplit_clade_map.find(subsplit_right) == subsplit_clade_map.end()) { + subsplit_clade_map[subsplit_right] = NodeIdSet(); } - subsplit_clade_[subsplit_right].insert(node_id); + subsplit_clade_map[subsplit_right].insert(node_id); } } @@ -1447,12 +1476,14 @@ TagStringMap SubsplitDAG::BuildDummyTagTaxonMap(const size_t taxon_count) { // ** Query DAG +bool SubsplitDAG::ContainsGraft() const { return storage_.HaveHost(); } + bool SubsplitDAG::ContainsTaxon(const std::string &name) const { return dag_taxa_.find(name) != dag_taxa_.end(); } bool SubsplitDAG::ContainsNode(const Bitset &subsplit) const { - if (storage_.HaveHost()) { + if (ContainsGraft()) { if (storage_.FindVertex(subsplit).has_value()) return true; } return subsplit_to_id_.find(subsplit) != subsplit_to_id_.end(); @@ -1645,18 +1676,20 @@ NodeIdVectorPair SubsplitDAG::FindParentNodeIdsViaMap(const Bitset &subsplit) co return {left_parents, right_parents}; } const auto subsplit_union = subsplit.SubsplitCladeUnion(); - if (subsplit_clade_.find(subsplit_union) != subsplit_clade_.end()) { - const auto &parents = subsplit_clade_.find(subsplit_union)->second; - for (const auto parent_id : parents) { - const auto parent_subsplit = GetDAGNodeBitset(parent_id); - // Sort results in left and right clades. - for (const auto clade : SubsplitCladeEnum::Iterator()) { - const auto parent_clade = parent_subsplit.SubsplitGetClade(clade); - if (parent_clade == subsplit_union) { - if (clade == SubsplitClade::Left) { - left_parents.push_back(parent_id); - } else { - right_parents.push_back(parent_id); + for (const auto &subsplit_map : {subsplit_clade_, subsplit_clade_graft_}) { + if (subsplit_map.find(subsplit_union) != subsplit_map.end()) { + const auto &parents = subsplit_map.find(subsplit_union)->second; + for (const auto parent_id : parents) { + const auto parent_subsplit = GetDAGNodeBitset(parent_id); + // Sort results in left and right clades. + for (const auto clade : SubsplitCladeEnum::Iterator()) { + const auto parent_clade = parent_subsplit.SubsplitGetClade(clade); + if (parent_clade == subsplit_union) { + if (clade == SubsplitClade::Left) { + left_parents.push_back(parent_id); + } else { + right_parents.push_back(parent_id); + } } } } @@ -1672,15 +1705,19 @@ NodeIdVectorPair SubsplitDAG::FindChildNodeIdsViaMap(const Bitset &subsplit) con if (subsplit.SubsplitIsLeaf()) { return {left_children, right_children}; } - const auto subsplit_left = subsplit.SubsplitGetClade(SubsplitClade::Left); - if (subsplit_union_.find(subsplit_left) != subsplit_union_.end()) { - const auto &left_children_input = subsplit_union_.find(subsplit_left)->second; - left_children.assign(left_children_input.begin(), left_children_input.end()); - } - const auto subsplit_right = subsplit.SubsplitGetClade(SubsplitClade::Right); - if (subsplit_union_.find(subsplit_right) != subsplit_union_.end()) { - const auto &right_children_input = subsplit_union_.find(subsplit_right)->second; - right_children.assign(right_children_input.begin(), right_children_input.end()); + for (const auto &subsplit_map : {subsplit_union_, subsplit_union_graft_}) { + const auto subsplit_left = subsplit.SubsplitGetClade(SubsplitClade::Left); + if (subsplit_map.find(subsplit_left) != subsplit_map.end()) { + const auto &new_left_children = subsplit_map.find(subsplit_left)->second; + left_children.insert(left_children.end(), new_left_children.begin(), + new_left_children.end()); + } + const auto subsplit_right = subsplit.SubsplitGetClade(SubsplitClade::Right); + if (subsplit_map.find(subsplit_right) != subsplit_map.end()) { + const auto &new_right_children = subsplit_map.find(subsplit_right)->second; + right_children.insert(right_children.end(), new_right_children.begin(), + new_right_children.end()); + } } return {left_children, right_children}; } @@ -1690,7 +1727,7 @@ NodeIdVectorPair SubsplitDAG::FindParentNodeIdsViaScan(const Bitset &subsplit, // Linear search for all parents. NodeIdVector left_parents, right_parents; for (const auto &[potential_parent_subsplit, node_id] : subsplit_to_id_) { - if (graft_nodes_only && (node_id >= NodeCount())) continue; + if (graft_nodes_only && (node_id < NodeCount())) continue; if (subsplit.SubsplitIsLeftChildOf(potential_parent_subsplit)) { left_parents.push_back(node_id); } else if (subsplit.SubsplitIsRightChildOf(potential_parent_subsplit)) { @@ -1705,7 +1742,7 @@ NodeIdVectorPair SubsplitDAG::FindChildNodeIdsViaScan(const Bitset &subsplit, // Linear search for all children. NodeIdVector left_children, right_children; for (const auto &[potential_child_subsplit, node_id] : subsplit_to_id_) { - if (graft_nodes_only && (node_id >= NodeCount())) continue; + if (graft_nodes_only && (node_id < NodeCount())) continue; if (potential_child_subsplit.SubsplitIsLeftChildOf(subsplit)) { left_children.push_back(node_id); } else if (potential_child_subsplit.SubsplitIsRightChildOf(subsplit)) { @@ -1716,21 +1753,11 @@ NodeIdVectorPair SubsplitDAG::FindChildNodeIdsViaScan(const Bitset &subsplit, } NodeIdVectorPair SubsplitDAG::FindParentNodeIds(const Bitset &subsplit) const { - auto [left, right] = FindParentNodeIdsViaMap(subsplit); - // If DAG contains grafted nodes, linearly scan for grafted nodes parents. - if (storage_.HaveHost()) { - return FindParentNodeIdsViaScan(subsplit); - } - return {left, right}; + return FindParentNodeIdsViaMap(subsplit); } NodeIdVectorPair SubsplitDAG::FindChildNodeIds(const Bitset &subsplit) const { - auto [left, right] = FindChildNodeIdsViaMap(subsplit); - // If DAG contains grafted nodes, linearly scan for grafted nodes children. - if (storage_.HaveHost()) { - return FindChildNodeIdsViaScan(subsplit); - } - return {left, right}; + return FindChildNodeIdsViaMap(subsplit); } NodeId SubsplitDAG::FindFirstParentNodeId(const Bitset &subsplit) const { @@ -1759,7 +1786,6 @@ void SubsplitDAG::ConnectChildToAllChildren(const Bitset &child_subsplit, EdgeIdVector &added_edge_idxs) { const auto [left_leafward_of_child, right_leafward_of_child] = FindChildNodeIds(child_subsplit); - for (const auto &[children_of_child, rotated] : std::vector>{{left_leafward_of_child, true}, {right_leafward_of_child, false}}) { @@ -1847,7 +1873,7 @@ SubsplitDAG::ModificationResult SubsplitDAG::AddNodePair(const Bitset &parent_su "The given pair of nodes is incompatible with DAG in " "SubsplitDAG::AddNodePair."); // Perform add node pair operation. - auto results = AddNodePairInternals(parent_subsplit, child_subsplit); + auto mods = AddNodePairInternals(parent_subsplit, child_subsplit); // Check that node pair was added correctly. if (!ContainsEdge(parent_subsplit, child_subsplit)) { std::cerr << "contains_parent: " << ContainsNode(parent_subsplit) << std::endl; @@ -1855,7 +1881,7 @@ SubsplitDAG::ModificationResult SubsplitDAG::AddNodePair(const Bitset &parent_su } Assert(ContainsEdge(parent_subsplit, child_subsplit), "AddNodePair failed to add given node pair."); - return results; + return mods; } SubsplitDAG::ModificationResult SubsplitDAG::AddNodes( @@ -1872,7 +1898,6 @@ SubsplitDAG::ModificationResult SubsplitDAG::AddNodePairInternals( SubsplitDAG::ModificationResult SubsplitDAG::AddNodePairInternals( const std::vector> &node_subsplit_pairs) { - Stopwatch timer(true, Stopwatch::TimeScale::SecondScale); // Initialize output vectors. ModificationResult mods; // Note: `prev_node_count` acts as a place marker. We know what the DAG root @@ -1908,7 +1933,7 @@ SubsplitDAG::ModificationResult SubsplitDAG::AddNodePairInternals( AddNodePairInternalsWithoutReindexing(node_subsplit_pairs, mods); // If GraftDAG, does not perform reindexing. - if (!storage_.HaveHost()) { + if (!ContainsGraft()) { // Create reindexers. mods.node_reindexer = BuildNodeReindexer(mods.prv_node_count); mods.edge_reindexer = BuildEdgeReindexer(mods.prv_edge_count); diff --git a/src/subsplit_dag.hpp b/src/subsplit_dag.hpp index a8117e97b..2b2ab2f25 100644 --- a/src/subsplit_dag.hpp +++ b/src/subsplit_dag.hpp @@ -221,6 +221,12 @@ class SubsplitDAG { // Get reference to parent_node -> child_edge_range map. const NodeIdEdgeIdPairMap &GetParentNodeToChildEdgeRangeMap() const; + // Maps for finding potential node neighbors. + const BitsetNodeIdSetMap &GetSubsplitUnionMap() const; + const BitsetNodeIdSetMap &GetSubsplitUnionGraftMap() const; + const BitsetNodeIdSetMap &GetSubsplitCladeMap() const; + const BitsetNodeIdSetMap &GetSubsplitCladeGraftMap() const; + // ** DAG Lambda Iterators // These methods iterate over the nodes and take lambda functions with arguments // relative to current node. @@ -384,6 +390,8 @@ class SubsplitDAG { // ** Query DAG + // Does DAG have attached graftDAG? + bool ContainsGraft() const; // Does a taxon with the given name exist in DAG? bool ContainsTaxon(const std::string &name) const; // Does a node with the given subsplit exist in DAG? @@ -729,10 +737,12 @@ class SubsplitDAG { // The subsplit_union_ map contains a map from every clade union in the DAG to the set // of all node_ids which contain that clade union. BitsetNodeIdSetMap subsplit_union_; + BitsetNodeIdSetMap subsplit_union_graft_; // The subsplit_clade_ map ontains a map from every left and right clades of every // subsplit in the DAG to the set of all node_ids which contain that clade as one of // its sides. BitsetNodeIdSetMap subsplit_clade_; + BitsetNodeIdSetMap subsplit_clade_graft_; // - Map of all DAG Nodes: // - [ Node Subsplit (Bitset) => (begin, end) Range of Child Ids ]