diff --git a/Makefile b/Makefile index dd6c1965098..166c2ce5378 100644 --- a/Makefile +++ b/Makefile @@ -521,7 +521,7 @@ $(LIB_DIR)/cleaned_old_boost: $(wildcard $(LIB_DIR)/libboost_*) $(wildcard $(INC +rm -Rf $(INC_DIR)/boost +touch $(LIB_DIR)/cleaned_old_boost -$(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp +$(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp $(LIBVGIO_DIR)/deps/vg.proto +rm -f $(CWD)/$(INC_DIR)/vg.pb.h $(CWD)/$(INC_DIR)/vg/vg.pb.h +rm -Rf $(CWD)/$(INC_DIR)/vg/io/ +. ./source_me.sh && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build && mkdir build && cd build && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH=$(CWD) -DCMAKE_LIBRARY_PATH=$(CWD)/$(LIB_DIR) -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install diff --git a/deps/libvgio b/deps/libvgio index 92182e38025..ecf5c600294 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit 92182e38025c5c0feab5e883331d9b33feb4c22b +Subproject commit ecf5c60029448dc1a837cfbcddfe1b34722f9dd1 diff --git a/src/algorithms/back_translate.cpp b/src/algorithms/back_translate.cpp index cbf3e36fb7c..ef07863d255 100644 --- a/src/algorithms/back_translate.cpp +++ b/src/algorithms/back_translate.cpp @@ -4,12 +4,40 @@ #include "back_translate.hpp" #include "../path.hpp" +#include "../snarls.hpp" namespace vg { namespace algorithms { using namespace std; +/** + * Erase the items at the given indices in the given Protobuf repeated pointer field. + */ +template +static void erase_at(RepeatedPtrField* field, const vector& sorted_indices_to_remove) { + if (sorted_indices_to_remove.empty()) { + // We don't need to do anything. + return; + } + + // This is how many slots ahead we are reading to write the current slot. + // It's also (1+) the cursor in sorted_indices_to_remove + size_t slots_ahead = 1; + for (size_t i = sorted_indices_to_remove[slots_ahead - 1]; i + sorted_indices_to_remove.size() < field->size(); i++) { + // We're at a slot that will need to be filled in the output. + while (slots_ahead < sorted_indices_to_remove.size() && i + slots_ahead == sorted_indices_to_remove[slots_ahead]) { + // Slide copy source ahead until it stops being the next thing to skip. + slots_ahead++; + } + // Overwrite the item here with the item that is supposed to be here. + // We use a swap so we don't have to actually copy any item data, just pointers. + field->SwapElements(i, i + slots_ahead); + } + // Now we've bumped all the unused items to the end so delete them. + field->DeleteSubrange(field->size() - sorted_indices_to_remove.size(), sorted_indices_to_remove.size()); +} + void back_translate_in_place(const NamedNodeBackTranslation* translation, Path& path) { // For combining mappings that can combine, we need to track where the previous mapping ended. nid_t prev_segment_number = numeric_limits::max(); @@ -35,7 +63,7 @@ void back_translate_in_place(const NamedNodeBackTranslation* translation, Path& if (translated.size() != 1) { // TODO: Implement translations that split graph nodes into multiple segments. - throw std::runtime_error("Translated range on node " + to_string(mapping->position().node_id()) + + throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) + " to " + to_string(translated.size()) + " named segment ranges, but complex translations like this are not yet implemented"); } @@ -43,7 +71,7 @@ void back_translate_in_place(const NamedNodeBackTranslation* translation, Path& auto& translated_range = translated[0]; if (get<1>(translated_range) != get<1>(source_range)) { // TODO: Implement translations that flip orientations. - throw std::runtime_error("Translated range on node " + to_string(mapping->position().node_id()) + + throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) + " ended up on the opposite strand; complex translations like this are not yet implemented"); } @@ -94,28 +122,90 @@ void back_translate_in_place(const NamedNodeBackTranslation* translation, Path& } } - if (mapping_indices_to_remove.empty()) { - // We don't need to fix up the mapping list - return; + // We need to batch-erase all these indices. + erase_at(path.mutable_mapping(), mapping_indices_to_remove); +} + + + +void back_translate_in_place(const NamedNodeBackTranslation* translation, Snarl& snarl) { + // To translate a snarl, you translate its bounding visits + back_translate_in_place(translation, *snarl.mutable_start()); + back_translate_in_place(translation, *snarl.mutable_end()); +} + +void back_translate_in_place(const NamedNodeBackTranslation* translation, SnarlTraversal& traversal) { + vector visit_indices_to_remove; + // We need to keep track of the last visit actually kept, for coalescing + // over dropped snarls that are the artifacts of segment splitting. + size_t last_kept_visit = numeric_limits::max(); + for (size_t i = 0; i < traversal.visit_size(); i++) { + // Translate every visit + Visit& here = *(traversal.mutable_visit(i)); + back_translate_in_place(translation, here); + if (here.has_snarl()) { + // We're a visit to a snarl, so we should be elided if we're a + // trivial snarl made by breaking a segment. + // TODO: We don't account for edits to the graph to insert in the + // middle of segments. + if (here.snarl().start() == here.snarl().end()) { + // This visit must be to a trivial snarl created by breaking the segment, so elide it. + visit_indices_to_remove.push_back(i); + continue; + } + } else { + // We're a visit to a segment, so we should be elided if we're just + // the same visit to a segment as the last thing that isn't getting + // dropped. + if (last_kept_visit != numeric_limits::max()) { + const Visit& prev = traversal.visit(last_kept_visit); + if (here == prev) { + // These visits can coalesce because they are to the same + // segment and orientation. + visit_indices_to_remove.push_back(i); + continue; + } + } + } + // If we aren't elided, we are a kept visit. + last_kept_visit = i; } - // Otherwise, we need to batch-erase all these indices. - - // This is how many slots ahead we are reading to write the current slot. - // It's also (1+) the cursor in mapping_indices_to_remove - size_t slots_ahead = 1; - for (size_t i = mapping_indices_to_remove[slots_ahead - 1]; i + mapping_indices_to_remove.size() < path.mapping_size(); i++) { - // We're at a slot that will need to be filled in the output. - while (slots_ahead < mapping_indices_to_remove.size() && i + slots_ahead == mapping_indices_to_remove[slots_ahead]) { - // Slide copy source ahead until it stops being the next thing to skip. - slots_ahead++; + // Get rid of visits that coalesced away + erase_at(traversal.mutable_visit(), visit_indices_to_remove); +} + +void back_translate_in_place(const NamedNodeBackTranslation* translation, Visit& visit) { + if (visit.has_snarl()) { + // A visit with a snarl is translated by translating its snarl + back_translate_in_place(translation, *visit.mutable_snarl()); + } else { + // Otherwise, translate its node ID to a segment name. Use made-up boundaries on the node. + // TODO: Can we have an easy way to say "whole node"? + oriented_node_range_t source_range(visit.node_id(), visit.backward(), 0, 1); + + // Translate it + vector translated = translation->translate_back(source_range); + + if (translated.size() != 1) { + // TODO: Implement translations that split graph nodes into multiple segments. + throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) + + " to " + to_string(translated.size()) + + " named segment ranges, but complex translations like this are not yet implemented"); } - // Overwrite the item here with the item that is supposed to be here. - // We use a swap so we don't have to actually copy any mapping data, just pointers. - path.mutable_mapping()->SwapElements(i, i + slots_ahead); + + auto& translated_range = translated[0]; + if (get<1>(translated_range) != get<1>(source_range)) { + // TODO: Implement translations that flip orientations. + throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) + + " ended up on the opposite strand; complex translations like this are not yet implemented"); + } + + // Save the change to the visit. + visit.clear_node_id(); + visit.set_name(translation->get_back_graph_node_name(get<0>(translated_range))); + // Ignore the offset and length info. } - // Now we've bumped all the unused Mappings to the end so delete them. - path.mutable_mapping()->DeleteSubrange(path.mapping_size() - mapping_indices_to_remove.size(), mapping_indices_to_remove.size()); } } diff --git a/src/algorithms/back_translate.hpp b/src/algorithms/back_translate.hpp index e8545d3bf4c..ce03c51213a 100644 --- a/src/algorithms/back_translate.hpp +++ b/src/algorithms/back_translate.hpp @@ -19,6 +19,28 @@ using namespace std; */ void back_translate_in_place(const NamedNodeBackTranslation* translation, Path& path); +/** + * Translate the given Snarl in place from node ID space to named segment space. + */ +void back_translate_in_place(const NamedNodeBackTranslation* translation, Snarl& snarl); + +/** + * Translate the given SnarlTraversal in place from node ID space to named + * segment space. Visits that end up being to snarls where both boundaries are + * from the same orientation of the same segment will be removed. Multiple + * visits in a row to the same orientation of the same segment will be elided. + * + * TODO: Work out a way to preserve traversals around cycles while not + * repeating ourselves for visits to diffrent pieces of the same chopped + * segment. + */ +void back_translate_in_place(const NamedNodeBackTranslation* translation, SnarlTraversal& traversal); + +/** + * Translate the given Visit in place from node ID space to named segment space. + */ +void back_translate_in_place(const NamedNodeBackTranslation* translation, Visit& visit); + } } diff --git a/src/algorithms/find_translation.cpp b/src/algorithms/find_translation.cpp index 48007eb72e5..c964823abc6 100644 --- a/src/algorithms/find_translation.cpp +++ b/src/algorithms/find_translation.cpp @@ -5,6 +5,7 @@ #include "find_translation.hpp" #include "../io/save_handle_graph.hpp" +#include "../gbzgraph.hpp" namespace vg { namespace algorithms { @@ -19,9 +20,14 @@ const NamedNodeBackTranslation* find_translation(const HandleGraph* graph) { return dynamic_cast(graph); } if (dynamic_cast(graph)) { - // If we loaded the graph fropm a GFA we would have attached this translation. + // If we loaded the graph from a GFA we would have attached this translation. return &dynamic_cast(graph)->gfa_id_space; } + if (dynamic_cast(graph)) { + // If we loaded the graph from a GBZ we can use the GBWTGraph even though the back-translation stuff isn't proxied. + // TODO: proxy it? + return &dynamic_cast(graph)->gbz.graph; + } // Otherwise there's no applicable translation return nullptr; } diff --git a/src/algorithms/gfa_to_handle.hpp b/src/algorithms/gfa_to_handle.hpp index 6038eba8887..bcfedf3c79a 100644 --- a/src/algorithms/gfa_to_handle.hpp +++ b/src/algorithms/gfa_to_handle.hpp @@ -43,7 +43,7 @@ struct GFAIDMapInfo : public NamedNodeBackTranslation { unique_ptr> id_to_name; /** - * Prepare thr backing data structures for get_back_graph_node_name(). Call after name_to_id is complete. + * Prepare the backing data structures for get_back_graph_node_name(). Call after name_to_id is complete. */ void invert_translation(); diff --git a/src/gbzgraph.hpp b/src/gbzgraph.hpp index 38a728b0a89..78d6b13e896 100644 --- a/src/gbzgraph.hpp +++ b/src/gbzgraph.hpp @@ -20,7 +20,7 @@ namespace vg { * * Should be removed if/when GBZ implements PathHandleGraph. */ -class GBZGraph : bdsg::PathHandleGraphProxy { +class GBZGraph : public bdsg::PathHandleGraphProxy { public: /// This is the GBZ object we own that actually holds the graph and GBWT /// data. diff --git a/src/io/register_loader_saver_gfa.cpp b/src/io/register_loader_saver_gfa.cpp index f376018e78a..7006bf4b475 100644 --- a/src/io/register_loader_saver_gfa.cpp +++ b/src/io/register_loader_saver_gfa.cpp @@ -106,6 +106,8 @@ void register_loader_saver_gfa() { // Load it from the stream, falling back to temp file if necessary algorithms::gfa_to_path_handle_graph_stream(input, gfa_graph, &gfa_graph->gfa_id_space); } + // Make sure the node ID to sequence space translation is ready if anybody wants it. + gfa_graph->gfa_id_space.invert_translation(); } catch (algorithms::GFAFormatError& e) { // There is something wrong with the input GFA file. diff --git a/src/snarls.cpp b/src/snarls.cpp index 28e7e6b4af5..abaa5076815 100644 --- a/src/snarls.cpp +++ b/src/snarls.cpp @@ -2404,6 +2404,7 @@ bool operator==(const Visit& a, const Visit& b) { // IDs and orientations have to match, and nobody has a snarl or the // snarls match. return a.node_id() == b.node_id() && + a.name() == b.name() && a.backward() == b.backward() && ((!a.has_snarl() && !b.has_snarl()) || a.snarl() == b.snarl()); @@ -2416,17 +2417,22 @@ bool operator!=(const Visit& a, const Visit& b) { bool operator<(const Visit& a, const Visit& b) { if (!a.has_snarl() && !b.has_snarl()) { // Compare everything but the snarl - return make_tuple(a.node_id(), a.backward()) < make_tuple(b.node_id(), b.backward()); + return make_tuple(a.node_id(), a.backward(), a.name()) < make_tuple(b.node_id(), b.backward(), b.name()); } else { // Compare including the snarl - return make_tuple(a.node_id(), a.snarl(), a.backward()) < make_tuple(b.node_id(), b.snarl(), b.backward()); + return make_tuple(a.node_id(), a.snarl(), a.backward(), a.name()) < make_tuple(b.node_id(), b.snarl(), b.backward(), b.name()); } } ostream& operator<<(ostream& out, const Visit& visit) { if (!visit.has_snarl()) { - // Use the node ID - out << visit.node_id(); + if (visit.name().empty()) { + // Use the node ID + out << visit.node_id(); + } else { + // Use the name + out << visit.name(); + } } else { // Use the snarl out << visit.snarl(); diff --git a/src/subcommand/autoindex_main.cpp b/src/subcommand/autoindex_main.cpp index c8f085efc83..2a4b5b57af8 100644 --- a/src/subcommand/autoindex_main.cpp +++ b/src/subcommand/autoindex_main.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include "subcommand.hpp" @@ -112,6 +113,8 @@ void help_autoindex(char** argv) { << " -T, --tmp-dir DIR temporary directory to use for intermediate files" << endl << " -M, --target-mem MEM target max memory usage (not exact, formatted INT[kMG])" << endl << " (default: 1/2 of available)" << endl + << " --gbwt-buffer-size NUM GBWT construction buffer size in millions of nodes; may need to be" << endl + << " increased for graphs with long haplotypes (default: " << IndexingParameters::gbwt_insert_batch_size / gbwt::MILLION << ")" << endl << " -t, --threads NUM number of threads (default: all available)" << endl << " -V, --verbosity NUM log to stderr (0 = none, 1 = basic, 2 = debug; default " << (int) IndexingParameters::verbosity << ")" << endl //<< " -d, --dot print the dot-formatted graph of index recipes and exit" << endl @@ -128,6 +131,7 @@ int main_autoindex(int argc, char** argv) { #define OPT_KEEP_INTERMEDIATE 1000 #define OPT_FORCE_UNPHASED 1001 #define OPT_FORCE_PHASED 1002 +#define OPT_GBWT_BUFFER_SIZE 1003 // load the registry IndexRegistry registry = VGIndexes::get_vg_index_registry(); @@ -155,6 +159,7 @@ int main_autoindex(int argc, char** argv) { {"provide", required_argument, 0, 'P'}, {"request", required_argument, 0, 'R'}, {"target-mem", required_argument, 0, 'M'}, + {"gbwt-buffer-size", required_argument, 0, OPT_GBWT_BUFFER_SIZE}, {"tmp-dir", required_argument, 0, 'T'}, {"threads", required_argument, 0, 't'}, {"verbosity", required_argument, 0, 'V'}, @@ -238,6 +243,9 @@ int main_autoindex(int argc, char** argv) { case 'M': target_mem_usage = parse_memory_usage(optarg); break; + case OPT_GBWT_BUFFER_SIZE: + IndexingParameters::gbwt_insert_batch_size = std::max(parse(optarg), 1ul) * gbwt::MILLION; + break; case 'T': temp_file::set_dir(optarg); break; diff --git a/src/subcommand/snarls_main.cpp b/src/subcommand/snarls_main.cpp index d05982c8a99..8587e93d2d2 100644 --- a/src/subcommand/snarls_main.cpp +++ b/src/subcommand/snarls_main.cpp @@ -16,6 +16,8 @@ #include "../cactus_snarl_finder.hpp" #include "../integrated_snarl_finder.hpp" #include "../gbwtgraph_helper.hpp" +#include "../algorithms/find_translation.hpp" +#include "../algorithms/back_translate.hpp" #include #include @@ -29,20 +31,21 @@ void help_snarl(char** argv) { cerr << "usage: " << argv[0] << " snarls [options] graph > snarls.pb" << endl << " By default, a list of protobuf Snarls is written" << endl << "options:" << endl - << " -A, --algorithm NAME compute snarls using 'cactus' or 'integrated' algorithms (default: integrated)" << endl - << " -p, --pathnames output variant paths as SnarlTraversals to STDOUT" << endl - << " -r, --traversals FILE output SnarlTraversals for ultrabubbles." << endl - << " -e, --path-traversals only consider traversals that correspond to paths in the graph. (-m ignored)" << endl - << " -l, --leaf-only restrict traversals to leaf ultrabubbles." << endl - << " -o, --top-level restrict traversals to top level ultrabubbles" << endl - << " -a, --any-snarl-type compute traversals for any snarl type (not limiting to ultrabubbles)" << endl - << " -m, --max-nodes N only compute traversals for snarls with <= N nodes (with degree > 1) [10]" << endl - << " -T, --include-trivial report snarls that consist of a single edge" << endl - << " -s, --sort-snarls return snarls in sorted order by node ID (for topologically ordered graphs)" << endl - << " -v, --vcf FILE use vcf-based instead of exhaustive traversal finder with -r" << endl - << " -f --fasta FILE reference in FASTA format (required for SVs by -v)" << endl - << " -i --ins-fasta FILE insertion sequences in FASTA format (required for SVs by -v)" << endl - << " -t, --threads N number of threads to use [all available]" << endl; + << " -A, --algorithm NAME compute snarls using 'cactus' or 'integrated' algorithms (default: integrated)" << endl + << " -p, --pathnames output variant paths as SnarlTraversals to STDOUT" << endl + << " -r, --traversals FILE output SnarlTraversals for ultrabubbles." << endl + << " -e, --path-traversals only consider traversals that correspond to paths in the graph. (-m ignored)" << endl + << " -l, --leaf-only restrict traversals to leaf ultrabubbles." << endl + << " -o, --top-level restrict traversals to top level ultrabubbles" << endl + << " -a, --any-snarl-type compute traversals for any snarl type (not limiting to ultrabubbles)" << endl + << " -m, --max-nodes N only compute traversals for snarls with <= N nodes (with degree > 1) [10]" << endl + << " -n, --named-coordinates produce snarl and traversal outputs in named-segment (GFA) space" << endl + << " -T, --include-trivial report snarls that consist of a single edge" << endl + << " -s, --sort-snarls return snarls in sorted order by node ID (for topologically ordered graphs)" << endl + << " -v, --vcf FILE use vcf-based instead of exhaustive traversal finder with -r" << endl + << " -f --fasta FILE reference in FASTA format (required for SVs by -v)" << endl + << " -i --ins-fasta FILE insertion sequences in FASTA format (required for SVs by -v)" << endl + << " -t, --threads N number of threads to use [all available]" << endl; } int main_snarl(int argc, char** argv) { @@ -60,6 +63,7 @@ int main_snarl(int argc, char** argv) { bool top_level_only = false; bool ultrabubble_only = true; int max_nodes = 10; + bool named_coordinates = false; bool filter_trivial_snarls = true; bool sort_snarls = false; bool fill_path_names = false; @@ -80,6 +84,7 @@ int main_snarl(int argc, char** argv) { {"top-level", no_argument, 0, 'o'}, {"any-snarl-type", no_argument, 0, 'a'}, {"max-nodes", required_argument, 0, 'm'}, + {"named-coordinates", no_argument, 0, 'n'}, {"include-trivial", no_argument, 0, 'T'}, {"sort-snarls", no_argument, 0, 's'}, {"vcf", required_argument, 0, 'v'}, @@ -92,7 +97,7 @@ int main_snarl(int argc, char** argv) { int option_index = 0; - c = getopt_long (argc, argv, "A:sr:laTopm:v:f:i:eh?t:", + c = getopt_long (argc, argv, "A:sr:laTopm:nv:f:i:eh?t:", long_options, &option_index); /* Detect the end of the options. */ @@ -126,6 +131,10 @@ int main_snarl(int argc, char** argv) { max_nodes = parse(optarg); break; + case 'n': + named_coordinates = true; + break; + case 'T': filter_trivial_snarls = false; break; @@ -187,6 +196,16 @@ int main_snarl(int argc, char** argv) { string graph_filename = get_input_file_name(optind, argc, argv); unique_ptr graph = vg::io::VPKG::load_one(graph_filename); + // Determine what translation we should apply, if any, to our output coordinates. + const NamedNodeBackTranslation* translation = nullptr; + if (named_coordinates) { + translation = vg::algorithms::find_translation(graph.get()); + if (!translation) { + cerr << "error:[vg snarls] Named coordinate output (-n) was requested, but the graph does not come with a named coordinate space." << endl; + return 1; + } + } + // TODO: Everything but Cactus and the path-related options can work with a // non-path HandleGraph, but we don't really have any of those implemented // anymore, so we don't bother supporting them. @@ -266,6 +285,12 @@ int main_snarl(int argc, char** argv) { } } vector travs = trav_finder->find_traversals(*snarl); + if (translation) { + for (auto& trav : travs) { + // Bring all the output traversals into named segment space. + algorithms::back_translate_in_place(translation, trav); + } + } vg::io::write_buffered(cout, travs, 0); } @@ -346,6 +371,10 @@ int main_snarl(int argc, char** argv) { // Write our snarl tree snarl_buffer.push_back(*snarl); + if (translation) { + // Bring all the output snarls into named segment space. + algorithms::back_translate_in_place(translation, snarl_buffer.back()); + } vg::io::write_buffered(cout, snarl_buffer, buffer_size); auto check_max_nodes = [&graph, &max_nodes](const unordered_set& nodeset) { @@ -377,6 +406,13 @@ int main_snarl(int argc, char** argv) { cerr << "Found " << travs.size() << endl; #endif + if (translation) { + for (auto& trav : travs) { + // Bring all the output traversals into named segment space. + algorithms::back_translate_in_place(translation, trav); + } + } + traversal_buffer.insert(traversal_buffer.end(), travs.begin(), travs.end()); vg::io::write_buffered(trav_stream, traversal_buffer, buffer_size); } diff --git a/test/graphs/big_snarl_named.gfa b/test/graphs/big_snarl_named.gfa new file mode 100644 index 00000000000..ee9493b78a3 --- /dev/null +++ b/test/graphs/big_snarl_named.gfa @@ -0,0 +1,7 @@ +H VN:Z:1.0 +S left CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGATT +S right ACACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC +S middle AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +L left + right + * +L left + middle + * +L middle + right + * diff --git a/test/graphs/components_walks_named.gfa b/test/graphs/components_walks_named.gfa new file mode 100644 index 00000000000..a7b3d5ecf8a --- /dev/null +++ b/test/graphs/components_walks_named.gfa @@ -0,0 +1,30 @@ +H VN:Z:1.0 +S cats G +S dogs A +S wombats T +S goldfish T +S chinchillas A +S cows C +S sparrows A +S oxen G +S pigs A +S squirrels T +S rabbits T +S tuna A +L cats + dogs + * +L cats + wombats + * +L dogs + goldfish + * +L wombats + goldfish + * +L goldfish + chinchillas + * +L goldfish + cows + * +L chinchillas + sparrows + * +L cows + sparrows + * +L oxen + pigs + * +L oxen + squirrels + * +L pigs + rabbits + * +L squirrels + rabbits - * +L rabbits + tuna + * +W sample 1 A 0 5 >cats>dogs>goldfish>chinchillas>sparrows +W sample 2 A 0 5 >cats>wombats>goldfish>cows>sparrows +W sample 1 B 0 5 >oxen>pigs>rabbitsoxen>pigs>rabbits>tuna diff --git a/test/t/32_vg_snarls.t b/test/t/32_vg_snarls.t index 7bb155febe6..46376660ee4 100644 --- a/test/t/32_vg_snarls.t +++ b/test/t/32_vg_snarls.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 14 +plan tests 19 vg view -J -v snarls/snarls.json > snarls.vg vg snarls -t 1 snarls.vg -r st.pb > snarls.pb @@ -84,3 +84,20 @@ grep $GOT possibilities.txt >/dev/null is $? 0 "vg snarls made snarls in the right order when dealing with nested chains" rm -f nested.vg nested.snarls possibilities.txt +# Find snarls from a GFA in GFA space +is "$(vg snarls --named-coordinates graphs/components_walks_named.gfa | vg view -Rj - | jq -c '[.start.name, .start.backward, .end.name, .end.backward]' | sort)" \ + "$(printf '["cats",null,"goldfish",null]\n["goldfish",null,"sparrows",null]\n["pigs",true,"squirrels",null]\n["squirrels",null,"rabbits",true]\n')" \ + "Correct snarls are found in GFA space" + +# Find traversals from a chopped GBZ-ified GFA in node ID space +vg autoindex -p index -g graphs/big_snarl_named.gfa -w giraffe +vg snarls index.giraffe.gbz --traversals traversals.dat >/dev/null +is "$(vg view -Ej traversals.dat | jq -c 'select(.visit | length > 2) | .visit[] | select(.node_id // .name)' | wc -l)" "7" "In node ID space traversal visits 7 nodes" +is "$(vg view -Ej traversals.dat | jq -c 'select(.visit | length > 2) | .visit[] | select(.snarl)' | wc -l)" "4" "In node ID space traversal visits 4 trivial snarls" + +# And in GFA space +vg snarls index.giraffe.gbz --traversals traversals.dat --named-coordinates >/dev/null +is "$(vg view -Ej traversals.dat | jq -c 'select(.visit | length > 2) | .visit[] | select(.node_id // .name)' | wc -l)" "3" "In segment name space traversal visits 3 segments" +is "$(vg view -Ej traversals.dat | jq -c 'select(.visit | length > 2) | .visit[] | select(.snarl)' | wc -l)" "0" "In segment name space traversal visits 0 trivial snarls" + +rm -f index.* traversals.dat