Skip to content

Commit

Permalink
Merge pull request #3524 from vgteam/gfa-space
Browse files Browse the repository at this point in the history
GFA space output for `vg snarls`
  • Loading branch information
adamnovak authored Jan 31, 2022
2 parents 27f3388 + a8a120a commit 4f14d05
Show file tree
Hide file tree
Showing 14 changed files with 269 additions and 45 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ $(LIB_DIR)/cleaned_old_boost: $(wildcard $(LIB_DIR)/libboost_*) $(wildcard $(INC
+rm -Rf $(INC_DIR)/boost
+touch $(LIB_DIR)/cleaned_old_boost

$(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp
$(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp $(LIBVGIO_DIR)/deps/vg.proto
+rm -f $(CWD)/$(INC_DIR)/vg.pb.h $(CWD)/$(INC_DIR)/vg/vg.pb.h
+rm -Rf $(CWD)/$(INC_DIR)/vg/io/
+. ./source_me.sh && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build && mkdir build && cd build && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH=$(CWD) -DCMAKE_LIBRARY_PATH=$(CWD)/$(LIB_DIR) -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install
Expand Down
2 changes: 1 addition & 1 deletion deps/libvgio
Submodule libvgio updated 1 files
+1 −0 deps/vg.proto
130 changes: 110 additions & 20 deletions src/algorithms/back_translate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,40 @@

#include "back_translate.hpp"
#include "../path.hpp"
#include "../snarls.hpp"

namespace vg {
namespace algorithms {

using namespace std;

/**
* Erase the items at the given indices in the given Protobuf repeated pointer field.
*/
template<typename RepeatedPtrField>
static void erase_at(RepeatedPtrField* field, const vector<size_t>& sorted_indices_to_remove) {
if (sorted_indices_to_remove.empty()) {
// We don't need to do anything.
return;
}

// This is how many slots ahead we are reading to write the current slot.
// It's also (1+) the cursor in sorted_indices_to_remove
size_t slots_ahead = 1;
for (size_t i = sorted_indices_to_remove[slots_ahead - 1]; i + sorted_indices_to_remove.size() < field->size(); i++) {
// We're at a slot that will need to be filled in the output.
while (slots_ahead < sorted_indices_to_remove.size() && i + slots_ahead == sorted_indices_to_remove[slots_ahead]) {
// Slide copy source ahead until it stops being the next thing to skip.
slots_ahead++;
}
// Overwrite the item here with the item that is supposed to be here.
// We use a swap so we don't have to actually copy any item data, just pointers.
field->SwapElements(i, i + slots_ahead);
}
// Now we've bumped all the unused items to the end so delete them.
field->DeleteSubrange(field->size() - sorted_indices_to_remove.size(), sorted_indices_to_remove.size());
}

void back_translate_in_place(const NamedNodeBackTranslation* translation, Path& path) {
// For combining mappings that can combine, we need to track where the previous mapping ended.
nid_t prev_segment_number = numeric_limits<nid_t>::max();
Expand All @@ -35,15 +63,15 @@ void back_translate_in_place(const NamedNodeBackTranslation* translation, Path&

if (translated.size() != 1) {
// TODO: Implement translations that split graph nodes into multiple segments.
throw std::runtime_error("Translated range on node " + to_string(mapping->position().node_id()) +
throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) +
" to " + to_string(translated.size()) +
" named segment ranges, but complex translations like this are not yet implemented");
}

auto& translated_range = translated[0];
if (get<1>(translated_range) != get<1>(source_range)) {
// TODO: Implement translations that flip orientations.
throw std::runtime_error("Translated range on node " + to_string(mapping->position().node_id()) +
throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) +
" ended up on the opposite strand; complex translations like this are not yet implemented");
}

Expand Down Expand Up @@ -94,28 +122,90 @@ void back_translate_in_place(const NamedNodeBackTranslation* translation, Path&
}
}

if (mapping_indices_to_remove.empty()) {
// We don't need to fix up the mapping list
return;
// We need to batch-erase all these indices.
erase_at(path.mutable_mapping(), mapping_indices_to_remove);
}



void back_translate_in_place(const NamedNodeBackTranslation* translation, Snarl& snarl) {
// To translate a snarl, you translate its bounding visits
back_translate_in_place(translation, *snarl.mutable_start());
back_translate_in_place(translation, *snarl.mutable_end());
}

void back_translate_in_place(const NamedNodeBackTranslation* translation, SnarlTraversal& traversal) {
vector<size_t> visit_indices_to_remove;
// We need to keep track of the last visit actually kept, for coalescing
// over dropped snarls that are the artifacts of segment splitting.
size_t last_kept_visit = numeric_limits<size_t>::max();
for (size_t i = 0; i < traversal.visit_size(); i++) {
// Translate every visit
Visit& here = *(traversal.mutable_visit(i));
back_translate_in_place(translation, here);
if (here.has_snarl()) {
// We're a visit to a snarl, so we should be elided if we're a
// trivial snarl made by breaking a segment.
// TODO: We don't account for edits to the graph to insert in the
// middle of segments.
if (here.snarl().start() == here.snarl().end()) {
// This visit must be to a trivial snarl created by breaking the segment, so elide it.
visit_indices_to_remove.push_back(i);
continue;
}
} else {
// We're a visit to a segment, so we should be elided if we're just
// the same visit to a segment as the last thing that isn't getting
// dropped.
if (last_kept_visit != numeric_limits<size_t>::max()) {
const Visit& prev = traversal.visit(last_kept_visit);
if (here == prev) {
// These visits can coalesce because they are to the same
// segment and orientation.
visit_indices_to_remove.push_back(i);
continue;
}
}
}
// If we aren't elided, we are a kept visit.
last_kept_visit = i;
}

// Otherwise, we need to batch-erase all these indices.

// This is how many slots ahead we are reading to write the current slot.
// It's also (1+) the cursor in mapping_indices_to_remove
size_t slots_ahead = 1;
for (size_t i = mapping_indices_to_remove[slots_ahead - 1]; i + mapping_indices_to_remove.size() < path.mapping_size(); i++) {
// We're at a slot that will need to be filled in the output.
while (slots_ahead < mapping_indices_to_remove.size() && i + slots_ahead == mapping_indices_to_remove[slots_ahead]) {
// Slide copy source ahead until it stops being the next thing to skip.
slots_ahead++;
// Get rid of visits that coalesced away
erase_at(traversal.mutable_visit(), visit_indices_to_remove);
}

void back_translate_in_place(const NamedNodeBackTranslation* translation, Visit& visit) {
if (visit.has_snarl()) {
// A visit with a snarl is translated by translating its snarl
back_translate_in_place(translation, *visit.mutable_snarl());
} else {
// Otherwise, translate its node ID to a segment name. Use made-up boundaries on the node.
// TODO: Can we have an easy way to say "whole node"?
oriented_node_range_t source_range(visit.node_id(), visit.backward(), 0, 1);

// Translate it
vector<oriented_node_range_t> translated = translation->translate_back(source_range);

if (translated.size() != 1) {
// TODO: Implement translations that split graph nodes into multiple segments.
throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) +
" to " + to_string(translated.size()) +
" named segment ranges, but complex translations like this are not yet implemented");
}
// Overwrite the item here with the item that is supposed to be here.
// We use a swap so we don't have to actually copy any mapping data, just pointers.
path.mutable_mapping()->SwapElements(i, i + slots_ahead);

auto& translated_range = translated[0];
if (get<1>(translated_range) != get<1>(source_range)) {
// TODO: Implement translations that flip orientations.
throw std::runtime_error("Translated range on node " + to_string(get<0>(source_range)) +
" ended up on the opposite strand; complex translations like this are not yet implemented");
}

// Save the change to the visit.
visit.clear_node_id();
visit.set_name(translation->get_back_graph_node_name(get<0>(translated_range)));
// Ignore the offset and length info.
}
// Now we've bumped all the unused Mappings to the end so delete them.
path.mutable_mapping()->DeleteSubrange(path.mapping_size() - mapping_indices_to_remove.size(), mapping_indices_to_remove.size());
}

}
Expand Down
22 changes: 22 additions & 0 deletions src/algorithms/back_translate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,28 @@ using namespace std;
*/
void back_translate_in_place(const NamedNodeBackTranslation* translation, Path& path);

/**
* Translate the given Snarl in place from node ID space to named segment space.
*/
void back_translate_in_place(const NamedNodeBackTranslation* translation, Snarl& snarl);

/**
* Translate the given SnarlTraversal in place from node ID space to named
* segment space. Visits that end up being to snarls where both boundaries are
* from the same orientation of the same segment will be removed. Multiple
* visits in a row to the same orientation of the same segment will be elided.
*
* TODO: Work out a way to preserve traversals around cycles while not
* repeating ourselves for visits to diffrent pieces of the same chopped
* segment.
*/
void back_translate_in_place(const NamedNodeBackTranslation* translation, SnarlTraversal& traversal);

/**
* Translate the given Visit in place from node ID space to named segment space.
*/
void back_translate_in_place(const NamedNodeBackTranslation* translation, Visit& visit);

}
}

Expand Down
8 changes: 7 additions & 1 deletion src/algorithms/find_translation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "find_translation.hpp"

#include "../io/save_handle_graph.hpp"
#include "../gbzgraph.hpp"

namespace vg {
namespace algorithms {
Expand All @@ -19,9 +20,14 @@ const NamedNodeBackTranslation* find_translation(const HandleGraph* graph) {
return dynamic_cast<const NamedNodeBackTranslation*>(graph);
}
if (dynamic_cast<const GFAHandleGraph*>(graph)) {
// If we loaded the graph fropm a GFA we would have attached this translation.
// If we loaded the graph from a GFA we would have attached this translation.
return &dynamic_cast<const GFAHandleGraph*>(graph)->gfa_id_space;
}
if (dynamic_cast<const GBZGraph*>(graph)) {
// If we loaded the graph from a GBZ we can use the GBWTGraph even though the back-translation stuff isn't proxied.
// TODO: proxy it?
return &dynamic_cast<const GBZGraph*>(graph)->gbz.graph;
}
// Otherwise there's no applicable translation
return nullptr;
}
Expand Down
2 changes: 1 addition & 1 deletion src/algorithms/gfa_to_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ struct GFAIDMapInfo : public NamedNodeBackTranslation {
unique_ptr<unordered_map<nid_t, const std::string*>> id_to_name;

/**
* Prepare thr backing data structures for get_back_graph_node_name(). Call after name_to_id is complete.
* Prepare the backing data structures for get_back_graph_node_name(). Call after name_to_id is complete.
*/
void invert_translation();

Expand Down
2 changes: 1 addition & 1 deletion src/gbzgraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace vg {
*
* Should be removed if/when GBZ implements PathHandleGraph.
*/
class GBZGraph : bdsg::PathHandleGraphProxy<gbwtgraph::GBWTGraph> {
class GBZGraph : public bdsg::PathHandleGraphProxy<gbwtgraph::GBWTGraph> {
public:
/// This is the GBZ object we own that actually holds the graph and GBWT
/// data.
Expand Down
2 changes: 2 additions & 0 deletions src/io/register_loader_saver_gfa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ void register_loader_saver_gfa() {
// Load it from the stream, falling back to temp file if necessary
algorithms::gfa_to_path_handle_graph_stream(input, gfa_graph, &gfa_graph->gfa_id_space);
}
// Make sure the node ID to sequence space translation is ready if anybody wants it.
gfa_graph->gfa_id_space.invert_translation();

} catch (algorithms::GFAFormatError& e) {
// There is something wrong with the input GFA file.
Expand Down
14 changes: 10 additions & 4 deletions src/snarls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2404,6 +2404,7 @@ bool operator==(const Visit& a, const Visit& b) {
// IDs and orientations have to match, and nobody has a snarl or the
// snarls match.
return a.node_id() == b.node_id() &&
a.name() == b.name() &&
a.backward() == b.backward() &&
((!a.has_snarl() && !b.has_snarl()) ||
a.snarl() == b.snarl());
Expand All @@ -2416,17 +2417,22 @@ bool operator!=(const Visit& a, const Visit& b) {
bool operator<(const Visit& a, const Visit& b) {
if (!a.has_snarl() && !b.has_snarl()) {
// Compare everything but the snarl
return make_tuple(a.node_id(), a.backward()) < make_tuple(b.node_id(), b.backward());
return make_tuple(a.node_id(), a.backward(), a.name()) < make_tuple(b.node_id(), b.backward(), b.name());
} else {
// Compare including the snarl
return make_tuple(a.node_id(), a.snarl(), a.backward()) < make_tuple(b.node_id(), b.snarl(), b.backward());
return make_tuple(a.node_id(), a.snarl(), a.backward(), a.name()) < make_tuple(b.node_id(), b.snarl(), b.backward(), b.name());
}
}

ostream& operator<<(ostream& out, const Visit& visit) {
if (!visit.has_snarl()) {
// Use the node ID
out << visit.node_id();
if (visit.name().empty()) {
// Use the node ID
out << visit.node_id();
} else {
// Use the name
out << visit.name();
}
} else {
// Use the snarl
out << visit.snarl();
Expand Down
8 changes: 8 additions & 0 deletions src/subcommand/autoindex_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include <htslib/hts.h>
#include <htslib/vcf.h>
#include <gbwt/utils.h>
#include <omp.h>

#include "subcommand.hpp"
Expand Down Expand Up @@ -112,6 +113,8 @@ void help_autoindex(char** argv) {
<< " -T, --tmp-dir DIR temporary directory to use for intermediate files" << endl
<< " -M, --target-mem MEM target max memory usage (not exact, formatted INT[kMG])" << endl
<< " (default: 1/2 of available)" << endl
<< " --gbwt-buffer-size NUM GBWT construction buffer size in millions of nodes; may need to be" << endl
<< " increased for graphs with long haplotypes (default: " << IndexingParameters::gbwt_insert_batch_size / gbwt::MILLION << ")" << endl
<< " -t, --threads NUM number of threads (default: all available)" << endl
<< " -V, --verbosity NUM log to stderr (0 = none, 1 = basic, 2 = debug; default " << (int) IndexingParameters::verbosity << ")" << endl
//<< " -d, --dot print the dot-formatted graph of index recipes and exit" << endl
Expand All @@ -128,6 +131,7 @@ int main_autoindex(int argc, char** argv) {
#define OPT_KEEP_INTERMEDIATE 1000
#define OPT_FORCE_UNPHASED 1001
#define OPT_FORCE_PHASED 1002
#define OPT_GBWT_BUFFER_SIZE 1003

// load the registry
IndexRegistry registry = VGIndexes::get_vg_index_registry();
Expand Down Expand Up @@ -155,6 +159,7 @@ int main_autoindex(int argc, char** argv) {
{"provide", required_argument, 0, 'P'},
{"request", required_argument, 0, 'R'},
{"target-mem", required_argument, 0, 'M'},
{"gbwt-buffer-size", required_argument, 0, OPT_GBWT_BUFFER_SIZE},
{"tmp-dir", required_argument, 0, 'T'},
{"threads", required_argument, 0, 't'},
{"verbosity", required_argument, 0, 'V'},
Expand Down Expand Up @@ -238,6 +243,9 @@ int main_autoindex(int argc, char** argv) {
case 'M':
target_mem_usage = parse_memory_usage(optarg);
break;
case OPT_GBWT_BUFFER_SIZE:
IndexingParameters::gbwt_insert_batch_size = std::max(parse<size_t>(optarg), 1ul) * gbwt::MILLION;
break;
case 'T':
temp_file::set_dir(optarg);
break;
Expand Down
Loading

0 comments on commit 4f14d05

Please sign in to comment.