Skip to content

Commit

Permalink
Merge pull request #3294 from vgteam/deconstruct
Browse files Browse the repository at this point in the history
deconstruct usability improvements
  • Loading branch information
glennhickey authored May 18, 2021
2 parents 9b0c959 + 231f57c commit f52d3a2
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 18 deletions.
47 changes: 37 additions & 10 deletions src/deconstructor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,10 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) {
<< " trav=" << pb2json(path_travs.first[i]) << endl;
}
#endif
tuple<bool, string, size_t, size_t> subpath_parse = Paths::parse_subpath_name(path_trav_name);
if (get<0>(subpath_parse)) {
path_trav_name = get<1>(subpath_parse);
}
if (ref_paths.count(path_trav_name) &&
(ref_trav_name.empty() || path_trav_name < ref_trav_name)) {
ref_trav_name = path_trav_name;
Expand All @@ -314,50 +318,66 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) {
// remember all the reference traversals (there can be more than one only in the case of a
// cycle in the reference path
vector<int> ref_travs;
// hacky subpath support -- gets added to variant on output
vector<int64_t> ref_offsets;
if (!ref_trav_name.empty()) {
for (int i = 0; i < path_travs.first.size(); ++i) {
string path_trav_name = graph->get_path_name(graph->get_path_handle_of_step(path_travs.second[i].first));
tuple<bool, string, size_t, size_t> subpath_parse = Paths::parse_subpath_name(path_trav_name);
int64_t sub_offset = 0;
if (get<0>(subpath_parse)) {
path_trav_name = get<1>(subpath_parse);
sub_offset = (int64_t)get<2>(subpath_parse);
}
if (path_trav_name == ref_trav_name) {
ref_travs.push_back(i);
ref_offsets.push_back(sub_offset);
}
}
}

// add in the gbwt traversals if we can
size_t first_gbwt_trav_idx = path_travs.first.size();
vector<gbwt::size_type> trav_thread_ids(first_gbwt_trav_idx, numeric_limits<gbwt::size_type>::max());
vector<int64_t> gbwt_trav_offsets;
if (gbwt_trav_finder.get() != nullptr) {
pair<vector<SnarlTraversal>, vector<gbwt::size_type>> thread_travs = gbwt_trav_finder->find_path_traversals(*snarl);
for (int i = 0; i < thread_travs.first.size(); ++i) {
string gbwt_sample_name = thread_sample(gbwt_trav_finder->get_gbwt(), gbwt::Path::id(thread_travs.second[i]));
// we count on convention of reference as embedded path above, so ignore it here
// todo: would be nice to be more flexible...
if (gbwt_sample_name != gbwtgraph::REFERENCE_PATH_SAMPLE_NAME) {
string name = thread_name(gbwt_trav_finder->get_gbwt(), gbwt::Path::id(thread_travs.second[i]));
string name = thread_name(gbwt_trav_finder->get_gbwt(), gbwt::Path::id(thread_travs.second[i]), true);

path_trav_names.push_back(name);
path_travs.first.push_back(thread_travs.first[i]);
// dummy handles so we can use the same code as the named path traversals above
path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t()));
// but we keep the thread id for later
trav_thread_ids.push_back(thread_travs.second[i]);
// keep the offset (which is stored in the contig field)
gbwt_trav_offsets.push_back(thread_count(gbwt_trav_finder->get_gbwt(), gbwt::Path::id(thread_travs.second[i])));
}
}
}

// if there's no reference traversal, go fishing in the gbwt
if (ref_travs.empty() && gbwt_trav_finder.get()) {
int gbwt_ref_trav = -1;
int64_t gbwt_ref_offset = 0;
for (int i = first_gbwt_trav_idx; i < path_travs.first.size(); ++i) {
if (ref_paths.count(path_trav_names[i]) &&
(gbwt_ref_trav < 0 || path_trav_names[i] < path_trav_names[gbwt_ref_trav])) {
gbwt_ref_trav = i;
}
gbwt_ref_offset = gbwt_trav_offsets.at(i - first_gbwt_trav_idx);
}
}
if (gbwt_ref_trav >= 0) {
ref_travs.push_back(gbwt_ref_trav);
ref_offsets.push_back(gbwt_ref_offset);
string& name = path_trav_names[gbwt_ref_trav];
assert(name.compare(0, 8, "_thread_") == 0);
ref_trav_name = name.substr(8);
assert(name.compare(0, 8, "_thread_") != 0);
ref_trav_name = name;
}
}

Expand Down Expand Up @@ -410,7 +430,9 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) {
}

// we write a variant for every reference traversal
for (auto ref_trav_idx : ref_travs) {
for (size_t i = 0; i < ref_travs.size(); ++i) {
auto& ref_trav_idx = ref_travs[i];
auto& ref_trav_offset = ref_offsets[i];

const SnarlTraversal& ref_trav = path_travs.first[ref_trav_idx];

Expand Down Expand Up @@ -458,7 +480,7 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) {
// shift from 0-based to 1-based for VCF
first_path_pos += 1;

v.position = first_path_pos;
v.position = first_path_pos + ref_trav_offset;

v.id = snarl_name(snarl);

Expand Down Expand Up @@ -601,13 +623,18 @@ void Deconstructor::deconstruct(vector<string> ref_paths, const PathPositionHand
}
}
if (!gbwt_ref_paths.empty()) {
unordered_map<string, gbwt::size_type> gbwt_name_to_id;
unordered_map<string, vector<gbwt::size_type>> gbwt_name_to_ids;
for (size_t i = 0; i < gbwt->metadata.paths(); i++) {
gbwt_name_to_id[thread_name(*gbwt, i)] = i;
gbwt_name_to_ids[thread_name(*gbwt, i, true)].push_back(i);
}
for (const string& refpath : gbwt_ref_paths) {
gbwt::size_type thread_id = gbwt_name_to_id.at(refpath);
size_t path_len = path_to_length(extract_gbwt_path(*graph, *gbwt, thread_id));
vector<gbwt::size_type>& thread_ids = gbwt_name_to_ids.at(refpath);
size_t path_len = 0;
for (gbwt::size_type thread_id : thread_ids) {
size_t offset = thread_count(*gbwt, thread_id);
size_t len = path_to_length(extract_gbwt_path(*graph, *gbwt, thread_id));
path_len = std::max(path_len, offset + len);
}
stream << "##contig=<ID=" << refpath << ",length=" << path_len << ">" << endl;
}
}
Expand Down
18 changes: 15 additions & 3 deletions src/gbwt_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,14 +283,16 @@ Path extract_gbwt_path(const HandleGraph& graph, const gbwt::GBWT& gbwt_index, g
return result;
}

std::string thread_name(const gbwt::GBWT& gbwt_index, gbwt::size_type id) {
std::string thread_name(const gbwt::GBWT& gbwt_index, gbwt::size_type id, bool short_name) {
if (!gbwt_index.hasMetadata() || !gbwt_index.metadata.hasPathNames() || id >= gbwt_index.metadata.paths()) {
return "";
}

const gbwt::PathName& path = gbwt_index.metadata.path(id);
std::stringstream stream;
stream << "_thread_";
if (!short_name) {
stream << "_thread_";
}
if (gbwt_index.metadata.hasSampleNames()) {
stream << gbwt_index.metadata.sample(path.sample);
} else {
Expand All @@ -302,7 +304,9 @@ std::string thread_name(const gbwt::GBWT& gbwt_index, gbwt::size_type id) {
} else {
stream << path.contig;
}
stream << "_" << path.phase << "_" << path.count;
if (!short_name) {
stream << "_" << path.phase << "_" << path.count;
}
return stream.str();
}

Expand Down Expand Up @@ -330,6 +334,14 @@ int thread_phase(const gbwt::GBWT& gbwt_index, gbwt::size_type id) {
return path.phase;
}

gbwt::size_type thread_count(const gbwt::GBWT& gbwt_index, gbwt::size_type id) {
if (!gbwt_index.hasMetadata() || !gbwt_index.metadata.hasPathNames() || id >= gbwt_index.metadata.paths()) {
return 0;
}

const gbwt::PathName& path = gbwt_index.metadata.path(id);
return path.count;
}

//------------------------------------------------------------------------------

Expand Down
7 changes: 6 additions & 1 deletion src/gbwt_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,9 @@ std::string insert_gbwt_path(MutablePathHandleGraph& graph, const gbwt::GBWT& gb
Path extract_gbwt_path(const HandleGraph& graph, const gbwt::GBWT& gbwt_index, gbwt::size_type id);

/// Get a string representation of a thread name stored in GBWT metadata.
/// When short_name is true, return a short name made of just the sample and contig
/// NOTE: id is a gbwt path id, not a gbwt sequence id.
std::string thread_name(const gbwt::GBWT& gbwt_index, gbwt::size_type id);
std::string thread_name(const gbwt::GBWT& gbwt_index, gbwt::size_type id, bool short_name = false);

/// Get a sample name of a thread stored in GBWT metadata.
/// NOTE: id is a gbwt path id, not a gbwt sequence id.
Expand All @@ -155,6 +156,10 @@ std::string thread_sample(const gbwt::GBWT& gbwt_index, gbwt::size_type id);
/// NOTE: id is a gbwt path id, not a gbwt sequence id.
int thread_phase(const gbwt::GBWT& gbwt_index, gbwt::size_type id);

/// Get count of a thread stored in GBWT metadata.
/// NOTE: id is a gbwt path id, not a gbwt sequence id.
gbwt::size_type thread_count(const gbwt::GBWT& gbwt_index, gbwt::size_type id);

//------------------------------------------------------------------------------

/// Transform the paths into a GBWT index. Primarily for testing.
Expand Down
17 changes: 13 additions & 4 deletions src/subcommand/deconstruct_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ int main_deconstruct(int argc, char** argv){
// Add GBWT threads if no reference paths found or we're running with -a
if (gbwt_index.get() && (all_snarls || refpaths.empty())) {
for (size_t i = 0; i < gbwt_index->metadata.paths(); i++) {
refpaths.push_back(thread_name(*gbwt_index, i));
refpaths.push_back(thread_name(*gbwt_index, i, true));
}
}
}
Expand Down Expand Up @@ -254,7 +254,7 @@ int main_deconstruct(int argc, char** argv){
});
if (gbwt_index.get()) {
for (size_t i = 0; i < gbwt_index->metadata.paths(); i++) {
std::string path_name = thread_name(*gbwt_index, i);
std::string path_name = thread_name(*gbwt_index, i, true);
for (auto& prefix : refpath_prefixes) {
if (path_name.compare(0, prefix.size(), prefix) == 0) {
refpaths.push_back(path_name);
Expand All @@ -264,14 +264,23 @@ int main_deconstruct(int argc, char** argv){
}
}

if (gbwt_index.get() && !altpath_prefixes.empty()) {
if (gbwt_index.get()) {
for (size_t i = 0; i < gbwt_index->metadata.paths(); i++) {
string sample_name = thread_sample(*gbwt_index.get(), i);
// if no prefixes were specified, we add them all. this way
// we make sure we only use gbwt alts. an option could be added
// to incorporate graph paths by toggling this (ie only adding to
// the map if prefixes given)
bool add_prefix = altpath_prefixes.empty();
for (auto& prefix : altpath_prefixes) {
if (sample_name.compare(0, prefix.size(), prefix) == 0) {
alt_path_to_prefix[sample_name] = sample_name;
add_prefix = true;
break;
}
}
if (add_prefix) {
alt_path_to_prefix[sample_name] = sample_name;
}
}
}
}
Expand Down

1 comment on commit f52d3a2

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 11207 seconds

Please sign in to comment.