Skip to content

Commit

Permalink
Merge pull request #3724 from vgteam/depth
Browse files Browse the repository at this point in the history
Add some subpath and cycle support to vg depth
  • Loading branch information
glennhickey authored Aug 29, 2022
2 parents 5989598 + b8a1a25 commit de51eb5
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 56 deletions.
72 changes: 51 additions & 21 deletions src/algorithms/coverage_depth.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ void packed_depths(const Packer& packer, const string& path_name, size_t min_cov
step_handle_t start_step = graph.path_begin(path_handle);
step_handle_t end_step = graph.path_end(path_handle);
Position cur_pos;
size_t path_offset = 1;
tuple<bool, string, size_t, size_t> subpath_parse = Paths::parse_subpath_name(path_name);
const string& base_name = get<0>(subpath_parse) ? get<1>(subpath_parse) : path_name;
size_t path_offset = 1 + get<2>(subpath_parse);

for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph.get_next_step(cur_step)) {
handle_t cur_handle = graph.get_handle_of_step(cur_step);
nid_t cur_id = graph.get_id(cur_handle);
Expand All @@ -24,7 +27,7 @@ void packed_depths(const Packer& packer, const string& path_name, size_t min_cov
cur_pos.set_offset(i);
size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos));
if (pos_coverage >= min_coverage) {
out_stream << path_name << "\t" << path_offset << "\t" << pos_coverage << "\n";
out_stream << base_name << "\t" << path_offset << "\t" << pos_coverage << "\n";
}
++path_offset;
}
Expand Down Expand Up @@ -292,34 +295,43 @@ pair<double, double> sample_gam_depth(const HandleGraph& graph, const vector<Ali
return combine_and_average_node_coverages(graph, node_coverages, min_coverage);
}

void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, ostream& out_stream) {
void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream) {
assert(graph.has_path(path_name));

path_handle_t path_handle = graph.get_path_handle(path_name);
size_t offset = 0;
// big speedup
unordered_map<path_handle_t, string> path_to_name;

tuple<bool, string, size_t, size_t> subpath_parse = Paths::parse_subpath_name(path_name);
const string& base_name = get<0>(subpath_parse) ? get<1>(subpath_parse) : path_name;
size_t offset = 1 + get<2>(subpath_parse);

graph.for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
unordered_set<string> path_set;
size_t step_count = 0;
handle_t handle = graph.get_handle_of_step(step_handle);
graph.for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) {
path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2);
auto it = path_to_name.find(step_path_handle);
if (it == path_to_name.end()) {
string step_path_name = graph.get_path_name(step_path_handle);
// disregard subpath tags when counting
auto subpath = Paths::parse_subpath_name(step_path_name);
string& parsed_name = get<0>(subpath) ? get<1>(subpath) : step_path_name;
it = path_to_name.insert(make_pair(step_path_handle, parsed_name)).first;
if (count_cycles) {
++step_count;
} else {
path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2);
auto it = path_to_name.find(step_path_handle);
if (it == path_to_name.end()) {
string step_path_name = graph.get_path_name(step_path_handle);
// disregard subpath tags when counting
auto subpath = Paths::parse_subpath_name(step_path_name);
string& parsed_name = get<0>(subpath) ? get<1>(subpath) : step_path_name;
it = path_to_name.insert(make_pair(step_path_handle, parsed_name)).first;
}
path_set.insert(it->second);
}
path_set.insert(it->second);
});
size_t coverage = path_set.size() - 1;
size_t coverage = (count_cycles ? step_count : path_set.size()) - 1;
size_t node_len = graph.get_length(handle);
if (coverage >= min_coverage) {
for (size_t i = 0; i < node_len; ++i) {
if (coverage >= min_coverage) {
out_stream << path_name << "\t" << (offset + i) << "\t" << coverage << "\n";
out_stream << base_name << "\t" << (offset + i) << "\t" << coverage << "\n";
}
}
}
Expand All @@ -329,23 +341,40 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m

pair<double, double> path_depth_of_bin(const PathHandleGraph& graph,
step_handle_t start_step, step_handle_t end_plus_one_step,
size_t min_coverage) {
size_t min_coverage, bool count_cycles) {

// compute the mean and variance of our base coverage across the bin
size_t bin_length = 0;
double mean = 0.0;
double M2 = 0.0;

// big speedup
unordered_map<path_handle_t, string> path_to_name;

for (step_handle_t cur_step = start_step; cur_step != end_plus_one_step; cur_step = graph.get_next_step(cur_step)) {
handle_t cur_handle = graph.get_handle_of_step(cur_step);
nid_t cur_id = graph.get_id(cur_handle);
size_t cur_len = graph.get_length(cur_handle);

unordered_set<path_handle_t> path_set;
unordered_set<string> path_set;
size_t step_count = 0;
graph.for_each_step_on_handle(cur_handle, [&](step_handle_t step_handle) {
path_set.insert(graph.get_path_handle_of_step(step_handle));
if (count_cycles) {
++step_count;
} else {
path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle);
auto it = path_to_name.find(step_path_handle);
if (it == path_to_name.end()) {
string step_path_name = graph.get_path_name(step_path_handle);
// disregard subpath tags when counting
auto subpath = Paths::parse_subpath_name(step_path_name);
string& parsed_name = get<0>(subpath) ? get<1>(subpath) : step_path_name;
it = path_to_name.insert(make_pair(step_path_handle, parsed_name)).first;
}
path_set.insert(it->second);
}
});
size_t coverage = path_set.size() - 1;
size_t coverage = (count_cycles ? step_count : path_set.size()) - 1;

if (coverage >= min_coverage) {
// todo: iteration here copied from packer implementation, not necessary
Expand All @@ -360,7 +389,8 @@ pair<double, double> path_depth_of_bin(const PathHandleGraph& graph,
vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandleGraph& graph,
const string& path_name,
size_t bin_size,
size_t min_coverage) {
size_t min_coverage,
bool count_cycles) {

path_handle_t path_handle = graph.get_path_handle(path_name);

Expand Down Expand Up @@ -388,7 +418,7 @@ vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandle
step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step;
size_t bin_start = bins[i].first;
size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset;
pair<double, double> coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage);
pair<double, double> coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles);
binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second);
}

Expand Down
6 changes: 3 additions & 3 deletions src/algorithms/coverage_depth.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@ pair<double, double> sample_mapping_depth(const HandleGraph& graph, const vector
/// print path-name offset base-coverage for every base on a path (just like samtools depth)
/// ignoring things below min_coverage. offsets are 1-based in output stream
/// coverage here is the number of steps from (unique) other paths
void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, ostream& out_stream);
void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream);

/// like packed_depth_of_bin (above), but use paths (as in path_depths) for measuring coverage
pair<double, double> path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step,
size_t min_coverage);
size_t min_coverage, bool count_cycles);

vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandleGraph& graph, const string& path_name, size_t bin_size,
size_t min_coverage);
size_t min_coverage, bool count_cycles);

}
}
Expand Down
82 changes: 52 additions & 30 deletions src/subcommand/depth_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ using namespace vg::subcommand;
void help_depth(char** argv) {
cerr << "usage: " << argv[0] << " depth [options] <graph>" << endl
<< "options:" << endl
<< " packed coverage depth (print positional depths along path):" << endl
<< " packed coverage depth (print 1-based positional depths along path):" << endl
<< " -k, --pack FILE supports created from vg pack for given input graph" << endl
<< " -d, --count-dels count deletion edges within the bin as covering reference positions" << endl
<< " GAM/GAF coverage depth (print <mean> <stddev> for depth):" << endl
Expand All @@ -37,13 +37,14 @@ void help_depth(char** argv) {
<< " -n, --max-nodes N maximum nodes to consider [1000000]" << endl
<< " -s, --random-seed N random seed for sampling nodes to consider" << endl
<< " -Q, --min-mapq N ignore alignments with mapping quality < N [0]" << endl
<< " path coverage depth (print positional depths along path):" << endl
<< " path coverage depth (print 1-based positional depths along path):" << endl
<< " activate by specifiying -p without -k" << endl
<< " -c, --count-cycles count each time a path steps on a position (by default paths are only counted once)" << endl
<< " common options:" << endl
<< " -p, --ref-path NAME reference path to call on (multipile allowed. defaults to all paths)" << endl
<< " -P, --paths-by STR select the paths with the given name prefix" << endl
<< " -b, --bin-size N bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl
<< " -m, --min-coverage N ignore nodes with less than N coverage [1]" << endl
<< " -m, --min-coverage N ignore nodes with less than N coverage depth [1]" << endl
<< " -t, --threads N number of threads to use [all available]" << endl;
}

Expand All @@ -55,7 +56,7 @@ int main_depth(int argc, char** argv) {
}

string pack_filename;
vector<string> ref_paths;
unordered_set<string> ref_paths_input_set;
vector<string> path_prefixes;
size_t bin_size = 1;
bool count_dels = false;
Expand All @@ -65,6 +66,7 @@ int main_depth(int argc, char** argv) {
size_t max_nodes = 1000000;
int random_seed = time(NULL);
size_t min_mapq = 0;
bool count_cycles = false;

size_t min_coverage = 1;

Expand All @@ -84,13 +86,14 @@ int main_depth(int argc, char** argv) {
{"random-seed", required_argument, 0, 's'},
{"min-mapq", required_argument, 0, 'Q'},
{"min-coverage", required_argument, 0, 'm'},
{"count-cycles", no_argument, 0, 'c'},
{"threads", required_argument, 0, 't'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};

int option_index = 0;
c = getopt_long (argc, argv, "hk:p:P:c:b:dg:a:n:s:m:t:",
c = getopt_long (argc, argv, "hk:p:P:b:dg:a:n:s:m:ct:",
long_options, &option_index);

// Detect the end of the options.
Expand All @@ -103,7 +106,7 @@ int main_depth(int argc, char** argv) {
pack_filename = optarg;
break;
case 'p':
ref_paths.push_back(optarg);
ref_paths_input_set.insert(optarg);
break;
case 'P':
path_prefixes.push_back(optarg);
Expand Down Expand Up @@ -132,6 +135,9 @@ int main_depth(int argc, char** argv) {
case 'm':
min_coverage = parse<size_t>(optarg);
break;
case 'c':
count_cycles = true;
break;
case 't':
{
int num_threads = parse<int>(optarg);
Expand Down Expand Up @@ -188,50 +194,66 @@ int main_depth(int argc, char** argv) {
packer->load_from_file(pack_filename);
}

// All paths if none given
if (ref_paths.empty() || !path_prefixes.empty()) {
graph->for_each_path_handle([&](path_handle_t path_handle) {
string path_name = graph->get_path_name(path_handle);
// just take anything if no selection
bool use_it = !Paths::is_alt(path_name) && path_prefixes.empty();
// otherwise look for a prefix match
for (size_t i = 0; i < path_prefixes.size() && !use_it; ++i) {
if (path_name.substr(0, path_prefixes[i].length()) == path_prefixes[i] &&
std::find(ref_paths.begin(), ref_paths.end(), path_name) == ref_paths.end()) {
use_it = true;
}
}
if (use_it) {
ref_paths.push_back(path_name);
// we want our paths sorted by the subpath parse so the output is sorted
map<pair<string, int64_t>, string> ref_paths;
unordered_set<string> base_path_set;

graph->for_each_path_handle([&](path_handle_t path_handle) {
string path_name = graph->get_path_name(path_handle);
tuple<bool, string, size_t, size_t> subpath_parse = Paths::parse_subpath_name(path_name);
const string& base_name = get<0>(subpath_parse) ? get<1>(subpath_parse) : path_name;
base_path_set.insert(base_name);
// just take anything if no selection
bool use_it = !Paths::is_alt(path_name) && path_prefixes.empty() && ref_paths_input_set.empty();

// then look in the input paths -p
if (!use_it && ref_paths_input_set.count(base_name)) {
use_it = true;
}

// then look in the prefixes
for (size_t i = 0; i < path_prefixes.size() && !use_it; ++i) {
if (path_name.substr(0, path_prefixes[i].length()) == path_prefixes[i]) {
use_it = true;
}
});
}
for (const string& ref_name : ref_paths) {
if (!graph->has_path(ref_name)) {
}
if (use_it) {
auto coord = make_pair(base_name, get<2>(subpath_parse));
assert(!ref_paths.count(coord));
ref_paths[coord] = path_name;
}
});

for (const auto& ref_name : ref_paths_input_set) {
if (!base_path_set.count(ref_name)) {
cerr << "error:[vg depth] Path \"" << ref_name << "\" not found in graph" << endl;
}
}

for (const string& ref_path : ref_paths) {

for (const auto& ref_coord_path : ref_paths) {
const string& ref_path = ref_coord_path.second;
const string& base_path = ref_coord_path.first.first;
const size_t subpath_offset = ref_coord_path.first.second;

if (bin_size > 1) {
vector<tuple<size_t, size_t, double, double>> binned_depth;
if (!pack_filename.empty()) {
binned_depth = algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels);
} else {
binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage);
binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage, count_cycles);
}
for (auto& bin_cov : binned_depth) {
// bins can ben nan if min_coverage filters everything out. just skip
if (!isnan(get<3>(bin_cov))) {
cout << ref_path << "\t" << (get<0>(bin_cov) + 1)<< "\t" << (get<1>(bin_cov) + 1) << "\t" << get<2>(bin_cov)
cout << base_path << "\t" << (get<0>(bin_cov) + 1 + subpath_offset)<< "\t" << (get<1>(bin_cov) + 1 + subpath_offset) << "\t" << get<2>(bin_cov)
<< "\t" << sqrt(get<3>(bin_cov)) << endl;
}
}
} else {
if (!pack_filename.empty()) {
algorithms::packed_depths(*packer, ref_path, min_coverage, cout);
} else {
algorithms::path_depths(*graph, ref_path, min_coverage, cout);
algorithms::path_depths(*graph, ref_path, min_coverage, count_cycles, cout);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions test/t/17_vg_augment.t
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,13 @@ rm -rf x.vg x.xg x.gcsa x.reads x.gam x.mod.vg x.trans
vg construct -m 1000 -r tiny/tiny.fa >flat.vg
vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTTTCTGGAGATCTATTATACTCCAACTCTCTG/' | vg view -Fv - >2snp.vg
vg index -x 2snp.xg 2snp.vg
vg sim -l 30 -x 2snp.xg -n 30 -a >2snp.sim
vg sim -s 2323 -l 30 -x 2snp.xg -n 30 -a >2snp.sim
vg index -x flat.xg -g flat.gcsa -k 16 flat.vg
vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam
is $(vg augment flat.vg 2snp.gam -i -S | vg paths -d -v - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes"

vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTATCTGGAGTTCTATTATATCCCAACTCTCTG/' | vg view -Fv - >2err.vg
vg sim -l 30 -x 2err.vg -n 10 -a >2err.sim
vg sim -s 2323 -l 30 -x 2err.vg -n 10 -a >2err.sim
vg map -g flat.gcsa -x flat.xg -G 2err.sim -k 8 >2err.gam
cat 2snp.gam 2err.gam > 4edits.gam
vg augment flat.vg 2snp.gam -S | vg view - | grep S | awk '{print $3}' | sort > 2snp_default.nodes
Expand Down

2 comments on commit de51eb5

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 12921 seconds

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch v1.43.0. View the full report here.

1 tests passed, 0 tests failed and 0 tests skipped in 2172 seconds

Please sign in to comment.