From a7315b88a12873a389dcc5f24cefc33820ce6532 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 25 Aug 2022 11:26:38 -0400 Subject: [PATCH 1/5] support subpath name on ref; add -c option to count cycles --- src/algorithms/coverage_depth.cpp | 72 ++++++++++++++++++++-------- src/algorithms/coverage_depth.hpp | 6 +-- src/subcommand/depth_main.cpp | 80 ++++++++++++++++++++----------- 3 files changed, 105 insertions(+), 53 deletions(-) diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index 6653989ea76..16d6b3eadf7 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -13,7 +13,10 @@ void packed_depths(const Packer& packer, const string& path_name, size_t min_cov step_handle_t start_step = graph.path_begin(path_handle); step_handle_t end_step = graph.path_end(path_handle); Position cur_pos; - size_t path_offset = 1; + tuple subpath_parse = Paths::parse_subpath_name(path_name); + const string& base_name = get<0>(subpath_parse) ? get<1>(subpath_parse) : path_name; + size_t path_offset = 1 + get<2>(subpath_parse); + for (step_handle_t cur_step = start_step; cur_step != end_step; cur_step = graph.get_next_step(cur_step)) { handle_t cur_handle = graph.get_handle_of_step(cur_step); nid_t cur_id = graph.get_id(cur_handle); @@ -24,7 +27,7 @@ void packed_depths(const Packer& packer, const string& path_name, size_t min_cov cur_pos.set_offset(i); size_t pos_coverage = packer.coverage_at_position(packer.position_in_basis(cur_pos)); if (pos_coverage >= min_coverage) { - out_stream << path_name << "\t" << path_offset << "\t" << pos_coverage << "\n"; + out_stream << base_name << "\t" << path_offset << "\t" << pos_coverage << "\n"; } ++path_offset; } @@ -292,34 +295,43 @@ pair sample_gam_depth(const HandleGraph& graph, const vector path_to_name; + + tuple subpath_parse = Paths::parse_subpath_name(path_name); + const string& base_name = get<0>(subpath_parse) ? get<1>(subpath_parse) : path_name; + size_t offset = get<2>(subpath_parse); + graph.for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { unordered_set path_set; + size_t step_count = 0; handle_t handle = graph.get_handle_of_step(step_handle); graph.for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { - path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2); - auto it = path_to_name.find(step_path_handle); - if (it == path_to_name.end()) { - string step_path_name = graph.get_path_name(step_path_handle); - // disregard subpath tags when counting - auto subpath = Paths::parse_subpath_name(step_path_name); - string& parsed_name = get<0>(subpath) ? get<1>(subpath) : step_path_name; - it = path_to_name.insert(make_pair(step_path_handle, parsed_name)).first; + if (count_cycles) { + ++step_count; + } else { + path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2); + auto it = path_to_name.find(step_path_handle); + if (it == path_to_name.end()) { + string step_path_name = graph.get_path_name(step_path_handle); + // disregard subpath tags when counting + auto subpath = Paths::parse_subpath_name(step_path_name); + string& parsed_name = get<0>(subpath) ? get<1>(subpath) : step_path_name; + it = path_to_name.insert(make_pair(step_path_handle, parsed_name)).first; + } + path_set.insert(it->second); } - path_set.insert(it->second); }); - size_t coverage = path_set.size() - 1; + size_t coverage = (count_cycles ? step_count : path_set.size()) - 1; size_t node_len = graph.get_length(handle); if (coverage >= min_coverage) { for (size_t i = 0; i < node_len; ++i) { if (coverage >= min_coverage) { - out_stream << path_name << "\t" << (offset + i) << "\t" << coverage << "\n"; + out_stream << base_name << "\t" << (offset + i) << "\t" << coverage << "\n"; } } } @@ -329,23 +341,40 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m pair path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step, - size_t min_coverage) { + size_t min_coverage, bool count_cycles) { // compute the mean and variance of our base coverage across the bin size_t bin_length = 0; double mean = 0.0; double M2 = 0.0; + // big speedup + unordered_map path_to_name; + for (step_handle_t cur_step = start_step; cur_step != end_plus_one_step; cur_step = graph.get_next_step(cur_step)) { handle_t cur_handle = graph.get_handle_of_step(cur_step); nid_t cur_id = graph.get_id(cur_handle); size_t cur_len = graph.get_length(cur_handle); - unordered_set path_set; + unordered_set path_set; + size_t step_count = 0; graph.for_each_step_on_handle(cur_handle, [&](step_handle_t step_handle) { - path_set.insert(graph.get_path_handle_of_step(step_handle)); + if (count_cycles) { + ++step_count; + } else { + path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle); + auto it = path_to_name.find(step_path_handle); + if (it == path_to_name.end()) { + string step_path_name = graph.get_path_name(step_path_handle); + // disregard subpath tags when counting + auto subpath = Paths::parse_subpath_name(step_path_name); + string& parsed_name = get<0>(subpath) ? get<1>(subpath) : step_path_name; + it = path_to_name.insert(make_pair(step_path_handle, parsed_name)).first; + } + path_set.insert(it->second); + } }); - size_t coverage = path_set.size() - 1; + size_t coverage = (count_cycles ? step_count : path_set.size()) - 1; if (coverage >= min_coverage) { // todo: iteration here copied from packer implementation, not necessary @@ -360,7 +389,8 @@ pair path_depth_of_bin(const PathHandleGraph& graph, vector> binned_path_depth(const PathHandleGraph& graph, const string& path_name, size_t bin_size, - size_t min_coverage) { + size_t min_coverage, + bool count_cycles) { path_handle_t path_handle = graph.get_path_handle(path_name); @@ -388,7 +418,7 @@ vector> binned_path_depth(const PathHandle step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step; size_t bin_start = bins[i].first; size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset; - pair coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage); + pair coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles); binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second); } diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp index 087949d6f29..021cc57fa52 100644 --- a/src/algorithms/coverage_depth.hpp +++ b/src/algorithms/coverage_depth.hpp @@ -67,14 +67,14 @@ pair sample_mapping_depth(const HandleGraph& graph, const vector /// print path-name offset base-coverage for every base on a path (just like samtools depth) /// ignoring things below min_coverage. offsets are 1-based in output stream /// coverage here is the number of steps from (unique) other paths -void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, ostream& out_stream); +void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream); /// like packed_depth_of_bin (above), but use paths (as in path_depths) for measuring coverage pair path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step, - size_t min_coverage); + size_t min_coverage, bool count_cycles); vector> binned_path_depth(const PathHandleGraph& graph, const string& path_name, size_t bin_size, - size_t min_coverage); + size_t min_coverage, bool count_cycles); } } diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp index b5ee580ff34..cb85bce8d6c 100644 --- a/src/subcommand/depth_main.cpp +++ b/src/subcommand/depth_main.cpp @@ -28,7 +28,7 @@ using namespace vg::subcommand; void help_depth(char** argv) { cerr << "usage: " << argv[0] << " depth [options] " << endl << "options:" << endl - << " packed coverage depth (print positional depths along path):" << endl + << " packed coverage depth (print 1-based positional depths along path):" << endl << " -k, --pack FILE supports created from vg pack for given input graph" << endl << " -d, --count-dels count deletion edges within the bin as covering reference positions" << endl << " GAM/GAF coverage depth (print for depth):" << endl @@ -37,8 +37,9 @@ void help_depth(char** argv) { << " -n, --max-nodes N maximum nodes to consider [1000000]" << endl << " -s, --random-seed N random seed for sampling nodes to consider" << endl << " -Q, --min-mapq N ignore alignments with mapping quality < N [0]" << endl - << " path coverage depth (print positional depths along path):" << endl + << " path coverage depth (print 1-based positional depths along path):" << endl << " activate by specifiying -p without -k" << endl + << " -c, --count-cycles count each time a path steps on a position (by default paths are only counted once)" << endl << " common options:" << endl << " -p, --ref-path NAME reference path to call on (multipile allowed. defaults to all paths)" << endl << " -P, --paths-by STR select the paths with the given name prefix" << endl @@ -55,7 +56,7 @@ int main_depth(int argc, char** argv) { } string pack_filename; - vector ref_paths; + unordered_set ref_paths_input_set; vector path_prefixes; size_t bin_size = 1; bool count_dels = false; @@ -65,6 +66,7 @@ int main_depth(int argc, char** argv) { size_t max_nodes = 1000000; int random_seed = time(NULL); size_t min_mapq = 0; + bool count_cycles = false; size_t min_coverage = 1; @@ -84,13 +86,14 @@ int main_depth(int argc, char** argv) { {"random-seed", required_argument, 0, 's'}, {"min-mapq", required_argument, 0, 'Q'}, {"min-coverage", required_argument, 0, 'm'}, + {"count-cycles", no_argument, 0, 'c'}, {"threads", required_argument, 0, 't'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hk:p:P:c:b:dg:a:n:s:m:t:", + c = getopt_long (argc, argv, "hk:p:P:b:dg:a:n:s:m:ct:", long_options, &option_index); // Detect the end of the options. @@ -103,7 +106,7 @@ int main_depth(int argc, char** argv) { pack_filename = optarg; break; case 'p': - ref_paths.push_back(optarg); + ref_paths_input_set.insert(optarg); break; case 'P': path_prefixes.push_back(optarg); @@ -132,6 +135,9 @@ int main_depth(int argc, char** argv) { case 'm': min_coverage = parse(optarg); break; + case 'c': + count_cycles = true; + break; case 't': { int num_threads = parse(optarg); @@ -188,42 +194,58 @@ int main_depth(int argc, char** argv) { packer->load_from_file(pack_filename); } - // All paths if none given - if (ref_paths.empty() || !path_prefixes.empty()) { - graph->for_each_path_handle([&](path_handle_t path_handle) { - string path_name = graph->get_path_name(path_handle); - // just take anything if no selection - bool use_it = !Paths::is_alt(path_name) && path_prefixes.empty(); - // otherwise look for a prefix match - for (size_t i = 0; i < path_prefixes.size() && !use_it; ++i) { - if (path_name.substr(0, path_prefixes[i].length()) == path_prefixes[i] && - std::find(ref_paths.begin(), ref_paths.end(), path_name) == ref_paths.end()) { - use_it = true; - } - } - if (use_it) { - ref_paths.push_back(path_name); + // we want our paths sorted by the subpath parse so the output is sorted + map, string> ref_paths; + unordered_set base_path_set; + + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + tuple subpath_parse = Paths::parse_subpath_name(path_name); + const string& base_name = get<0>(subpath_parse) ? get<1>(subpath_parse) : path_name; + base_path_set.insert(base_name); + // just take anything if no selection + bool use_it = !Paths::is_alt(path_name) && path_prefixes.empty() && ref_paths_input_set.empty(); + + // then look in the input paths -p + if (!use_it && ref_paths_input_set.count(base_name)) { + use_it = true; + } + + // then look in the prefixes + for (size_t i = 0; i < path_prefixes.size() && !use_it; ++i) { + if (path_name.substr(0, path_prefixes[i].length()) == path_prefixes[i]) { + use_it = true; } - }); - } - for (const string& ref_name : ref_paths) { - if (!graph->has_path(ref_name)) { + } + if (use_it) { + auto coord = make_pair(base_name, get<2>(subpath_parse)); + assert(!ref_paths.count(coord)); + ref_paths[coord] = path_name; + } + }); + + for (const auto& ref_name : ref_paths_input_set) { + if (!base_path_set.count(ref_name)) { cerr << "error:[vg depth] Path \"" << ref_name << "\" not found in graph" << endl; } } - - for (const string& ref_path : ref_paths) { + + for (const auto& ref_coord_path : ref_paths) { + const string& ref_path = ref_coord_path.second; + const string& base_path = ref_coord_path.first.first; + const size_t subpath_offset = ref_coord_path.first.second; + if (bin_size > 1) { vector> binned_depth; if (!pack_filename.empty()) { binned_depth = algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels); } else { - binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage); + binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage, count_cycles); } for (auto& bin_cov : binned_depth) { // bins can ben nan if min_coverage filters everything out. just skip if (!isnan(get<3>(bin_cov))) { - cout << ref_path << "\t" << (get<0>(bin_cov) + 1)<< "\t" << (get<1>(bin_cov) + 1) << "\t" << get<2>(bin_cov) + cout << base_path << "\t" << (get<0>(bin_cov) + 1 + subpath_offset)<< "\t" << (get<1>(bin_cov) + 1 + subpath_offset) << "\t" << get<2>(bin_cov) << "\t" << sqrt(get<3>(bin_cov)) << endl; } } @@ -231,7 +253,7 @@ int main_depth(int argc, char** argv) { if (!pack_filename.empty()) { algorithms::packed_depths(*packer, ref_path, min_coverage, cout); } else { - algorithms::path_depths(*graph, ref_path, min_coverage, cout); + algorithms::path_depths(*graph, ref_path, min_coverage, count_cycles, cout); } } } From cc4685f8c23eda22d30cbae8f9499bb9aa0ee898 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 25 Aug 2022 12:38:26 -0400 Subject: [PATCH 2/5] use 1-based offset consistently --- src/algorithms/coverage_depth.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index 16d6b3eadf7..c3c0b6d94ec 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -304,7 +304,7 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m tuple subpath_parse = Paths::parse_subpath_name(path_name); const string& base_name = get<0>(subpath_parse) ? get<1>(subpath_parse) : path_name; - size_t offset = get<2>(subpath_parse); + size_t offset = 1 + get<2>(subpath_parse); graph.for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { unordered_set path_set; From 0d815a3b633f2d4cae037d6034907f00358f60ce Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 26 Aug 2022 08:16:30 -0400 Subject: [PATCH 3/5] trigger CI --- src/subcommand/depth_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp index cb85bce8d6c..1f584c4173d 100644 --- a/src/subcommand/depth_main.cpp +++ b/src/subcommand/depth_main.cpp @@ -44,7 +44,7 @@ void help_depth(char** argv) { << " -p, --ref-path NAME reference path to call on (multipile allowed. defaults to all paths)" << endl << " -P, --paths-by STR select the paths with the given name prefix" << endl << " -b, --bin-size N bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl - << " -m, --min-coverage N ignore nodes with less than N coverage [1]" << endl + << " -m, --min-coverage N ignore nodes with less than N coverage depth [1]" << endl << " -t, --threads N number of threads to use [all available]" << endl; } From 9674c58f4d3a6e6bcbbbabd22c594517bc872493 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 26 Aug 2022 09:38:50 -0400 Subject: [PATCH 4/5] specifiy vg sim seed in augment test --- test/t/17_vg_augment.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t index 693093bfd8a..88962afbe4d 100644 --- a/test/t/17_vg_augment.t +++ b/test/t/17_vg_augment.t @@ -86,13 +86,13 @@ rm -rf x.vg x.xg x.gcsa x.reads x.gam x.mod.vg x.trans vg construct -m 1000 -r tiny/tiny.fa >flat.vg vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTTTCTGGAGATCTATTATACTCCAACTCTCTG/' | vg view -Fv - >2snp.vg vg index -x 2snp.xg 2snp.vg -vg sim -l 30 -x 2snp.xg -n 30 -a >2snp.sim +vg sim -s 0 -l 30 -x 2snp.xg -n 30 -a >2snp.sim vg index -x flat.xg -g flat.gcsa -k 16 flat.vg vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam is $(vg augment flat.vg 2snp.gam -i -S | vg paths -d -v - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes" vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTATCTGGAGTTCTATTATATCCCAACTCTCTG/' | vg view -Fv - >2err.vg -vg sim -l 30 -x 2err.vg -n 10 -a >2err.sim +vg sim -s 0 -l 30 -x 2err.vg -n 10 -a >2err.sim vg map -g flat.gcsa -x flat.xg -G 2err.sim -k 8 >2err.gam cat 2snp.gam 2err.gam > 4edits.gam vg augment flat.vg 2snp.gam -S | vg view - | grep S | awk '{print $3}' | sort > 2snp_default.nodes From b8a1a25ff2915ca44de64f43f5319f0c8b8282b1 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 26 Aug 2022 10:18:34 -0400 Subject: [PATCH 5/5] use nonzero seed --- test/t/17_vg_augment.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t index 88962afbe4d..65927df37dc 100644 --- a/test/t/17_vg_augment.t +++ b/test/t/17_vg_augment.t @@ -86,13 +86,13 @@ rm -rf x.vg x.xg x.gcsa x.reads x.gam x.mod.vg x.trans vg construct -m 1000 -r tiny/tiny.fa >flat.vg vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTTTCTGGAGATCTATTATACTCCAACTCTCTG/' | vg view -Fv - >2snp.vg vg index -x 2snp.xg 2snp.vg -vg sim -s 0 -l 30 -x 2snp.xg -n 30 -a >2snp.sim +vg sim -s 2323 -l 30 -x 2snp.xg -n 30 -a >2snp.sim vg index -x flat.xg -g flat.gcsa -k 16 flat.vg vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam is $(vg augment flat.vg 2snp.gam -i -S | vg paths -d -v - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes" vg view flat.vg| sed 's/CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTG/CAAATAAGGCTTGGAAATTATCTGGAGTTCTATTATATCCCAACTCTCTG/' | vg view -Fv - >2err.vg -vg sim -s 0 -l 30 -x 2err.vg -n 10 -a >2err.sim +vg sim -s 2323 -l 30 -x 2err.vg -n 10 -a >2err.sim vg map -g flat.gcsa -x flat.xg -G 2err.sim -k 8 >2err.gam cat 2snp.gam 2err.gam > 4edits.gam vg augment flat.vg 2snp.gam -S | vg view - | grep S | awk '{print $3}' | sort > 2snp_default.nodes