Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to output closest relative(s) per sample #218

Merged
merged 4 commits into from
Mar 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 48 additions & 1 deletion src/matUtils/extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ po::variables_map parse_extract_command(po::parsed_options parsed) {
"Add exactly W samples at random to your selection. Affected by -Z and overridden by -z.")
("select-nearest,Y", po::value<size_t>()->default_value(0),
"Set to add to the sample selection the y nearest samples to each of your samples, without duplicates.")
("closest-relatives,V", po::value<std::string>()->default_value(""),
"Write a tsv file of the closest relative(s) (in mutations) of each selected sample to the indicated file. All equidistant closest samples are included unless --break-ties is set.")
("break-ties,q", po::bool_switch(),
"Only output one closest relative per sample (used only with --closest-relatives). If multiple closest relatives are equidistant, the lexicographically smallest sample ID is chosen.")
("dump-metadata,Q", po::value<std::string>()->default_value(""),
"Set to write all final stored metadata to a tsv.")
("whitelist,L", po::value<std::string>()->default_value(""),
Expand Down Expand Up @@ -158,6 +162,7 @@ void extract_main (po::parsed_options parsed) {
size_t add_random = vm["add-random"].as<size_t>();
size_t select_nearest = vm["select-nearest"].as<size_t>();
float x_scale = vm["x-scale"].as<float>();
bool break_ties = vm["break-ties"].as<bool>();
bool include_nt = vm["include-nt"].as<bool>();

boost::filesystem::path path(dir_prefix);
Expand All @@ -172,6 +177,7 @@ void extract_main (po::parsed_options parsed) {
std::string sample_path_filename = dir_prefix + vm["sample-paths"].as<std::string>();
std::string clade_path_filename = dir_prefix + vm["clade-paths"].as<std::string>();
std::string all_path_filename = dir_prefix + vm["all-paths"].as<std::string>();
std::string closest_relatives_filename = dir_prefix + vm["closest-relatives"].as<std::string>();
std::string tree_filename = dir_prefix + vm["write-tree"].as<std::string>();
std::string vcf_filename = dir_prefix + vm["write-vcf"].as<std::string>();
std::string output_mat_filename = dir_prefix + vm["write-mat"].as<std::string>();
Expand All @@ -184,6 +190,7 @@ void extract_main (po::parsed_options parsed) {
std::string gtf_filename = dir_prefix + vm["input-gtf"].as<std::string>();
std::string fasta_filename = dir_prefix + vm["input-fasta"].as<std::string>();
std::string dump_metadata = dir_prefix + vm["dump-metadata"].as<std::string>();


std::vector<std::string> additional_meta_fields;
MAT::string_split(raw_meta_fields, ',', additional_meta_fields);
Expand All @@ -198,7 +205,7 @@ void extract_main (po::parsed_options parsed) {
return f != dir_prefix;
}) &&
usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) {
if (nearest_k_batch_file == "") {
if (nearest_k_batch_file == "" && closest_relatives_filename == dir_prefix) {
fprintf(stderr, "ERROR: No output files requested!\n");
exit(1);
}
Expand Down Expand Up @@ -674,6 +681,45 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) {
}, ap) ;
fprintf(stderr, "%ld batch sample jsons written in %ld msec.\n\n", batch_samples.size(), timer.Stop());
}
if (closest_relatives_filename != dir_prefix) {
fprintf(stderr, "Per-sample closest relative(s) requested. Computing...\n");
if (break_ties) {
fprintf(stderr, "Storing one closest relative per sample.\n");
}
std::ofstream out(closest_relatives_filename);

timer.Start();
std::vector<std::string> closest_relatives;

for (std::string sample : samples) {
std::pair<std::vector<std::string>, size_t> closest_relatives_pair = get_closest_samples(&T, sample);
std::vector<std::string> closest_relatives = closest_relatives_pair.first;
size_t dist = closest_relatives_pair.second;
if (closest_relatives.size() > 0) {
closest_relatives = closest_relatives_pair.first;
std::string s = "";
s += sample + '\t';
std::string lex_smallest_sample = closest_relatives[0];
for (std::string relative : closest_relatives) {
if (break_ties) {
if (relative < lex_smallest_sample) {
lex_smallest_sample = relative;
}
} else {
s += relative + ',';
}
}
if (break_ties) {
s += lex_smallest_sample;
} else {
s = s.substr(0, s.size() - 1);
}
s += '\t' + std::to_string(dist);
out << s << "\n";
}
}
fprintf(stderr, "TSV of closest relative written to %s in %ld msec.\n\n", closest_relatives_filename.c_str(), timer.Stop());
}
//if json output AND sample context is requested, add an additional metadata column which simply indicates the focal sample versus context
if ((json_filename != "") && (nearest_k != "")) {
std::unordered_map<std::string,std::string> conmap;
Expand Down Expand Up @@ -704,6 +750,7 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) {
submet["selected_for"] = ranmap;
catmeta.emplace_back(submet);
}

//last step is to convert the subtree to other file formats
if (vcf_filename != dir_prefix) {
fprintf(stderr, "Generating VCF of final tree\n");
Expand Down
92 changes: 92 additions & 0 deletions src/matUtils/select.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,3 +428,95 @@ std::vector<std::string> fill_random_samples(MAT::Tree* T, std::vector<std::stri
}
return filled_samples;
}


void closest_samples_dfs(MAT::Node *node, MAT::Node *target, size_t path_length, size_t max_path_length, std::vector<std::pair<MAT::Node *, size_t>> &leaves) {
if (path_length > max_path_length) {
return;
}
for (auto child : node->children) {
if (child->is_leaf()) {
leaves.push_back(std::make_pair(child, path_length + child->branch_length));
} else {
closest_samples_dfs(child, target, path_length + child->branch_length, max_path_length, leaves);
}
}
}
std::pair<std::vector<std::string>, size_t> get_closest_samples(MAT::Tree* T, std::string nid) {
// Returns a pair with (1) a vector of closest nodes to a target and (2) the distance from the target node
std::pair<std::vector<std::string>, size_t> closest_samples;

MAT::Node *target = T->get_node(nid);
MAT::Node *curr_target = T->get_node(nid);

if (!target) {
fprintf(stderr, "WARNING: Node %s not found in tree\n", nid.c_str());
return closest_samples;
}
MAT::Node *parent = target->parent;

size_t min_dist = std::numeric_limits<size_t>::max();
size_t dist_to_orig_parent = 0; // cumulative distance to the parent of the initial target

bool go_up = true;
while (go_up && parent) {

size_t parent_branch_length = parent->branch_length + dist_to_orig_parent;
// make a vector of siblings of the current target.
// for siblings that are internal nodes, add leaves in the descendant subtree
// as pseudo-children if they are close enough
std::vector<std::pair<MAT::Node *, size_t>> children_and_distances;

size_t min_of_sibling_leaves = std::numeric_limits<size_t>::max();
for (auto child : parent->children) {
if (child->is_leaf()) {
if (child->identifier == curr_target->identifier) {
continue; // skip the target node
}
size_t child_branch_length = child->branch_length;
if (child_branch_length < min_of_sibling_leaves) {
min_of_sibling_leaves = child_branch_length;
}
}
}
for (auto child : parent->children) {
if (child->identifier == curr_target->identifier) {
continue; // skip the target node
}

if (!child->is_leaf()) {
// for internal nodes, descend the tree, adding leaves as they are
// encountered, restricting path lengths to less than the minimum of
// the sibling leaves at the current level
size_t dist_so_far = child->branch_length;
closest_samples_dfs(child, target, dist_so_far, min_of_sibling_leaves, children_and_distances);

} else { // leaf node
children_and_distances.push_back(std::make_pair(child, dist_to_orig_parent + child->branch_length));
}
}

for (std::pair child_and_dist : children_and_distances) {
// for the siblings of the target node, if any branch lengths
// are shorter than the path up a level, we can stop
MAT::Node *child = child_and_dist.first;
size_t child_branch_length = child_and_dist.second + dist_to_orig_parent;
if (child_branch_length < parent_branch_length) {
go_up = false;
}
if (child_branch_length < min_dist) {
min_dist = child_branch_length;
closest_samples.first.clear();
closest_samples.first.push_back(child->identifier);
closest_samples.second = min_dist + target->branch_length;

} else if (child_branch_length == min_dist) {
closest_samples.first.push_back(child->identifier);
}
}
curr_target = parent;
parent = curr_target->parent;
dist_to_orig_parent += parent_branch_length;
}
return closest_samples;
}
1 change: 1 addition & 0 deletions src/matUtils/select.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ std::vector<std::string> get_short_paths(MAT::Tree* T, std::vector<std::string>
std::unordered_map<std::string,std::unordered_map<std::string,std::string>> read_metafile(std::string metainf, std::set<std::string> samples_to_use);
std::vector<std::string> get_sample_match(MAT::Tree* T, std::vector<std::string> samples_to_check, std::string substring);
std::vector<std::string> fill_random_samples(MAT::Tree* T, std::vector<std::string> current_samples, size_t target_size, bool lca_limit = false);
std::pair<std::vector<std::string>, size_t> get_closest_samples(MAT::Tree* T, std::string nid);