Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modularized VCF reading into MAT namespace #200

Merged
merged 11 commits into from
Dec 1, 2021
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ AUX_SOURCE_DIRECTORY(src/matOptimize/Profitable_Moves_Enumerators New_Profitable
AUX_SOURCE_DIRECTORY(src/matOptimize/apply_move patch_tree)
file(GLOB MATUTIL_SRCS "src/matUtils/*.cpp" "src/matUtils/*.hpp")
file(GLOB RIPPLES_SRCS "src/ripples/*.cpp" "src/ripples/*.hpp")

#set_source_files_properties(src/matOptimize/Profitable_Moves_Enumerators/upward_integrated.cpp PROPERTIES COMPILE_FLAGS -O0)
#set_source_files_properties(src/matOptimize/Profitable_Moves_Enumerators/downward_integrated.cpp PROPERTIES COMPILE_FLAGS -O0)
#set_source_files_properties(src/matOptimize/Profitable_Moves_Enumerators/range_tree.cpp PROPERTIES COMPILE_FLAGS -O0)
Expand Down Expand Up @@ -104,7 +105,9 @@ if(DEFINED Protobuf_PATH)
)
add_executable(compareVCF
src/mutation_annotated_tree.cpp
src/compareVCF.cpp)
src/compareVCF.cpp
src/usher_mapper.cpp
)


add_executable(transpose_vcf
Expand Down Expand Up @@ -215,6 +218,7 @@ else()
add_executable(compareVCF
src/mutation_annotated_tree.cpp
src/compareVCF.cpp
src/usher_mapper.cpp
${PROTO_SRCS}
${PROTO_HDRS}
)
Expand Down Expand Up @@ -325,6 +329,8 @@ else(SAVE_PROFILE)
endif(SAVE_PROFILE)
#set_property(TARGET matOptimize PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
TARGET_LINK_LIBRARIES(compareVCF PRIVATE stdc++ ${Boost_LIBRARIES} ${TBB_IMPORTED_TARGETS} ${Protobuf_LIBRARIES} ZLIB::ZLIB) # OpenMP::OpenMP_CXX)


TARGET_LINK_LIBRARIES(matOptimize PRIVATE stdc++ ${Boost_LIBRARIES} ${TBB_IMPORTED_TARGETS} ${Protobuf_LIBRARIES} ZLIB::ZLIB ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS} ${ISAL_LIB} ) # OpenMP::OpenMP_CXX)
TARGET_LINK_LIBRARIES(output_final_protobuf PRIVATE stdc++ ${Boost_LIBRARIES} ${TBB_IMPORTED_TARGETS} ${Protobuf_LIBRARIES} ZLIB::ZLIB ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS} ) # OpenMP::OpenMP_CXX)
TARGET_LINK_LIBRARIES(transpose_vcf PRIVATE stdc++ ${Boost_LIBRARIES} ${TBB_IMPORTED_TARGETS} ${Protobuf_LIBRARIES} ZLIB::ZLIB) # OpenMP::OpenMP_CXX)
Expand Down
225 changes: 225 additions & 0 deletions src/mutation_annotated_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <unistd.h>
#include <google/protobuf/io/coded_stream.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>
#include "usher_graph.hpp"

// Uses one-hot encoding if base is unambiguous
// A:1,C:2,G:4,T:8
Expand Down Expand Up @@ -1899,3 +1900,227 @@ void Mutation_Annotated_Tree::get_sample_mutation_paths (Mutation_Annotated_Tree
}
fclose(mutation_paths_file);
}

// Read vcf with existing mat
void Mutation_Annotated_Tree::read_vcf (Mutation_Annotated_Tree::Tree* T, std::string &vcf_filename, std::vector<Missing_Sample>& missing_samples) {
fprintf(stderr, "Loading VCF file\n");
// Boost library used to stream the contents of the input VCF file in
// uncompressed or compressed .gz format
std::ifstream infile(vcf_filename, std::ios_base::in | std::ios_base::binary);
if (!infile) {
fprintf(stderr, "ERROR: Could not open the VCF file: %s!\n", vcf_filename.c_str());
exit(1);
}
boost::iostreams::filtering_istream instream;
try {
if (vcf_filename.find(".gz\0") != std::string::npos) {
instream.push(boost::iostreams::gzip_decompressor());
}
instream.push(infile);
} catch(const boost::iostreams::gzip_error& e) {
std::cout << e.what() << '\n';
}

bool header_found = false;
std::vector<std::string> variant_ids;
std::vector<size_t> missing_idx;
std::string s;
// This while loop reads the VCF file line by line and populates
// missing_samples and missing_sample_mutations based on the names and
// variants of missing samples. If a sample name in the VCF is already
// found in the tree, it gets ignored with a warning message
while (instream.peek() != EOF) {
std::getline(instream, s);
std::vector<std::string> words;
string_split(s, words);
if ((not header_found) && (words.size() > 1)) {
if (words[1] == "POS") {
for (size_t j=9; j < words.size(); j++) {
variant_ids.emplace_back(words[j]);
if ((T->get_node(words[j]) == NULL) && (T->condensed_leaves.find(words[j]) == T->condensed_leaves.end())) {
missing_samples.emplace_back(Missing_Sample(words[j]));
missing_idx.emplace_back(j);
} else {
fprintf(stderr, "WARNING: Ignoring sample %s as it is already in the tree.\n", words[j].c_str());
}
}
header_found = true;
}
} else if (header_found) {
if (words.size() != 9+variant_ids.size()) {
fprintf(stderr, "ERROR! Incorrect VCF format. Expected %zu columns but got %zu.\n", 9+variant_ids.size(), words.size());
exit(1);
}
std::vector<std::string> alleles;
alleles.clear();
string_split(words[4], ',', alleles);
for (size_t k = 0; k < missing_idx.size(); k++) {
size_t j = missing_idx[k];
auto iter = missing_samples.begin();
std::advance(iter, k);
if (iter != missing_samples.end()) {
Mutation m;
m.chrom = words[0];
m.position = std::stoi(words[1]);
m.ref_nuc = get_nuc_id(words[3][0]);
assert((m.ref_nuc & (m.ref_nuc-1)) == 0); //check if it is power of 2
m.par_nuc = m.ref_nuc;
// Alleles such as '.' should be treated as missing
// data. if the word is numeric, it is an index to one
// of the alleles
if (isdigit(words[j][0])) {
int allele_id = std::stoi(words[j]);
if (allele_id > 0) {
std::string allele = alleles[allele_id-1];
if (allele[0] == 'N') {
m.is_missing = true;
m.mut_nuc = get_nuc_id('N');
} else {
auto nuc = get_nuc_id(allele[0]);
if (nuc == get_nuc_id('N')) {
m.is_missing = true;
} else {
m.is_missing = false;
}
m.mut_nuc = nuc;
}
(*iter).mutations.emplace_back(m);
}
} else {
m.is_missing = true;
m.mut_nuc = get_nuc_id('N');
(*iter).mutations.emplace_back(m);
}
if ((m.mut_nuc & (m.mut_nuc-1)) !=0) {
(*iter).num_ambiguous++;
}
}
}
}
}
}


// If called with a tree file that needs to create a MAT
void Mutation_Annotated_Tree::read_vcf(Mutation_Annotated_Tree::Tree* T, std::string &vcf_filename, std::vector<Missing_Sample>& missing_samples, bool create_new_mat) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the point of the boolean create_new_mat? It doesn't seem like you're using it inside the function?

// Vector used to store all tree nodes in breadth-first search (BFS) order
std::vector<Node*> bfs;
// Map the node identifier string to index in the BFS traversal
std::unordered_map<std::string, size_t> bfs_idx;

// Variables below used to store the different fields of the input VCF file
bool header_found = false;
std::vector<std::string> variant_ids;

// timer object to be used to measure runtimes of individual stages
Timer timer;

// Breadth-first expansion to populate bfs and bfs_idx
bfs = T->breadth_first_expansion();
for (size_t idx = 0; idx < bfs.size(); idx++) {
bfs_idx[bfs[idx]->identifier] = idx;
}

fprintf(stderr, "Loading VCF file.\n");
timer.Start();

// Boost library used to stream the contents of the input VCF file in
// uncompressed or compressed .gz format
std::ifstream infile(vcf_filename, std::ios_base::in | std::ios_base::binary);
if (!infile) {
fprintf(stderr, "ERROR: Could not open the VCF file: %s!\n", vcf_filename.c_str());
exit(1);
}
boost::iostreams::filtering_istream instream;
try {
if (vcf_filename.find(".gz\0") != std::string::npos) {
instream.push(boost::iostreams::gzip_decompressor());
}
instream.push(infile);
} catch(const boost::iostreams::gzip_error& e) {
std::cout << e.what() << '\n';
}

fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());

fprintf(stderr, "Computing parsimonious assignments for input variants.\n");
timer.Start();

// A TBB flow graph containing a single source_node (reader) connected
// to several mappers. The source_node sequentially reads in the different
// lines of the input VCF file and constructs a mapper task for each
// VCF line. Each mapper task takes a mapper_input as input, which stores
// the alternate alleles, ambiguous bases and missing data (Ns) for
// different tree samples at the corresponding VCF line/position. The
// mappers use Fitch-Sankoff algorithm to assign mutations at different
// branches of the tree and update the mutation-annotated tree (T)
// accordingly.
tbb::flow::graph mapper_graph;

tbb::flow::function_node<mapper_input, int> mapper(mapper_graph, tbb::flow::unlimited, mapper_body());
tbb::flow::source_node <mapper_input> reader (mapper_graph,
[&] (mapper_input &inp) -> bool {

//check if reached end-of-file
int curr_char = instream.peek();
if(curr_char == EOF)
return false;

std::string s;
std::getline(instream, s);
std::vector<std::string> words;
string_split(s, words);
inp.variant_pos = -1;

// Header is found when "POS" is the second word in the line
if ((not header_found) && (words.size() > 1)) {
if (words[1] == "POS") {
// Sample names start from the 10th word in the header
for (size_t j=9; j < words.size(); j++) {
variant_ids.emplace_back(words[j]);
// If sample name not in tree, add it to missing_samples
if (bfs_idx.find(words[j]) == bfs_idx.end()) {
missing_samples.emplace_back(Missing_Sample(words[j]));
}
}
header_found = true;
}
} else if (header_found) {
if (words.size() != 9+variant_ids.size()) {
fprintf(stderr, "ERROR! Incorrect VCF format.\n");
exit(1);
}
std::vector<std::string> alleles;
alleles.clear();
inp.variant_pos = std::stoi(words[1]);
string_split(words[4], ',', alleles);
// T will be modified by the mapper with mutation
// annotations
inp.T = T;
inp.chrom = words[0];
inp.bfs = &bfs;
inp.bfs_idx = &bfs_idx;
inp.variant_ids = &variant_ids;
inp.missing_samples = &missing_samples;
// Ref nuc id uses one-hot encoding (A:0b1, C:0b10, G:0b100,
// T:0b1000)
inp.ref_nuc = get_nuc_id(words[3][0]);
assert((inp.ref_nuc & (inp.ref_nuc-1)) == 0); //check if it is power of 2
inp.variants.clear();
for (size_t j=9; j < words.size(); j++) {
if (isdigit(words[j][0])) {
int allele_id = std::stoi(words[j]);
if (allele_id > 0) {
std::string allele = alleles[allele_id-1];
inp.variants.emplace_back(std::make_tuple(j-9, get_nuc_id(allele[0])));
}
} else {
inp.variants.emplace_back(std::make_tuple(j-9, get_nuc_id('N')));
}
}
}
return true;
}, true );
tbb::flow::make_edge(reader, mapper);
mapper_graph.wait_for_all();
}
7 changes: 7 additions & 0 deletions src/mutation_annotated_tree.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#pragma once
#include <fstream>
#include <unordered_map>
#include <string>
Expand All @@ -20,6 +21,9 @@
#include "parsimony.pb.h"
#include "Instrumentor.h"

// Forward declaration of structs from usher_graph
struct Missing_Sample;

#if SAVE_PROFILE == 1
# define TIMEIT() InstrumentationTimer timer##__LINE__(__PRETTY_FUNCTION__);
#else
Expand Down Expand Up @@ -165,5 +169,8 @@ void get_random_single_subtree (Mutation_Annotated_Tree::Tree* T, std::vector<st
void get_random_sample_subtrees (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, bool retain_original_branch_len = false);
void get_sample_mutation_paths (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string mutation_paths_filename);
void clear_tree(Tree& tree);

void read_vcf (Mutation_Annotated_Tree::Tree* T, std::string &vcf_filename, std::vector<Missing_Sample>& missing_samples);
void read_vcf (Mutation_Annotated_Tree::Tree* T, std::string &vcf_filename, std::vector<Missing_Sample>& missing_samples, bool create_new_mat);
}

Loading