Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mapq filter #8

Merged
merged 1 commit into from
Jul 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 59 additions & 45 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#include "path_cluster_estimates.hpp"
#include "path_estimates_writer.hpp"

const uint32_t align_path_buffer_size = 10000;
const uint32_t align_paths_buffer_size = 10000;
const uint32_t read_path_cluster_probs_buffer_size = 10;
const double prob_precision = pow(10, -8);

Expand All @@ -46,88 +46,100 @@ typedef spp::sparse_hash_map<uint32_t, spp::sparse_hash_set<uint32_t> > connecte
typedef ProducerConsumerQueue<vector<vector<AlignmentPath> > *> align_paths_buffer_queue_t;


void addAlignmentPathsToBufferQueue(vector<vector<AlignmentPath> > * align_paths_buffer, align_paths_buffer_queue_t * align_paths_buffer_queue, const double mean_fragment_length) {
void addAlignmentPathsToBuffer(const vector<AlignmentPath> & align_paths, vector<vector<AlignmentPath> > * align_paths_buffer, const double min_mapq, const double mean_fragment_length) {

for (auto & align_paths: *align_paths_buffer) {
if (!align_paths.empty()) {

if (!align_paths.empty()) {
align_paths_buffer->emplace_back(vector<AlignmentPath>());

if (align_paths.size() == 1) {
for (auto & align_path: align_paths) {

align_paths.front().seq_length = mean_fragment_length;
align_paths.front().score_sum = 1;
if (align_path.mapq_comb >= min_mapq) {

align_paths_buffer->back().emplace_back(align_path);
}
}

if (!align_paths_buffer->back().empty()) {

if (align_paths_buffer->back().size() == 1) {

align_paths_buffer->back().front().seq_length = mean_fragment_length;
align_paths_buffer->back().front().score_sum = 1;
}

sort(align_paths.begin(), align_paths.end());
sort(align_paths_buffer->back().begin(), align_paths_buffer->back().end());

} else {

align_paths_buffer->pop_back();
}
}

align_paths_buffer_queue->push(align_paths_buffer);
}

template<class AlignmentType>
void findAlignmentPaths(ifstream & alignments_istream, align_paths_buffer_queue_t * align_paths_buffer_queue, const PathsIndex & paths_index, const FragmentLengthDist & fragment_length_dist, const uint32_t num_threads) {
void findAlignmentPaths(ifstream & alignments_istream, align_paths_buffer_queue_t * align_paths_buffer_queue, const PathsIndex & paths_index, const FragmentLengthDist & fragment_length_dist, const double min_mapq, const uint32_t num_threads) {

AlignmentPathFinder<AlignmentType> align_path_finder(paths_index, fragment_length_dist.maxLength());

auto threaded_align_path_buffer = vector<vector<vector<AlignmentPath > > *>(num_threads);
auto threaded_align_paths_buffer = vector<vector<vector<AlignmentPath > > *>(num_threads);

for (auto & align_path_buffer: threaded_align_path_buffer) {
for (auto & align_paths_buffer: threaded_align_paths_buffer) {

align_path_buffer = new vector<vector<AlignmentPath > >();
align_path_buffer->reserve(align_path_buffer_size);
align_paths_buffer = new vector<vector<AlignmentPath > >();
align_paths_buffer->reserve(align_paths_buffer_size);
}

vg::io::for_each_parallel<AlignmentType>(alignments_istream, [&](AlignmentType & alignment) {

auto align_path_buffer = threaded_align_path_buffer.at(omp_get_thread_num());
align_path_buffer->emplace_back(align_path_finder.findAlignmentPaths(alignment));
vector<vector<AlignmentPath > > * align_paths_buffer = threaded_align_paths_buffer.at(omp_get_thread_num());
addAlignmentPathsToBuffer(align_path_finder.findAlignmentPaths(alignment), align_paths_buffer, min_mapq, fragment_length_dist.mean());

if (align_path_buffer->size() == align_path_buffer_size) {
if (align_paths_buffer->size() == align_paths_buffer_size) {

addAlignmentPathsToBufferQueue(align_path_buffer, align_paths_buffer_queue, fragment_length_dist.mean());
align_paths_buffer_queue->push(align_paths_buffer);

threaded_align_path_buffer.at(omp_get_thread_num()) = new vector<vector<AlignmentPath > >();
threaded_align_path_buffer.at(omp_get_thread_num())->reserve(align_path_buffer_size);
threaded_align_paths_buffer.at(omp_get_thread_num()) = new vector<vector<AlignmentPath > >();
threaded_align_paths_buffer.at(omp_get_thread_num())->reserve(align_paths_buffer_size);
}
});

for (auto & align_path_buffer: threaded_align_path_buffer) {
for (auto & align_paths_buffer: threaded_align_paths_buffer) {

addAlignmentPathsToBufferQueue(align_path_buffer, align_paths_buffer_queue, fragment_length_dist.mean());
align_paths_buffer_queue->push(align_paths_buffer);
}
}

template<class AlignmentType>
void findPairedAlignmentPaths(ifstream & alignments_istream, align_paths_buffer_queue_t * align_paths_buffer_queue, const PathsIndex & paths_index, const FragmentLengthDist & fragment_length_dist, const uint32_t num_threads) {
void findPairedAlignmentPaths(ifstream & alignments_istream, align_paths_buffer_queue_t * align_paths_buffer_queue, const PathsIndex & paths_index, const FragmentLengthDist & fragment_length_dist, const double min_mapq, const uint32_t num_threads) {

AlignmentPathFinder<AlignmentType> align_path_finder(paths_index, fragment_length_dist.maxLength());

auto threaded_align_path_buffer = vector<vector<vector<AlignmentPath > > *>(num_threads);
auto threaded_align_paths_buffer = vector<vector<vector<AlignmentPath > > *>(num_threads);

for (auto & align_path_buffer: threaded_align_path_buffer) {
for (auto & align_paths_buffer: threaded_align_paths_buffer) {

align_path_buffer = new vector<vector<AlignmentPath > >();
align_path_buffer->reserve(align_path_buffer_size);
align_paths_buffer = new vector<vector<AlignmentPath > >();
align_paths_buffer->reserve(align_paths_buffer_size);
}

vg::io::for_each_interleaved_pair_parallel<AlignmentType>(alignments_istream, [&](AlignmentType & alignment_1, AlignmentType & alignment_2) {

auto align_path_buffer = threaded_align_path_buffer.at(omp_get_thread_num());
align_path_buffer->emplace_back(align_path_finder.findPairedAlignmentPaths(alignment_1, alignment_2));
vector<vector<AlignmentPath > > * align_paths_buffer = threaded_align_paths_buffer.at(omp_get_thread_num());
addAlignmentPathsToBuffer(align_path_finder.findPairedAlignmentPaths(alignment_1, alignment_2), align_paths_buffer, min_mapq, fragment_length_dist.mean());

if (align_path_buffer->size() == align_path_buffer_size) {
if (align_paths_buffer->size() == align_paths_buffer_size) {

addAlignmentPathsToBufferQueue(align_path_buffer, align_paths_buffer_queue, fragment_length_dist.mean());
threaded_align_path_buffer.at(omp_get_thread_num()) = new vector<vector<AlignmentPath > >();
threaded_align_path_buffer.at(omp_get_thread_num())->reserve(align_path_buffer_size);
align_paths_buffer_queue->push(align_paths_buffer);

threaded_align_paths_buffer.at(omp_get_thread_num()) = new vector<vector<AlignmentPath > >();
threaded_align_paths_buffer.at(omp_get_thread_num())->reserve(align_paths_buffer_size);
}
});

for (auto & align_path_buffer: threaded_align_path_buffer) {
for (auto & align_paths_buffer: threaded_align_paths_buffer) {

addAlignmentPathsToBufferQueue(align_path_buffer, align_paths_buffer_queue, fragment_length_dist.mean());
align_paths_buffer_queue->push(align_paths_buffer);
}
}

Expand All @@ -139,11 +151,10 @@ void addAlignmentPathsBufferToIndexes(align_paths_buffer_queue_t * align_paths_b

for (auto & align_paths: *align_paths_buffer) {

if (!align_paths.empty()) {
assert(!align_paths.empty());

auto threaded_align_paths_index_it = align_paths_index->emplace(align_paths, 0);
threaded_align_paths_index_it.first->second++;
}
auto threaded_align_paths_index_it = align_paths_index->emplace(align_paths, 0);
threaded_align_paths_index_it.first->second++;
}

delete align_paths_buffer;
Expand Down Expand Up @@ -220,6 +231,7 @@ int main(int argc, char* argv[]) {
options.add_options("Probability")
("m,frag-mean", "mean for fragment length distribution", cxxopts::value<double>())
("d,frag-sd", "standard deviation for fragment length distribution", cxxopts::value<double>())
("q,filt-mapq-prob", "filter alignments with a mapq error probability above value", cxxopts::value<double>()->default_value("0.002"))
("b,prob-output", "write read path probabilities to file", cxxopts::value<string>())
;

Expand Down Expand Up @@ -391,26 +403,28 @@ int main(int argc, char* argv[]) {

thread indexing_thread(addAlignmentPathsBufferToIndexes, align_paths_buffer_queue, &align_paths_index);

const double min_mapq = prob_to_phred(option_results["filt-mapq-prob"].as<double>());

if (is_single_end) {

if (is_multipath) {

findAlignmentPaths<vg::MultipathAlignment>(alignments_istream, align_paths_buffer_queue, paths_index, fragment_length_dist, num_threads);
findAlignmentPaths<vg::MultipathAlignment>(alignments_istream, align_paths_buffer_queue, paths_index, fragment_length_dist, min_mapq, num_threads);

} else {

findAlignmentPaths<vg::Alignment>(alignments_istream, align_paths_buffer_queue, paths_index, fragment_length_dist, num_threads);
findAlignmentPaths<vg::Alignment>(alignments_istream, align_paths_buffer_queue, paths_index, fragment_length_dist, min_mapq, num_threads);
}

} else {

if (is_multipath) {

findPairedAlignmentPaths<vg::MultipathAlignment>(alignments_istream, align_paths_buffer_queue, paths_index, fragment_length_dist, num_threads);
findPairedAlignmentPaths<vg::MultipathAlignment>(alignments_istream, align_paths_buffer_queue, paths_index, fragment_length_dist, min_mapq, num_threads);

} else {

findPairedAlignmentPaths<vg::Alignment>(alignments_istream, align_paths_buffer_queue, paths_index, fragment_length_dist, num_threads);
findPairedAlignmentPaths<vg::Alignment>(alignments_istream, align_paths_buffer_queue, paths_index, fragment_length_dist, min_mapq, num_threads);
}
}

Expand Down
21 changes: 19 additions & 2 deletions src/path_abundance_estimator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ void PathAbundanceEstimator::estimate(PathClusterEstimates * path_cluster_estima
Eigen::ColVectorXd noise_probs;
Eigen::RowVectorXui read_counts;

constructProbabilityMatrix(&read_path_probs, &noise_probs, &read_counts, cluster_probs, true, 2);
constructProbabilityMatrix(&read_path_probs, &noise_probs, &read_counts, cluster_probs, true);

path_cluster_estimates->initEstimates(path_cluster_estimates->paths.size() + 1, 0, false);

Expand Down Expand Up @@ -79,6 +79,23 @@ void PathAbundanceEstimator::EMAbundanceEstimator(PathClusterEstimates * path_cl

prev_abundances = path_cluster_estimates->abundances;
}

double abundances_sum = 0;

for (size_t i = 0; i < path_cluster_estimates->abundances.cols(); ++i) {

if (path_cluster_estimates->abundances(0, i) < min_abundances) {

path_cluster_estimates->abundances(0, i) = 0;
}

abundances_sum += path_cluster_estimates->abundances(0, i);
}

if (abundances_sum > 0) {

path_cluster_estimates->abundances = path_cluster_estimates->abundances / abundances_sum;
}
}

void PathAbundanceEstimator::removeNoiseAndRenormalizeAbundances(PathClusterEstimates * path_cluster_estimates) const {
Expand Down Expand Up @@ -249,7 +266,7 @@ void NestedPathAbundanceEstimator::estimate(PathClusterEstimates * path_cluster_
Eigen::ColVectorXd noise_probs;
Eigen::RowVectorXui read_counts;

constructProbabilityMatrix(&read_path_probs, &noise_probs, &read_counts, cluster_probs, true, 2);
constructProbabilityMatrix(&read_path_probs, &noise_probs, &read_counts, cluster_probs, true);

noise_probs = read_path_probs.col(read_path_probs.cols() - 1);

Expand Down
53 changes: 25 additions & 28 deletions src/path_estimator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ bool probabilityCountColSorter(const pair<Eigen::ColVectorXd, uint32_t> & lhs, c

PathEstimator::PathEstimator(const double prob_precision_in) : prob_precision(prob_precision_in) {}

void PathEstimator::constructProbabilityMatrix(Eigen::ColMatrixXd * read_path_probs, Eigen::ColVectorXd * noise_probs, Eigen::RowVectorXui * read_counts, const vector<ReadPathProbabilities> & cluster_probs, const bool add_noise, const double max_noise_prob) {
void PathEstimator::constructProbabilityMatrix(Eigen::ColMatrixXd * read_path_probs, Eigen::ColVectorXd * noise_probs, Eigen::RowVectorXui * read_counts, const vector<ReadPathProbabilities> & cluster_probs, const bool add_noise) {

assert(!cluster_probs.empty());

Expand All @@ -58,54 +58,51 @@ void PathEstimator::constructProbabilityMatrix(Eigen::ColMatrixXd * read_path_pr

for (size_t i = 0; i < read_path_probs->rows(); ++i) {

if (cluster_probs.at(i).noiseProbability() <= max_noise_prob) {
assert(cluster_probs.at(i).probabilities().size() + static_cast<uint32_t>(add_noise) == read_path_probs->cols());

assert(cluster_probs.at(i).probabilities().size() + static_cast<uint32_t>(add_noise) == read_path_probs->cols());
for (size_t j = 0; j < cluster_probs.at(i).probabilities().size(); ++j) {

for (size_t j = 0; j < cluster_probs.at(i).probabilities().size(); ++j) {

(*read_path_probs)(num_rows, j) = cluster_probs.at(i).probabilities().at(j);
}

if (add_noise) {

(*read_path_probs)(num_rows, cluster_probs.at(i).probabilities().size()) = cluster_probs.at(i).noiseProbability();
}
(*read_path_probs)(num_rows, j) = cluster_probs.at(i).probabilities().at(j);
}

(*noise_probs)(num_rows, 0) = cluster_probs.at(i).noiseProbability();
(*read_counts)(0, num_rows) = cluster_probs.at(i).readCount();
if (add_noise) {

if (num_rows > 0) {
(*read_path_probs)(num_rows, cluster_probs.at(i).probabilities().size()) = cluster_probs.at(i).noiseProbability();
}

bool is_identical = true;
(*noise_probs)(num_rows, 0) = cluster_probs.at(i).noiseProbability();
(*read_counts)(0, num_rows) = cluster_probs.at(i).readCount();

for (size_t j = 0; j < read_path_probs->cols(); ++j) {
if (num_rows > 0) {

if (abs((*read_path_probs)(num_rows - 1, j) - (*read_path_probs)(num_rows, j)) >= prob_precision) {
bool is_identical = true;

is_identical = false;
break;
}
}
for (size_t j = 0; j < read_path_probs->cols(); ++j) {

if (abs((*noise_probs)(num_rows - 1, 0) - (*noise_probs)(num_rows, 0)) >= prob_precision) {
if (abs((*read_path_probs)(num_rows - 1, j) - (*read_path_probs)(num_rows, j)) >= prob_precision) {

is_identical = false;
break;
}
}

if (is_identical) {
if (abs((*noise_probs)(num_rows - 1, 0) - (*noise_probs)(num_rows, 0)) >= prob_precision) {

(*read_counts)(0, num_rows - 1) += (*read_counts)(0, num_rows);
is_identical = false;
}

} else {
if (is_identical) {

num_rows++;
}
(*read_counts)(0, num_rows - 1) += (*read_counts)(0, num_rows);

} else {

num_rows++;
}

} else {

num_rows++;
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/path_estimator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class PathEstimator {

const double prob_precision;

void constructProbabilityMatrix(Eigen::ColMatrixXd * read_path_probs, Eigen::ColVectorXd * noise_probs, Eigen::RowVectorXui * read_counts, const vector<ReadPathProbabilities> & cluster_probs, const bool add_noise, const double max_noise_prob);
void constructProbabilityMatrix(Eigen::ColMatrixXd * read_path_probs, Eigen::ColVectorXd * noise_probs, Eigen::RowVectorXui * read_counts, const vector<ReadPathProbabilities> & cluster_probs, const bool add_noise);
void addNoiseAndNormalizeProbabilityMatrix(Eigen::ColMatrixXd * read_path_probs, const Eigen::ColVectorXd & noise_probs);

void collapseProbabilityMatrixReads(Eigen::ColMatrixXd * read_path_probs, Eigen::RowVectorXui * read_counts);
Expand Down
4 changes: 2 additions & 2 deletions src/path_posterior_estimator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ void PathPosteriorEstimator::estimate(PathClusterEstimates * path_cluster_estima
Eigen::ColVectorXd noise_probs;
Eigen::RowVectorXui read_counts;

constructProbabilityMatrix(&read_path_probs, &noise_probs, &read_counts, cluster_probs, true, 2);
constructProbabilityMatrix(&read_path_probs, &noise_probs, &read_counts, cluster_probs, true);

noise_probs = read_path_probs.col(read_path_probs.cols() - 1);
read_path_probs.conservativeResize(read_path_probs.rows(), read_path_probs.cols() - 1);
Expand Down Expand Up @@ -42,7 +42,7 @@ void PathGroupPosteriorEstimator::estimate(PathClusterEstimates * path_cluster_e
Eigen::ColVectorXd noise_probs;
Eigen::RowVectorXui read_counts;

constructProbabilityMatrix(&read_path_probs, &noise_probs, &read_counts, cluster_probs, true, 2);
constructProbabilityMatrix(&read_path_probs, &noise_probs, &read_counts, cluster_probs, true);

noise_probs = read_path_probs.col(read_path_probs.cols() - 1);
read_path_probs.conservativeResize(read_path_probs.rows(), read_path_probs.cols() - 1);
Expand Down