Skip to content

Commit

Permalink
Merge pull request #3673 from vgteam/gampcompare-gmq
Browse files Browse the repository at this point in the history
Add group MAPQ to gampcompare and default multimappings to mpmap
  • Loading branch information
jeizenga authored May 31, 2022
2 parents 0477507 + 58a65e7 commit 096bfdc
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 13 deletions.
16 changes: 11 additions & 5 deletions src/subcommand/gampcompare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,10 @@ int main_gampcompare(int argc, char** argv) {
}

// A buffer we use for the TSV output
vector<vector<tuple<int64_t, bool, int64_t, string>>> buffers(get_thread_count());
vector<vector<tuple<int64_t, bool, int64_t, int64_t, string>>> buffers(get_thread_count());

// We have an output function to dump all the reads in the text buffer in TSV
auto flush_buffer = [&](vector<tuple<int64_t, bool, int64_t, string>>& buffer) {
auto flush_buffer = [&](vector<tuple<int64_t, bool, int64_t, int64_t, string>>& buffer) {
// We print exactly one header line.
static bool header_printed = false;
// Output TSV to standard out in the format plot-qq.R needs.
Expand All @@ -172,7 +172,7 @@ int main_gampcompare(int argc, char** argv) {
else {
cout << "correct";
}
cout << "\tmapped\tmq\taligner\tread" << endl;
cout << "\tmapped\tmq\tgroupmq\taligner\tread" << endl;
header_printed = true;
}
for (auto& result : buffer) {
Expand All @@ -183,7 +183,7 @@ int main_gampcompare(int argc, char** argv) {
else {
cout << (get<0>(result) <= range);
}
cout << '\t' << get<1>(result) << '\t' << get<2>(result) << '\t' << aligner_name << '\t' << get<3>(result) << endl;
cout << '\t' << get<1>(result) << '\t' << get<2>(result) << '\t' << get<3>(result) << '\t' << aligner_name << '\t' << get<4>(result) << endl;
}
buffer.clear();
};
Expand Down Expand Up @@ -230,9 +230,15 @@ int main_gampcompare(int argc, char** argv) {
correct_counts[omp_get_thread_num()]++;
}

// group mapq defaults to regular mapq
int64_t group_mapq = proto_mp_aln.mapping_quality();
if (has_annotation(proto_mp_aln, "group_mapq")) {
group_mapq = get_annotation<double>(proto_mp_aln, "group_mapq");
}

// put the result on the IO buffer
auto& buffer = buffers[omp_get_thread_num()];
buffer.emplace_back(abs_dist, proto_mp_aln.subpath_size() > 0, proto_mp_aln.mapping_quality(), move(*proto_mp_aln.mutable_name()));
buffer.emplace_back(abs_dist, proto_mp_aln.subpath_size() > 0, proto_mp_aln.mapping_quality(), group_mapq, move(*proto_mp_aln.mutable_name()));
if (buffer.size() > buffer_size) {
#pragma omp critical
flush_buffer(buffer);
Expand Down
16 changes: 13 additions & 3 deletions src/subcommand/mpmap_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ void help_mpmap(char** argv) {
//<< " --suppress-tail-anchors don't produce extra anchors when aligning to alternate paths in snarls" << endl
//<< " -T, --same-strand read pairs are from the same strand of the DNA/RNA molecule" << endl
<< " -X, --not-spliced do not form spliced alignments, even if aligning with --nt-type 'rna'" << endl
<< " -M, --max-multimaps INT report (up to) this many mappings per read [1]" << endl
<< " -M, --max-multimaps INT report (up to) this many mappings per read [10 rna / 1 dna]" << endl
<< " -a, --agglomerate-alns combine separate multipath alignments into one (possibly disconnected) alignment" << endl
<< " -r, --intron-distr FILE intron length distribution (from scripts/intron_length_distribution.py)" << endl
<< " -Q, --mq-max INT cap mapping quality estimates at this much [60]" << endl
Expand Down Expand Up @@ -253,7 +253,9 @@ int main_mpmap(int argc, char** argv) {
// How many distinct single path alignments should we look for in a multipath, for MAPQ?
// TODO: create an option.
int localization_max_paths = 5;
int max_num_mappings = 1;
int max_num_mappings = 0;
int default_dna_num_mappings = 1;
int default_rna_num_mappings = 10;
int hit_max = 1024;
int hit_max_arg = numeric_limits<int>::min();
int hard_hit_max_muliplier = 3;
Expand Down Expand Up @@ -1019,8 +1021,16 @@ int main_mpmap(int argc, char** argv) {
// can be one alignment
suppress_multicomponent_splitting = true;
}
if (max_num_mappings == 0) {
max_num_mappings = default_rna_num_mappings;
}
}
else if (nt_type == "dna") {
if (max_num_mappings == 0) {
max_num_mappings = default_dna_num_mappings;
}
}
else if (nt_type != "dna") {
else {
// DNA is the default
cerr << "error:[vg mpmap] Cannot identify sequencing type preset (-n): " << nt_type << endl;
exit(1);
Expand Down
4 changes: 2 additions & 2 deletions test/t/15_vg_surject.t
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,9 @@ is $(vg map -b minigiab/NA12878.chr22.tiny.bam -x m.xg -g m.gcsa | vg surject -p
is $(vg map -f minigiab/NA12878.chr22.tiny.fq.gz -x m.xg -g m.gcsa | vg surject -p q -x m.xg -s - | grep chr22.bin8.cram:166:6027 | grep BBBBBFBFI | wc -l) 1 "mapping reproduces qualities from fastq input"
is $(vg map -f minigiab/NA12878.chr22.tiny.fq.gz -x m.xg -g m.gcsa --gaf | vg surject -p q -x m.xg -s - -G | grep chr22.bin8.cram:166:6027 | grep BBBBBFBFI | wc -l) 1 "mapping reproduces qualities from GAF input"

is "$(zcat < minigiab/NA12878.chr22.tiny.fq.gz | head -n 4000 | vg mpmap -B -p -x m.xg -g m.gcsa -f - | vg surject -m -x m.xg -p q -s - | samtools view | wc -l)" 1000 "surject works on GAMP input"
is "$(zcat < minigiab/NA12878.chr22.tiny.fq.gz | head -n 4000 | vg mpmap -B -p -x m.xg -g m.gcsa -M 1 -f - | vg surject -m -x m.xg -p q -s - | samtools view | wc -l)" 1000 "surject works on GAMP input"

is "$(vg sim -x m.xg -n 500 -l 150 -a -s 768594 -i 0.01 -e 0.01 -p 250 -v 50 | vg view -aX - | vg mpmap -B -p -b 200 -x m.xg -g m.gcsa -i -f - | vg surject -m -x m.xg -i -p q -s - | samtools view | wc -l)" 1000 "surject works on paired GAMP input"
is "$(vg sim -x m.xg -n 500 -l 150 -a -s 768594 -i 0.01 -e 0.01 -p 250 -v 50 | vg view -aX - | vg mpmap -B -p -b 200 -x m.xg -g m.gcsa -i -M 1 -f - | vg surject -m -x m.xg -i -p q -s - | samtools view | wc -l)" 1000 "surject works on paired GAMP input"

rm -rf minigiab.vg* m.xg m.gcsa

Expand Down
4 changes: 2 additions & 2 deletions test/t/33_vg_mpmap.t
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ vg snarls -T xy.vg > xy.snarls
vg index xy.vg -x xy.xg -g xy.gcsa
vg index xy.vg -j xy.dist -s xy.snarls

vg mpmap -x xy.xg -d xy.dist -g xy.gcsa -G x.gam -F SAM -i --frag-mean 50 --frag-stddev 10 >xy.sam
vg mpmap -x xy.xg -d xy.dist -g xy.gcsa -G x.gam -F SAM -i --frag-mean 50 --frag-stddev 10 -M 1 >xy.sam
X_HITS="$(cat xy.sam | grep -v "^@" | cut -f3 | grep x | wc -l)"
if [ "${X_HITS}" -lt 1200 ] && [ "${X_HITS}" -gt 800 ] ; then
IN_RANGE="1"
Expand All @@ -149,7 +149,7 @@ else
fi
is "${IN_RANGE}" "1" "paired reads are evenly split between equivalent mappings"

vg mpmap -x xy.xg -d xy.dist -g xy.gcsa -G x.gam -F SAM >xy.sam
vg mpmap -x xy.xg -d xy.dist -g xy.gcsa -G x.gam -F SAM -M 1 >xy.sam
X_HITS="$(cat xy.sam | grep -v "^@" | cut -f3 | grep x | wc -l)"
if [ "${X_HITS}" -lt 1200 ] && [ "${X_HITS}" -gt 800 ] ; then
IN_RANGE="1"
Expand Down
2 changes: 1 addition & 1 deletion vgci/vgci.py
Original file line number Diff line number Diff line change
Expand Up @@ -1328,7 +1328,7 @@ def test_sim_mhc_snp1kg_mpmap(self):
acc_threshold=0.02, auc_threshold=0.02,
sim_opts='-d 0.01 -p 1000 -v 75.0 -S 5 -I',
sim_fastq=self._input('platinum_NA12878_MHC.fq.gz'),
more_mpmap_opts=['-u 8'])
more_mpmap_opts=['-u 8 -M 1'])

@timeout_decorator.timeout(4000)
def test_sim_yeast_cactus(self):
Expand Down

2 comments on commit 096bfdc

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 11160 seconds

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch ptx-doi-tag. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 14906 seconds

Please sign in to comment.