Skip to content

Commit

Permalink
Amalgamate multiple CIGAR ops into single entry. (#1607)
Browse files Browse the repository at this point in the history
Amalgamate multiple CIGAR ops into single entry.

Multiple matching (or sequence (mis)matching)) ops (e.g. 10M40M) give a different VCF using BAQ than a single operation of the same length (e.g. 50M).  This change compresses the multiple operations into one.
  • Loading branch information
whitwham authored and vasudeva8 committed May 30, 2023
1 parent d6faf46 commit 86b3809
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 2 deletions.
22 changes: 21 additions & 1 deletion realn.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* realn.c -- BAQ calculation and realignment.
Copyright (C) 2009-2011, 2014-2016, 2018, 2021 Genome Research Ltd.
Copyright (C) 2009-2011, 2014-2016, 2018, 2021, 2023 Genome Research Ltd.
Portions copyright (C) 2009-2011 Broad Institute.
Author: Heng Li <[email protected]>
Expand Down Expand Up @@ -268,8 +268,28 @@ int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) {
// tseq,tref are no longer needed, so we can steal them to avoid mallocs
uint8_t *left = tseq;
uint8_t *rght = tref;
int len = 0;

for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
int op = cigar[k]&0xf, l = cigar[k]>>4;

// concatenate alignment matches (including sequence (mis)matches)
// otherwise 50M50M gives a different result to 100M
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
if ((k + 1) < c->n_cigar) {
int next_op = bam_cigar_op(cigar[k + 1]);

if (next_op == BAM_CMATCH || next_op == BAM_CEQUAL || next_op == BAM_CDIFF) {
len += l;
continue;
}
}

// last of M/X/= ops
l += len;
len = 0;
}

if (l == 0) continue;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
// Sanity check running off the end of the sequence
Expand Down
2 changes: 1 addition & 1 deletion sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -5227,7 +5227,7 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
uint32_t *cigar = bam_get_cigar(b);
int k;
// determine the current CIGAR operation
//fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
//fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
if (s->k == -1) { // never processed
p->qpos = 0;
if (c->n_cigar == 1) { // just one operation, save a loop
Expand Down
2 changes: 2 additions & 0 deletions test/realn03.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>MX
CGTCTACTACG
1 change: 1 addition & 0 deletions test/realn03.fa.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
MX 11 4 11 12
4 changes: 4 additions & 0 deletions test/realn03.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@HD VN:1.6 SO:coordinate
@SQ SN:MX LN:11
M 64 MX 1 60 11M * 0 0 CGTCTCCTACG IIIIIIIIIII
X 64 MX 1 60 5=1X5= * 0 0 CGTCTCCTACG IIIIIIIIIII
4 changes: 4 additions & 0 deletions test/realn03_exp.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@HD VN:1.6 SO:coordinate
@SQ SN:MX LN:11
M 64 MX 1 60 11M * 0 0 CGTCTCCTACG IIIIIIIIIII BQ:Z:D@@@@@@@@@D
X 64 MX 1 60 5=1X5= * 0 0 CGTCTCCTACG IIIIIIIIIII BQ:Z:D@@@@@@@@@D
3 changes: 3 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,9 @@ sub test_realn {

# Revert quality values (using data in ZQ tags)
test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn02.fa -i $$opts{path}/realn02_exp-a.sam -o -", out => "realn02_exp.sam");

# Make sure multiple matches are treated the same way as a single match of the same length.
test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn03.fa -e -i $$opts{path}/realn03.sam -o -", out => "realn03_exp.sam");
}

sub test_bcf_set_variant_type
Expand Down

0 comments on commit 86b3809

Please sign in to comment.