Skip to content

Commit

Permalink
proper mate information
Browse files Browse the repository at this point in the history
  • Loading branch information
Heng Li committed Nov 12, 2011
1 parent e06685d commit b42910a
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 76 deletions.
10 changes: 7 additions & 3 deletions bwtsw2.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
#include "bwt_lite.h"
#include "bwt.h"

#define BSW2_FLAG_MOVED 0x80
#define BSW2_FLAG_MATESW 0x100
#define BSW2_FLAG_TANDEM 0x200
#define BSW2_FLAG_MOVED 0x400

typedef struct {
int a, b, q, r, t, qr, bw;
Expand All @@ -24,11 +24,15 @@ typedef struct {
int beg, end;
} bsw2hit_t;

typedef struct {
int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize;
uint32_t *cigar;
} bsw2aux_t;

typedef struct {
int n, max;
bsw2hit_t *hits;
int *n_cigar;
uint32_t **cigar;
bsw2aux_t *aux;
} bwtsw2_t;

typedef struct {
Expand Down
182 changes: 116 additions & 66 deletions bwtsw2_aux.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ void bsw2_destroy(bwtsw2_t *b)
{
int i;
if (b == 0) return;
if (b->cigar)
for (i = 0; i < b->n; ++i) free(b->cigar[i]);
free(b->cigar); free(b->n_cigar); free(b->hits);
if (b->aux)
for (i = 0; i < b->n; ++i) free(b->aux[i].cigar);
free(b->aux); free(b->hits);
free(b);
}

Expand Down Expand Up @@ -175,18 +175,10 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pa
i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length
target = calloc(i, 1);
path = calloc(i + lq, sizeof(path_t));
// memory clean up for b
if (b->n < b->max) {
b->max = b->n;
b->hits = realloc(b->hits, b->n * sizeof(bsw2hit_t));
}
if (b->cigar) free(b->cigar);
if (b->n_cigar) free(b->n_cigar);
b->cigar = (uint32_t**)calloc(b->max, sizeof(void*));
b->n_cigar = (int*)calloc(b->max, sizeof(int));
// generate CIGAR
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
bsw2aux_t *q = b->aux + i;
uint8_t *query;
bwtint_t k;
int score, path_len, beg, end;
Expand All @@ -197,17 +189,17 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pa
for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here
target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3;
score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
b->cigar[i] = aln_path2cigar32(path, path_len, &b->n_cigar[i]);
q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar);
if (beg != 0 || end < lq) { // write soft clipping
b->cigar[i] = realloc(b->cigar[i], 4 * (b->n_cigar[i] + 2));
q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2));
if (beg != 0) {
memmove(b->cigar[i] + 1, b->cigar[i], b->n_cigar[i] * 4);
b->cigar[i][0] = beg<<4 | 4;
++b->n_cigar[i];
memmove(q->cigar + 1, q->cigar, q->n_cigar * 4);
q->cigar[0] = beg<<4 | 4;
++q->n_cigar;
}
if (end < lq) {
b->cigar[i][b->n_cigar[i]] = (lq - end)<<4 | 4;
++b->n_cigar[i];
q->cigar[q->n_cigar] = (lq - end)<<4 | 4;
++q->n_cigar;
}
}
}
Expand Down Expand Up @@ -325,7 +317,7 @@ typedef struct {
bsw2seq1_t *seq;
} bsw2seq_t;

static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar)
static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar)
{
// FIXME: this routine does not work if the query bridge three reference sequences
int32_t coor, refl, lq;
Expand Down Expand Up @@ -398,17 +390,82 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n
return n_cigar;
}

static int est_mapq(const bsw2hit_t *p, const bsw2opt_t *opt)
static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], uint8_t *pac, bwtsw2_t *b)
{
float c = 1.0;
int qual, subo = p->G2 > opt->t? p->G2 : opt->t;
if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
if (p->n_seeds < 2) c *= .2;
qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499);
if (qual > 250) qual = 250;
if (qual < 0) qual = 0;
if (p->flag&1) qual = 0; // this is a random hit
return qual;
int i;
// allocate for b->aux
if (b->n<<1 < b->max) {
b->max = b->n;
kroundup32(b->max);
b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t));
}
b->aux = calloc(b->n, sizeof(bsw2aux_t));
// generate CIGAR
gen_cigar(opt, qlen, seq, pac, b);
// fix CIGAR, generate mapQ, and write chromosomal position
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = &b->hits[i];
bsw2aux_t *q = &b->aux[i];
q->flag = p->flag & 0xfe;
q->isize = 0;
if (p->l == 0) { // unique hit
float c = 1.0;
int subo;
// fix out-of-boundary CIGAR
q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar);
// compute mapQ
subo = p->G2 > opt->t? p->G2 : opt->t;
if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
if (p->n_seeds < 2) c *= .2;
q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499);
if (q->qual > 250) q->qual = 250;
if (q->qual < 0) q->qual = 0;
if (p->flag&1) q->qual = 0; // this is a random hit
q->pqual = q->qual; // set the paired qual as qual
// get the chromosomal position
q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr);
q->pos = p->k - bns->anns[q->chr].offset;
} else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0;
}
}

static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m)
{
int i;
if (m == 0) return;
// update flag, mchr and mpos
for (i = 0; i < b->n; ++i) {
bsw2aux_t *q = &b->aux[i];
q->flag |= 1; // paired
if (m->n == 0) q->flag |= 8; // mate unmapped
if (m->n == 1) {
q->mchr = m->aux[0].chr;
q->mpos = m->aux[0].pos;
if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand
if (q->chr == q->mchr) { // set insert size
if (q->mpos + m->hits[0].len > q->pos)
q->isize = q->mpos + m->hits[0].len - q->pos;
else q->isize = q->mpos - q->pos - b->hits[0].len;
} else q->isize = 0;
} else q->mchr = q->mpos = -1;
}
// update mapping quality
if (b->n == 1 && m->n == 1) {
bsw2hit_t *p = &b->hits[0];
int isize;
if (p->flag & BSW2_FLAG_MATESW) { // this alignment is rescued by Smith-Waterman
if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < m->aux[0].qual)
b->aux[0].pqual = m->aux[0].qual;
} else if (p->flag&2) { // properly paired
if (!(p->flag & BSW2_FLAG_TANDEM)) { // not around a tandem repeat
if (b->aux[0].pqual < m->aux[0].qual) {
b->aux[0].pqual += 20;
if (b->aux[0].pqual >= m->aux[0].qual)
b->aux[0].pqual = m->aux[0].qual;
}
}
}
}
}

/* generate SAM lines for a sequence in ks with alignment stored in
Expand All @@ -429,56 +486,42 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
}
for (i = 0; b && i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
int seqid = -1;
int64_t coor = -1;
int j, nn = 0;
int beg, end;
if (p->l == 0) {
b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]);
nn = bns_cnt_ambi(bns, p->k, p->len, &seqid);
coor = p->k - bns->anns[seqid].offset;
}
ksprintf(&str, "%s\t%d", ks->name, (p->flag&0x7f)|(is_pe?1:0));
ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1);
if (p->l == 0) {
int qual = est_mapq(p, opt);
if (is_pe && bmate && bmate->n == 1) {
int mate_qual = est_mapq(bmate->hits, opt);
if (p->flag & BSW2_FLAG_MATESW) { // this alignment is rescued by Smith-Waterman
qual = qual < mate_qual? qual : mate_qual;
} else if (p->flag&2) { // properly paired
if (!(p->flag & BSW2_FLAG_TANDEM)) { // not around a tandem repeat
if (qual < mate_qual) {
qual += 20;
if (qual >= mate_qual) qual = mate_qual;
}
}
}
}
ksprintf(&str, "\t%d\t", qual);
for (k = 0; k < b->n_cigar[i]; ++k)
ksprintf(&str, "%d%c", b->cigar[i][k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[b->cigar[i][k]&0xf]);
bsw2aux_t *q = b->aux + i;
int j, beg, end, type = 0;
// print mandatory fields before SEQ
ksprintf(&str, "%s\t%d", ks->name, q->flag);
ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1);
if (p->l == 0) { // not a repetitive hit
ksprintf(&str, "\t%d\t", q->pqual);
for (k = 0; k < q->n_cigar; ++k)
ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]);
} else ksprintf(&str, "\t0\t*");
ksprintf(&str, "\t*\t0\t0\t");
ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize);
// get the sequence begin and end
beg = 0; end = ks->l;
if (opt->hard_clip) {
if ((b->cigar[i][0]&0xf) == 4) beg += b->cigar[i][0]>>4;
if ((b->cigar[i][b->n_cigar[i]-1]&0xf) == 4) end -= b->cigar[i][b->n_cigar[i]-1]>>4;
if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4;
if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4;
}
for (j = beg; j < end; ++j) {
if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str);
else kputc(ks->seq[j], &str);
}
// print base quality if present
if (ks->qual) {
kputc('\t', &str);
for (j = beg; j < end; ++j) {
if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str);
else kputc(ks->qual[j], &str);
}
} else ksprintf(&str, "\t*");
ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tXN:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, nn);
// print optional tags
ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds);
if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn);
if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1);
if (p->flag&BSW2_FLAG_MATESW) ksprintf(&str, "\tXT:i:1");
if (p->flag&BSW2_FLAG_MATESW) type |= 1;
if (p->flag&BSW2_FLAG_TANDEM) type |= 2;
if (type) ksprintf(&str, "\tXT:i:%d", type);
kputc('\n', &str);
}
ks->sam = str.s;
Expand Down Expand Up @@ -575,10 +618,13 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const
seq[0][i] = c;
seq[1][p->l-1-i] = 3 - c;
}
gen_cigar(&opt, p->l, seq, pac, buf[x]);
print_hits(bns, &opt, p, buf[x], is_pe, buf[x^1]);
write_aux(&opt, bns, p->l, seq, pac, buf[x]);
free(seq[0]);
}
for (x = 0; x < _seq->n; ++x) {
if (is_pe) update_mate_aux(buf[x], buf[x^1]);
print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]);
}
for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]);
free(buf);
bsw2_global_destroy(pool);
Expand Down Expand Up @@ -681,6 +727,8 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
is_pe = 1;
} else fp2 = 0, ks2 = 0, is_pe = 0;
while (kseq_read(ks) >= 0) {
if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/')
ks->name.l -= 2, ks->name.s[ks->name.l] = 0;
if (_seq->n == _seq->max) {
_seq->max = _seq->max? _seq->max<<1 : 1024;
_seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
Expand All @@ -689,6 +737,8 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
size += ks->seq.l;
if (ks2) {
if (kseq_read(ks2) >= 0) {
if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/')
ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0;
kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge
size += ks->seq.l;
} else {
Expand Down
18 changes: 12 additions & 6 deletions bwtsw2_pair.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
ksw_query_t *q;
ksw_aux_t aux[2];
// compute the region start and end
a->n_seeds = 1; a->l = 0; a->flag |= BSW2_FLAG_MATESW;
a->n_seeds = 1; a->l = 0;
if (h->is_rev == 0) {
beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499);
end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499);
Expand Down Expand Up @@ -168,21 +168,27 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b
if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1]);
if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0]);
// the following enumerate all possibilities. It is tedious but necessary...
if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not
if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not;
bwtsw2_t *p[2];
int which;
if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1;
else p[0] = hits[i+1], p[1] = hits[i], which = 0;
if (a[which].G == 0) continue;
a[which].flag |= BSW2_FLAG_MATESW;
if (a[which].G2) a[which].flag |= BSW2_FLAG_TANDEM;
if (p[1]->max == 0) {
p[1]->max = 1;
p[1]->hits = malloc(sizeof(bsw2hit_t));
}
p[1]->hits[0] = a[which];
p[1]->n = 1;
p[0]->hits[0].flag |= 2;
p[1]->hits[0].flag |= 2;
++n_rescued;
} else { // then both ends mapped
int ori_G2[2];
//fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end);
ori_G2[0] = a[0].G2; ori_G2[1] = a[1].G2;
for (j = 0; j < 2; ++j) { // first fix wrong mappings
if (hits[i+j]->hits[0].G < a[j].G) { // the orginal mapping is suboptimal
a[j].G2 = a[j].G2 > hits[i+j]->hits[0].G? a[j].G2 : hits[i+j]->hits[0].G;
Expand All @@ -194,14 +200,14 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b
for (j = 0; j < 2; ++j) {
if (hits[i+j]->hits[0].G2 < a[j].G2)
hits[i+j]->hits[0].G2 = a[j].G2;
if (a[j].G2) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM;
if (ori_G2[j]) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM;
hits[i+j]->hits[0].flag |= 2;
}
} else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match
for (j = 0; j < 2; ++j) {
hits[i+j]->hits[0].flag |= 2;
if (hits[i+j]->hits[0].k != a[j].k)
hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM | 2;
hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM;
}
} else if (a[0].G || a[1].G) { // it is possible to move one end
if (a[0].G && a[1].G) { // now we have two "proper pairs"
Expand All @@ -223,14 +229,14 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b
diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0;
if (diff < dev * 2.) { // then move
int tflag = 0;
if (a[which].G - a[which].G2 < 2 * (opt->a + opt->b)) tflag = BSW2_FLAG_TANDEM;
if (ori_G2[which]) tflag = BSW2_FLAG_TANDEM;
a[which].G2 = a[which].G;
p[1][0] = a[which];
p[1]->flag |= BSW2_FLAG_MOVED | 2 | tflag;
p[0]->flag |= 2;
++n_moved;
}
}
} // else, do nothing
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "utils.h"

#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.6.0-r77-dev"
#define PACKAGE_VERSION "0.6.0-r78-dev"
#endif

static int usage()
Expand Down

0 comments on commit b42910a

Please sign in to comment.