-
Notifications
You must be signed in to change notification settings - Fork 24
/
assembler.c
383 lines (346 loc) · 16.5 KB
/
assembler.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
/* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers.
Copyright (C) 2011-2012 Andre Masella
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "config.h"
#include <assert.h>
#include <limits.h>
#include <math.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include "pandaseq.h"
#include "algo.h"
#include "assembler.h"
#include "buffer.h"
#include "misc.h"
#include "module.h"
#include "prob.h"
#include "table.h"
#define LOG(flag, code) do { if(panda_debug_flags & flag) panda_log_proxy_write(assembler->logger, (code), assembler, &assembler->result.name, NULL); } while(0)
#define LOGV(flag, code, fmt, ...) do { if(panda_debug_flags & flag) { snprintf(static_buffer(), BUFFER_SIZE, fmt, __VA_ARGS__); panda_log_proxy_write(assembler->logger, (code), assembler, &assembler->result.name, static_buffer()); }} while(0)
typedef unsigned int bitstype;
#define FOR_BITS_IN_LIST(bits,index) for (index = 0; index < bits##_size; index++) if ((bits)[index / sizeof(bitstype) / 8] & (1 << (index % (8 * sizeof(bitstype)))))
#define BIT_LIST_SET(bits,index) do { if ((index) >= 0 && (size_t)(index) < bits##_size) { (bits)[(index) / sizeof(bitstype) / 8] |= (1 << ((index) % (8 * sizeof(bitstype)))); } } while (0)
#define BIT_LIST_GET(bits,index) ((bits)[(index) / sizeof(bitstype) / 8] & (1 << ((index) % (8 * sizeof(bitstype)))))
#define BITS_INIT(bits,size) bitstype bits[(size) / 8 / sizeof(bitstype) + 1]; size_t bits##_size = (size); memset(&bits, 0, ((size) / 8 / sizeof(bitstype) + 1) * sizeof(bitstype))
#define ALL_BITS_IF_NONE(bits) do { bitstype _all = 0; size_t _bitctr; for (_bitctr = 0; _bitctr < ((bits##_size) / 8 / sizeof(bitstype) + 1); _bitctr++) { _all |= (bits)[_bitctr]; } if (_all == 0) { memset(&bits, 0xFF, (bits##_size / 8 / sizeof(bitstype) + 1) * sizeof(bitstype)); }} while (0)
#define VEEZ(x) ((x) < 0 ? 0 : (x))
#define WEDGEZ(x) ((x) > 0 ? 0 : (x))
/* Try to align forward and reverse reads and return the quality of the aligned sequence and the sequence itself. */
static bool align(
PandaAssembler assembler,
panda_result_seq *result) {
size_t i, j;
ptrdiff_t df, dr;
/* Cache all algorithm information. */
double qual_nn = assembler->algo->clazz->prob_unpaired;
void *algo_data = panda_algorithm_data(assembler->algo);
PandaComputeOverlap overlap_probability = assembler->algo->clazz->overlap_probability;
PandaComputeMatch match_probability = assembler->algo->clazz->match_probability;
/* For determining overlap. */
size_t maxoverlap = result->forward_length + result->reverse_length - assembler->minoverlap - result->forward_offset - result->reverse_offset - 1;
double bestprobability = qual_nn * (result->forward_length + result->reverse_length);
ptrdiff_t bestoverlap = -1;
size_t counter;
kmer_it it;
size_t unmasked_forward_length;
size_t unmasked_reverse_length;
/* For computing new sequence. */
double fquality = 0;
double oquality = 0;
double rquality = 0;
ptrdiff_t len;
if (assembler->minoverlap + result->forward_offset >= result->forward_length || assembler->minoverlap + result->reverse_offset >= result->reverse_length) {
LOG(PANDA_DEBUG_BUILD, PANDA_CODE_NEGATIVE_SEQUENCE_LENGTH);
return false;
}
if (assembler->maxoverlap == 0) {
maxoverlap = result->forward_length < result->reverse_length ? result->forward_length : result->reverse_length;
} else if (maxoverlap > assembler->maxoverlap) {
maxoverlap = assembler->maxoverlap;
}
BITS_INIT(posn, assembler->minoverlap <= maxoverlap ? (maxoverlap - assembler->minoverlap + 1) : 1);
if (result->forward_length >= (1 << (8 * sizeof(seqindex)))) {
LOG(PANDA_DEBUG_BUILD, PANDA_CODE_INSUFFICIENT_KMER_TABLE);
return false;
}
/* Scan forward sequence building k-mers and appending the position to kmerseen[k] */
FOREACH_KMER(it, result->forward,.nt) {
LOGV(PANDA_DEBUG_KMER, PANDA_CODE_FORWARD_KMER, "%zd@%zd", KMER(it), KMER_POSITION(it));
for (j = 0; j < assembler->num_kmers && assembler->kmerseen[(KMER(it) << 1) + j] != 0; j++) ;
if (j == assembler->num_kmers) {
/* If we run out of storage, we lose k-mers. */
LOGV(PANDA_DEBUG_BUILD, PANDA_CODE_LOST_KMER, "%zd@%zd", KMER(it), KMER_POSITION(it));
} else {
assembler->kmerseen[(KMER(it) * assembler->num_kmers) + j] = KMER_POSITION(it);
}
}
/* Scan reverse sequence building k-mers. For each position in the forward sequence for this kmer (i.e., kmerseen[k]), flag that we should check the corresponding overlap. */
FOREACH_KMER_REVERSE(it, result->reverse,.nt) {
LOGV(PANDA_DEBUG_KMER, PANDA_CODE_REVERSE_KMER, "%zd@%zd", KMER(it), KMER_POSITION(it));
for (j = 0; j < assembler->num_kmers && assembler->kmerseen[(KMER(it) * assembler->num_kmers) + j] != (seqindex) 0; j++) {
int index = result->forward_length + result->reverse_length - KMER_POSITION(it) - assembler->kmerseen[(KMER(it) * assembler->num_kmers) + j] - assembler->minoverlap - 1;
BIT_LIST_SET(posn, index);
}
}
/* Reset kmerseen */
FOREACH_KMER(it, result->forward,.nt) {
for (j = 0; j < assembler->num_kmers; j++)
assembler->kmerseen[(KMER(it) * assembler->num_kmers) + j] = 0;
}
ALL_BITS_IF_NONE(posn);
result->overlaps_examined = 0;
/* Compute the quality of the overlapping region for the various overlaps and pick the best one. */
FOR_BITS_IN_LIST(posn, counter) {
double probability;
size_t overlap = counter + assembler->minoverlap;
probability = overlap_probability(algo_data, result->forward, result->forward_length, result->reverse, result->reverse_length, overlap);
LOGV(PANDA_DEBUG_RECON, PANDA_CODE_OVERLAP_POSSIBILITY, "overlap = %zd probability = %f", overlap, probability);
if (probability > bestprobability && overlap >= assembler->minoverlap) {
bestprobability = probability;
bestoverlap = overlap;
}
result->overlaps_examined++;
}
if (result->overlaps_examined == maxoverlap - assembler->minoverlap + 1) {
assembler->slowcount++;
}
LOGV(PANDA_DEBUG_BUILD, PANDA_CODE_BEST_OVERLAP, "%zd", bestoverlap);
if (bestoverlap == -1) {
return false;
}
/* Compute the correct alignment and the quality score of the entire sequence. */
len = result->forward_length - (ptrdiff_t) result->forward_offset - bestoverlap + result->reverse_length - (ptrdiff_t) result->reverse_offset + 1;
if (len <= 0) {
LOG(PANDA_DEBUG_BUILD, PANDA_CODE_NEGATIVE_SEQUENCE_LENGTH);
return false;
}
if ((size_t) len > 2 * PANDA_MAX_LEN) {
LOG(PANDA_DEBUG_BUILD, PANDA_CODE_SEQUENCE_TOO_LONG);
return false;
}
result->sequence_length = len - 1;
result->degenerates = 0;
df = (ptrdiff_t) result->forward_length - (ptrdiff_t) result->forward_offset - bestoverlap;
dr = (ptrdiff_t) result->reverse_length - (ptrdiff_t) result->reverse_offset - bestoverlap;
/* Copy the unpaired forward sequence. */
LOGV(PANDA_DEBUG_RECON, PANDA_CODE_RECONSTRUCTION_PARAM, "bestoverlap = %zd, dforward = %zd, dreverse = %zd, len = %zd", bestoverlap, df, dr, len);
for (i = 0; i < (size_t) VEEZ(df); i++) {
int findex = i + result->forward_offset;
panda_nt fbits = result->forward[findex].nt;
double q = qual_score[PHREDCLAMP(result->forward[findex].qual)];
result->sequence[i].nt = fbits;
result->sequence[i].p = q;
if (PANDA_NT_IS_DEGN(fbits)) {
result->degenerates++;
}
fquality += q;
LOGV(PANDA_DEBUG_RECON, PANDA_CODE_BUILD_FORWARD, "S[%zd] = F[%d] = %c", i, findex, panda_nt_to_ascii(result->sequence[i].nt));
}
/* Mask out the B-cliff at the end of sequences */
for (unmasked_forward_length = result->forward_length; unmasked_forward_length > 0 && result->forward[unmasked_forward_length - 1].qual == (char) 2; unmasked_forward_length--) ;
for (unmasked_reverse_length = result->reverse_length; unmasked_reverse_length > 0 && result->reverse[unmasked_reverse_length - 1].qual == (char) 2; unmasked_reverse_length--) ;
/* Copy the paired sequence adjusting the probabilities based on the quality information from both sequences. */
result->overlap_mismatches = 0;
for (i = 0; i < (size_t) (bestoverlap + WEDGEZ(df) + WEDGEZ(dr)); i++) {
int index = VEEZ(df) + i;
int findex = result->forward_offset + VEEZ(df) + i;
int rindex = result->reverse_length - i - 1 + WEDGEZ(df);
bool ismatch = (result->reverse[rindex].nt & result->forward[findex].nt) != '\0';
double fpr;
double rpr;
double q;
char nt;
if (index < 0 || findex < 0 || rindex < 0 || (size_t) findex >= result->forward_length || (size_t) rindex >= result->reverse_length)
continue;
fpr = (size_t) findex >= unmasked_forward_length ? qual_nn : qual_score[PHREDCLAMP(result->forward[findex].qual)];
rpr = (size_t) rindex >= unmasked_reverse_length ? qual_nn : qual_score[PHREDCLAMP(result->reverse[rindex].qual)];
if (!ismatch) {
LOGV(PANDA_DEBUG_MISMATCH, PANDA_CODE_MISMATCHED_BASE, "(F[%d] = %c) != (R[%d] = %c)", findex, panda_nt_to_ascii(result->forward[findex].nt), rindex, panda_nt_to_ascii(result->reverse[rindex].nt));
result->overlap_mismatches++;
}
if ((size_t) findex >= unmasked_forward_length && (size_t) rindex >= unmasked_reverse_length) {
q = qual_nn;
} else if ((size_t) findex >= unmasked_forward_length) {
q = rpr;
} else if ((size_t) rindex >= unmasked_reverse_length) {
q = fpr;
} else {
q = match_probability(algo_data, ismatch, result->forward[findex].qual, result->reverse[rindex].qual);
}
if (ismatch) {
nt = (result->reverse[rindex].nt & result->forward[findex].nt);
} else {
if (result->forward[findex].qual < result->reverse[rindex].qual) {
nt = result->reverse[rindex].nt;
} else {
nt = result->forward[findex].nt;
}
}
result->sequence[index].nt = nt;
result->sequence[index].p = q;
if (PANDA_NT_IS_DEGN(nt)) {
result->degenerates++;
}
oquality += q;
LOGV(PANDA_DEBUG_RECON, PANDA_CODE_BUILD_OVERLAP, "S[%d] = %c, F[%d] = %c, R[%d] = %c", index, panda_nt_to_ascii(nt), findex, panda_nt_to_ascii(result->forward[findex].nt), rindex, panda_nt_to_ascii(result->reverse[rindex].nt));
}
/* Copy the unpaired reverse sequence. */
for (i = 0; i < (size_t) VEEZ(dr); i++) {
int index = df + bestoverlap + i;
int rindex = result->reverse_length - bestoverlap - i - 1;
panda_nt rbits = result->reverse[rindex].nt;
double q = qual_score[PHREDCLAMP(result->reverse[rindex].qual)];
rquality += q;
result->sequence[index].nt = rbits;
result->sequence[index].p = q;
if (PANDA_NT_IS_DEGN(rbits)) {
result->degenerates++;
}
LOGV(PANDA_DEBUG_RECON, PANDA_CODE_BUILD_REVERSE, "S[%d] = R[%d] = %c", index, rindex, panda_nt_to_ascii(result->sequence[index].nt));
}
result->quality = (fquality + rquality + oquality) / len;
result->overlap = bestoverlap;
result->estimated_overlap_probability = bestprobability;
return true;
}
bool assemble_seq(
PandaAssembler assembler) {
assembler->count++;
if (assembler->result.forward_length < 2 || assembler->result.reverse_length < 2) {
assembler->badreadcount++;
return false;
}
if (!module_precheckseq(assembler, &assembler->result.name, assembler->result.forward, assembler->result.forward_length, assembler->result.reverse, assembler->result.reverse_length)) {
return false;
}
if (!assembler->post_primers) {
if (assembler->forward_primer_length > 0) {
assembler->result.forward_offset = panda_compute_offset_qual(assembler->threshold, assembler->primer_penalty, false, assembler->result.forward, assembler->result.forward_length, assembler->forward_primer, assembler->forward_primer_length);
if (assembler->result.forward_offset == 0) {
LOG(PANDA_DEBUG_STAT, PANDA_CODE_NO_FORWARD_PRIMER);
assembler->nofpcount++;
return false;
}
assembler->result.forward_offset--;
} else {
assembler->result.forward_offset = assembler->forward_trim;
}
if (assembler->reverse_primer_length > 0) {
assembler->result.reverse_offset = panda_compute_offset_qual(assembler->threshold, assembler->primer_penalty, false, assembler->result.reverse, assembler->result.reverse_length, assembler->reverse_primer, assembler->reverse_primer_length);
if (assembler->result.reverse_offset == 0) {
LOG(PANDA_DEBUG_STAT, PANDA_CODE_NO_REVERSE_PRIMER);
assembler->norpcount++;
return false;
}
assembler->result.reverse_offset--;
} else {
assembler->result.reverse_offset = assembler->reverse_trim;
}
} else {
assembler->result.forward_offset = 0;
assembler->result.reverse_offset = 0;
}
if (((assembler->result.forward_length < assembler->result.reverse_length) ? assembler->result.forward_length : assembler->result.reverse_length) < assembler->minoverlap) {
assembler->badreadcount++;
return false;
}
if (!align(assembler, &assembler->result)) {
if (assembler->noalgn != NULL) {
assembler->noalgn(assembler, &assembler->result.name, assembler->result.forward, assembler->result.forward_length, assembler->result.reverse, assembler->result.reverse_length, assembler->noalgn_data);
}
assembler->noalgncount++;
return false;
}
if (assembler->post_primers) {
size_t it;
if (assembler->forward_primer_length > 0) {
assembler->result.forward_offset = panda_compute_offset_result(assembler->threshold, assembler->primer_penalty, false, assembler->result.sequence, assembler->result.sequence_length, assembler->forward_primer, assembler->forward_primer_length);
if (assembler->result.forward_offset == 0) {
LOG(PANDA_DEBUG_STAT, PANDA_CODE_NO_FORWARD_PRIMER);
assembler->nofpcount++;
return false;
}
assembler->result.forward_offset--;
} else {
assembler->result.forward_offset = assembler->forward_trim;
}
if (assembler->reverse_primer_length > 0) {
assembler->result.reverse_offset = panda_compute_offset_result(assembler->threshold, assembler->primer_penalty, true, assembler->result.sequence, assembler->result.sequence_length, assembler->reverse_primer, assembler->reverse_primer_length);
if (assembler->result.reverse_offset == 0) {
LOG(PANDA_DEBUG_STAT, PANDA_CODE_NO_REVERSE_PRIMER);
assembler->norpcount++;
return false;
}
assembler->result.reverse_offset--;
} else {
assembler->result.reverse_offset = assembler->reverse_trim;
}
if (assembler->result.sequence_length <= assembler->result.forward_offset + assembler->result.reverse_offset) {
LOG(PANDA_DEBUG_STAT, PANDA_CODE_NO_FORWARD_PRIMER);
assembler->nofpcount++;
return false;
}
assembler->result.sequence_length -= assembler->result.forward_offset + assembler->result.reverse_offset;
for (it = 0; it < assembler->result.sequence_length; it++) {
assembler->result.sequence[it] = assembler->result.sequence[it + assembler->result.forward_offset];
}
}
if (assembler->result.quality < assembler->threshold) {
assembler->lowqcount++;
LOGV(PANDA_DEBUG_STAT, PANDA_CODE_LOW_QUALITY_REJECT, "%f < %f", exp(assembler->result.quality), exp(assembler->threshold));
return false;
}
if (module_checkseq(assembler, &assembler->result)) {
assembler->okcount++;
assembler->overlapcount[assembler->result.overlap]++;
if (assembler->longest_overlap < assembler->result.overlap) {
assembler->longest_overlap = assembler->result.overlap;
}
return true;
}
return false;
}
const panda_result_seq *panda_assembler_next(
PandaAssembler assembler) {
if (assembler->next == NULL) {
return NULL;
}
while (true) {
if (!assembler->next(&assembler->result.name, &assembler->result.forward, &assembler->result.forward_length, &assembler->result.reverse, &assembler->result.reverse_length, assembler->next_data)) {
return NULL;
}
assert(assembler->result.forward_length <= PANDA_MAX_LEN);
assert(assembler->result.reverse_length <= PANDA_MAX_LEN);
if (assemble_seq(assembler)) {
return &assembler->result;
}
}
return NULL;
}
const panda_result_seq *panda_assembler_assemble(
PandaAssembler assembler,
panda_seq_identifier *id,
const panda_qual *forward,
size_t forward_length,
const panda_qual *reverse,
size_t reverse_length) {
assert(forward_length <= PANDA_MAX_LEN);
assert(reverse_length <= PANDA_MAX_LEN);
assembler->result.name = *id;
assembler->result.forward_length = forward_length;
assembler->result.reverse_length = reverse_length;
assembler->result.forward = forward;
assembler->result.reverse = reverse;
return assemble_seq(assembler) ? &assembler->result : NULL;
}