-
Notifications
You must be signed in to change notification settings - Fork 3
/
taf_norm.c
388 lines (354 loc) · 17 KB
/
taf_norm.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
/*
* maf_norm: Normalize a taf or maf alignment to remove unnecessary blocks
*
* Released under the MIT license, see LICENSE.txt
*/
#include "taf.h"
#include "sonLib.h"
#include <getopt.h>
#include <time.h>
int64_t maximum_block_length_to_merge = 200;
int64_t maximum_gap_length = 30;
int64_t minimum_shared_rows = 1;
float fraction_shared_rows = 0.0;
static int64_t repeat_coordinates_every_n_columns = 10000;
static void usage(void) {
fprintf(stderr, "taffy norm [options]\n");
fprintf(stderr, "Normalize a taf format alignment to remove small blocks using the -m and -n options to determine what to merge \n");
fprintf(stderr, "-i --inputFile : Input taf file to normalize. If not specified reads from stdin\n");
fprintf(stderr, "-o --outputFile : Output taf file. If not specified outputs to stdout\n");
fprintf(stderr, "-l --logLevel : Set the log level\n");
fprintf(stderr, "-k --maf : Print maf output instead of taf\n");
fprintf(stderr, "-m --maximumBlockLengthToMerge : Only merge together any two adjacent blocks if one or both is less than this many bases long, by default: %" PRIi64 "\n", maximum_block_length_to_merge);
fprintf(stderr, "-n --maximumGapLength : Only merge together two adjacent blocks if the total number of unaligned bases between the blocks is less than this many bases, by default: %" PRIi64 "\n", maximum_gap_length);
fprintf(stderr, "-Q --minimumSharedRows : The minimum number of rows between two blocks that need to be shared for a merge, default: %" PRIi64 "\n", minimum_shared_rows);
fprintf(stderr, "-q --fractionSharedRows : The fraction of rows between two blocks that need to be shared for a merge, default: %f\n", fraction_shared_rows);
fprintf(stderr, "-d --filterGapCausingDupes : Reduce the number of MAF blocks by filtering out rows that induce gaps > maximumGapLength. Rows are only filtered out if they are duplications (contig of same name appears elsewhere in block, or contig with same prefix up to \".\" appears in the same block).\n");
fprintf(stderr, "-s --repeatCoordinatesEveryNColumns : Repeat coordinates of each sequence at least every n columns. By default: %" PRIi64 "\n", repeat_coordinates_every_n_columns);
fprintf(stderr, "-c --useCompression : Write the output using bgzip compression.\n");
fprintf(stderr, "-a --halFile : HAL file for extracting gap sequence (MAF must be created with hal2maf *without* --onlySequenceNames)\n");
fprintf(stderr, "-b --seqFiles : Fasta files for extracting gap sequence. Do not specify both this option and --halFile\n");
fprintf(stderr, "-h --help : Print this help message\n");
}
static Alignment *get_next_taf_block(LI *li, bool run_length_encode_bases) {
static Alignment *alignments[3];
static int64_t alignment_index=0;
assert(alignment_index >= 0);
while(alignment_index < 3) {
alignments[alignment_index] = taf_read_block(alignment_index == 0 ? NULL : alignments[alignment_index-1],
run_length_encode_bases, li); // Read a block
if(alignments[alignment_index] == NULL) { // The read block is empty
break;
}
alignment_index++;
}
assert(alignment_index >= 0);
if(alignment_index == 0) {
return NULL;
}
Alignment *block = alignments[0];
// Shift down the remaining alignments
alignments[0] = alignments[1];
alignments[1] = alignments[2];
alignments[2] = NULL;
// Reduce the alignment index
alignment_index--;
return block;
}
// prototype logic to try to reduce the gap delta by greedily filtering out
// dupes with the biggest gaps. if it doesn't find enough dupes to remove
// to cover gap_delta, it returns false and does nothing. otherwise it returns
// true and removes the rows.
static bool greedy_prune_by_gap(Alignment *alignment, int64_t maximum_gap_length) {
// map row ptr to sample name, using everything up to first "." of sequence name
// (which is quite hacky but not sure there's a choice)
stHash *row_to_sample_name = stHash_construct2(NULL, free);
// hash sample name to number of rows with the sample
stHash *sample_to_count = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, NULL, free);
int64_t i = 0;
for (Alignment_Row *row = alignment->row; row != NULL; row = row->n_row, ++i) {
char *dot = strchr(row->sequence_name, '.');
char *sample_name = NULL;
if (dot != NULL) {
sample_name = stString_getSubString(row->sequence_name, 0, dot - row->sequence_name);
} else {
sample_name = stString_copy(row->sequence_name);
}
stHash_insert(row_to_sample_name, row, sample_name);
int64_t *count = stHash_search(sample_to_count, sample_name);
if (count == NULL) {
count = (int64_t*)malloc(sizeof(int64_t));
*count = 1;
stHash_insert(sample_to_count, sample_name, count);
} else {
++(*count);
}
}
// check that all rows with gaps > max_gap are dupes, ie we can merge a pruned alignment
i = 0;
bool can_prune = true;
// all rows that are dupes exceeding gap length are stored here
stList *to_prune = stList_construct();
// we only filter dupes if there's at least one row that would not get filtered
// remember when that happens here
stSet *samples_passing_gap_once = stSet_construct3(stHash_stringKey, stHash_stringEqualKey, NULL);
for (Alignment_Row *row = alignment->row; row != NULL && can_prune; row = row->n_row, ++i) {
int64_t gap = 0;
if (row->l_row != NULL && alignment_row_is_predecessor(row->l_row, row)) {
gap = row->start - (row->l_row->start + row->l_row->length);
}
char *sample_name = (char*)stHash_search(row_to_sample_name, row);
if (gap > maximum_gap_length) {
int64_t *count = stHash_search(sample_to_count, sample_name);
if (*count > 1 && row != alignment->row) {
stList_append(to_prune, row);
} else {
can_prune = false;
}
} else {
stSet_insert(samples_passing_gap_once, sample_name);
}
}
// if there isn't at least one copy that we can merge without dropping, then we
// don't bother as we don't want to drop coverage below 1 for any sample
int64_t no_to_prune = stList_length(to_prune);
for (i = 0; can_prune && i < no_to_prune; ++i) {
Alignment_Row *row_to_prune = stList_get(to_prune, i);
char *sample_name = stHash_search(row_to_sample_name, row_to_prune);
assert(sample_name);
if (stSet_search(samples_passing_gap_once, sample_name) == NULL) {
can_prune = false;
}
}
bool pruned = false;
if (can_prune) {
// remove the rows
assert(stList_length(to_prune) > 0);
Alignment_Row *p_row = NULL;
int64_t to_prune_idx = 0;
i = 0;
Alignment_Row *row_to_prune = stList_get(to_prune, to_prune_idx);
Alignment_Row *row = alignment->row;
while (row) {
Alignment_Row *n_row = row->n_row;
if (row == row_to_prune) {
assert(p_row != NULL);
p_row->n_row = row->n_row;
alignment_row_destruct(row);
++to_prune_idx;
row_to_prune = to_prune_idx < stList_length(to_prune) ? stList_get(to_prune, to_prune_idx) : NULL;
--alignment->row_number;
pruned = true;
} else {
p_row = row;
}
row = n_row;
++i;
}
}
stHash_destruct(row_to_sample_name);
stHash_destruct(sample_to_count);
stList_destruct(to_prune);
stSet_destruct(samples_passing_gap_once);
return pruned;
}
int taf_norm_main(int argc, char *argv[]) {
time_t startTime = time(NULL);
/*
* Arguments/options
*/
char *logLevelString = NULL;
char *inputFile = NULL;
char *outputFile = NULL;
bool run_length_encode_bases = 0;
bool output_maf = 0;
bool use_compression = 0;
bool filter_gap_causing_dupes = 0;
stList *fasta_files = stList_construct();
char *hal_file = NULL;
///////////////////////////////////////////////////////////////////////////
// Parse the inputs
///////////////////////////////////////////////////////////////////////////
while (1) {
static struct option long_options[] = { { "logLevel", required_argument, 0, 'l' },
{ "inputFile", required_argument, 0, 'i' },
{ "outputFile", required_argument, 0, 'o' },
{ "maf", no_argument, 0, 'k' },
{ "help", no_argument, 0, 'h' },
{ "maximumBlockLengthToMerge", required_argument, 0, 'm' },
{ "maximumGapLength", required_argument, 0, 'n' },
{ "fractionSharedRows", required_argument, 0, 'q' },
{ "minimumSharedRows", required_argument, 0, 'Q' },
{ "filterGapCausingDupes", no_argument, 0, 'd' },
{ "repeatCoordinatesEveryNColumns", required_argument, 0, 's' },
{ "useCompression", no_argument, 0, 'c' },
{ "halFile", required_argument, 0, 'a' },
{ "seqFiles", required_argument, 0, 'b' },
{ 0, 0, 0, 0 } };
int option_index = 0;
int64_t key = getopt_long(argc, argv, "l:i:o:hcm:n:dkQ:q:s:a:b:", long_options, &option_index);
if (key == -1) {
break;
}
switch (key) {
case 'l':
logLevelString = optarg;
break;
case 'i':
inputFile = optarg;
break;
case 'o':
outputFile = optarg;
break;
case 'h':
usage();
return 0;
case 'k':
output_maf = 1;
break;
case 'm':
maximum_block_length_to_merge = atol(optarg);
break;
case 'n':
maximum_gap_length = atol(optarg);
break;
case 'd':
filter_gap_causing_dupes = 1;
break;
case 'Q':
minimum_shared_rows = atol(optarg);
break;
case 'q':
fraction_shared_rows = atof(optarg);
break;
case 'c':
use_compression = 1;
break;
case 's':
repeat_coordinates_every_n_columns = atol(optarg);
break;
case 'a':
hal_file = optarg;
break;
case 'b':
// Parse the set of sequence files (this is a bit fragile - files can not start with a '-' character)
optind--;
for( ;optind < argc && *argv[optind] != '-'; optind++){
stList_append(fasta_files, argv[optind]);
}
break;
default:
usage();
return 1;
}
}
//////////////////////////////////////////////
//Log the inputs
//////////////////////////////////////////////
st_setLogLevelFromString(logLevelString);
st_logInfo("Input file string : %s\n", inputFile);
st_logInfo("Output file string : %s\n", outputFile);
st_logInfo("Maximum block length to merge : %" PRIi64 "\n", maximum_block_length_to_merge);
st_logInfo("Maximum gap length : %" PRIi64 "\n", maximum_gap_length);
st_logInfo("Filter gap-causing dupes : %d\n", (int)filter_gap_causing_dupes);
st_logInfo("Output maf : %s\n", output_maf ? "true" : "false");
st_logInfo("Repeat coordinates every n bases : %" PRIi64 "\n", repeat_coordinates_every_n_columns);
st_logInfo("Fraction shared rows to merge adjacent blocks : %f\n", fraction_shared_rows);
st_logInfo("Write compressed output : %s\n", use_compression ? "true" : "false");
if (hal_file) {
st_logInfo("HAL file string : %s\n", hal_file);
} else {
st_logInfo("Number of input FASTA files : %ld\n", argc - optind);
}
//////////////////////////////////////////////
// Read in the sequences if joining over unaligned gaps
//////////////////////////////////////////////
stHash *fastas_map = NULL;
stSet *hal_species = NULL;
int hal_handle = -1;
if (hal_file) {
hal_species = load_sequences_from_hal_file(hal_file, &hal_handle);
}
else if(stList_length(fasta_files) > 0) {
fastas_map = load_sequences_from_fasta_files(stList_getBackingArray(fasta_files), stList_length(fasta_files));
stList_destruct(fasta_files);
}
//////////////////////////////////////////////
// Read in the taf blocks and merge blocks that are sufficiently small
//////////////////////////////////////////////
FILE *input = inputFile == NULL ? stdin : fopen(inputFile, "r");
LW *output = LW_construct(outputFile == NULL ? stdout : fopen(outputFile, "w"), use_compression);
LI *li = LI_construct(input);
// Pass the header line to determine parameters and write the updated taf header
Tag *tag = taf_read_header_2(li, &run_length_encode_bases);
if(output_maf && run_length_encode_bases) { // Remove this tag from the maf output as not relevant
tag = tag_remove(tag, "run_length_encode_bases");
}
output_maf ? maf_write_header(tag, output) : taf_write_header(tag, output);
tag_destruct(tag);
Alignment *alignment, *p_alignment = NULL, *p_p_alignment = NULL;
while((alignment = get_next_taf_block(li, run_length_encode_bases)) != NULL) {
if(p_alignment != NULL) {
// First realign the rows in case we in the process of merging prior blocks we have
// identified rows that can be merged
alignment_link_adjacent(p_alignment, alignment, 1);
bool merged = false;
int64_t common_rows = alignment_number_of_common_rows(p_alignment, alignment);
int64_t total_rows = alignment->row_number + p_alignment->row_number - common_rows;
if (common_rows >= minimum_shared_rows &&
common_rows >= total_rows * fraction_shared_rows &&
(alignment_length(p_alignment) <= maximum_block_length_to_merge ||
alignment_length(alignment) <= maximum_block_length_to_merge)) {
int64_t max_gap = alignment_max_gap_length(p_alignment);
if (max_gap > maximum_gap_length && filter_gap_causing_dupes) {
// try to greedily filter dupes in order to get the gap length down
bool was_pruned = greedy_prune_by_gap(alignment, maximum_gap_length);
max_gap = alignment_max_gap_length(p_alignment);
assert(was_pruned == (max_gap <= maximum_gap_length));
}
if (max_gap <= maximum_gap_length) {
if(hal_species || fastas_map) { // Now add in any gap bases if sequences are provided
alignment_add_gap_strings(p_alignment, alignment, fastas_map, hal_handle, hal_species, -1);
}
p_alignment = alignment_merge_adjacent(p_alignment, alignment);
merged = true;
}
}
if (!merged) {
output_maf ? maf_write_block(p_alignment, output) : taf_write_block(p_p_alignment, p_alignment, run_length_encode_bases, repeat_coordinates_every_n_columns, output); // Write the maf block
if(p_p_alignment != NULL) {
alignment_destruct(p_p_alignment, 1); // Clean up the left-most block
}
p_p_alignment = p_alignment;
p_alignment = alignment;
}
}
else {
p_alignment = alignment;
}
}
if(p_alignment != NULL) {
output_maf ? maf_write_block(p_alignment, output) : taf_write_block(p_p_alignment, p_alignment, run_length_encode_bases, -1, output); // Write the last taf block
alignment_destruct(p_alignment, 1);
if(p_p_alignment != NULL) {
alignment_destruct(p_p_alignment, 1);
}
}
//////////////////////////////////////////////
// Cleanup
//////////////////////////////////////////////
if(inputFile != NULL) {
fclose(input);
}
LW_destruct(output, outputFile != NULL);
if (fastas_map) {
stHash_destruct(fastas_map);
}
if (hal_species) {
stSet_destruct(hal_species);
}
st_logInfo("taffy norm is done, %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime);
//while(1);
//assert(0);
return 0;
}