-
Notifications
You must be signed in to change notification settings - Fork 0
/
Build_bw_nova.cpp
3099 lines (2669 loc) · 92.6 KB
/
Build_bw_nova.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// BRAT is a whole genome bisulfite sequence mapping program
// Copyright (C) 2009 Elena Yavorska Harris
// Mapping is done to Concatenation of two genome versions:
// First is Complement of genome ( reverse of reverse-complement of genome is complement of genome)
// Second is Reverse of genome
// Reverse is taken to be able to map 5' end of reads (since BWT alignment is done from right-to-left)
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
/*
Same seeds as in Brat-1.1.4, but less space since it treats chroms as one continuous genome
*/
#include<iomanip>
#include<map>
#include<bitset>
#include<queue>
#include<ctime>
#include<list>
#include<cmath>
#include<vector>
#include<string.h>
#include<iostream>
#include<fstream>
#include<sstream>
#include<algorithm>
#include<assert.h>
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
using namespace std;
typedef unsigned long int _uli;
typedef unsigned int _ui;
typedef unsigned short int _usi;
typedef vector<_uli>::iterator _uli_iter;
_ui min(_ui x, _ui y){
if(x < y)
return x;
else
return y;
}//min
void usage(){
string mes("\nUSAGE: build_bw -r <references> -P <prefix> \nOptions:\n");
string gen_ref_opt(" -r <references-file> file with file names containing references (one\n reference per file)");
string prefix( " -P <prefix> name of absolute path name for output files");
cout << mes << endl;
cout << gen_ref_opt << endl << prefix << endl;
string mes2( "Limitations:\n");
string ref_size( " a single reference size <= 2^45");
string chrom_ids( " number of references <= 2^18");
string space( " memory <= 2GB; space on hard disk <= 3GB");
cout << mes2 << endl;
cout << ref_size << endl << chrom_ids << endl << space << endl;
}//usage()
class Anchor{
public:
Anchor(int argc, char* argv[]);
~Anchor();
//TOP
vector<string> names;//for references
vector<unsigned long int> mask;
vector<_uli> C;//#occ of As and Ts in ta-genome
unsigned long int* ht_indices;//ht_idices
vector<_uli> ht_sizes;
vector< pair<_ui, _ui> > ranges;
vector<_ui> ranges_total;
vector< pair<_ui, _ui> > sub_bucket;//ranges<st, en> for sub-buckets
vector<_ui> sub_bucket_sum;//sum of indices in each sub-bucket range
vector<_uli> bucket_components;
vector<_uli> bc_indices;//pointers to indices of bucket_components
vector<_uli> end_of_suffix;//if > 0, then this component contains suffix after *value* bits, the less bits, the earlier in sorting order
vector<_ui> prev_same_comp_ids;
vector<_uli> bwt;//cat-representation, C->T
vector<_ui> bwt_marked;//mark those 1-bit bwt positions per char whose pos indices are stored
vector<_ui> pos_strand;//nova 1bit per base: 0 for 1st half of genome (- strand), 1 for 2nd half of genome with Reverse(original strand) (+ strand)
//pos_strand should be as bwt_marked
vector<_ui> marked_ns;
vector<_ui> num_occ_sqr;//to keep # occ of As in a current bucket of size 4096 bits, cumulative sum from 0 to current - 1 block
//need to make sure that neither of C[i] for any character is greater than 2^32 (otherwise, num_occ_sqr must be _uli)
vector< vector<_usi> > num_occ; // # occ of As in 64 sub-buckets of current bucket
vector<_uli> num_occ_sqr_sum;// num_occ_sqr_sum[k] = sum of entries of num_occ_sqr from 0 to k-1
vector< _ui > pos_indices;//key = pos inside a 2^16-size bucket, value = genome pos;
vector<_usi> pos_num_occ;//as num_occ for char count in bwt, this one is for count of 1s in bwt_marked
_ui pos_num_sqr;//as num_occ_sqr for char count in bwt, this one is for count of 1s in bwt_marked
//pos_num_sqr will not work for STEP=1, for STEP = 1, must be _uli
_uli pos_num_occ_sqr_sum;//similar to num_occ_sqr_sum
_ui pos_indices_ind;//index to reference pos_indices from 0 to 65,536 (ok _ui)
vector<_ui> size_chrom;//contains size of each chromosome, ok _ui
vector<_uli> starts_size_chrom;
map<_ui, string> chroms_names;//don't need map: can do in vector, _ui is ok since is in range 0...num_chroms
vector<_uli> chroms_starts;//for ta/cg/ns-genomes only
vector<_uli> starts_chroms_starts;
//auxiliary to prevent huge stacking during bucket sort
queue< pair< pair<_uli_iter, _uli_iter>, pair<_ui, _ui> > > not_sorted;
unsigned long int* ta_genome;
unsigned long int* cg_genome;
unsigned long int* ns_genome;
unsigned long int* full_genome;
unsigned long int* bwt_genome;
int parse_options(int argc, char* argv[]);
void read_genome();
void init_num_occ();//add here pos_num_occ too
void init_pos_indices();//pos_marked
void init_marked();//for bwt_marked, marked_ns, pos_strand
_uli get_ta_lmer(_uli cur_ind, _uli asize);
_uli get_ns_lmer(_uli cur_ind, _uli asize);
void make_mask();
void collect_ranges(_ui st, _ui en, _ui two_in_bucket, vector< pair<_ui, _ui> > &buckets, vector<_ui> &bucket_sum, vector<_uli> &ht_sizes_dif);
void collect_bucket_indices(_ui st, _ui en, _ui range_ind);
void init_comp_indices(_ui st, _ui en, _ui sub_bucket_size);
void update_component(_uli_iter st, _uli_iter en, _ui &component, const _ui &sub_bucket_st);
void update_component_first(_uli_iter st, _uli_iter en, _ui component, const _ui &sub_bucket_st);
_uli get_component(_uli cur_ind, _uli asize, _uli comp_id, _uli &bits_before_suffix);
void get_pivot_value(_uli_iter st, _uli_iter en, _uli &value, _ui &total_left);
void quick_multikeys(_ui component, _uli_iter start, _uli_iter fin, const _ui &sub_bucket_st, _ui total_comp);
void quick_multikeys(vector< _uli > &lengths, _uli_iter start, _uli_iter fin);
_uli_iter partition_multikeys(vector< _uli > &lengths, _uli_iter st, _uli_iter en, _ui &total_left);
void sort_small_range(_uli_iter st, _uli_iter en);
_uli_iter partition_multikeys(_uli_iter st, _uli_iter en, _ui &total_left);
void update_end_of_suffix(_uli_iter &st, _uli_iter en);
_uli get_char( _uli index );
void update_bwt(_ui sub_bucket_st);
void finish_update_bwt(_ui sub_bucket_st);
void collect_subbuckets(vector<_ui> &bucket_sum);
_uli_iter find_first_equal_range(_uli_iter st, _uli_iter en, _ui &total_left);
void flash_num_occ(_uli upto);
void flash_bwt();
void flash_pos_indices(_uli upto);
void flash_marked_ns();
void flash_marked();
void flash_genome(ofstream &out, _uli *genome, _uli asize);
void output_info();
void usi_to_char(_usi, char abuffer[]);
void uli_to_char(_uli, char abuffer[]);
void ui_to_char(_ui, char abuffer[]);
_uli Occ(_uli ch, _uli sp, vector< vector<_ui> > &num_occ_sqr);
void exact_match(_uli ta_read, _uli &sp, _uli &ep, vector< vector<_ui> > &num_occ_sqr);//sp=1, ep = 0, remainer = 0: init
_uli get_bwt_lmer(_uli sp, _uli asize, _uli ch);
void read_num_occ_sqr(vector< vector<_ui> > &num_occ_sqr);
void read_num_occ(vector< vector<_usi> > &num_occ);
void read_bwt_genome();
void precalculate_seeds();
_uli count_mism(unsigned long int bits);
ofstream out_bwt;
ofstream out_num_occ;
ofstream out_num_sqr;
ofstream out_ta_genome;
ofstream out_cg_genome;
ofstream out_ns_genome;
ofstream out_pos_strand;//nova
ofstream out_pos_indices;
ofstream out_bwt_marked;
ofstream out_pos_num_sqr;
ofstream out_pos_num_occ;
ofstream out_chrom;
ofstream out_seeds;
ifstream in_bwt;
void build_bwt();
unsigned long int len_mer;
unsigned long int read_len;
_uli orig_num_sqr;
_uli orig_num;
string genome_names;
string prefix;
//all of the below refers to size of concatenation of genome to itself
_uli binary_total;//size of binary representation of the real genome
_uli ta_size;//size of ta_genome
_uli full_size;
_uli orig_genome_size;//real size of original genome in bases
unsigned long int byte_size;
unsigned long int SIZE ;//genome char per an unsigned long int
unsigned long int two_in_Size; //log_2(SIZE)
unsigned long int MASK_SIZE;
_uli MASK_LMER;
_uli MASK_BYTE;
_uli block_64;//64*64=4096
_uli block_32;//32*32 = 1024
_uli block_256; //256*256
_uli two_in_block_64;//2^12=4096
_uli two_in_block_32;//2^10
_uli two_in_block_256;//2^16 = 256^2
_uli two_in_32;
_uli curL_bwt;//current index of BWT full size
_uli cur_ind_bwt;
_uli SEED;//24
_uli MASK_SEED;
_uli STEP;//16
_uli two_in_Step;
_uli orig_gen_inL;//index of original genome in bwt
_ui abyte;
_ui two_bytes;
_ui three_bytes;
_ui four_bytes;
_ui five_bytes;
_ui six_bytes;
_ui seven_bytes;
_ui two_in_byte;
_ui num_chroms;
_uli hash_table_size;
_uli total_components;
_uli genome_byte_size;
void print();
vector< vector<_usi> > num_occ_bytes;//hash table for the number of 1s in a byte
//first entry of hash table might be hash table huge, sort separately
vector< _uli > ends_ranges_of_As;
vector< _uli > lengths;//similar to bucket components, holds lengths of regions of As at the beg of suffixes
vector< _uli > ht_sizes_lengths;//holds number of suffixes with As by lengths subdivided into 2^16 entries
void sort_first_entry();
void collect_bucket_indices_first(_ui st, _ui en, _ui asize, vector<_uli> &count_lengths);
void collect_indices_first_eos(_uli &count_eos);//end of suffix char
_uli small_ht_size;
_uli shift_ht_lengths;
_uli two_in_half_word;//half_word = 32, two in it = 5
_uli full_binary_total;
void count_char(_uli kmer, vector<_usi> & characters, int asize);
_uli pos_indices_total;
void space_release();
_uli half_Size;
_uli half_SEED;
_uli max_range_As;
};
Anchor::~Anchor(){
delete [] bwt_genome;
out_bwt.close();
out_num_occ.close();
out_num_sqr.close();
out_pos_strand.close();
out_ta_genome.close();
out_cg_genome.close();
out_ns_genome.close();
out_pos_indices.close();
out_chrom.close();
out_pos_num_occ.close();
out_pos_num_sqr.close();
out_bwt_marked.close();
if(ht_indices != 0)
delete [] ht_indices;
if(ns_genome != 0)
delete [] ns_genome;
if(ta_genome != 0)
delete [] ta_genome;
if(cg_genome != 0)
delete [] cg_genome;
if(full_genome != 0)
delete [] full_genome;
}
void Anchor::space_release()
{
out_bwt.close();
out_num_occ.close();
out_num_sqr.close();
out_pos_strand.close();
out_ta_genome.close();
out_cg_genome.close();
out_ns_genome.close();
out_pos_indices.close();
out_chrom.close();
out_pos_num_occ.close();
out_pos_num_sqr.close();
out_bwt_marked.close();
if(ht_indices != 0)
delete [] ht_indices;
ht_indices = 0;
if(ns_genome != 0)
delete [] ns_genome;
ns_genome = 0;
if(ta_genome != 0)
delete [] ta_genome;
ta_genome = 0;
if(cg_genome != 0)
delete [] cg_genome;
cg_genome = 0;
if(full_genome != 0)
delete [] full_genome;
full_genome = 0;
if(bwt_genome != 0)
delete [] bwt_genome;
bwt_genome = 0;
}//space_release
Anchor::Anchor(int argc, char* argv[])
{
half_SEED = 12;
pos_indices_total = 0;
STEP = 16;
SIZE = 64;//genome char per an unsigned long int
half_Size = SIZE >> 1;
make_mask();
int res = parse_options(argc, argv);
if((STEP > 1) && ((STEP & mask[1]) != 0))
STEP &= ~(mask[1]);
if(STEP > 32)
STEP = 32;
if(STEP == 0 || STEP == 1)
STEP = 2;//does not support STEP = 1: some count will be off (number of occurrences or positions)
cout << "STEP = " << STEP << endl;
if(res == -1){
usage();
space_release();
exit(0);
}
if(prefix.empty()){
usage();
space_release();
exit(0);
}
string genome_repres("CT");
string rem = (prefix[prefix.length() - 1] == '/' ? "" : "/");
string output_ta_genome = prefix + rem + "ta.ta";
string output_cg_genome = prefix + rem + "cg.cg";
string output_pos_strand = prefix + rem + "pos_strand.txt";//nova 1bit per base: 0 for 1st half of genome (- strand), 1 for 2nd half of genome with Reverse(original strand) (+ strand)
string output_chrom = prefix + rem + genome_repres + ".chr";
string output_pos_indices = prefix + rem + genome_repres + ".pos_ind";
string output_pos_num_sqr = prefix + rem + genome_repres + ".pos_num_sqr";
string output_pos_num_occ = prefix + rem + genome_repres + ".pos_num_occ";
string output_bwt_marked = prefix + rem + genome_repres + ".bwt_marked";//marks 1-bit bwt entries that have genome pos stored
string output_ns_genome = prefix + rem + genome_repres + ".ns";//for each 1-bit bwt entry it marks 1 if 24-SEED prefix of this suffix contains N
string output_num_occ = prefix + rem + genome_repres + ".no";//charactor count in each 64-bit word
string output_num_sqr = prefix + rem + genome_repres + ".nosqr";//character count in square blocks: sum of counts in 0...i-1 (64*64)blocks
string output_bwt = prefix + rem + genome_repres + ".bwt";
string output_seeds = prefix + rem + genome_repres + ".seeds";
out_bwt.open(output_bwt.c_str(), ios::out);
out_num_occ.open(output_num_occ.c_str(), ios::out);
out_num_sqr.open(output_num_sqr.c_str(), ios::out);
out_seeds.open(output_seeds.c_str(), ios::out);
out_ns_genome.open(output_ns_genome.c_str(), ios::out);
out_pos_indices.open(output_pos_indices.c_str(), ios::out);
out_pos_num_sqr.open(output_pos_num_sqr.c_str(), ios::out);
out_pos_num_occ.open(output_pos_num_occ.c_str(), ios::out);
out_bwt_marked.open(output_bwt_marked.c_str(), ios::out);
out_chrom.open(output_chrom.c_str(), ios::out);
out_ta_genome.open(output_ta_genome.c_str(), ios::out);
out_cg_genome.open(output_cg_genome.c_str(), ios::out);
out_pos_strand.open(output_pos_strand.c_str(), ios::out);//nova
total_components = 10000;//stack has limitation on sgi-2 it is 42000, total stack depth = total_comp * log(binary_total);
unsigned long int one = 1;
orig_gen_inL = 0;
abyte = 8;
two_in_Size = log(static_cast<float>(SIZE))/log(static_cast<float>(2));
MASK_SIZE = (one << two_in_Size) - 1;
MASK_BYTE = (one << abyte) - 1;
two_in_32 = 5;
two_bytes = 16;
three_bytes = 24;
four_bytes = 32;
five_bytes = 40;
six_bytes = 48;
seven_bytes = 56;
byte_size = 8;//byte size is characters(genome) per a byte
two_in_byte = 3;//log_2(byte_size)
two_in_half_word = 5;
SEED = 24;
len_mer = SEED;
MASK_SEED = (one << 24) -1;
MASK_LMER = MASK_SEED;
block_64 = 4096;
block_32 = 1024;
two_in_block_64 = 12;
two_in_block_32 = 10;
two_in_Step = log(static_cast<float>(STEP))/log(static_cast<float>(2));//2^4 = 16
unsigned long int ta_ind = 0;
float base = 2;
float apow = static_cast<float>(SEED);
hash_table_size = static_cast<_ui>(pow(base, apow));
ifstream in;
in.open(genome_names.c_str(), ios::in);
if(!in){
cout << "ERROR: cannot open " << genome_names << endl;
space_release();
exit(0);
}
string aname;
num_chroms = 0;
in >> aname;
while(!in.eof()){
if(aname.length() == 0){
cout << "\nERROR: No input in the file " << genome_names << endl;
space_release();
exit(0);
}
if(aname[0] == '>' || aname[0] == '@'){
cout << "\nERROR: " << genome_names << " must contain the names of the files with references." << endl;
space_release();
exit(0);
}
if(aname[aname.size() - 1] == '\n' || aname[aname.size() - 1] == '\r')//nova AWS treats end of line char as a valid character
aname.erase(aname.size() - 1);
names.push_back(aname);
num_chroms++;
in >> aname;
}//while
in.close(); in.clear();
string line;
//find out sizes of chromosomes (or references)
// ta_genome.resize(1);
vector<string> chrom_refs_names;
_uli cur_size;
for(_ui u = 0; u < num_chroms; u++){
in.open(names[u].c_str(), ios::in);
if(!in){
cout << "ERROR: cannot open " << names[u] << endl;
space_release();
exit(0);
}
getline(in, line);
if(line.length() == 0){
cout << "\nERROR: No input in the file " << names[u] << endl;
space_release();
exit(0);
}
if(line[0] != '>' ){
cout << "\nERROR: " << names[u] << " must be in FASTA format." << endl;
space_release();
exit(0);
}
string achrom;
istringstream istr(line.substr(1, line.length() -1));
istr >> achrom;
chroms_names.insert(make_pair(u, achrom));
chrom_refs_names.push_back(achrom);
cur_size = 0;
getline(in, line);
while(!in.eof()){
long int line_len = line.length();
char last_char = line[line_len - 1] ;
if(last_char == '\n' || last_char == '\r')//nova changed because AWS reads in EOL character
line_len--;
cur_size += line_len;
getline(in, line);
}//while
size_chrom.push_back(cur_size);//in char/bits one char per bit
_uli new_size = (cur_size >> two_in_Size) ;// =/(8*64) 1pb takes 1 bit, and 1 long takes 64 bits
if((cur_size & mask[two_in_Size]) > 0)
new_size += 1;
chroms_starts.push_back(new_size);
in.close(); in.clear();
}//for u
_ui total_chroms = num_chroms;
num_chroms <<= 1;// multiply by 2
for(int j = chrom_refs_names.size() - 1; j >= 0; j--){
string achrom = chrom_refs_names[j];
chroms_names.insert(make_pair(total_chroms, achrom));
total_chroms++;
}//for j since we have concatenation of genome to itself in this order
//nova: since we concatenate two strands S_complement and S_reverse, where S is original string
//chroms will be in order 1,2,3,3,2,1
_uli last_chr = size_chrom.size() - 1;//index
_uli size_of_orig_genome = 0;
for(int j = last_chr; j >= 0; j--){
size_of_orig_genome += size_chrom[j];
size_chrom.push_back(size_chrom[j]);
chroms_starts.push_back(chroms_starts[j]);
}//for j
orig_genome_size = size_of_orig_genome;//nova in bases: we use this to convert genomic positions within
//a single genome version (ConcatenationOf(genome) or Reverse(genome)) to genomic position within entire Concatenated genome
_uli cur_total = 0;
_uli starts_total = 0;
starts_size_chrom.push_back(cur_total);
starts_chroms_starts.push_back(starts_total);
_ui i = 1;
for(i = 1; i < size_chrom.size(); i++){//nova size_chrom contains 1,2,3,...,n,n,...,3,2,1 chroms
cur_total += size_chrom[i - 1];
starts_size_chrom.push_back(cur_total);
starts_chroms_starts.push_back(starts_size_chrom[i] >> two_in_Size);
}
binary_total = cur_total + size_chrom[i - 1];//binary size of genome
starts_chroms_starts.push_back(binary_total >> two_in_Size);//the end of the last chrom
cur_total = (binary_total >> two_in_Size) + 1 ;//index of bucket plus 1 to get size, + 1 extra
ta_size = cur_total;
full_binary_total = binary_total << 1;
full_size = ((binary_total << 1) >> two_in_Size) + 1;
cout << "References sizes are calculated" << endl;
genome_byte_size = ((binary_total + 1) >> two_in_byte) + 1;
ta_genome = new unsigned long int[cur_total];
assert(ta_genome != 0);
cg_genome = new unsigned long int[cur_total];
assert(cg_genome != 0);
ns_genome = new unsigned long int[cur_total];
assert(ns_genome != 0);
full_genome = new unsigned long int[full_size];
assert(full_genome != 0);
cout << "Space is allocated for genome representation" << endl;
ht_sizes.resize(hash_table_size);
_usi alphabet_size = 4;
vector<_usi> dum(alphabet_size, 0);
num_occ.resize(SIZE, dum);
num_occ_sqr.resize(alphabet_size, 0);
num_occ_sqr_sum.resize(alphabet_size, 0);
C.resize(alphabet_size, 0);
int fast_size = static_cast<int>(pow(static_cast<double>(2), static_cast<double>(STEP)));
block_256 = pow(2.0, 16.0);
two_in_block_256 = 16;
pos_indices.resize(block_256, 0);
pos_num_sqr = 0;
pos_num_occ_sqr_sum = 0;
pos_num_occ.resize(SIZE, 0);
bwt_marked.resize(block_32, 0);
pos_strand.resize(block_32, 0);
pos_indices_ind = 0;
ht_indices = 0;
bwt.resize(block_32);//64*64
marked_ns.resize(block_32);
for(i = 0; i < block_32; i++)
{ bwt[i] = 0;
marked_ns[i] = 0;
}//for i
curL_bwt = 0;
cur_ind_bwt = 0;
int apow_seed = 16;
_uli hash_small_size = pow(2.0, apow_seed + 0.0);
num_occ_bytes.resize(hash_small_size);//2^SEED
for(_uli i = 0; i < hash_small_size; i++){
vector< _usi > characters(alphabet_size, 0);
count_char(i, characters, apow_seed);
num_occ_bytes[i] = characters;
}//for
small_ht_size = pow(2.0, 24.0);
ht_sizes_lengths.resize(small_ht_size, 0);
cout << "Initialization is completed" << endl;
}//Anchor()
void Anchor::count_char(_uli kmer, vector<_usi> & characters, int asize){
for(int i = 0; i < asize; i += 2){
_uli cur_char = kmer & mask[2];
characters[cur_char]++;
kmer >>= 2;
}
}//count_char
void Anchor::print()
{
for(_uli i = 0; i < ta_size; i++){
_uli cur = cg_genome[i];
// bitset<64> z(cur);
// cout << z << endl;
}
}//for test only
void Anchor::build_bwt()
{
//construct ta-, cg- and ns-genomes
read_genome();//counts ta, cg, ns and ht_sizes (Hash Table holds # of 24-bit mers in full genome), full_genome
cout << "Genome is read " << endl;
_uli cur_bwt_char = get_char(binary_total - 1);
bwt[0] = cur_bwt_char;
_uli no_ind = (curL_bwt & mask[two_in_block_64]) >> two_in_Size;//index of one of 64 sub-buckets inside 4096 bucket
num_occ_sqr[cur_bwt_char]++;
num_occ[no_ind][cur_bwt_char]++;
curL_bwt = 1;
//write to files cg- and ns-genomes
flash_genome(out_cg_genome, cg_genome, ta_size);
flash_genome(out_ta_genome, ta_genome, ta_size);
//WHY flash_genome on ns_genome is not done here? Answer: because we use it
//If a seed at genome pos has an N char, it is not used (in mapping)
//free space for cg- and ns-genomes
delete [] cg_genome;
cg_genome = 0;
delete [] ta_genome;
ta_genome = 0;
//collect ranges of indices of HashTable representing buckets of size N/32, where N is genome size
//first entry of hash table has to be processed separately
//NOVA_DEBUG: using total_components = ta_size might cause problem of stack overflow
// total_components = ta_size ;//sort_first_entry uses quick_multikeys conditioning on total_components
// sort_first_entry();
total_components = 5000;
//cout << "first entry is sorted" << endl;
//now start from pos 1 not 0, because we have processed entry 0 already
collect_ranges(0, hash_table_size - 1, two_in_32, ranges, ranges_total, ht_sizes);
_ui ranges_size = ranges.size();
for(_ui i = 0; i < ranges_size; i++){
collect_bucket_indices(ranges[i].first, ranges[i].second, ranges_total[i]);
//subdivide indices into sub buckets of size N/(32*32)
vector< pair<_ui, _ui> > temp_buckets;
vector<_ui> temp_sum;
collect_ranges(ranges[i].first, ranges[i].second, two_in_block_64, temp_buckets, temp_sum, ht_sizes);
collect_subbuckets(temp_sum);
//for each sub-bucket
_ui sub_size = sub_bucket.size();
for(_ui j = 0; j < sub_size; j++){
//initialize bucket components and bc-indices
init_comp_indices(sub_bucket[j].first, sub_bucket[j].second, sub_bucket_sum[j]);
//sort indices in each subrange
quick_multikeys(0, bc_indices.begin(), (bc_indices.end() -1) , sub_bucket[j].first, total_components);
//sort not sorted until everything is sorted
_uli not_sorted_size = not_sorted.size();
_uli not_sorted_count = 0;
total_components += total_components;//*2
_uli count_total_ta_size = 0;
//DEBUG
cout << "quick_multikeys for j= " << j << " of " << sub_size << ", i = " << i << " of "
<< ranges_size << ", not_sorted size = " << not_sorted_size << endl;
while(!not_sorted.empty()){
pair< pair<_uli_iter, _uli_iter>, pair<_ui, _ui> > cur_pair = not_sorted.front();
not_sorted.pop();
quick_multikeys(cur_pair.second.first, cur_pair.first.first, cur_pair.first.second, cur_pair.second.second, total_components);
not_sorted_count++;
//each of ranges in not_sorted has been processed with next value of total_components
//so increment total_components
if(not_sorted_count == not_sorted_size){
total_components += min(total_components + 5000, full_size );
not_sorted_size = not_sorted.size();
not_sorted_count = 0;
}//
if(total_components == full_size)
count_total_ta_size++;
if(count_total_ta_size > 1)
break;
}//while
total_components = 5000;
//update bwt, num_occ, pos_indices
update_bwt(sub_bucket[j].first);
}//for j
}//for i
cout << "Sorting is complete" << endl;
//check if the there is something needed to flash in bwt, num_occ and pos_indices
finish_update_bwt(sub_bucket[sub_bucket.size() - 1].first);
//output last genome: we don't need to output full genome
//finish additional infor output
output_info();
//DEBUG
cout << "output_info is done " << endl;
precalculate_seeds();
//DEBUG
cout << "precalculated_seeds are done" << endl;
}//build_bwt
void Anchor::update_bwt(_ui sub_bucket_st)
{
_uli azero = 0;
_uli asize = bc_indices.size();
for(_uli i = 0; i < asize; i++){
_uli ht_ind = bc_indices[i] + sub_bucket_st;
_uli gen_ind = ht_indices[ht_ind];//real concatenated genome index
_uli cur_bwt_char = get_char(gen_ind - 1);//prev char
marked_ns[cur_ind_bwt] <<= 1;
bwt_marked[cur_ind_bwt] <<= 1;
pos_strand[cur_ind_bwt] <<= 1;
_uli start_lmer = get_ns_lmer(gen_ind, SEED);//if 'N' in genome at this pos within 12char
if(start_lmer > 0)
marked_ns[cur_ind_bwt] |= 1; //if 'N' in genome at this pos within 12char
if(gen_ind > 0){
bwt[cur_ind_bwt] = (bwt[cur_ind_bwt] << 2) | cur_bwt_char;
//update no_occ
_uli no_ind = (curL_bwt & mask[two_in_block_64]) >> two_in_Size;//index of one of 64 sub-buckets inside 4096 bucket
num_occ_sqr[cur_bwt_char]++;
num_occ[no_ind][cur_bwt_char]++;
}//gen_ind > 0
else{
bwt[cur_ind_bwt] = (bwt[cur_ind_bwt] << 2) | 0;// $ character which is 0 here
orig_gen_inL = curL_bwt;
}//else
//update pos indices
if((gen_ind & mask[two_in_Step]) == 0){//each 16-th position is saved
//check if gen_ind is in the first half of the concatenation (strand "-", corresponds to ComplementOf(OriginalGenome))
//If gen_ind is in the second half, we start pos indices from start, because
//for human genome, genomic positions can go as high as 2^32 only
//to fit genomic pos into _ui, we need to count genomic positions for the second half of genome starting with 0
//If pos_strand[i] = 1, that means that correspondin genomic position comes from second half
//We mark only positions pos_strand[i] for which we actually store genomic positions
_uli converted_gen_ind = gen_ind;
if(gen_ind >= orig_genome_size){
converted_gen_ind = gen_ind - orig_genome_size;
pos_strand[cur_ind_bwt] |= 1; //genomic position in the second half of concatenated genome
}
pos_indices[pos_indices_ind] = converted_gen_ind;
pos_indices_ind++;
pos_indices_total++;
bwt_marked[cur_ind_bwt] |= 1;
_uli no_ind = (curL_bwt & mask[two_in_block_64]) >> two_in_Size;//index of one of 64 sub-buckets inside 4096 bucket
pos_num_occ[no_ind]++;
pos_num_sqr++;
}
curL_bwt++;
if((curL_bwt & mask[two_in_block_64]) == 0){ //NOVA_NEXT
num_occ_sqr_sum[0] += num_occ_sqr[0];
num_occ_sqr_sum[1] += num_occ_sqr[1];
num_occ_sqr_sum[2] += num_occ_sqr[2];
num_occ_sqr_sum[3] += num_occ_sqr[3];
num_occ_sqr[0] = num_occ_sqr_sum[0];
num_occ_sqr[1] = num_occ_sqr_sum[1];
num_occ_sqr[2] = num_occ_sqr_sum[2];
num_occ_sqr[3] = num_occ_sqr_sum[3];
for(_ui i = 1; i < SIZE; i++){
num_occ[i][0] += num_occ[i - 1][0];
num_occ[i][1] += num_occ[i - 1][1];
num_occ[i][2] += num_occ[i - 1][2];
num_occ[i][3] += num_occ[i - 1][3];
}//for
pos_num_occ_sqr_sum += pos_num_sqr;
pos_num_sqr = pos_num_occ_sqr_sum;
for(_ui i = 1; i < SIZE; i++)
pos_num_occ[i] += pos_num_occ[i-1];
flash_num_occ(SIZE);//pos_num_sqr and pos_num_occ flashed here too
init_num_occ();
}//if
if((curL_bwt & mask[two_in_half_word]) == 0){
cur_ind_bwt++;
if(cur_ind_bwt == block_32){
flash_bwt();
flash_marked_ns();
init_marked();
cur_ind_bwt = 0;
}
}
if(pos_indices_ind == block_256){
flash_pos_indices(block_256);
init_pos_indices();
}//if
}//for
}//update_bwt
void Anchor::finish_update_bwt(_ui sub_bucket_st)
{
if((curL_bwt & mask[two_in_block_64]) > 0){
num_occ_sqr_sum[0] += num_occ_sqr[0];
num_occ_sqr_sum[1] += num_occ_sqr[1];
num_occ_sqr_sum[2] += num_occ_sqr[2];
num_occ_sqr_sum[3] += num_occ_sqr[3];
num_occ_sqr[0] = num_occ_sqr_sum[0];
num_occ_sqr[1] = num_occ_sqr_sum[1];
num_occ_sqr[2] = num_occ_sqr_sum[2];
num_occ_sqr[3] = num_occ_sqr_sum[3];
for(_ui i = 1; i < SIZE; i++){
num_occ[i][0] += num_occ[i - 1][0];
num_occ[i][1] += num_occ[i - 1][1];
num_occ[i][2] += num_occ[i - 1][2];
num_occ[i][3] += num_occ[i - 1][3];
}//for
pos_num_occ_sqr_sum += pos_num_sqr;
pos_num_sqr = pos_num_occ_sqr_sum;
for(_ui i = 1; i < SIZE; i++)
pos_num_occ[i] += pos_num_occ[i-1];
_uli cur_ind = ((curL_bwt - 1) & mask[two_in_block_64]) >> two_in_Size;//index of one of 64 sub-buckets inside 4096 bucket
flash_num_occ(cur_ind + 1);//up to not included this index
}//if
if((curL_bwt & mask[two_in_half_word]) > 0){
//curLbwt points to unused base or bit
bwt[cur_ind_bwt] <<= (SIZE - ((curL_bwt << 1 ) & mask[two_in_Size]) );//DEBUG POSSIBLE
marked_ns[cur_ind_bwt] <<= ((SIZE >> 1) - (curL_bwt & mask[two_in_half_word]));
bwt_marked[cur_ind_bwt] <<= ((SIZE >> 1) - (curL_bwt & mask[two_in_half_word]) );
pos_strand[cur_ind_bwt] <<= ((SIZE >> 1) - (curL_bwt & mask[two_in_half_word]) );
cur_ind_bwt++;
flash_bwt(); //upto and including cur_ind_bwt
flash_marked_ns();
cur_ind_bwt = 0;
}//if bwt is not full
else if((curL_bwt & mask[two_in_half_word]) == 0){
if(cur_ind_bwt > 0){
//cur_ind_bwt points to an unused 64-bit word, so just flush upto cur ind that is unused
flash_bwt(); //upto and including cur_ind_bwt
flash_marked_ns();
cur_ind_bwt = 0;
}
}//else
if(pos_indices_ind < block_256){
flash_pos_indices(pos_indices_ind);
init_pos_indices();
}//if
//output one additional SIZE-word
bwt[0] = 0;
marked_ns[0] = 0;
bwt_marked[0] = 0;
pos_strand[0] = 0;
cur_ind_bwt = 1;
flash_bwt(); //upto not including cur_ind_bwt
flash_marked_ns();
// Not efficient: flash_genome(out_pos_strand, pos_strand, ta_size);
}//finish_update_bwt
_uli Anchor::get_char(_uli ind)//
{
//index is given in 1-bit representation
if((ind << 1) > full_binary_total)
return 0;
else if((ind << 1) == full_binary_total)
return ((full_genome[0] >> (SIZE - 2)) & mask[2]);
_uli ta_ind = (ind << 1) >> two_in_Size;
_uli inside_ind = (ind << 1) & mask[two_in_Size];
_uli res = full_genome[ta_ind];
res = (res >> (SIZE - inside_ind - 2)) & mask[2];//2 for 2-bit full-genome representation
return res;
}//get_char
void Anchor::init_num_occ()
{
for(_ui i = 0; i < SIZE; i++){
num_occ[i][0] = 0;
num_occ[i][1] = 0;
num_occ[i][2] = 0;
num_occ[i][3] = 0;
}
num_occ_sqr[0] = 0;
num_occ_sqr[1] = 0;
num_occ_sqr[2] = 0;
num_occ_sqr[3] = 0;
pos_num_sqr = 0;
for(_uli i = 0; i < SIZE; i++)
pos_num_occ[i] = 0;
}//init_num_occ
//lookup8.c, by Bob Jenkins, January 4 1997, Public Domain.
void Anchor::init_pos_indices()
{
for(_ui i = 0; i < block_256; i++)
{
pos_indices[i] = 0;
}//for i
pos_indices_ind = 0;
}//init_pos_indices
void Anchor::init_marked()
{
for(_ui i = 0; i < block_32; i++){
marked_ns[i] = 0;
bwt_marked[i] = 0;
pos_strand[i] = 0;
}
}
void Anchor::make_mask(){
_uli one = 1;
mask.push_back(0);
for(_uli i = 1; i <= SIZE; i++){
_uli res = (one << i) - 1;
mask.push_back(res);