-
Notifications
You must be signed in to change notification settings - Fork 15
/
ri-build.cpp
156 lines (103 loc) · 3.91 KB
/
ri-build.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// Copyright (c) 2017, Nicola Prezza. All rights reserved.
// Use of this source code is governed
// by a MIT license that can be found in the LICENSE file.
#include <iostream>
#include "internal/r_index.hpp"
#include "utils.hpp"
#include "internal/r_index.hpp"
using namespace ri;
using namespace std;
string out_basename=string();
string input_file=string();
int sa_rate = 512;
bool sais=true;
ulint T = 0;//Build fast index with SA rate = T
bool fast = false;//build fast index
bool hyb = false; //use hybrid bitvectors instead of sd_vectors?
void help(){
cout << "ri-build: builds the r-index. Extension .ri is automatically added to output index file" << endl << endl;
cout << "Usage: ri-build [options] <input_file_name>" << endl;
cout << " -o <basename> use 'basename' as prefix for all index files. Default: basename is the specified input_file_name"<<endl;
//cout << " -h use hybrid bitvectors instead of elias-fano in both RLBWT and predecessor structures. Important: "<<endl;
//cout << " if the index is built with -h, need to specify -h also when locating and counting (ri-count/ri-locte). "<<endl;
//cout << " -fast build fast index (O(occ)-time locate, O(r log(n/r)) words of space). By default, "<<endl;
//cout << " small index is built (O(occ*log(n/r))-time locate, O(r) words of space)"<<endl;
//cout << " -sa_rate <T> T>0. if used, build the fast index (see option -fast) storing T SA samples before and after each"<<endl;
//cout << " BWT equal-letter run. O(r*T) words of space, O(occ(log(n/r)/T) + log(n/r))-time locate. "<<endl;
cout << " -divsufsort use divsufsort algorithm to build the BWT (fast, 7.5n Bytes of RAM). By default,"<<endl;
cout << " SE-SAIS is used (about 4 time slower than divsufsort, 4n Bytes of RAM)."<<endl;
cout << " <input_file_name> input text file." << endl;
exit(0);
}
void parse_args(char** argv, int argc, int &ptr){
assert(ptr<argc);
string s(argv[ptr]);
ptr++;
if(s.compare("-o")==0){
if(ptr>=argc-1){
cout << "Error: missing parameter after -o option." << endl;
help();
}
out_basename = string(argv[ptr]);
ptr++;
}else if(s.compare("-divsufsort")==0){
sais = false;
}/*else if(s.compare("-h")==0){
hyb=true;
}/*else if(s.compare("-fast")==0){
fast=true;
}else if(s.compare("-T")==0){
T = atoi(argv[ptr]);
if(T<=0){
cout << "Error: parameter T must be T>0" << endl;
help();
}
ptr++;
fast=true;
}*/else{
cout << "Error: unrecognized '" << s << "' option." << endl;
help();
}
}
int main(int argc, char** argv){
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
auto t1 = high_resolution_clock::now();
//parse options
out_basename=string();
input_file=string();
int ptr = 1;
if(argc<2) help();
while(ptr<argc-1)
parse_args(argv, argc, ptr);
input_file = string(argv[ptr]);
if(out_basename.compare("")==0)
out_basename = string(input_file);
string idx_file = out_basename;
idx_file.append(".ri");
cout << "Building r-index of input file " << input_file << endl;
cout << "Index will be saved to " << idx_file << endl;
string input;
{
std::ifstream fs(input_file);
std::stringstream buffer;
buffer << fs.rdbuf();
input = buffer.str();
}
string path = string(out_basename).append(".ri");
std::ofstream out(path);
//save flag storing whether index is fast or small
out.write((char*)&fast,sizeof(fast));
if(hyb){
//auto idx = r_index<sparse_hyb_vector,rle_string_hyb>(input,sais);
//idx.serialize(out);
}else{
auto idx = r_index<>(input,sais);
idx.serialize(out);
}
auto t2 = high_resolution_clock::now();
ulint total = duration_cast<duration<double, std::ratio<1>>>(t2 - t1).count();
cout << "Build time : " << get_time(total) << endl;
out.close();
}