-
Notifications
You must be signed in to change notification settings - Fork 1
/
pfpebwt
executable file
·196 lines (178 loc) · 10.7 KB
/
pfpebwt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python3
import sys, time, argparse, subprocess, os.path
Description = """
Tool to compute the eBWT and the GCA of a string collection.
"""
def main():
parser = argparse.ArgumentParser(description=Description, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('input', help='input fasta file name', type=str)
parser.add_argument('-w', '--wsize', help='sliding window size (def. 10)', default=10, type=int)
parser.add_argument('-p', '--mod', help='hash modulus (def. 100)', default=100, type=int)
parser.add_argument('-t', help='number of helper threads (def. None)', default=0, type=int)
parser.add_argument('-n', help='number of different primes when using --reads (def. 1)', default=1, type=int)
parser.add_argument('--rle', help='store eBWT in RLE format (def. False)', action='store_true')
parser.add_argument('--samples', help='compute the GCA samples (def. False)', action='store_true')
parser.add_argument('--GCA', help='compute the complete GCA (def. False)', action='store_true')
parser.add_argument('--reads', help='process input ad a reads multiset (def. False)', action='store_true')
parser.add_argument('--remainders', help='use multiple remainders instead of multiple primes (def. False)',action='store_true')
parser.add_argument('--period', help='remove periodic sequences (def. False)', action='store_true')
parser.add_argument('--invert', help='invert the eBWT (def. False)', action='store_true')
parser.add_argument('--keep', help='keep auxiliary files (debug only)',action='store_false')
parser.add_argument('--parsing', help='stop after the parsing phase (debug only)',action='store_true')
parser.add_argument('--verbose', help='verbose (def. False)',action='store_true')
args = parser.parse_args()
dirname = os.path.dirname(os.path.abspath(__file__))
parse_exe = os.path.join(dirname, "include/PFP-algos/circpfp.x")
parseNT_exe = os.path.join(dirname, "include/PFP-algos/circpfpNT.x")
parseReads_exe = os.path.join(dirname, "include/PFP-algos/circpfpr.x")
parseReadsNT_exe = os.path.join(dirname, "include/PFP-algos/circpfprNT.x")
parseReadsD_exe = os.path.join(dirname, "include/PFP-algos/circpfpd.x")
parseReadsDNT_exe = os.path.join(dirname, "include/PFP-algos/circpfpdNT.x")
parsebwtNT_exe = os.path.join(dirname, "include/eBWT-algos/parsebwtNT.x")
parsebwtNT64_exe = os.path.join(dirname, "include/eBWT-algos/parsebwtNT64.x")
bebwtNT_exe = os.path.join(dirname, "include/eBWT-algos/build-ebwt-gca.x")
bebwtNTp64_exe = os.path.join(dirname, "include/eBWT-algos/build-ebwt-gca_p64.x")
bebwtNTd64_exe = os.path.join(dirname, "include/eBWT-algos/build-ebwt-gca_d64.x")
bebwtNT64_exe = os.path.join(dirname, "include/eBWT-algos/build-ebwt-gca_64.x")
invertNT_exe = os.path.join(dirname, "include/eBWT-algos/invertNT.x")
if args.samples or args.GCA:
parseNT_exe = os.path.join(dirname, "include/PFP-algos/circpfp-gca.x")
parsebwtNT_exe = os.path.join(dirname, "include/eBWT-algos/parsebwtgcaNT.x")
parsebwtNT64_exe = os.path.join(dirname, "include/eBWT-algos/parsebwtgcaNT64.x")
if args.reads or args.t > 0:
print("GCA computation not yet implemented for multithreading and short reads mode")
exit(1)
if args.samples and args.GCA:
args.samples = False
print("Warning --GCA flag overrides --samples flag.")
logfile_name = args.input + ".log"
# get main bigbwt directory
args.bigbwt_dir = os.path.split(sys.argv[0])[0]
print("Sending logging messages to file:", logfile_name)
with open(logfile_name,"a") as logfile:
# ---------- Parsing of the input file
start0 = start = time.time()
if args.reads:
# Input is a short sequences multiset
if args.remainders:
# Use different remainders
if args.t>0:
command = "{exe} {file} -w {wsize} -p {modulus} -t {th} -n {wnumb}".format(
exe = os.path.join(args.bigbwt_dir,parseReadsD_exe),
wsize=args.wsize, modulus = args.mod, th=args.t, wnumb=args.n, file=args.input)
else:
command = "{exe} {file} -w {wsize} -p {modulus} -n {wnumb}".format(
exe = os.path.join(args.bigbwt_dir,parseReadsDNT_exe),
wsize=args.wsize, modulus = args.mod, wnumb = args.n, file=args.input)
else:
# Use different primes
if args.t>0:
command = "{exe} {file} -w {wsize} -p {modulus} -t {th} -n {wnumb}".format(
exe = os.path.join(args.bigbwt_dir,parseReads_exe),
wsize=args.wsize, modulus = args.mod, th=args.t, wnumb=args.n, file=args.input)
else:
command = "{exe} {file} -w {wsize} -p {modulus} -n {wnumb}".format(
exe = os.path.join(args.bigbwt_dir,parseReadsNT_exe),
wsize=args.wsize, modulus = args.mod, wnumb = args.n, file=args.input)
else:
# Input is a long sequences multiset
if args.t>0:
command = "{exe} {file} -w {wsize} -p {modulus} -t {th}".format(
exe = os.path.join(args.bigbwt_dir,parse_exe),
wsize=args.wsize, modulus = args.mod, th=args.t, file=args.input)
else:
command = "{exe} {file} -w {wsize} -p {modulus}".format(
exe = os.path.join(args.bigbwt_dir,parseNT_exe),
wsize=args.wsize, modulus = args.mod, file=args.input)
# Verbose
if args.verbose: command += " -v"
# Remove periodic sequences
if args.period: command += " -c"
print("==== Parsing. Command:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start))
if args.parsing:
# delete temporary parsing files
command = "rm -f {file}.eparse_old {file}.start {file}.offset {file}.fchar {file}.eocc".format(file=args.input)
if(execute_command(command,logfile,logfile_name)!=True):
return
for i in range(args.t):
command = "rm -f {file}.{i}.eparse_old {file}.{i}.offset_old ".format(file=args.input, i=i)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("==== Stopping after the parsing phase as requested")
return
# ----------- Computing the eBWT and Inverted List of the parse
start = time.time()
print("==== Computing Inverted List of parse's eBWT.")
parse_size = os.path.getsize(args.input+".eparse")/4
print("Parse contains " + str(parse_size) + " words.")
if(parse_size >= (2**32-1)):
print("IL creation running in 64 bit mode")
command = "{exe} {file} -w {wsize}".format(
exe = os.path.join(args.bigbwt_dir,parsebwtNT64_exe), wsize=args.wsize, file=args.input)
else:
print("IL creation running in 32 bit mode")
command = "{exe} {file} -w {wsize}".format(
exe = os.path.join(args.bigbwt_dir,parsebwtNT_exe), wsize=args.wsize, file=args.input)
print("Command:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start))
# ----------- Computing the eBWT of the text
start = time.time()
print("==== Computing the eBWT of the text.")
dict_size = os.path.getsize(args.input+".edict")
print("Dictionary contains " + str(dict_size) + " characters.")
if(dict_size >= (2**31-1)):
print("Dict SA running in 64 bit mode")
if(parse_size >= (2**32-1)):
command = "{exe} {file} -w {wsize}".format(
exe = os.path.join(args.bigbwt_dir,bebwtNT64_exe), wsize=args.wsize, file=args.input)
else:
command = "{exe} {file} -w {wsize}".format(
exe = os.path.join(args.bigbwt_dir,bebwtNTd64_exe), wsize=args.wsize, file=args.input)
else:
print("Dict SA running in 32 bit mode")
if(parse_size >= (2**32-1)):
command = "{exe} {file} -w {wsize}".format(
exe = os.path.join(args.bigbwt_dir,bebwtNTp64_exe), wsize=args.wsize, file=args.input)
else:
command = "{exe} {file} -w {wsize}".format(
exe = os.path.join(args.bigbwt_dir,bebwtNT_exe), wsize=args.wsize, file=args.input)
if(args.rle): command += " -r" # Activate rle
if(args.samples): command += " -s" # Activate samples computation
if(args.GCA): command += " -c" # Activate full GCA computation
print("==== Computing eBWT and GCA of the text. Command:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start));
print("Total construction time: {0:.4f}".format(time.time()-start0))
if args.keep:
print("Deleting auxiliary files")
command = "rm -f {file}.eparse_old {file}.offset_old {file}.eparse {file}.edict {file}.offset {file}.eocc {file}.fchar {file}.start {file}.sdsl {file}.last {file}.spos {file}.slast".format(file=args.input)
if(execute_command(command,logfile,logfile_name)!=True):
return
for i in range(args.t):
command = "rm -f {file}.{i}.eparse_old {file}.{i}.offset_old ".format(file=args.input, i=i)
if(execute_command(command,logfile,logfile_name)!=True):
return
if args.invert:
print("Inverting the eBWT")
command = "{exe} {file}".format(exe = os.path.join(args.bigbwt_dir,invertNT_exe), file=args.input)
if(execute_command(command,logfile,logfile_name)!=True):
return
# execute command: return True is everything OK, False otherwise
def execute_command(command,logfile,logfile_name,env=None):
try:
#subprocess.run(command.split(),stdout=logfile,stderr=logfile,check=True,env=env)
subprocess.check_call(command.split(),stdout=logfile,stderr=logfile,env=env)
except subprocess.CalledProcessError:
print("Error executing command line:")
print("\t"+ command)
print("Check log file: " + logfile_name)
return False
return True
if __name__ == '__main__':
main()