-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbatch_merge.py
88 lines (69 loc) · 2.49 KB
/
batch_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/python
# filename: batch_merge.py
###########################################################################
#
# Copyright (c) 2013 Bryan Briney. All rights reserved.
# Copyright (c) 2021 Rinat Mukhometzianov.
# @version: 1.0.0
# @author: Bryan Briney, Rinat Mukhometzianov
# @props: IgBLAST team (http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi)
# @license: MIT (http://opensource.org/licenses/MIT)
#
###########################################################################
import os
import glob
import shutil
import argparse
import pandaseq
parser = argparse.ArgumentParser("Batch merging of paired-end reads with PANDAseq")
parser.add_argument('-i', '--in', dest='input', required=True, help="The input directory, containing paired FASTQ files"
" (uncompressed or gzip compressed). Required.")
parser.add_argument('-o', '--out', dest='output', required=True,
help="The output directory, will contain merged FASTA files. Required.")
parser.add_argument('-n', '--nextseq', dest='nextseq', default=False, action='store_true',
help="Use flag if run was performed on a NextSeq sequencer.")
args = parser.parse_args()
def make_direc(d):
if not os.path.exists(d):
os.mkdir(d)
def remove_direc(d):
shutil.rmtree(d)
def list_files(d):
return sorted([f for f in glob.glob(d + '/*') if os.path.isfile(f)])
def bin_files(files):
file_bins = {}
for f in files:
f_pre = '_'.join(os.path.basename(f).split('_')[:-1])
if f_pre in file_bins:
file_bins[f_pre].append(f)
else:
file_bins[f_pre] = [f, ]
return file_bins
def concat(d):
files = list_files(d)
file_bins = bin_files(files)
for file_bin in file_bins:
outfile = os.path.join(args.output, '{}.fasta'.format(file_bin))
with open(outfile, 'w') as o:
for f in file_bins[file_bin]:
with open(f) as i:
for line in i:
o.write(line)
def main():
make_direc(args.output)
if args.nextseq:
temp = os.path.join(args.output, 'temp')
make_direc(temp)
o = temp
else:
o = args.output
pandaseq.run(args.input, o, args.nextseq)
if args.nextseq:
print('')
print('Concatenating NextSeq lane files for each sample...')
concat(o)
remove_direc(o)
print('Done.')
print('')
if __name__ == '__main__':
main()