-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpandaseq.py
91 lines (71 loc) · 2.62 KB
/
pandaseq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/python
# filename: pandaseq.py
###########################################################################
#
# Copyright (c) 2013 Bryan Briney. All rights reserved.
# Copyright (c) 2021 Rinat Mukhometzianov
# @version: 1.1.0
# @author: Bryan Briney, Rinat Mukhometzianov
# @props: PANDAseq team (github.com/neufeld/pandaseq)
# @cite: Andre P Masella, Andrea K Bartram, Jakub M Truszkowski, Daniel G Brown and Josh D Neufeld
# PANDAseq: paired-end assembler for illumina sequences.
# BMC Bioinformatics 2012, 13:31.
# www.biomedcentral.com/1471-2105/13/31
# @license: MIT (http://opensource.org/licenses/MIT)
#
###########################################################################
import os
import glob
import subprocess as sp
from multiprocessing import cpu_count
def list_files(d):
return sorted([f for f in glob.glob(d + '/*') if os.path.isfile(f)])
def pair_files(files, nextseq):
pairs = {}
for f in files:
if nextseq:
f_prefix = '_'.join(os.path.basename(f).split('_')[:3])
else:
f_prefix = '_'.join(os.path.basename(f).split('_')[:2])
if f_prefix in pairs:
pairs[f_prefix].append(f)
else:
pairs[f_prefix] = [f, ]
return pairs
def batch_pandaseq(f, r, o):
cmd = 'pandaseq -f {0} -r {1} -d rbfkms -T {3} -w {2}'.format(f, r, o, cpu_count())
sp.Popen(cmd, shell=True, stderr=sp.STDOUT, stdout=sp.PIPE).communicate()
def merge_reads(files, output, nextseq, i):
files.sort()
f = files[0]
r = files[1]
if nextseq:
lane = os.path.basename(f).split('_')[-3]
sample_id = os.path.basename(f).split('_')[0]
sample = sample_id + '_' + lane
else:
sample = os.path.basename(f).split('_')[0]
print_sample_info(i, sample)
o = os.path.join(output, '{}.fasta'.format(sample))
batch_pandaseq(f, r, o)
def print_start_info():
print('')
print('')
print('========================================')
print('Merging reads with PANDAseq')
print('========================================')
print('')
def print_input_info(files):
print('The input directory contains {} pair(s) of files to be merged.\n'.format(len(files) / 2))
def print_sample_info(i, sample):
print('[ {} ] Processing sample {}'.format(str(i), sample))
def print_sample_end():
print('Done.\n')
def run(f_input, output, nextseq):
print_start_info()
files = list_files(f_input)
print_input_info(files)
pairs = pair_files(files, nextseq)
for i, pair in enumerate(sorted(pairs.keys())):
if len(pairs[pair]) == 2:
merge_reads(pairs[pair], output, nextseq, i)