-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_reference_slices.py
executable file
·107 lines (94 loc) · 4.38 KB
/
extract_reference_slices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#! /usr/bin/env python
from __future__ import print_function
import argparse, gzip, pysam, re, sys
def extract_reference_slices(faf_ref, f_fasta, f_out, f_bed=None):
for coord in get_coords(f_fasta, f_bed):
key="{}:{}-{}".format(*coord)
seq=faf_ref.fetch(region=key)
print(">{}".format(key), file=f_out)
while seq:
print(seq[:60], file=f_out)
seq = seq[60:]
def get_coords(f_fasta, f_bed=None):
coords = []
numSequences =0
# read BED file if available
if f_bed:
for line in f_bed:
if line.strip():
chrom, start, end = line.split()[:3]
coords.append((chrom, int(start), int(end)))
numSequences = len([l for l in f_fasta if l.startswith('>')])
# if no BED available
else:
for name, seq, qual in readfq(f_fasta):
# split on potential delimiters
numSequences += 1
for head in re.split(r'\s+|>|[|]|@', name):
match = re.match(r'^([a-zA-Z0-9_]+):(\d+)-(\d+)$', head)
if match:
coords.append((match.groups()[0],
int(match.groups()[1]),
int(match.groups()[2])))
break
if len(coords) != numSequences:
raise Exception("Number of coordinates provided does not match number "
"of reads. Please specify coordinates in BED format "
"or behind fasta headers.")
return coords
# External code
# taken form lh3
def readfq(fp): # this is a generator function
last = None # this is a buffer keeping the last unprocessed line
while True: # mimic closure; is it a bad idea?
if not last: # the first record or a record following a fastq
for l in fp: # search for the start of the next record
if l[0] in '>@': # fasta/q header line
last = l[:-1] # save this line
break
if not last: break
name, seqs, last = last[1:], [], None # EDIT: read whole name line
for l in fp: # read the sequence
if l[0] in '@+>':
last = l[:-1]
break
seqs.append(l[:-1])
if not last or last[0] != '+': # this is a fasta record
yield name, ''.join(seqs), None # yield a fasta record
if not last: break
else: # this is a fastq record
seq, leng, seqs = ''.join(seqs), 0, []
for l in fp: # read the quality
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq): # have read enough quality
last = None
yield name, seq, ''.join(seqs); # yield a fastq record
break
if last: # reach EOF before reading enough quality
yield name, seq, None # yield a fasta record instead
break
if __name__=='__main__':
parser = argparse.ArgumentParser(description='Prepare reference slices for '
'assemblies to be visualized in maze. If you provide '
'a BED file, coordinates are taken from it in the '
'given order. Without BED, I look for a pattern of '
'chr:start-end in the fasta headers, seperated by '
'white spaces.')
parser.add_argument("-c", "--coords", metavar='BED', default=None,
type=argparse.FileType('r'), required=False,
help="Bed file with coordinates.")
parser.add_argument("-f", "--fasta", metavar='FA',
type=argparse.FileType('rb'), required=True,
help="Fasta containing assemblies/long reads.")
parser.add_argument("-r", "--ref", metavar='FA',
type=pysam.FastaFile, required=True,
help='Reference genome; index required.')
parser.add_argument("-o", "--out", metavar='FA', default='-',
type=argparse.FileType('w'), required=False,
help='Output file.')
args = parser.parse_args()
# .gz support for fasta
if args.fasta != sys.stdin and args.fasta.name.endswith(".gz"):
args.fasta = gzip.open(args.fasta.name, 'r')
extract_reference_slices(args.ref, args.fasta, args.out, args.coords)