-
Notifications
You must be signed in to change notification settings - Fork 3
/
vcfSNPrate2circosLine.py.untested
executable file
·310 lines (280 loc) · 11 KB
/
vcfSNPrate2circosLine.py.untested
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/usr/bin/env python3
import sys
def help():
print('''
usage:
vcf2track.py -vcf <file> [-ref <file> -binsize <int>]
description:
If -ref is not used, output is a Circos line track of SNP rate
Then scaffold lengths are obtained from the vcf and used to
calculate SNP rate. e.g. if the VCF represents mapped
genomic reads.
If -ref is used, only the features specified in the -ref
GFF3 file are used. e.g. if the VCF represents mapped
transcript reads.
options:
-binsize <int> If not using -ref, bin size. Default 1000 (1kb)
-ref GFF3 of features to calculate SNP rate
-noindels Don't count indels
-nosnps Don't count snps
''', file=sys.stderr)
sys.exit()
class GFF3_line:
"""A class to represet GFF3 lines and allow modification of the
values of its fields.
Attributes:
------------
field0, ..., field8 strings containing the values of each field
in a GFF3 line
attributes a dictionary containing the key-value pairs
in the GFF3 line 9th field
Methods:
------------
str() Outputs GFF3 line
repr() Outputs GFF3 line
refreshAttrStr() This needs to be called if changes were made to
any of the attributes. It refreshes
"""
def __init__(self, line):
"""GFF3_line is initialized to contain the fields of the GFF3
line provided as attributes. The attributes are kept in a
dictionary and a the keys are ordered in a list to preserve
the order of attributes upon getting the sring representation
of the line from the GFF3_line object.
"""
(self.seqid,
self.source,
self.type,
self.start,
self.end,
self.score,
self.strand,
self.phase,
self.attributes_str) = line.strip().split('\t')
self.length = self.end - self.start + 1
# preserve attribute order as a list of keys (attributes_order)
attributes_list = self.attributes_str.split(';')
self.attributes_order = [attr.split('=')[0] for attr in
attributes_list]
# store attribute keys and their values in a dictionary
self.attributes = {attr.split('=')[0]:attr.split('=')[1] for attr in
attributes_list}
# rename the name attribute key to Name so it conforms to the
# GFF3 specification, where Name is a reserved attribute key
if 'name' in self.attributes:
self.attributes['Name'] = self.attributes.pop('name')
self.attributes_order[self.attributes_order.index('name')] = 'Name'
def __repr__(self):
"""Output for overloaded functions str() and repr()"""
return '\t'.join([str(self.seqid),
str(self.source),
str(self.type),
str(self.start),
str(self.end),
str(self.score),
str(self.strand),
str(self.phase),
str(self.attributes_str)])
def addAttr(self, attr_key, attr_val, replace=False, attr_pos=0):
"""
Add key-value pair to the GFF3 attribute column, at the
beginning of the list by default. If the key exists then the
value is concatenated with the current value separated by a
comma, unless replace=True in which case the existing value is
replaced.
"""
if attr_key in self.attributes:
# replace the current value of this key with then new value
if replace:
delAttr(attr_key)
self.attributes[attr_key] = attr_val
self.attributes_order.insert(attr_pos, attr_key)
self.refreshAttrStr()
# this key already exists in the attributes of the GFF3 file
# add this value to the existing key's values with commas
# separating values
else:
self.attributes[attr_key] = '{0},{1}'.format(
self.attributes[attr_key],
attr_val)
self.refreshAttrStr()
else:
self.attributes[attr_key] = attr_val
self.attributes_order.insert(attr_pos, attr_key)
self.refreshAttrStr()
def delAttr(self, attr_key):
"""Deletes attribute."""
del self.attributes[attr_key]
self.attributes_order.pop(self.attributes_order.index(attr_key))
self.refreshAttrStr()
def refreshAttrStr(self):
"""If the attributes dictionary or attributes_order has been
altered this should be called to update attributes_str.
"""
self.attributes_str = ';'.join(['='.join(
[attr, self.attributes[attr]]) for attr in self.attributes_order])
def genbincoords(n, binsize):
"""
Returns the coordinates for the nth bins of size binsize
"""
return (n*binsize, (n+1)*binsize-1)
def getbins(length, binsize):
"""
Returns all the bins (including last small bin)
"""
if length // binsize == length / binsize:
return tuple([genbincoords(i, binsize) for i in range(length // binsize)])
else:
return tuple([genbincoords(i, binsize)
for i in range(length // binsize)]
+ [(genbincoords(length // binsize, binsize)[0], length)])
def read_ref_coords(gff3):
"""
reads all features and returns a dict:
{scaf:{(start,end):gene_name}}
"""
dct = {}
by_name = {}
with open(gff3, 'r') as inFl:
for line in inFl:
if line.startswith('#'):
continue
g = GFF3_line(line)
if g.seqid in dct:
dct[g.seqid][g.coords] = g.attributes['Name']
else:
dct[g.seqid] = {g.coords:g.attributes['Name']}
by_name[g.attributes['Name']] = g
return dct, by_name
def Overlaps(j, k):
"""
Inputs, j and k, are tuples/lists with a start and a end coordinate:
(start, end). If they overlap True is returned, if they do not
overlap, False is returned.
"""
j = sorted([int(i) for i in j])
k = sorted([int(i) for i in k])
jk = sorted([j,k], key=lambda x:x[0])
if jk[0][1] >= jk[1][0]:
return True
else:
return False
def vcfref2snprate(vcf, ref, transformation='length', snps=True, indels=False):
"""
Takes a VCF file and a GFF3 file ref whose features' coordinates
are represented in the VCF file as input and outputs a four-column
file:
geneName length SNPs SNPs/length
The value of the attribute key Name in the GFF3 is used as the gene
name.
"""
# read the reference gff3 into a dicionary of the coords (Ref) and
# a dictionary containing the full gff3 lines
Ref, gff3 = read_ref_coords(ref)
snps = {}
# for each line in the vcf file,
with open(vcf, 'r') as inFl:
for line in inFl:
# Read in contig lengths
if line.startswith('#'):
continue
line = line.strip().split('\t')
contig = line[0]
pos = int(line[1])
typ = line[7].split(';')[0]
# Do not/count SNPs or indels
if typ == 'INDEL':
if not indels:
continue
else:
if not snps:
continue
if contig not in Ref:
continue
for coord in Ref[contig]:
if Overlaps(coord, (pos,pos)):
gene_name = Ref[contig][coord]
if gene_name in snps:
snps[gene_name] += 1
else:
snps[gene_name] = 1
print('gene\tgene_length\tSNPs\tSNPs/length')
for gene_name in snps:
length, SNPs, ratio = [gff3[gene_name].length,
snps[gene_name],
snps[gene_name] / gff3[gene_name].length]
print('{0}\t{1}\t{2}\t{3:.5f}'.format(gene_name, length, SNPs, ratio))
def vcf2heatmap(vcf, binsize=1000, snps=True, indels=False):
"""
Reads VCF file and outputs rate of SNPs per <binsize> to stdout
"""
contig_lengths = {}
snps = {}
with open(vcf, 'r') as inFl:
last_contig = None
for line in inFl:
# Read in contig lengths
if line.startswith('#'):
if line.startswith('##contig='):
##contig=<ID=MDC000001.73,length=2214>
info = line.strip().split('contig=<')[1].rstrip('>').split(
',')
contig = info[0].split('=')[-1]
length = int(info[1].split('=')[-1])
contig_lengths[contig] = length
continue
else:
continue
# Count SNPs per bin
line = line.strip().split('\t')
contig = line[0]
pos = int(line[1])
typ = line[7].split(';')[0]
# Do not/count SNPs or indels
if typ == 'INDEL':
if not indels:
continue
else:
if not snps:
continue
bin_no = pos//binsize
if contig in snps:
if bin_no in snps[contig]:
snps[contig][bin_no] += 1
else:
snps[contig][bin_no] = 1
else:
snps[contig] = {bin_no:1}
for contig in snps:
for i, coords in enumerate(getbins(contig_lengths[contig], binsize)):
try:
snp_rate = snps[contig][i]/(coords[1]-coords[0]+1)
except KeyError:
snp_rate = 0
print('{0}\t{1}\t{2}\t{3}'.format(contig,
coords[0],
coords[1],
snp_rate))
if __name__ == '__main__':
args = sys.argv
if '-vcf' not in args or len(args) < 3:
help()
if '-noindels' in args:
indels = False
else:
indels = True
if '-nosnps' in args:
snps = False
else:
snps = True
if '-binsize' in args:
binsize = int(args[args.index('-binsize')+1])
else:
binsize = 1000
if '-ref' in args:
refPth = args[args.index('-ref')+1]
vcfPth = args[args.index('-vcf')+1]
if '-ref' in args:
vcfref2snprate(vcfPth, refPth, snps=snps, indels=indels)
else:
vcf2heatmap(vcfPth, binsize=binsize, snps=snps, indels=indels)