-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathExACto0.4.py
585 lines (567 loc) · 38.8 KB
/
ExACto0.4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
#!/usr/bin/env python3
'''
Exome aggregation consortium annotation tool, Version 0.3
This program is designed to use the ExAC VCF data from the broad institute (downloadable from the link below).
ftp://ftp.broadinstitute.org/pub/ExAC_release
You will first need to run the library generation function using the commandline argument -s filename.vcf
After that, you will be able to annotate your variants using the commandline argument -f filename.txt
This version has not yet been tested against an ANNOVAR output, so it will probably encounter errors there.
Copyright 2014, Michael Weinstein, Cohn Lab, UCLA. This is free to use for noncommercial entities, but please
let me know that you are using if I don't already know. [email protected]
'''
def checkargs(): #subroutine for validating commandline arguments
import argparse #loads the required library for reading the commandline
import os #imports the library we will need to check if the file exists
file = False
job = False
parser = argparse.ArgumentParser()
parser.add_argument ("-f", "--file", help = "Specify the desired file to annotate for submission.") #tells the parser to look for -f and stuff after it and call that the filename
parser.add_argument ("-s", "--split", help = "Specify a VCF to split into a subVCF library for use as a reference.")
args = parser.parse_args() #puts the arguments into the args object
if not (args.file or args.split): #if the args.file value is null, give an error message and quit the program
usage("No file specified for annotation or splitting.")
quit()
elif (args.file and args.split):
quit('You must specify either a file to split or a file to annotate. You cannot specify both on a single run. Please try again with one at a time.\nGoodbye.')
elif args.file:
file = args.file
job = 'annotate'
if not os.path.isfile(file):
quit('Specified file for annotation does not exist')
if os.path.isfile(file + 'ExACtoed.txt'):
quit('Specified file for annotation already has an existing annotation from this program. Please move or delete it, then run again.')
elif args.split:
file = args.split
job = 'librarysplit'
if not os.path.isfile(file):
quit('Specified VCF for the library split does not exist.')
if os.path.exists('subvcfs'):
quit('Library directory already exists. Please remove the old one before creating a new one')
return (file, job)
def usage(sin): #This subroutine prints directions
print ('Error: ' + sin)
print ('This program will annotate a SeattleSeq or ANNOVAR (not yet tested) output and annotate the variants based upon the Exome Aggregation Consortium\'s VCF data. Before your first annotation run, you will need to download the current ExAC VCF and run the library generation on it using the commandline argument -s ExACVCF.vcf. Once that is complete, you can annotate your outputs using the commandline argument -f yourfile.txt.')
def directoryhash(directory):
import os #the library needed to look at files and directories
import hashlib #the library needed to create an MD5 hash of the directory
try:
allfiles = os.listdir(directory) #gets a list of all the files in the directory
filestring = ''.join(allfiles) #turns the list of files into a single string
filestringhash = hashlib.md5(rawbytes(filestring)) #turns the string of filenames into an MD5 hash
except: #if something gives an error here
return False #return a value of false instead of returning the hash string to let the program know it did not work
return filestringhash
def rawbytes(stringin): #subroutine to take a string and make it into UTF-8 bytes, often for sending via socket connection
return bytes(stringin, 'utf-8')
def checkintegrity(directory):
try:
hashfile = open(directory + '/hashsum', 'r')
recordedhash = (hashfile.readline()).strip('\n')
hashfile.close()
if recordedhash and recordedhash == (directoryhash(directory)).hexdigest():
return True
except:
return False
else:
return False
def createintegrity(directory):
try:
hashfile = open(directory + '/hashsum', 'w') #opens a file to save the recorded hash at time of creation
checksum = directoryhash(directory).hexdigest() #creates a hash of the directory filenames and saves it as checksum
if not checksum: #if we did not get back a checksum value (because the subroutine failed)
return False #return false to let the program know it did not work
hashfile.write(checksum) #otherwise, we write the checksum to a file
hashfile.close() #and then close it
except: #if the process fails in some unexpected way
return False #return false to let the rest of the program know that
return True #otherwise return true to indicate success
def librarysplit(filename): #subroutine for generating the subvcf library to speed searches
import re #the library we need to use regular expressions
import os #the library we need to look for files and directories
if os.path.exists('subvcfs'): #check if there is already a subvcfs directory, if so, we do not want to overwrite it
usage('Library directory already exists. Please remove the old one before creating a new one')
quit()
try:
os.makedirs('subvcfs') #actually creates the new directory
except PermissionError: #if it is trying to create a directory and lacks the permission, we give an error message and quit
print ('\nPermission to create the library directory was denied. Please run this program with appropriate permissions or create it using your file manager.')
quit()
except: #if we get some other error, give a message and quit
print ('\nUnable to make the library directory.')
quit()
lastchromosome = 0 #initializing a bunch of values here
linecount = 0
libraryfile = False
lastpositionblock = 0
try: #see if we can open the exac VCF and read a line from it
exac = open(filename, 'r')
line = exac.readline()
except: #otherwise, if we get an error, we display an error message and quit
quit('Error reading from ExAC VCF')
while line: #while line is not empty (indiciating we have hit the end of the file)
linecount += 1 #increment the linecount variable
if re.match('^\#', line): #check if the line starts with a hashtag, indicating that it is not a data line
try: #try to read the next line and then restart the loop
line = exac.readline()
continue
except: #if the reading fails, display an error message
quit('Error reading from ExAC VCF')
linearray = line.split('\t') #if the line did not start with a hashtag (indicating it is a data line), we will split it at every tab
chromosome = linearray[0] #finds the chromosome
position = linearray[1] #finds the position
positionblock = str(int(position) - (int(position) % 100000)) #rounds down the position value to the nearest 100,000 bases to know which library file to use
if positionblock != lastpositionblock or chromosome != lastchromosome: #if either the chromosome or position block for this variant is different from the last
if libraryfile: #if we already have an open subvcf that we were writing to, we close it
libraryfile.close()
libraryfile = createlibraryfile(chromosome, positionblock) #then we create a new library file for the current chromosome and position block combination
try:
libraryfile.write(line) #writes the data line to the subvcf. By this point, we either know that we are writing to the same file as the last iteration, or have already opened the new one for writing
except:
quit('Error writing new library file.') #if something goes wrong with writing, we show an error message
try:
line = exac.readline() #then we try to read the next line from the file
except:
quit('Error reading from ExAC VCF') #and display an error message if something goes wrong
lastpositionblock = positionblock #remember which chromosome and position block we were on during this iteration so we can compare with the next
lastchromosome = chromosome
print('Processed ' + str(linecount) + ' lines.', end = '\r') #and tell the user what the progress is
print('\n')
if not createintegrity('subvcfs'): #if we were not able to create the hashsum file to monitor the integrity of the library directory
return False #returns a value of false to let the rest of the program know
return True #otherwise returns a value of true, indicating success
def createlibraryfile(chromosome, positionblock): #subroutine for generating subvcf files for the library
libraryfilename = str('subvcfs/' + chromosome + 'c' + positionblock + '.subvcf') #uses the chromosome and position block to generate the actual filename
try: #try to create a new file using that filename
libraryfile = open(libraryfilename, 'w') #needs a try/except block
except: #if that fails, display an error message and quit
quit('Error creating new library file.')
return libraryfile #returns the newly created file handle to the part of the program that called it
def yesanswer(question): #asks the question passed in and returns True if the answer is yes, False if the answer is no, and keeps the user in a loop until one of those is given. Also useful for walking students through basic logical python functions
answer = False #initializes the answer variable to false. Not absolutely necessary, since it should be undefined at this point and test to false, but explicit is always better than implicit
while not answer: #enters the loop and stays in it until answer is equal to True
print (question + ' (Y/N)') #Asks the question contained in the argument passed into this subroutine
answer = input('>>') #sets answer equal to some value input by the user
if str(answer) == 'y' or str(answer) == 'Y': #checks if the answer is a valid yes answer
return True #sends back a value of True because of the yes answer
elif str(answer) == 'n' or str(answer) == 'N': #checks to see if the answer is a valid form of no
return False #sends back a value of False because it was not a yes answer
else: #if the answer is not a value indicating a yes or no
print ('Invalid response.')
answer = False #set ansewr to false so the loop will continue until a satisfactory answer is given
def quotedsplit(input, delimiter = ','): #subroutine for splitting CSV lines (that will often have quotes around them and be separated by commas) This line can take an optional argument of something other than a comma as a delimiter
import re
brokenlist = input.split(delimiter) #this will split the input at every delimiter, quoted or not
outputlist = [] #initialize a blank list for the output
outputindex = -1 #initializing the index counter for the output list to -1 (because lists start at 0 for the first element)
for item in brokenlist:
if re.match('^".*[^"]$', outputlist[outputindex]): #if the item in the current cell in the output list starts with quotes, but does not end with them
outputlist[outputindex] = outputlist[outputindex] + delimiter + item #append the next item from the broken, separated by a delimiter (fixes the broken entries)
else:
outputlist[outputindex] = outputlist[outputindex].strip('"') #removes the quotes from the entry in the list (they only get in the way at this point) we will have to return them before writing them to a file
outputindex += 1 #if the item in the output list looks complete
outputlist.append(item) #create a new cell in the list containing the next item from the broken list (we will check to see if it's broken on the next iteration)
return outputlist #returns the list to the program for use
def getkeycolumns(headers): #subroutine to extract a tupple containing the column numbers for key data points (location, reference, and observed alleles)
import re
chromosomecolumn = False #initialize this to false
positioncolumn = False #initialize this to false as well, if we don't change them during the course of this subroutine, they will come back false and trigger an error message in the program
for i in range (0, len(headers)): #iterate through the list of headers
if re.search('^chr$', headers[i], re.IGNORECASE) or re.search('^chromosome$', headers[i], re.IGNORECASE): #use a regex to test if the column name indicates that it is a useful value to us (in this case chromosome), if so, we assign that value to the appropriate variable. If we never find one, it remains false and returns false in that position to the annotation subroutine (causes the program to stop, as we need these values)
chromosomecolumn = i
elif re.search('^start$', headers[i], re.IGNORECASE) or re.search('^position$', headers[i], re.IGNORECASE):
positioncolumn = i
elif re.search('^ref$', headers[i], re.IGNORECASE) or re.search('^referencebase$', headers[i], re.IGNORECASE):
reference = i
elif re.search('^obs$', headers[i], re.IGNORECASE) or re.search('^samplealleles$', headers[i], re.IGNORECASE):
observed = i
if not chromosomecolumn and positioncolumn and reference and observed:
return False #if any of these values were not set, we do not return a tupple, instead we just return a false value
return (chromosomecolumn, positioncolumn, reference, observed) #if all these values were set, we return the tupple
def createzerohash(populations): #this subroutine returns a hash full of some set values (although it could easily be modified to return a hash full of anything) that will have the same structure as the hash of values when a variant is known and in the ExAC vcf
datapoints = ['AC','AN','Hom'] #a list of the data points we need to fill in (I will go over details of the data structure for these hashes in the getfrequencyhash subroutine)
fillins = {'AC':0,'AN':1,'Hom':0} #creates a dictionary of what to fill each data point's population:value frequencies with
frequencyhash = {} #initializes an empty hash
for point in datapoints: #iterate through each of the defined datapoints
for population in populations: #iterate through each of the defined populations at each datapoint
try: #the purpose of this try/except series is to avoid errors by putting a hash of a hash in a location that didn't already have an empty hash initialized to it (you can do that in PERL, but not python)
frequencyhash[point][population] = fillins[point] #if the empty hash for the datapoint already exists (or it's not an empty hash because there are already key:value pairs in it), write in the appropriate fillin
except KeyError: #if trying to do the above raises a key error because the hash hasn't been initialized for the datapoint
frequencyhash[point]={} #first we create the empty hash
frequencyhash[point][population] = fillins[point] #then we add a value to it
return frequencyhash #and return the hash to the higher-level subroutine
def getfrequencyhash(line, populations, position): #this is the "pointy end" of the annotation subroutine, this takes in the line from the ExAC reference and extracts the frequency data into a well-organized hash table
'''
Data structure
hashname frequencyhash
/ | \
datapoint AC AN Hom
- | | |
| |-Pop:## |-Pop:## |-Pop:##
Population |-Pop:## |-Pop:## |-Pop:##
| |-Pop:## |-Pop:## |-Pop:##
| |-Pop:## |-Pop:## |-Pop:##
| |-Pop:## |-Pop:## |-Pop:##
- |-Pop:## |-Pop:## |-Pop:##
any individual datapoint can be recalled as hashname[datapoint][population]
'''
import re
multiallele = False
populationset = ''
frequencyhash = {}
for population in populations: #iterate through populations
if populationset: #if there is already a value written in the populationset string
populationset += '|' #add in a pipe character
populationset += population #add the current population to the string
regex = re.compile('(AC|AN|Hom)_(' + populationset + ')=(.+?);') #this regex is the "tip of the pointy end." This regex will look for anything matching the datapoints (AC, AN, or Hom) followed by an underscore, then any of the populations (we built that part of the search using the populationset string), then the equal sign, then any number of characters over zero before a semicolon (we can't restrict to numbers because there will be a comma in there for multi-allele variants). This will then return all those values as a list of tupples where the first value of each tupple is the datapoint, the second is the population, and the third is the actual number (or set of numbers for multi-allele)
dataset = re.findall(regex, line) #this takes the regex we compiled above (we did not actually run it until this line) and finda ALL instances of it in the data line
dataset = [list(entry) for entry in dataset] #turns a tupple into a list so that it can be modified properly later (this is not a big issue until we have a multi-allele variant, a tupple won't allow for splitting the values within a cell)
for datum in dataset: #iterate through the dataset (that 2-dimensional list of tupples, now lists, returned by the regex)
if ',' in datum[2]: #check and see if there is a comma in the third element of each 3-element list as we iterate through them (a comma would indicate a locus with multiple reported alternate alleles)
multiallele = True #set the value multiallele to true (we don't do anything to alter the data yet, we are just seeing if we need to)
break #end the loop, once we find a comma, we know it's a multiallele case and don't need to waste cycles to confirm it repeatedly
if multiallele: #if the previous test for multiallele loci came back positive
for i in range (0,len(dataset)): #iterate through the dataset
if ',' in dataset[i][2]: #if the element being inspected contains a comma
dataset[i][2] = dataset[i][2].split(',') #split that element into its own sublist on the comma
for datum in dataset: #now iterate through the dataset again
if multiallele and datum[0] != 'AN': #iterate through the dataset list using the first element of each 3-element set as the datapoint key for the frequencyhash, the second element as the population, and the third as the value to set it equal to.
try: #the use of try statements here, like in other cases where we are building a hash table, is to see if the key has already been initialized, and to initialize it if not
frequencyhash[datum[0]][datum[1]] = datum[2][position] #note that we have to handle a multi-allele case differently: because some of the values are actually lists of values, we need an additional variable to locate our specific number (position within the list)
except KeyError:
frequencyhash[datum[0]]={}
frequencyhash[datum[0]][datum[1]] = datum[2][position]
else:
try:
frequencyhash[datum[0]][datum[1]] = datum[2] #this part of the subroutine does the same thing as the previous block, but does not have the position listed, as there will be only one element
except KeyError:
frequencyhash[datum[0]]={}
frequencyhash[datum[0]][datum[1]] = datum[2]
return frequencyhash
def annotate(file): #and now for our main event
import os
import re
if not os.path.isdir('subvcfs'): #if the library isn't already made or can't be found
usage('No subvcf library detected in this directory. Please be sure the script is running from the same directory that contains the library.')
quit()
fileformat = 'tdt' #default format is tab-delimited
delimiter = '\t' #meaning that the delimiter is a tab
lastlibraryfile = False #initializes the lastlibrary file to false (we will use this to remember which library file we used in the last iteration so we don't reopen it if we don't have to)
populations = ['AFR','AMR','EAS','FIN','NFE','SAS'] #initializing a list of the population in the ExAC
datapoints = ['AF','HOMOF'] #initializing a list of the datapoins we will output for each population
if not checkintegrity('subvcfs'): #checks the subvcf directory to be sure that no directories have been gained or lost or had their name changed since it was created
quit('Check of subvcf library directory not passed. Please be sure no files have been added, removed, or renamed in the library director')
inputfile = open(file, 'r') #opens the file to be annotated
try: #This try/except (like most of the others you will see here) is to either read or write to a file, and end more gracefully if something fails in the process
header = inputfile.readline().strip('\r\n\t') #reads the first line from the file to annotate. This should be the header with all the column names. We need to analyze this to determine where the important columns are
except:
quit('Error reading header from input file.')
if not '\t' in header: #if we analyze the header line and find that there isn't a tab in there (indicating that tabs are not the delimiter)
if not yesanswer('This does not appear to be a tab-delimited file. Is it comma-delimited?'): #ask the user if this is a comma separated file
quit('Please talk to Mike about getting this script to take your file format.')
else:
fileformat = 'csv' #if the user says yes, the we treat this input as a CSV
if fileformat == 'tdt': #probably unnecessary, as we already initialized the value, but good to be explicit here
headerlist = header.split('\t') #splits the header on each tab
delimiter = '\t'
if fileformat == 'csv':
headerlist = quotedsplit(header) #if the file is a CSV, we split the header on commas. This requires a full subroutine, since commas can be used in a field as well as being used to separate them and quoted fields make a huge difference
delimiter = ','
headercolumns = len(headerlist) #the variable headercolumns is set to the number of elements in the headerlist (this value is quite useful later)
keycolumns = getkeycolumns(headerlist) #this subroutine takes the headerlist and returns the numbers of the columns containing values we will need in a tupple in a specific order
if not keycolumns: #if we could not get all those column numbers, we quit
quit('Unable to find columns containing chromosome and/or position data.')
chromosomecolumn = keycolumns[0] #this takes the values from the keycolumns tupple and gives them easier-to-remember names
positioncolumn = keycolumns[1]
referencecolumn = keycolumns[2]
observedcolumn = keycolumns[3]
outputfilename = file + '.ExACtoed.txt' #creates the name of the output file
#if os.path.isfile(outputfilename): #halts the program if there is already an annotated version of the file present
# usage('Output file already exists, please move, delete, or rename the existing output file for this source.')
# quit()
try:
outputfile = open(outputfilename, 'w', 1) #opens the output file
except:
quit('Error opening the output file.')
try:
line = inputfile.readline().strip('\r\n\t') #reads a new line from the user's annotation file
except:
quit('Error reading first data line from input file.')
linenumber = 0 #initializes a counter of lines
outputgroups = []
for population in populations:
outputgroups.append(population) #iterates through the populations and adds each one to the outputgroups list
outputgroups.append('max') #then adds the max value to the list (we will calculate the max for each later)
outputheaderlist = []
for header in headerlist: #iterate through the headers that came with the file we are annotating
outputheaderlist.append(header) #adds each value to the outputheader
for allele in range(0,2):
for point in datapoints:
for group in outputgroups:
outputheaderlist.append('A' + str(int(allele) + 1) + '_' + point + '_' + group)
outputheaderlist.append('F_rarest_allele')
outputheaderlist.append('Combo_max')
outputheaderstring = ''
for item in outputheaderlist:
if outputheaderstring:
outputheaderstring += delimiter
outputheaderstring += item
outputheaderstring += '\n'
try:
outputfile.write(outputheaderstring)
except:
quit('Error writing header to output file.')
lastchromosome = False
lastposition = False
lastreference = False
lastobserved = False
newlines = 0
repeatlines = 0
while line:
line = line.strip('\r\n\t')
linenumber += 1
if line[0] == '#':
print('Skipped line ' + str(linenumber) + ' as it appears to be a comment.')
try:
outputfile.write(line)
except:
quit('Error writing to output file.')
line = inputfile.readline()
continue
if linenumber == -1: #this if/then statement here entirely for debugging purposes and nothing else
pass
print ('Processing line ' + str(linenumber), end = '\r')
if fileformat == 'tdt':
linearray = line.split('\t')
elif fileformat == 'csv':
linearray = quotedsplit(line)
chromosome = re.sub('^chr', '',linearray[chromosomecolumn], flags=re.IGNORECASE)
position = linearray[positioncolumn]
reference = linearray[referencecolumn]
observed = linearray[observedcolumn]
if not (chromosome == lastchromosome and position == lastposition and reference == lastreference and observed == lastobserved): #In an output listing several possible transcripts for each variant, this recycling of already-calculated values cuts more than 50% of the time required.
newlines += 1
frequencyhash = {}
refmismatch = False
extraalleles = False
observedarray = []
if '/' in observedarray:
observedarray = observed.split('/')
elif ',' in observedarray:
observedarray = observed.split(',')
else:
observedarray.append(observed)
currentlibraryfile = str(chromosome) + 'c' + str(int(position) - (int(position) % 100000)) + '.subvcf'
if os.path.isfile('subvcfs/' + currentlibraryfile):
founddata = True
else:
founddata = False
if founddata:
if currentlibraryfile != lastlibraryfile:
try:
libraryfile = open('subvcfs/' + currentlibraryfile, 'r')
library = libraryfile.read() #slurps the whole file into the library string
libraryfile.close()
except:
quit('Error opening subvcf library file.')
refline = re.search('^(' + str(chromosome) + '\t' + str(position) + '\t.*?)$', library, re.MULTILINE)
if not refline:
founddata = False
else:
refline = refline.group(0)
reflinearray = refline.split('\t')
if founddata:
exacalts = reflinearray[4].split(',')
exacalthash = {}
for i in range (0, len(exacalts)):
exacalthash[exacalts[i]]= i
if refline and (reflinearray[3] != reference): #if this doesn't analyze lazy, we need two if statements
print('Reference base mismatch starting on line ' + str(linenumber) + '.')
refmismatch = True
if reflinearray[3] in observedarray:
for allele in observedarray:
if allele in reflinearray[3] or allele in exactalts:
refandaltswapped = True
else:
try:
observedarray.remove(reference)
singlenonref = True
except:
singlenonref = False
if len(observedarray) == 1:
homozygousrare = True
elif len(observedarray) > 1 and observedarray[0] == observedarray[1]: #if this doesn't lazy evaluate, it needs to be nested
homozygousrare = True
observedarray.remove(observedarray[0])
else:
homozygousrare = False
if len(observedarray) in range(1,3):
extraalleles = False
for allele in (observedarray):
try:
frequencyhash[allele] = getfrequencyhash(reflinearray[7], populations, exacalthash[allele])
except KeyError:
frequencyhash[allele] = createzerohash(populations)
else:
print('Possible error on line ' + str(linenumber) + ', subject appears to have more than 2 alleles.')
extraalleles = True
if not extraalleles:
i = 0
namelist = []
namehash = {}
valuearray = []
for allele in observedarray:
for point in datapoints:
for population in populations:
try:
namehash[allele][point][population] = i
except KeyError:
try:
namehash[allele][point] = {}
namehash[allele][point][population] = i
except KeyError:
try:
namehash[allele] = {}
namehash[allele][point] = {}
namehash[allele][point][population] = i
except:
print('This is what you get for not making the data structure simpler')
namelist.append([allele, point, population])
if point == 'AF':
try:
entry = int(frequencyhash[allele]['AC'][population])/int(frequencyhash[allele]['AN'][population])
except KeyError:
entry = 0
except ZeroDivisionError:
entry = 'NA'
elif point == 'HOMOF':
if chromosome not in ('X','x','Y','y','MT','Mt','mt'):
try:
entry = int(frequencyhash[allele]['Hom'][population])/(int(frequencyhash[allele]['AN'][population])/2)
except KeyError:
entry = 0
except ZeroDivisionError:
entry = 'NA'
else:
entry = 'NA'
valuearray.append(entry)
i += 1
maximum = 0
namehash[allele][point]['max'] = i
namelist.append([allele, point, 'max'])
for population in populations:
if valuearray[namehash[allele][point][population]] != 'NA':
if maximum < valuearray[namehash[allele][point][population]]:
maximum = valuearray[namehash[allele][point][population]]
valuearray.append(maximum)
i += 1
if not singlenonref and not homozygousrare:
allele1max = valuearray[namehash[observedarray[0]][point]['max']]
allele2max = valuearray[namehash[observedarray[1]][point]['max']]
rarestallele = min(int(allele1max),int(allele2max))
if allele1max == 0:
allele1max = 1/65000
if allele2max == 0:
allele2max = 1/65000
combomax = allele1max * allele2max
elif singlenonref:
rarestallele = valuearray[namehash[observedarray[0]][point]['max']]
combomax = 'NA'
elif homozygousrare:
rarestallele = valuearray[namehash[observedarray[0]][point]['max']]
combomax = (1/65000)**2
datastring = ''
for i in range(0,headercolumns):
if datastring:
datastring += delimiter
if delimiter in linearray[i]:
linearray[i] = '"' + linearray[i] + '"'
datastring += linearray[i]
for value in valuearray:
datastring += delimiter + str(value)
if len(observedarray) == 1:
for value in valuearray:
datastring += delimiter + 'NA'
datastring += delimiter + str(valuearray[namehash[observedarray[0]][point]['max']]) + delimiter + str(valuearray[namehash[observedarray[0]][point]['max']]**2)
for i in range (headercolumns, len(linearray)):
if delimiter in linearray[i]:
linearray[i] = '"' + linearray[i] + '"'
datastring += delimiter + linearray[i]
if refmismatch or extraalleles:
if extraalleles:
fillin = 'Too many observed alleles'
if refmismatch:
if refandaltswapped:
fillin = 'Swapped reference and alternate alleles (likely common).'
else:
fillin = 'Mismatched reference'
datastring = ''
for i in range(0,headercolumns):
if datastring:
datastring += delimiter
if delimiter in linearray[i]:
linearray[i] = '"' + linearray[i] + '"'
datastring += linearray[i]
for i in range(0,30):
datastring += delimiter + fillin
for i in range (headercolumns, len(linearray)):
if delimiter in linearray[i]:
linearray[i] = '"' + linearray[i] + '"'
datastring += delimiter + linearray[i]
elif not founddata:
datastring = ''
for i in range(0,headercolumns):
if datastring:
datastring += delimiter
if delimiter in linearray[i]:
linearray[i] = '"' + linearray[i] + '"'
datastring += linearray[i]
for i in range(0,29):
datastring += delimiter + '0'
datastring += delimiter + str(float((1/65000)*(1/65000)))
for i in range (headercolumns, len(linearray)):
if delimiter in linearray[i]:
linearray[i] = '"' + linearray[i] + '"'
datastring += delimiter + linearray[i]
else: #this is what we do if the current line being annotated is the same base as the last line (meaning we can skip over finding the data for it again)
repeatlines += 1
datastring = datastring.strip('\n')
datastringarray = datastring.split(delimiter)
linearrayindex = 0
for i in range (0,len(datastringarray)):
if i < headercolumns or i >= headercolumns + 30:
datastringarray[i] = linearray[linearrayindex]
linearrayindex += 1
datastring = ''
for datum in datastringarray:
if datastring:
datastring += delimiter
if delimiter in datum:
datum = '"' + datum + '"'
datastring += datum
datastring += '\n'
try:
outputfile.write(datastring)
except:
quit('Error writing to output file.')
lastchromosome = chromosome
lastposition = position
lastreference = reference
lastobserved = observed
try:
line = inputfile.readline()
except:
quit('Error reading from input file.')
print ('Annotated ' + str(newlines) + ' unique loci and ' + str(repeatlines) + ' duplicated lines.' )
def main():
import time #this module lets us determine how long the run took (it is used only once at the very start and once at the very end of the program)
starttime = time.time() #mark the start time
job = checkargs()
file = job[0]
jobtype = job[1]
if jobtype == 'librarysplit':
if not librarysplit(file):
quit('Error creating checksum hash for library.')
elif jobtype == 'annotate':
annotate(file)
print ('Job completed successfully in ' + str(round(time.time() - starttime, 1)) + ' seconds.')
main()