-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetaSNV_post.py
executable file
·436 lines (335 loc) · 17 KB
/
metaSNV_post.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
#!/usr/bin/env python
import os
import sys
import argparse
import glob
try:
import numpy as np
except ImportError:
sys.stderr.write("Numpy is necessary to run postfiltering.\n")
sys.exit(1)
try:
import pandas as pd
except ImportError:
sys.stderr.write("Pandas is necessary to run postfiltering.\n")
sys.exit(1)
basedir = os.path.dirname(os.path.abspath(__file__))
#############################
# Parse Commandline Arguments
#############################
def get_arguments():
'''
Get commandline arguments and return namespace
'''
## Initialize Parser
parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Not Showns:
parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
# TODO: POPULATION STATISTICS
## Population Statistics (Fst - ):
# parser.add_argument("-f", "--fst", action="store_true")
## pNpS ratio:
# parser.add_argument("-n", "--pnps", action="store_true")
# OPTIONAL arguments:
parser.add_argument('-b', metavar='FLOAT', type=float, default=40.0,
help="Coverage breadth: minimal horizontal genome coverage percentage per sample per species")
parser.add_argument('-d', metavar='FLOAT', type=float, default=5.0,
help="Coverage depth: minimal average vertical genome coverage per sample per species")
parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
parser.add_argument('-div',action='store_true',help="Compute diversity measures")
# REQUIRED arguments:
parser.add_argument('projdir', help='project name', metavar='Proj')
return parser.parse_args()
##############################
def debugging(taxids_of_interest, header_cov):
'''Sanity check for the Dict of TaxID:Samples'''
nr_keys = 0
taxids_of_interest = []
for key in samples_of_interest:
nr_keys += 1
taxids_of_interest.append(key)
taxids_of_interest.sort()
print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
print(taxids_of_interest)
# print("\n")
# print("nr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
# header = "\t".join(header_cov)
# print("header: {}".format(header))
sys.exit("only filter 1")
def file_check():
''' Check if required files exist (True / False)'''
args.projdir = args.projdir.rstrip('/')
args.coverage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_cov.tab'
args.percentage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_perc.tab'
args.all_samples = args.projdir+'/'+'all_samples'
print("Checking for necessary input files...")
if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
else:
sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
if os.path.isfile(args.all_samples):
print("found: '{}'\n".format(args.all_samples))
else:
sys.exit("\nERROR: No such file '{}'".format(args.all_samples))
def print_arguments():
## Print Defined Thresholds:
print("Options:")
if args.b:
print("threshold: percentage covered (breadth) {}".format(args.b) )
if args.d:
print("threshold: average coverage (depth) {}".format(args.d) )
if args.m:
print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
if args.c:
print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
if args.p:
print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )
print("")
###############################
## FILTER I: Count SAMPLES_OF_INTEREST per TAXON_OF_INTEREST
# Example: "Which taxon has at least 10 SoI?"
# --Coverage Conditions:
# Sample of Interest:
# 1. breadth > 40 %
# 2. depth > 5 X
# --Min. number Samples:
# 3. min samples/TaxID > 10
def relevant_taxa(args):
'''function that goes through the coverage files and determines taxa and samples of interest'''
samples_of_interest = {}
with open(args.coverage_file, 'r') as COV, open(args.percentage_file, 'r') as PER:
header_cov = COV.readline().split()
header_per = PER.readline().split()
COV.readline() # skip second row in COV_FILE
PER.readline() # skip second row in PER_FILE
# Check if header match:
if header_cov != header_per:
sys.exit("ERROR: Coverage file headers do not match!") #Exit with error message
# Read taxon by taxon (line by line) check coverage conditions (thresholds)
for cov,perc in zip(COV,PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
cstring = cov.split()
pstring = perc.split()
cov_taxID = cstring.pop(0)
perc_taxID = pstring.pop(0)
coverage = list(map(float, cstring)) # convert list_of_strings 2 list_of_floats (TaxID=INT!)
percentage = list(map(float, pstring)) # convert list_of_strings 2 list_of_floats
sample_count = 0
sample_names = []
# print(cov_taxID,perc_taxID)
if cov_taxID != perc_taxID: # check if taxIDs match!
sys.exit("ERROR: TaxIDs in the coverage files are not in the same order!")
for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
sample_count += 1
if c >= args.d and p >= args.b:
sample_names.append(header_cov[sample_count-1]) # starting at 0 in array
if sample_count == len(header_cov) and len(sample_names) >= args.m:
samples_of_interest[cov_taxID] = sample_names
return {'SoI':samples_of_interest, 'h':header_cov}#return dict()
COV.close()
PER.close()
#################################
# FILTER I: END 'returns(<'samples_of_interest', type=dict>)'
#################################
def header_comparison(header_cov):
'''Comparing the sample order in the coverage and all_samples file'''
# sort snp_indices by sample_of_interest names
#header = "\t".join(header_cov)
#print("\nNr.Samples_cov_header: {}".format(len(header_cov)))
#print("cov_header: {}\n\n".format(header_cov[:10]))
#print("\ncov_header: {}\n {}\n\n".format(len(header_cov),header))
### read <all_samples> for snp_file header:
all_samples = open(args.all_samples,'r')
snp_header = all_samples.read().splitlines()
snp_header = [i.split('/')[-1] for i in snp_header] #get name /trim/off/path/to/sample.name.bam
#print("Nr.Samples in (all_samples): {}".format(len(snp_header)))
#print("all_samples: {}\n".format(snp_header[:10]))
# Compare the sample order in ALL.COV/ALL.PERC and all_samples
#print("Comparing headers..")
#snp_header_joined = "\t".join(snp_header)
#if header == snp_header_joined:
# print("Headers match nicely\n")
#else:
# print("CAUTION: Header in COV_FILE does not match the order of samples in the SNP_FILES,\n\t no problem, we took care of it!\n")
return snp_header
#################################
## FILTER II: ACQUIRE SNPs WITH SUFFICIENT OCCURRENCE WITHIN SAMPLES_OF_INTEREST
# --SNP Conditions (Default):
# 1. Position covered by at least (5) reads
# 2. Position present in at least 50 % of the accepted samples_of_interest
def filter_two(args, snp_header, snp_files, outdir):
'''position wise filtering'''
header_taxID = ''
snp_taxID = '_'
# print(locals())
for best_split_x in snp_files:
with open(best_split_x, 'r') as file:
for snp_line in file: #position wise loop
snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome change from . to ]
## SPECIES FILTER:
if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
continue #Taxon is not relevant, NEXT!
else:
## SAMPLE FILTER: only load samples with enough coverage
sample_list = samples_of_interest[snp_taxID]#Load Sample List
# !!! Sample order, get indices - INDICES based on ORDER in COV/PERC file!!!
sample_indices = []
for name in sample_list:
sample_indices.append(snp_header.index(name))
## POSITION FILTER:
# Positions with sufficient coverage (c) and support (p, proportion) in the discovery set (covered samples).
# Discovery set: sufficiently covered samples
#..site coverage >= cX
#..proportion >= p% (SoIs)
whole_line = snp_line.split()
site_coverage = list(map(int, whole_line[4].split('|'))) # Site coverages as list of ints
nr_good = 0
for index in sample_indices:
if site_coverage[index] < args.c or site_coverage[index] == 0:
continue#NOT enough coverage depth at position in sample, no increment
else:
nr_good += 1
#FILTER: Position incidence with sufficient coverage:
if float(nr_good)/len(sample_indices) < args.p:#if snp_incidence < x % drop SNP
continue#mainly uninformative position, SNP incidence < x %, SNP droped!
else:
# CALCULATE SNP ALLELE FREQUENCY:
# If at least one position passed cutoffs open file (if new TaxID):
if header_taxID != snp_taxID:
if 'outfile' in locals():
print("closing: {}".format(header_taxID) )
outfile.close()
outfile = open(outdir+'/'+'%s.filtered.freq' % snp_taxID, 'w')
print("Generating: {}".format(outdir+'/'+'%s.filtered.freq' % snp_taxID))
outfile.write('\t' + "\t".join(sample_list) + '\n')
header_taxID = snp_taxID#Jump to current TaxID
#Loop through alternative alleles [5](comma separated):
line_id = ":".join(whole_line[:4])#line_id composed of CHROM:REFGENE:POS:REFBASE
reference_base = whole_line[3]
alt_bases_totalcov = []#total coverage per alt allele
#VCF format
#LOOP Start:
for snp in whole_line[5].split(','):
xS = snp.split('|')
snp_coverage = list(map(float, xS[3:]))#coverage string (unfiltered!)
#Sanity check:
if len(site_coverage) != len(snp_coverage):
print("ERROR: SNP FILE {} is corrupted".format(best_split_x))
sys.exit("ERROR: Site coverage and SNP coverage string have uneven length!")
#Compute allele frequency tables:
alt_base = snp.split('|')[1]#alternative base
#FREQUENCY Computation
total_reads = 0
snp_frq = []#frequencies for SNPs (pos >5X in at least 50% of the SoIs)
for index in sample_indices:
#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!
if site_coverage[index] >= args.c and site_coverage[index] != 0:
snp_frq.append(snp_coverage[index]/site_coverage[index])
else:
snp_frq.append(-1)
# WRITE OUTPUT Allele Frequencies (Default)
outfile.write(":".join(snp_line.split()[:4])+'>'+alt_base +':'+ xS[2] + '\t' + "\t".join(str(x) for x in snp_frq) + '\n')
#if header_taxID != snp_taxID:
if 'outfile' in locals():
#print("closing: last = {}".format(header_taxID))
outfile.close()
def l1nonans(d1,d2):
return np.abs(d1 - d2).mean()
def alleledist(d1,d2, threshold=.6):
return (np.abs(d1 - d2) > threshold).mean()
def computeAllDist(args):
print("Computing distances")
allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
for f in allFreq:
data = pd.read_table(f, index_col=0, na_values=['-1']).T
dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
def genetic_distance(sample1, sample2):
'''Pairwise genetic distance'''
# The expression used to compute dist_nd would compute all the necessary
# values if appplied to sample[12]. However, the case where there are no
# duplicates is the majority (often >90% of cases) and can be done much
# faster, so it is special cased here:
sample1nd = sample1.reset_index().drop_duplicates(subset='index', keep=False).set_index('index')
sample2nd = sample2.reset_index().drop_duplicates(subset='index', keep=False).set_index('index')
sample2nd = sample2nd.reindex(index=sample1nd.index)
s1 = sample1nd.values
s2 = sample2nd.values
valid = ~(np.isnan(s1) | np.isnan(s2))
s1 = s1[valid]
s2 = s2[valid]
s1 = np.vstack([s1, 1 - s1])
s2 = np.vstack([s2, 1 - s2])
dist_nd = (s1[0]*s2[1]+s1[1]*s2[0]).sum()
def compute_diversity(x):
out = np.outer(x.s1.values, x.s2.values)
return np.nansum(out) - np.nansum(out.diagonal())
sample1d = sample1.ix[sample1.index[sample1.index.duplicated()]]
sample2d = sample2.ix[sample2.index[sample2.index.duplicated()]]
if not len(sample1d) or not len(sample2d):
# No duplicates
return dist_nd
both = pd.DataFrame({'s1' : sample1d, 's2' : sample2d})
both = both.reset_index()
both = pd.concat([both,(1. - both.groupby('index').sum()).reset_index()])
dist_d = both.groupby('index', group_keys=False).apply(compute_diversity).sum()
return dist_d + dist_nd
def computeAllDiv(args):
# Load external info : Coverage, genomes size, genes size
horizontal_coverage = pd.read_table(args.percentage_file, skiprows=[1], index_col=0)
vertical_coverage = pd.read_table(args.coverage_file, skiprows=[1], index_col=0)
bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
bedfile_tab = bedfile_tab.set_index(pd.Index(bed_index))
allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
for f in allFreq:
species = int(f.split('/')[-1].split('.')[0])
genome_length = bedfile_tab.loc[str(species), 2].sum()
data = pd.read_table(f, index_col=0, na_values=['-1'])
pre_index = [i.split(':') for i in list(data.index)]
data = data.set_index(pd.Index([item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]))
data = data.sort_index()
########
# Correcting the coverage by the genome length observed in each pairwise comparison
correction_coverage = [[(min(horizontal_coverage.loc[species, i], horizontal_coverage.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
########
# Vertical coverage in pi within : AvgCov / (AvgCov - 1)
for j,i in enumerate(data.columns):
correction_within = vertical_coverage.loc[species,i] / (vertical_coverage.loc[species,i] - 1)
correction_coverage[j][j] = correction_coverage[j][j] / correction_within
dist = [[genetic_distance(data.iloc[:, i], data.iloc[:, j]) / correction_coverage[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
FST = [[(1-(dist[i][i]+dist[j][j])/(2*dist[j][i])) for i in range(j + 1)] for j in range(len(dist))]
dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
FST = pd.DataFrame(FST, index=data.columns, columns=data.columns)
dist.to_csv(args.projdir + '/distances/' + '%s.diversity' % species, sep='\t')
FST.to_csv(args.projdir + '/distances/' + '%s.FST' % species, sep='\t')
if __name__ == "__main__":
# print("globals: {}".format(globals()))
args = get_arguments()
print_arguments()
file_check()
if args.debug:
print_arguments()
#==========================================
# Filtering I - Determine Taxa of Interest:
#==========================================
samples_of_interest = relevant_taxa(args)['SoI']
header_cov = relevant_taxa(args)['h']
if args.debug:
debugging(samples_of_interest, header_cov)#needs: samples_of_interest,header_cov
#=========================================
# Filtering II - Position wise filtering
#=========================================
snp_header = header_comparison(header_cov) #not necessary?! we identify it anyways..
filter_two(args, snp_header,glob.glob(args.projdir + '/snpCaller/called*'),args.projdir + '/filtered/pop')
filter_two(args, snp_header,glob.glob(args.projdir + '/snpCaller/indiv*'),args.projdir + '/filtered/ind')
computeAllDist(args)
if args.div:
computeAllDiv(args)