-
Notifications
You must be signed in to change notification settings - Fork 0
/
RemoveInParalogFromTree.py
118 lines (96 loc) · 3.68 KB
/
RemoveInParalogFromTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python2
import optparse
from Bio import SeqIO
from ete2 import Tree
from collections import defaultdict
################################# Command line options
desc='Remove paralogs from the Ortholog Group (OG)\n Cite: Cicconardi et al 2023 Tribe-wide genomics reveals the evolutionary dynamics of genome size, content, and selection across the adaptive radiation of Heliconiini butterflies. In prep.'
parser = optparse.OptionParser(description=desc, version='%prog version 0.1 - 06-01-2022 - Author: FCicconardi')
parser.add_option('-t', '--in-tree', dest='tree', help='Tree file name.', action='store', type='string', metavar='<FILE>')
parser.add_option('-f', '--fasta', dest='fasta', help='Fasta file name.', action='store', type='string', metavar='<FILE>')
(opts, args) = parser.parse_args()
mandatories = ['tree','fasta']
for m in mandatories:
if not opts.__dict__[m]:
print "\nWARNING! output file is not specified\n"
parser.print_help()
exit(-1)
############################## Reading files and parameters
fasta_dct=defaultdict(list)
for record in SeqIO.parse(opts.fasta, 'fasta'):
fasta_dct[record.id].append(record.seq)
SpList=defaultdict(list)
tree=open(opts.tree, 'r').readline()
Tree=Tree(tree.strip(), format=1)
c=0
NodeSupport=defaultdict(list)
for node in Tree.traverse("postorder"):
if node.is_leaf():
sp=node.name.split('.')[0]
SpList[sp].append(node.name)
else:
if len(node.name) > 1:
if float(node.name) < 0.9:
removed_node = node.delete()
else:
c+=1
Nname='Node'+str(c)
NodeSupport[Nname].append(node.name)
node.name=Nname
EueidesTaxa=[]
HeliconiusTaxa=[]
print
for sp in SpList:
MinDist=defaultdict(list)
AncestorNode=[]
if len(SpList[sp]) > 1:
for leaf in SpList[sp]:
#print leaf
AncestorNode.append(Tree.search_nodes(name=leaf)[0].up.name)
if len(set(AncestorNode)) == 1:
for leaf in SpList[sp]:
MinDist[Tree.search_nodes(name=leaf)[0].dist].append(leaf)
bestPara=Tree.search_nodes(name=MinDist[min(MinDist.keys())][0])[0].name
if bestPara.split('_')[1].startswith('H'):
HeliconiusTaxa.append(SpList[sp][0])
elif bestPara.split('_')[1].startswith('E'):
EueidesTaxa.append(SpList[sp][0])
else:
for leaf in SpList[sp]:
print leaf
if len(Tree.search_nodes(name=leaf)[0].up.name) == 0: print 'Root'
else:
print Tree.search_nodes(name=leaf)[0].up.name, NodeSupport[Tree.search_nodes(name=leaf)[0].up.name][0]
print Tree.search_nodes(name=leaf)[0].up
print
if leaf.split('_')[1].startswith('H'):
HeliconiusTaxa.append(SpList[sp][0])
elif leaf.split('_')[1].startswith('E'):
EueidesTaxa.append(SpList[sp][0])
else:
if SpList[sp][0].split('_')[1].startswith('H'):
HeliconiusTaxa.append(SpList[sp][0])
elif SpList[sp][0].split('_')[1].startswith('E'):
EueidesTaxa.append(SpList[sp][0])
print
SpList=[]
for sp in EueidesTaxa:
SpList.append(sp.split('.')[0])
if len(SpList) == len(set(SpList)):
out=open(opts.fasta.replace('NT.fasta','Eueides.NT.fasta'), 'w')
for sp in EueidesTaxa:
if len(str(fasta_dct[sp][0])) > 1:
print >> out, '>%s\n%s' % (sp,str(fasta_dct[sp][0]).replace('-',''))
SpList=[]
for sp in HeliconiusTaxa:
SpList.append(sp.split('.')[0])
if len(SpList) == len(set(SpList)):
out=open(opts.fasta.replace('NT.fasta','Heliconius.NT.fasta'), 'w')
for sp in HeliconiusTaxa:
if len(str(fasta_dct[sp][0])) > 1:
print >> out, '>%s\n%s' % (sp,str(fasta_dct[sp][0]).replace('-',''))
for node in Tree.traverse("postorder"):
if node.name in NodeSupport:
node.name=NodeSupport[node.name][0]
treeout=open(opts.tree.replace('NT.fasta.treefile','NT.Pruned.treefile'), 'w')
print >> treeout, Tree.write(format=1)