-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathancestral_dists.py
executable file
·123 lines (101 loc) · 3.95 KB
/
ancestral_dists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python2.4
"""
Construct ancestral probability distribution for a set of alignment columns
usage: %prog tree_mod species_names col_count_file
`tree_mod`: A tree model as produced by phyloFit from PHAST.
`species_names`: a comma seperated list of the species names in the order
they appear in the alignment
`col_file`: if present, a list of alignment columns and counts for which
distributions will be computed. Otherwise distributions will
be computed for all possible columns. Column will be taken from
the first field in this file, and all other fields will be kept.
"""
from __future__ import division
from bx import cookbook
import sys
from numpy import *
from scipy.linalg import expm
import bx.phylo.newick
import bx.phylo.phast
def main():
tm = bx.phylo.phast.TreeModel.from_file( open( sys.argv[1] ) )
t = bx.phylo.newick.newick_parser.parse_string( tm.tree )
names = sys.argv[2].split( ',' )
nucs = tm.alphabet
background = tm.background
print "#", ','.join( names )
if len( sys.argv ) > 3:
cols = []
for line in open( sys.argv[3] ):
fields = line.rstrip( "\r\n" ).split()
col = fields[0]
# HACK!
col = col.replace( "N", "*" )
cols.append( ( fields[0], int( fields[1]) ) )
a = dict( zip( names, col ) )
lik = felsen( t, a, tm )
# print >>sys.stderr, col
prob = lik_to_prob( lik, background, tm )
print "\t".join( fields + [ ' '.join( map( str, prob ) ) ] )
else:
for rows in cookbook.cross_lists( *( [ nucs ] * len( names ) ) ):
a = dict( zip( names, rows ) )
lik = felsen( t, a, tm )
prob = lik_to_prob( lik, background, tm )
print "\t".join( [ ''.join(rows), "?" , ' '.join( map( str, prob ) ) ] )
matrix_by_time_cache = {}
def felsen( node, column, tm ):
"""Reconstruct likelihood using felsen's algorithm"""
nucs = tm.alphabet
# Is it a leaf node?
if node.edges is None:
symbol = column[ node.label ]
if symbol == '*':
## Uniform distribution
# return [ 1 / len( nucs ) ] # len( nucs )
## Equilibrium (\pi) distribution
#return tm.background
# Eliminate entirely
return None
return [ int( symbol == nuc ) for nuc in nucs ]
else:
# Traverse children and determine likelihoods
## l_children = [ felsen( edge.tip, column, tm ) for edge in node.edges ]
l_children = []
for edge in node.edges:
t = felsen( edge.tip, column, tm )
if t is not None:
l_children.append( ( t, edge ) )
if not l_children: return None
l_self = []
# Determine liklihood for each possible 'ancestral' sequence
for i_y in range( len( nucs ) ):
y = nucs[ i_y ]
p_y = 1
# For each child, sum over all paths
for cl, edge in l_children:
if edge.length not in matrix_by_time_cache:
matrix_by_time_cache[ edge.length ] = matrix_for_time( tm.matrix, edge.length )
psub = matrix_by_time_cache[ edge.length ]
p_y_c = 0
for i_a in range( len( nucs ) ):
p_y_c += psub[i_y,i_a] * cl[ i_a ]
# Product over all children
p_y *= p_y_c
l_self.append( p_y )
return l_self
def lik_to_prob( lik, p_parent, tm ):
nucs = tm.alphabet
p = []
for i in range( len( nucs ) ):
p.append( lik[i] * p_parent[i] )
p_x = sum( p )
# print >>sys.stderr, lik
# print >>sys.stderr, p
rval = [ top / p_x for top in p ]
# print >>sys.stderr, rval
return rval
def matrix_for_time( matrix, t ):
return expm( matrix * t )
if __name__ == "__main__":
main()