-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrp_adapt_mpi.py
executable file
·145 lines (111 loc) · 4.47 KB
/
rp_adapt_mpi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
"""
Collapse from a starting alphabet to a series of nested alphabets using
cross validation as 'merit'. Uses MPI via pypar.
usage: %prog pos_data neg_data out_dir [options]
-f, --format=NAME: Format of input data. 'ints' by default, or 'maf'
-m, --mapping=FILE: A mapping (alphabet reduction) to apply to each sequence
"""
import align.maf
import cookbook.doc_optparse
import os.path
import sys
import traceback
from cookbook.progress_bar import *
import rp.cv
import rp.io
import rp.mapping
import rp.models.averaging
# Startup pypar and get some info about what node we are
import pypar
nodes = pypar.size()
node_id = pypar.rank()
print "I am node %d of %d" % ( node_id, nodes )
# Things that shouldn't be hardcoded'
stop_size = 5
fold = 10
passes = 10
model = rp.models.averaging
def run( pos_file, neg_file, out_dir, format, align_count, mapping ):
# Open merit output
merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' )
# Read integer sequences
pos_strings = list( rp.io.get_reader( pos_file, format, None ) )
neg_strings = list( rp.io.get_reader( neg_file, format, None ) )
symbol_count = mapping.get_out_size()
# Collapse
while symbol_count > stop_size:
# Sync nodes on each pass, may not be required
pypar.barrier()
if node_id == 0:
print "Collapsing from:", symbol_count
pairs = all_pairs( symbol_count )
# Decide which subrange of all pairs this node will handle
lo, hi = pypar.balance( len( pairs ), nodes, node_id )
# Find best collapsed mapping in interval
best_i, best_j, best_merit = None, None, 0
for i, j in pairs[lo:hi]:
merit = calc_merit( pos_strings, neg_strings, mapping.collapse( i, j ) )
if merit > best_merit:
best_i, best_j, best_merit = i, j, merit
# Aggregate results
if node_id != 0:
# Send best i, j, merit to the master
pypar.send( ( best_i, best_j, merit ), 0 )
else:
# I am the master, get results from all other nodes and determine
# which had the best merit
for other_node_id in range( 1, nodes ):
i, j, merit = pypar.receive( other_node_id )
if merit > best_merit:
best_i, best_j, best_merit = i, j, merit
# Collapse the two symbols that resulted in the best merit
mapping = mapping.collapse( best_i, best_j )
symbol_count -= 1
# Ensure only the master writes files
if node_id == 0:
# Append merit to merit output
print >>merit_out, symbol_count, best_merit
print "\nBest Merit %d." % best_merit,
# Write best mapping to a file
mapping_out = open( os.path.join( out_dir, "%03d.mapping" % symbol_count ), 'w' )
for i, symbol in enumerate( mapping.get_table() ):
print >>mapping_out, str.join( '', rp.mapping.DNA.reverse_map( i, align_count ) ), symbol
mapping_out.close()
def all_pairs( n ):
rval = []
for i in range( 0, n ):
for j in range( i + 1, n ):
rval.append( ( i, j ) )
return rval
def max_order( radix ):
"""Determine max order based on size of alphabet"""
if radix <= 4: return 4
elif radix <= 7: return 3
elif radix <= 14: return 2
else: return 1
def calc_merit( pos_strings, neg_strings, mapping ):
# Apply mapping to strings
pos_strings = [ mapping.translate( s ) for s in pos_strings ]
neg_strings = [ mapping.translate( s ) for s in neg_strings ]
# Cross validate using those strings
radix = mapping.get_out_size()
order = max_order( radix )
model_factory = lambda d0, d1: model.train( order, radix, d0, d1 )
cv_engine = rp.cv.CV( model_factory, pos_strings, neg_strings, fold=fold, passes=passes )
cv_engine.run()
# Merit is TP + TN
return cv_engine.cls1.pos + cv_engine.cls2.neg
def main():
# Parse command line
options, args = cookbook.doc_optparse.parse( __doc__ )
#try:
pos_fname, neg_fname, out_dir = args
align_count, mapping = rp.mapping.alignment_mapping_from_file( file( options.mapping ) )
#except:
# cookbook.doc_optparse.exit()
try:
run( open( pos_fname ), open( neg_fname ), out_dir, options.format, align_count, mapping )
finally:
pypar.finalize()
if __name__ == "__main__": main()