-
Notifications
You must be signed in to change notification settings - Fork 0
/
protein_lib.py
365 lines (277 loc) · 10.9 KB
/
protein_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#!/usr/bin/env python3
def read_fasta_lists( file_to_read ):
"""
Reads a list of fastas from file_to_read
Returns:
names- a list of names of the sequences found in the fasta file
sequences- a list of the sequences found in the fasta file
"""
file_in = open( file_to_read, 'r' )
count = 0
names = []
sequences = []
current_sequence = ''
for line in file_in:
line = line.strip()
if line and line[ 0 ] == '>':
count += 1
names.append( line[ 1: ] )
if count > 1:
sequences.append( current_sequence )
current_sequence = ''
else:
current_sequence += line
sequences.append( current_sequence )
file_in.close()
return names, sequences
def write_fastas( names_list, sequence_list, output_name="out.txt" ):
"""
Writes a fasta file from a list of names and sequences to output file provided
"""
out_file = open( output_name, 'w+' )
for index in range( len( names_list ) ):
out_file.write( '>' + names_list[ index ] + '\n' +
sequence_list[ index ] + '\n'
)
out_file.close()
def char_in_string( test_string, character ):
"""
Checks if a character is found within a given string
"""
return character in test_string
def percentage_of_char_in_string( test_string, character ):
"""
Calculates what percent of a given test_string is given character
Params:
test_string- string to test for character
character- character to test for in string
Returns:
floating point value percent of the string that
is character
"""
length = len( test_string )
char_count = 0.0
for index in range( length ):
if test_string[ index ] == character:
char_count += 1
if length == 0:
return 0
return ( char_count / length ) * 100
def count_char_in_string( test_string, character ):
"""
Counts how much of a certain character are in test_string
Params:
test_string- string to count
character- character to count in string
Returns:
integer value representing the number of character
were found in test_string
"""
length = len( test_string )
count = 0
for index in range( length ):
if test_string[ index ] == character:
count += 1
return count
def min_concurrent_chars( test_string, delimeter_char ):
"""
Finds the minimum number of concurrent non-delimiter char in
test_string
Params:
test_string- string to search
delimeter_char- character to reset the count
Returns:
integer value, the smallest amount of concurrent characters
between delimeter character
"""
split_string = test_string.split( delimeter_char )
min_length = len( split_string[ 0 ] )
for substring in split_string[ 1: ]:
current_length = len( substring )
if current_length > 0 and current_length < min_length:
min_length = current_length
return min_length
def remove_char_from_string( test_string, to_remove ):
"""
Removes character to_remove from string test_string
Note: Case sensitive method, 'a' != 'A'
Returns:
test_string, minus any instance of to_remove character
"""
output = ""
for index in range( len( test_string ) ):
if test_string[ index ] != to_remove:
output += test_string[ index ]
return output
def get_unique_sequences( names_list, sequence_list ):
sequence_dict = {}
out_names, out_seqs = list(), list()
if len( names_list ) > 0:
for item in range( len( sequence_list ) ):
sequence_dict[ sequence_list[ item ] ] = names_list[ item ]
for sequence, name in sequence_dict.items():
out_names.append( name )
out_seqs.append( sequence )
return out_names, out_seqs
return [ ], sequence_list
def create_list_of_uniques( names, sequences ):
"""
Removes duplicates from a list
Params:
names- names of the sequences
sequences:
a list of sequences who may or may not be unique
"""
return_names = []
return_sequences = []
unique_values = set()
for index in range( len( sequences ) ):
starting_length = len( unique_values )
unique_values.add( sequences[ index ] )
if len( unique_values ) > starting_length:
return_names.append( names[ index ] )
return_sequences.append( sequences[ index ] )
return return_names, return_sequences
def create_valid_sequence_list( names_list, sequence_list, min_length, percent_valid ):
"""
Creates a sequence list of valid sequences.
A valid sequence is defined by not having any 'X' characters,
and not violating the parameters of either min_length or percent_valid
Returns:
a list of names of those sequences that were valid, with the new bounds appended to the name
a list of the sequences that were found valid
"""
valid_names = []
valid_sequences = []
for sequence in range( len( sequence_list ) ):
current_sequence = sequence_list[ sequence ]
if is_valid_sequence( current_sequence, min_length, percent_valid ):
current_sequence = remove_char_from_string( current_sequence, '-' )
valid_sequences.append( current_sequence )
if names_list:
valid_names.append( names_list[ sequence ] )
return valid_names, valid_sequences
def is_valid_sequence( sequence, gap_constraint ):
"""
Determines whether a given sequence is valid
A valid sequence is defined by not having any 'X' characters,
and not violating the parameters of either min_length or percent_valid
"""
if sequence.count( 'X' ) == 0:
if gap_constraint == None:
return True
elif isinstance( gap_constraint, float ):
return percentage_of_char_in_string( sequence, '-' ) < ( 100 - gap_constraint )
elif isinstance( gap_constraint, int ):
return ( sequence.count( '-' ) <= len( sequence.replace( '-', '' ) ) - gap_constraint )
return False
def append_suffix( string, start, end ):
"""
Appends _start_end to a string
"""
return "%s_%s_%s" % ( string, str( start ), str( end ) )
def subset_lists_iter( name, sequence, window_size, step_size, span_gaps, gap_constraints = None):
new_names = []
new_seqs = []
start = 0
end = window_size
index = 0
while start < len( sequence ):
xmer = grab_xmer_from_seq( sequence, start, window_size, span_gaps )
if len( xmer ) == window_size and is_valid_sequence( xmer, gap_constraints ):
xmer = xmer.replace( '-', '' )
if xmer:
new_seqs.append( xmer )
new_names.append( append_suffix( name, start + 1, end ) )
start += step_size
end = start + window_size
return new_names, new_seqs
def grab_xmer_from_seq( sequence, start, window_size, span_gaps ):
out_xmer = ""
xmer_len = 0
probe_index = start
sequence_len = len( sequence )
probe_index = calc_start_index( sequence, start, window_size, span_gaps )
while xmer_len < window_size and probe_index < sequence_len:
probe_char = sequence[ probe_index ]
skipped = False
if span_gaps:
while probe_char == '-' and probe_index < sequence_len:
probe_char = sequence[ probe_index ]
probe_index += 1
skipped = True
if probe_index < sequence_len:
out_xmer += probe_char
xmer_len += 1
if not skipped:
probe_index += 1
return out_xmer
def calc_start_index( in_seq, start_index, window_size, span_gaps ):
in_seq_len = len( in_seq )
probe_index = start_index
current_index = 0
if span_gaps:
kmer_length = ( in_seq_len - start_index ) - in_seq.count( '-', start_index )
else:
kmer_length = ( in_seq_len - start_index )
if ( start_index + window_size ) >= in_seq_len:
while kmer_length < window_size and probe_index >= 0:
if span_gaps and in_seq[ probe_index ] != '-':
kmer_length += 1
else:
kmer_length += 1
probe_index -= 1
return probe_index
def get_kmers_from_seqs(
names,
sequences,
window_size,
step_size,
span_gaps,
gap_constraint = None
):
total_kmers = 0
subset_names = list()
subset_seqs = list()
for index in range( len( sequences ) ):
current_sequence = sequences[ index ]
if len( names ) > 0:
current_name = names[ index ]
else:
current_name = ""
win_names, current_kmers = subset_lists_iter( current_name, current_sequence,
window_size, step_size,
span_gaps,
gap_constraint
)
total_kmers += len( set( current_kmers ) )
for each in set( current_kmers ):
subset_seqs.append( each )
subset_names.append( win_names[ current_kmers.index( each ) ] + "_" + str( index ) + "_" + str( index + window_size ) )
subset_names, subset_seqs = get_unique_sequences( subset_names, subset_seqs )
return subset_names, subset_seqs, total_kmers
def subset_lists( name, sequence, window_size, step_size ):
"""
Creates a list of subsets of windowSize size in intervals of stepSize
Note: Uses recursive subset_lists_helper for operations
Params:
name: String name of sequence to be split up
sequence: String sequence to be split up into a list
Returns:
a list of the split up names, with a suffix applied, and a list of the segments
of the list, as specified
"""
new_names = []
new_seqs = []
return subset_lists_helper( name, sequence, new_names, new_seqs, window_size, step_size, 0, window_size )
def subset_lists_helper( name, sequence, name_arr, seq_arr, window_size, step_size, start, end ):
"""
Recursive helper method called by subset_lists
"""
if start + window_size <= len( sequence ):
if len( sequence[ start: end ] ) == 1:
return
seq_arr.append( sequence[ start : end ] )
name_arr.append( append_suffix( name, start + 1, end ) )
subset_lists_helper( name, sequence, name_arr, seq_arr, window_size, step_size, start + step_size, start + step_size + window_size )
return name_arr, seq_arr