forked from faircloth-lab/uce-probe-design
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sequence.py
102 lines (94 loc) · 3.62 KB
/
sequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# encoding: utf-8
"""
sequence.py
Created by Brant Faircloth on 2008-07-08.
Copyright (c) 2008 Brant Faircloth. All rights reserved.
This module is meant to be used with the summary.py program for MAF files.
"""
import pdb
import numpy
import re
class stack(numpy.ndarray):
'''Takes a sequence stack (ndarray of strings) and performs several operations'''
def __new__(subtype, data, info='Sequence Stack', dtype='|S1', copy=False):
assert data.ndim > 1, 'sequence is not a stack'
# Make sure we are working with an array, and copy the data if requested
subarr = numpy.array(data, dtype=dtype, copy=copy)
# Transform 'subarr' from an ndarray to our new subclass.
subarr = subarr.view(subtype)
# Use the specified 'info' parameter if given
if info is not None:
subarr.info = info
# Otherwise, use data's info attribute if it exists
elif hasattr(data, 'info'):
subarr.info = data.info
# Finally, we must return the newly created object:
return subarr
def __array_finalize__(self,obj):
# We use the getattr method to set a default if 'obj' doesn't have the 'info' attribute
self.info = getattr(obj, 'info', {})
def __repr__(self):
desc = """array(data=%(data)s,tag='%(tag)s')"""
return desc % {'data': str(self), 'tag':self.info }
def iupac(self, s):
'''define translation for mismatched bases, forgetting gaps'''
iupacCode = {
'A':'A',
'C':'C',
'G':'G',
'T':'T',
'AG':'R',
'CT':'Y',
'AC':'M',
'GT':'K',
'AT':'W',
'CG':'S',
'CGT':'B',
'AGT':'D',
'ACT':'H',
'ACG':'V',
'ACGT':'N'
}
return iupacCode[s]
def consensus(self):
'''loops over consensus => dumb consensus and maintaining mask status'''
# ensure that sequence is indeed a stack
assert self.ndim > 1, 'sequence is not a stack'
# setup regex to search for lowercase bases (masking)
lc = re.compile('[acgt]+')
cons = []
# for i in the length of the row
for b in range(len(self[0,:])):
# slice the data across columns (i,e. each colum is a
# stack of bases at the aligned position).
stringRep = self[:,b]
# if gap anywhere, leave it and only it
if '-' in stringRep:
cons.append('-')
# there appear to be some 'N'|'n' bases. If in the sequence column,
# represent all bases as N|n, maintaining case.
elif 'N' in stringRep:
cons.append('N')
elif 'n' in stringRep:
cons.append('n')
# otherwise, see if there are mismatches and translate
else:
bases = []
# ignore same base > 1 time, uppercase all
[bases.append(i.upper()) for i in stringRep if i.upper() not in bases]
# sort the bases, so they match the dict entries
bases.sort()
# translate, if needed
try:
base = self.iupac(''.join(bases))
except KeyError:
pdb.set_trace()
# if the column contains a masked base, from any
# species, lowercase the base
string = ''.join(stringRep)
if lc.search(string):base = base.lower()
cons.append(base)
# return the consensus as a string
self.cons = ''.join(cons)
return self.cons