-
Notifications
You must be signed in to change notification settings - Fork 18
/
backtranslate.py
128 lines (108 loc) · 4.12 KB
/
backtranslate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Py3 and UTF-8 encoding required for special characters.
# Code that converts a protein sequence in to a representation of the available
# codon encodings. Note, it doesnt enumerate all individual possibilities (as one
# might do using itertools.product as this simply takes too long to calculate/print).
# Usage:
# $ python3 backtranslate.py <outputformat> <proteinfile> <proteinfileformat>
# E.g:
# $ python3 backtranslate.py pretty myproteins.fasta fasta
# If the output is wrapping weirdly:
# $ python3 backtranslate.py pretty myproteins.fasta fasta |cut -c1-$(stty size </dev/tty | cut -d' ' -f2)
# TODO:
# - Refactor to classes/structures
# - Add argparse.
# - Include different codon tables (create a codon table class to hold them all?)
# - Maybe lean on Bio.IUPAC for this?
# - Figure out screen wrapping. Currenty needs exporting to a file, or trimming with
# the construct ( |cut -c1-$(stty size </dev/tty | cut -d' ' -f2) )
# Big thanks to the guys on Code Golf and Programming Puzzle SE for the pretty/boxed output
codon_table_11 = {
"A": ["GCU", "GCC", "GCA", "GCG"],
"C": ["UGU", "UGC"],
"D": ["GAU", "GAC"],
"E": ["GAA", "GAG"],
"F": ["UUU", "UUC"],
"G": ["GGU", "GGC", "GGA", "GGG"],
"H": ["CAU", "CAC"],
"I": ["AUU", "AUC", "AUA"],
"K": ["AAA", "AAG"],
"L": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"],
"M": ["AUG"],
"N": ["AAU", "AAC"],
"P": ["CCU", "CCC", "CCA", "CCG"],
"Q": ["CAA", "CAG"],
"R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"],
"S": ["AGU", "AGC", "UCU", "UCC", "UCA", "UCG"],
"T": ["ACU", "ACC", "ACA", "ACG"],
"V": ["GUU", "GUC", "GUA", "GUG"],
"W": ["UGG"],
"X": ["nnn"], # Tolerates unknown AAs but returns unknown codon
"Y": ["UAU", "UAC"],
"*": ["UAA", "UAG", "UGA"],
}
def parse(infile, format):
for protein in SeqIO.parse(infile, format):
for i in protein:
amino_acids = []
for residue in protein:
amino_acids.append(codon_table_11[residue])
yield amino_acids, protein.id
def comboprint(matrix):
m = len(max(matrix, key=len))
s = [""] * m
for vector in matrix:
for i in range(m):
s[i] += (
vector[i] if i < len(vector) else " " * max([len(x) for x in vector])
) + " "
print("\n".join(s))
def comboprint_boxed(matrix):
length = max(map(len, matrix))
lengths = []
for index in range(len(matrix)):
matrix[index] = (
[""] * -((len(matrix[index]) - length) // 2)
+ matrix[index]
+ ([""] * ((length - len(matrix[index])) // 2))
)
hlength = max(map(len, matrix[index]))
lengths.append(hlength)
matrix[index] = [item.ljust(hlength) for item in matrix[index]]
horiz = ["─" * e for e in lengths]
print("┬".join(horiz).join("┌┐"))
for row in list(zip(*matrix))[:-1]:
print("│".join(row).join("││"))
print("┼".join(horiz).join("├┤"))
print("│".join(col[-1] for col in matrix).join("││"))
print("┴".join(horiz).join("└┘"))
def comboprint_pretty(matrix):
length = max(map(len, matrix))
for index in range(len(matrix)):
matrix[index] = (
[""] * -((len(matrix[index]) - length) // 2)
+ matrix[index]
+ ([""] * ((length - len(matrix[index])) // 2))
)
hlength = max(map(len, matrix[index]))
matrix[index] = [item.ljust(hlength) for item in matrix[index]]
for row in list(zip(*matrix)):
print(" ".join(row))
if __name__ == "__main__":
import sys
from Bio import SeqIO
if not sys.argv[3]:
sys.argv[3] = "fasta"
if sys.version_info < (3, 0):
sys.exit("Exited (1). Requires python3 to run correctly.")
functions = {
"pretty": comboprint_pretty,
"boxed": comboprint_boxed,
"simple": comboprint,
}
for amino_acids, name in parse(sys.argv[2], sys.argv[3]):
print(name)
functions.get(
sys.argv[1], lambda: "Invalid function choice {pretty|boxed|simple}"
)(amino_acids)