forked from gringer/bioinfscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
readthrough.py
executable file
·127 lines (118 loc) · 4.5 KB
/
readthrough.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/python
import HTSeq
import string
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
import re
import sys
def writeSeqTrans(name, exons, seq, interval):
if(interval.strand == "-"):
#print(">%s (rc)\n%s" % (name, seq))
seq = string.replace(seq,"|","")
translation = str(Seq(seq, generic_dna).reverse_complement().translate())
endText = " from RC strand"
else:
#print(">%s\n%s" % (name, seq))
seq = string.replace(seq,"|","")
translation = str(Seq(seq, generic_dna).translate())
endText = ""
translation = string.replace(translation, "*", "X", 1)
translation = re.sub("\\*.*$","*",translation)
if(translation.endswith("X")):
endText += ", no sequence beyond stop codon"
translation = string.replace(translation, "X", "*", 1)
throughDist = 0
elif(not translation.endswith("*")):
endText += ", no additional stop codons found"
throughDist = 0
else:
throughDist = translation.find("*") - translation.find("X")
endText += ", stop codon distance: %d" % throughDist
name = "%05d_%s" % (throughDist, name)
if(numExons == 1):
print(">%s [translation of %d exon%s]\n%s" % (name, exons, endText, translation))
else:
print(">%s [translation of %d exons%s]\n%s" % (name, exons, endText, translation))
def writeSeqMrna(name, exons, seq, interval):
if(interval.strand == "-"):
#print(">%s (rc)\n%s" % (name, seq))
seq = string.replace(seq,"|","")
translation = str(Seq(seq, generic_dna).reverse_complement().translate())
endText = " from RC strand"
else:
#print(">%s\n%s" % (name, seq))
seq = string.replace(seq,"|","")
translation = str(Seq(seq, generic_dna).translate())
endText = ""
translation = string.replace(translation, "*", "X", 1)
translation = re.sub("\\*.*$","*",translation)
if(translation.endswith("X")):
endText += ", no sequence beyond stop codon"
translation = string.replace(translation, "X", "*", 1)
throughDist = 0
elif(not translation.endswith("*")):
endText += ", no additional stop codons found"
throughDist = 0
else:
throughDist = translation.find("*") - translation.find("X")
endText += ", stop codon distance: %d" % throughDist
name = "%05d_%s" % (throughDist, name)
if(numExons == 1):
print(">%s [translation of %d exon%s]\n%s" % (name, exons, endText, translation))
else:
print(">%s [translation of %d exons%s]\n%s" % (name, exons, endText, translation))
# load Scer reference genome
scerFile = open('saccharomyces_cerevisiae.gff', 'r')
gffLines = ()
fastaLines = ()
hitFasta = False
for(line in scerFile):
if(line.startswith('##FASTA')):
hitFasta = True
if(not hitFasta):
gfflines.append(line)
else if(not line.startswith('#')):
fastaLines.append(line)
gtfFile = HTSeq.GFF_Reader(gffLines)
# load all sequences into memory
sequences = dict()
for s in HTSeq.FastaReader(fastaLines):
s.seq = string.replace(s.seq,"\r","")
sequences[s.name] = s
lastSequence = ""
lastName = ""
lastInterval = None
extend = 50 # number of base pairs to extend
readDistance = 5000
numExons = 0
numFeatures = 0
sys.stderr.write("\n")
for feature in gtfFile:
sys.stderr.write("\rFeatures read: %d" % numFeatures)
numFeatures += 1
if(feature.type != "CDS"):
continue
if(feature.name != lastName):
if((lastInterval != None) and (lastInterval.strand != "-")):
# add readDistance bases to end of sequence
lastSequence += sequences[lastInterval.chrom].seq[lastInterval.end:lastInterval.end+readDistance]
# write out sequence / translation
if(lastName != ""):
writeSeqTrans(lastName, numExons, lastSequence, lastInterval)
lastSequence = ""
numExons = 0
if(feature.iv.strand == "-"):
# add readDistance bases to start of sequence
startPos = lastInterval.start - readDistance
if(startPos < 0):
startPos = 0
lastSequence += sequences[feature.iv.chrom].seq[startPos:lastInterval.end]
lastName = feature.name
lastInterval = feature.iv
# add current interval to sequence
lastSequence += sequences[feature.iv.chrom].seq[feature.iv.start:feature.iv.end]
lastSequence += "|"
numExons += 1
# write out sequence / translation
writeSeqTrans(lastName, numExons, lastSequence, lastInterval)
sys.stderr.write("\n")