forked from chunjie-sam-liu/useful-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfindMotifByHand.py
executable file
·55 lines (48 loc) · 1.8 KB
/
findMotifByHand.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/python
#-*- coding:utf-8 -*-
################################################
#File Name: find_motif.py
#Author: C.J. Liu
#Mail: [email protected]
#Created Time: Sun 09 Oct 2016 05:25:28 PM CST
################################################
import os,sys
import re
from itertools import product
from Bio import SeqIO
import numpy as np
ROOT = os.path.dirname(os.path.abspath(__file__))
def help():
helpMessage=''' Description:
Find motif through nt number. It relies on the exact match, not caculating the possibility or confidence coeffient like MEME, it's used for tuning motif results produced by MEME and speeding up handpick.
It's based on the Cartesian product with 4 alphabets. The nt number is not recommended over 10nt. It's very slow when nt number is over 10 which will loop for one million times.
Example:
python find_motif.py seq.fasta motifLength
'''
if len(sys.argv) < 3:
print(helpMessage)
sys.exit(1)
def motifList(motifLength):
motifLength = int(motifLength)
alpha = ["A", "G", "C", "T"]
searchList = list(product(alpha, repeat=motifLength))
searchList = [''.join(i) for i in searchList]
return searchList
def run(fastaFile,motifLength):
# print(motifList(motifLength))
searchList = motifList(motifLength)
for motif in searchList:
searchedID = list()
prog = re.compile(motif, re.I)
for record in SeqIO.parse(fastaFile, 'fasta'):
id = record.id
seq = str(record.seq)
if prog.search(seq):
searchedID.append(id)
if len(searchedID) < 5: continue
print(motif,str(len(searchedID)),','.join(searchedID), sep="\t")
def main():
help()
run(sys.argv[1], sys.argv[2])
if __name__ == "__main__":
main()