-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_match.py
67 lines (59 loc) · 3.39 KB
/
find_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 22 17:30:57 2016
@author: Krishna
"""
import re
from fgm import *
from rules1 import *
def find_matches(s, gmk):
if gmk in s: # checking if gmk is in the line
l = re.split('\s|(?<!\d)[,.]|[,.](?!\d)|;|[()]|-', s) # split the line by comma, semicolon and space to check for gmks and gs. Also http://goo.gl/RPQNbT
filter(None, l) # remove empty elements in the list
gs_list=[]
for gss in keywords:
gss=gss.rstrip()
gss=gss.lower()
if gss in s:
gs_list.append(gss)
for gs in gs_list: # gene symbols
if gs in s: # search for GS in line. using 'gs in s' led to a lot of partial word matches
gs1 = re.split('\s|(?<!\d)[,.]|[,.](?!\d)|;|-', gs)
gs1=filter(None, gs1)
gmk1 = re.split('\s|(?<!\d)[,.]|[,.](?!\d)|;|-', gmk)
gmk1=filter(None, gmk1)
if any(l[i:i+len(gs1)]==gs1 for i in xrange(len(l)-len(gs1)+1)) and (any(l[i:i+len(gmk1)]==gmk1 for i in xrange(len(l)-len(gmk1)+1))): # this ensures that both gs and gmk are in l, as a unit(i.e. and in order) otherwise it was detecting things like 'beta c' from beta cells
# UPTO THIS POINT WE HAVE ESTABLISHED THAT THE GMK AND GS ARE INDEED IN THE LINE
k1 = '_MKKEYWORD_1_'
k2 = '_SKEYWORD_2_'
#print gmk
text = re.sub(re.escape(gmk), k1, s, flags=re.I) # because of this replacement, we dont have the problem of counting r from behind etc.
# also, I cannot use the regex based replacement used below for gmk replacement because we do want
# cases where gmk's like -/- or + are just after or before a word, without the word boundary
text = re.sub(r'(\b%s\b)' % (re.escape(gs)), k2, text, flags=re.I)
lt = text.split()
#lt = re.split('\s|(?<!\d)[,.]|[,.](?!\d)|;|[()]|-', text)
d_idx = {k1:[], k2:[]}
#print d_idx[k1]
for k,v in enumerate(lt):
if k1 in v:
d_idx[k1].append(k)
if k2 in v:
d_idx[k2].append(k)
distance = 8
data = []
for idx1 in d_idx[k1]:
for idx2 in d_idx[k2]:
d = abs(idx1 - idx2)
if d<=distance:
data.append((d,idx1,idx2))
data.sort(key=lambda x: x[0])
for i in range (0, len(data)):
aq = data[i]
loq = min(aq[1], aq[2])
hiq = max(aq[1], aq[2])
brrq = lt[max(0, loq-6):hiq+6]
brq = " ".join(brrq)
if data:
a1=int(cl(s, gmk, gs, gs_list, data))
print 'cl for %s is %d' %(gs, a1)