-
Notifications
You must be signed in to change notification settings - Fork 0
/
untitled22.py
93 lines (90 loc) · 4.26 KB
/
untitled22.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
## -*- coding: utf-8 -*-
#"""
#Created on Tue Jan 05 18:29:02 2016
#
#@author: Krishna
#"""
#
#import os
#import re
#import time
#start_time = time.time()
#q=open('C:\Users\Krishna\Desktop\up down analysis for orgs\gmk_gs vicinity_small_testt.txt','w')
#a = open('F:\M.Tech\patterns for gmk_down.txt','r').readlines()
#a1 = open('F:\M.Tech\patterns for gmk_up.txt','r').readlines()
#a=a+a1
##print a
#keywords = open('F:\M.Tech\mouse_gs_small_simple.txt','r').readlines() # this has the large gs's
##gmk = 'knockout'
#def find_matches(s, gmk):
# r=0
# if gmk in s: # checking if gmk is in the line
# l = re.split('\s|(?<!\d)[,.]|[,.](?!\d)|;|[()]|-', s) # split the line by comma, semicolon and space to check for gmks and gs. Also http://goo.gl/RPQNbT
# filter(None, l) # remove empty elements in the list
# for gs in keywords: # gene symbols
# #q.write(gs)
# #q.write('\n')
# gs = gs.rstrip() #remove lagging whitespace characters
# #q.write(gs)
# #print gs
# #re.compile(gs)
# #print hg=hg+1
# if gs in s: # search for GS in line. using 'gs in s' led to a lot of partial word matches
# #print 'title'
# #print gs, gmk
# gs1 = re.split('\s|(?<!\d)[,.]|[,.](?!\d)|;|-', gs)
# gs1=filter(None, gs1)
# gs11=gs1[0]
# gmk1 = re.split('\s|(?<!\d)[,.]|[,.](?!\d)|;|-', gmk)
# gmk1=filter(None, gmk1)
# gmk11=gmk1[0]
# #print 'gmk is %s' % gmk
# #print 'l is %s' % l
# #print gs, gmk, l, s
# if any(l[i:i+len(gs1)]==gs1 for i in xrange(len(l)-len(gs1)+1)) and (any(l[i:i+len(gmk1)]==gmk1 for i in xrange(len(l)-len(gmk1)+1))): # this ensures that both gs and gmk are in l, as a unit(i.e. and in order) otherwise it was detecting things like 'beta c' from beta cells
# #if (gs1 in l) and (gmk1 in l):
# #if (gmk in s): # ensure both are in the list. sometimes, they are not because of some attached delimiters
## index = s.find(gs) #find index of the GS in the line
## if (s[index-1].isalpha() or s[index+len(gs)+1].isalpha())==False: #see if the 2 characters preceding and succeeding the GS are not alphabets. this is imp because if they are, then we dont want it, cause it means the GS is within a word
# r = abs(l.index(gs11) - l.index(gmk11)) # find distance between first character of gs and gmk
# #print 'title %d' % r
# if r<=8:
# q.write(file)
# q.write('\n')
# q.write(gs)
# q.write (' r is %d ' % r)
# q.write('\n')
# q.write('Found %s in %s' % (gmk,s.strip()))
# q.write('\n')
#
#c=0
#for path, dirs, files in os.walk('F:\M.Tech\org segregated\mus musculus'):
# for file in files:
# sentences = open(os.path.join(path,file),'r').readlines();
# c = c+1
# r=0
# print c # one value printed for each file
# #print file
# print("--- %s seconds ---" % (time.time() - start_time))
# k=0
# hg=0
# for gmk in a:
# gmk = gmk.rstrip() #to remove the lagging \n in many GMKs
# #print gmk
# for s in sentences:
# s = s.rstrip()
# if s.startswith('!Series_title'):
# find_matches(s, gmk)
# if s.startswith('!Series_summary'):
# find_matches(s, gmk)
# if s.startswith('!Series_overall_design'):
# find_matches(s, gmk)
#q.close()
#print("--- %s seconds ---" % (time.time() - start_time))
##
a="""rt wer werw
ds sldkjsdf
sdfsldkf""".splitlines()
for aa in a:
aa=aa.strip()
print aa