forked from zsctju/triplets-extraction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PrecessEEdata.py
190 lines (172 loc) · 6.07 KB
/
PrecessEEdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#coding=utf-8
__author__ = 'Suncong Zheng'
import numpy as np
import cPickle
import json
import re
def load_vec_pkl(fname,vocab,k=300):
"""
Loads 300x1 word vecs from word2vec
"""
W = np.zeros(shape=(vocab.__len__() + 1, k))
w2v = cPickle.load(open(fname,'rb'))
w2v["UNK"] = np.random.uniform(-0.25, 0.25, k)
for word in vocab:
if not w2v.__contains__(word):
w2v[word] = w2v["UNK"]
W[vocab[word]] = w2v[word]
return w2v,k,W
def make_idx_data_index_EE_LSTM(file,max_s,source_vob,target_vob):
"""
Coding the word sequence and tag sequence based on the digital index which provided by source_vob and target_vob
:param the tag file: word sequence and tag sequence
:param the word index map and tag index map: source_vob,target_vob
:param the maxsent lenth: max_s
:return: the word_index map, the index_word map, the tag_index map, the index_tag map,
the max lenth of word sentence
"""
data_s_all=[]
data_t_all=[]
f = open(file,'r')
fr = f.readlines()
for line in fr:
sent = json.loads(line.strip('\r\n'))
s_sent = sent['tokens']
t_sent = sent['tags']
data_t = []
data_s = []
if len(s_sent) > max_s:
i=max_s-1
while i >= 0:
data_s.append(source_vob[s_sent[i]])
i-=1
else:
num=max_s-len(s_sent)
for inum in range(0,num):
data_s.append(0)
i=len(s_sent)-1
while i >= 0:
data_s.append(source_vob[s_sent[i]])
i-=1
data_s_all.append(data_s)
if len(t_sent) > max_s:
for i in range(0,max_s):
data_t.append(target_vob[t_sent[i]])
else:
for word in t_sent:
data_t.append(target_vob[word])
while len(data_t)< max_s:
data_t.append(0)
data_t_all.append(data_t)
f.close()
return [data_s_all,data_t_all]
def get_word_index(train,test):
"""
Give each word an index
:param the train file and the test file
:return: the word_index map, the index_word map, the tag_index map, the index_tag map,
the max lenth of word sentence
"""
source_vob = {}
target_vob = {}
sourc_idex_word = {}
target_idex_word = {}
count = 1
tarcount=1
max_s=0
max_t=0
f = open(train,'r')
fr = f.readlines()
for line in fr:
sent = json.loads(line.strip('\r\n'))
sourc = sent['tokens']
for word in sourc:
if not source_vob.__contains__(word):
source_vob[word] = count
sourc_idex_word[count] = word
count += 1
if sourc.__len__()>max_s:
max_s = sourc.__len__()
target = sent['tags']
if target.__len__()> max_t:
max_t = target.__len__()
for word in target:
if not target_vob.__contains__(word):
target_vob[word] = tarcount
target_idex_word[tarcount] = word
tarcount += 1
f.close()
f = open(test,'r')
fr = f.readlines()
for line in fr:
sent = json.loads(line.strip('\r\n'))
sourc = sent['tokens']
for word in sourc:
if not source_vob.__contains__(word):
source_vob[word] = count
sourc_idex_word[count] = word
count += 1
if sourc.__len__()>max_s:
max_s = sourc.__len__()
target = sent['tags']
if not source_vob.__contains__(target[0]):
source_vob[target[0]] = count
sourc_idex_word[count] = target[0]
count += 1
if target.__len__()> max_t:
max_t = target.__len__()
for word in target:
if not target_vob.__contains__(word):
target_vob[word] = tarcount
target_idex_word[tarcount] = word
tarcount += 1
f.close()
if not source_vob.__contains__("**END**"):
source_vob["**END**"] = count
sourc_idex_word[count] = "**END**"
count+=1
if not source_vob.__contains__("UNK"):
source_vob["UNK"] = count
sourc_idex_word[count] = "UNK"
count+=1
return source_vob,sourc_idex_word,target_vob,target_idex_word,max_s
def get_data_e2e(trainfile,testfile,w2v_file,eelstmfile,maxlen = 50):
"""
Converts the input files into the end2end model input formats
:param the train tag file: produced by TaggingScheme.py
:param the test tag file: produced by TaggingScheme.py
:param the word2vec file: Extracted form the word2vec resource
:param: the maximum sentence length we want to set
:return: tthe end2end model formats data: eelstmfile
"""
source_vob, sourc_idex_word, target_vob, target_idex_word, max_s = \
get_word_index(trainfile, testfile)
print "source vocab size: " + str(len(source_vob))
print "target vocab size: " + str(len(target_vob))
source_w2v ,k ,source_W= load_vec_pkl(w2v_file,source_vob)
print "word2vec loaded!"
print "num words in source word2vec: " + str(len(source_w2v))+\
"source unknown words: "+str(len(source_vob)-len(source_w2v))
if max_s > maxlen:
max_s = maxlen
print 'max soure sent lenth is ' + str(max_s)
train = make_idx_data_index_EE_LSTM(trainfile,max_s,source_vob,target_vob)
test = make_idx_data_index_EE_LSTM(testfile, max_s, source_vob, target_vob)
print "dataset created!"
cPickle.dump([train,test,source_W,source_vob,sourc_idex_word,
target_vob,target_idex_word,max_s,k],open(eelstmfile,'wb'))
def zero_digits(s):
"""
Replace every digit in a string by a zero.
"""
return re.sub('\d', '0', s)
def peplacedigital(s):
if len(s)==1:
s='1'
elif len(s)==2:
s='10'
elif len(s)==3:
s='100'
else:
s='1000'
return s