-
Notifications
You must be signed in to change notification settings - Fork 42
/
process-conll.py
84 lines (76 loc) · 2.66 KB
/
process-conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
__author__ = 'pv'
import gzip
window = 2
vocab = {}
labels = {}
w2v_file = 'data/embeddings/polyglot-en.w2v.gz'
vector_out_file = 'data/embeddings/polyglot-en.index'
data_dir = 'data/conll2003/'
vocab_map_file = 'data/conll2003/vocab-map.index'
label_map_file = 'data/conll2003/label-map.index'
in_files = [data_dir+'eng.testa', data_dir+'eng.testb', data_dir+'eng.train']
# create token -> index map
out = open(vector_out_file, 'w')
with gzip.open(w2v_file, 'rb') as f:
next(f)
for line in f:
parts = line.split(' ')
token = parts[0]
vocab[token] = str(len(vocab) + 1)
vector = parts[1:]
out.write(vocab[token] + '\t' + ' '.join(vector))
print ('loaded ' + str(len(vocab)) + ' tokens to vocab')
out.close()
# iterate over train, dev, test files for conll
for in_file in in_files:
print 'Processing ' + in_file
out_file = in_file + '.index'
tokens = []
chunks = []
ner = []
labeled_windows = []
for line in open(in_file, 'r'):
line = line.strip()
if not line.startswith('-DOCSTART-'):
if line != '':
parts = line.split(' ')
tokens.append(parts[0])
chunks.append(parts[2])
if parts[3] not in labels:
labels[parts[3]] = str(len(labels) + 1)
ner.append(labels[parts[3]])
# new sentence
else:
# process the last sentence into labeled windows
for i in range(0, len(tokens)):
# each line starts with label \t
current_window = [ner[i] + '\t']
for j in range(i - window, i + window + 1):
if j < -1 or j > len(tokens):
token = '<PAD>'
elif j == -1:
token = '<S>'
elif j == len(tokens):
token = '<\S>'
else:
token = tokens[j]
token_idx = vocab[token] if token in vocab else vocab['<UNK>']
current_window.append(token_idx)
labeled_windows.append(' '.join(current_window))
tokens = []
chunks = []
ner = []
# write the windows to file
out = open(out_file, 'w')
for w in labeled_windows:
out.write(w + '\n')
out.close()
# export maps
out = open(vocab_map_file, 'w')
for t, i in vocab.iteritems():
out.write(t + '\t' + str(i) +'\n')
out.close()
out = open(label_map_file, 'w')
for l, i in labels.iteritems():
out.write(l + '\t' + str(i) +'\n')
out.close()