-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathcandidatesreader.py
201 lines (147 loc) · 5.96 KB
/
candidatesreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import sys
#import dlm.utils as U
#import dlm.io.logging as L
import codecs
class NBestList():
def __init__(self, nbest_path, mode='r', reference_list=None):
assert mode == 'r' or mode == 'w', "Invalid mode: " + mode
self.mode = mode
self.nbest_file = codecs.open(nbest_path, mode=mode, encoding='UTF-8')
self.prev_index = -1
self.curr_item = None
self.curr_index = 0
self.eof_flag = False
self.ref_manager = None
if reference_list:
assert mode == 'r', "Cannot accept a reference_list in 'w' mode"
self.ref_manager = RefernceManager(reference_list)
def __iter__(self):
assert self.mode == 'r', "Iteration can only be done in 'r' mode"
return self
def next_item(self):
assert self.mode == 'r', "next() method can only be used in 'r' mode"
try:
segments = self.nbest_file.next().split("|||")
except StopIteration:
self.close()
raise StopIteration
try:
index = int(segments[0])
except ValueError:
print >> sys.stderr, "The first segment in an n-best list must be an integer"
#L.error("The first segment in an n-best list must be an integer")
hyp = segments[1].strip()
features = segments[2].strip()
score = None
phrase_alignments = None
word_alignments = None
phrase_alignments = None
if len(segments) > 3:
score = segments[3].strip()
if len(segments) > 4:
phrase_alignments = segments[4].strip()
if len(segments) > 5:
word_alignments = segments[5].strip()
return NBestItem(index, hyp, features, score, phrase_alignments, word_alignments)
def next(self): # Returns a group of NBestItems with the same index
if self.eof_flag == True:
raise StopIteration
assert self.mode == 'r', "next_group() method can only be used in 'r' mode"
group = NBestGroup(self.ref_manager)
group.add(self.curr_item) # add the item that was read in the last next() call
try:
self.curr_item = self.next_item()
except StopIteration:
self.eof_flag = True
return group
if self.curr_index != self.curr_item.index:
self.curr_index = self.curr_item.index
return group
while self.curr_index == self.curr_item.index:
group.add(self.curr_item)
try:
self.curr_item = self.next_item()
except StopIteration:
self.eof_flag = True
return group
self.curr_index = self.curr_item.index
return group
def write(self, item):
assert self.mode == 'w', "write() method can only be used in 'w' mode"
self.nbest_file.write(unicode(item) + "\n")
def close(self):
self.nbest_file.close()
class NBestItem:
def __init__(self, index, hyp, features, score, phrase_alignments, word_alignments):
self.index = index
self.hyp = hyp
self.features = features
self.score = score
self.phrase_alignments = phrase_alignments
self.word_alignments = word_alignments
def __unicode__(self):
output = ' ||| '.join([unicode(self.index), self.hyp, self.features])
if self.score:
output = output + ' ||| ' + self.score
if self.phrase_alignments:
output = output + ' ||| ' + self.phrase_alignments
if self.word_alignments:
output = output + ' ||| ' + self.word_alignments
return output
def append_feature(self, feature_name, feature_value):
self.features += ' ' + str(feature_name) + '= ' + str(feature_value) + ' '
class NBestGroup:
def __init__(self, refrence_manager=None):
self.group_index = -1
self.group = []
self.ref_manager = refrence_manager
def __unicode__(self):
return '\n'.join([unicode(item) for item in self.group])
def __iter__(self):
self.item_index = 0
return self
def __getitem__(self, index):
return self.group[index]
def add(self, item):
if item is None:
return
if self.group_index == -1:
self.group_index = item.index
if self.ref_manager:
self.refs = self.ref_manager.get_all_refs(self.group_index)
else:
assert item.index == self.group_index, "Cannot add an nbest item with an incompatible index"
self.group.append(item)
def next(self):
#if self.item_index < len(self.group):
try:
item = self.group[self.item_index]
self.item_index += 1
return item
#else:
except IndexError:
raise StopIteration
def size(self):
return len(self.group)
def append_features(self, features_list):
assert len(features_list) == len(self.group), 'Number of features and number of items in this group do not match'
for i in range(len(self.group)):
self.group[i].append_feature(features_list[i])
class RefernceManager:
def __init__(self, paths_list):
assert type(paths_list) is list, "The input to a RefernceManager class must be a list"
self.ref_list = []
self.num_lines = -1
self.num_refs = 0
for path in paths_list:
with codecs.open(path, mode='r', encoding='UTF-8') as f:
self.num_refs += 1
sentences = f.readlines()
if self.num_lines == -1:
self.num_lines = len(sentences)
else:
assert self.num_lines == len(sentences), "Reference files must have the same number of lines"
self.ref_list.append(sentences)
def get_all_refs(self, index):
assert index < self.num_lines, "Index out of bound"
return [self.ref_list[k][index] for k in range(self.num_refs)]