-
Notifications
You must be signed in to change notification settings - Fork 13
/
tagger.py
172 lines (146 loc) · 5.78 KB
/
tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import udt_suc_sv
import suc
import suc_ne
import collections
# Tags a sentence with SUC tags based on a trained model
class SucTagger():
def __init__(self, tagging_model):
with open(tagging_model, 'rb') as f:
self.tagger_weights = f.read()
def tag(self, sentence):
tags_list = suc.tag(self.tagger_weights, sentence)
return tags_list
# Tags a sentence with SUC-style named entity tags based on a trained model
class SucNETagger():
def __init__(self, tagging_model):
with open(tagging_model, 'rb') as f:
self.tagger_weights = f.read()
def tag(self, sentence):
tags_list = suc_ne.tag(self.tagger_weights, sentence)
return tags_list
# Tags a sentence with UD tags based on a model trained on SUC tags
class UDTagger():
FEATURE_MAPPING = {
"AKT": ["Voice=Act"],
"DEF": ["Definite=Def"],
"GEN": ["Case=Gen"],
"IND": ["Definite=Ind"],
"INF": ["VerbForm=Inf"],
"IMP": ["VerbForm=Fin", "Mood=Imp"],
"KOM": ["Degree=Cmp"],
"KON": ["Mood=Sub"],
"NEU": ["Gender=Neut"],
"NOM": ["Case=Nom"],
"MAS": ["Gender=Masc"],
"OBJ": ["Case=Acc"],
"PLU": ["Number=Plur"],
"POS": ["Degree=Pos"],
"PRF": ["VerbForm=Part", "Tense=Past"],
"PRT": ["VerbForm=Fin", "Tense=Past"],
"PRS": ["VerbForm=Fin", "Tense=Pres"],
"SFO": ["Voice=Pass"],
"SIN": ["Number=Sing"],
"SMS": [],
"SUB": ["Case=Nom"],
"SUP": ["VerbForm=Sup"],
"SUV": ["Degree=Sup"],
"UTR": ["Gender=Com"],
"AN": ["Abbr=Yes"],
"-": [],
}
# Words that should have the feature Polarity=Neg
NEGATIVE = {
('inte', 'AB'),
('icke', 'AB'),
('aldrig', 'AB'),
('knappast', 'AB'),
('näppeligen', 'AB'),
('varken', 'AB'),
('föga', 'AB'),
('igalunda', 'AB'),
('ej', 'AB'),
#('nej', 'IN'),
#('nehej', 'IN'),
#('nejdå', 'IN'),
#('nix', 'IN'),
}
# Words that should have the feature Polarity=Pos
# NOTE: this is currently not used in the Swedish version
#POSITIVE = {
# ('ja', 'IN'),
# ('jaa', 'IN'),
# ('jadå', 'IN'),
# ('jajamen', 'IN'),
# ('jajamän', 'IN'),
# ('jajamensan', 'IN'),
#}
def __init__(self, tagging_model):
with open(tagging_model, 'rb') as f:
self.tagger_weights = f.read()
def _is_nonstring_iterable(self, value):
if not isinstance(value, collections.Iterable) or isinstance(value, str):
raise TypeError("Argument is not of the correct type")
def tag(self, sentence, lemmas, suc_tags_list):
self._is_nonstring_iterable(sentence)
self._is_nonstring_iterable(lemmas)
self._is_nonstring_iterable(suc_tags_list)
suc_sentence = [(lemma, tag.split('|',1)[0], tag)
for lemma, tag in zip(lemmas, suc_tags_list)]
tag_list = udt_suc_sv.tag(self.tagger_weights, suc_sentence)
tag_list = self.ud_verb_heuristics(tag_list, sentence, lemmas)
features = self.ud_features(suc_tags_list, lemmas)
return tuple(["|".join(t) for t in zip(tag_list, features)])
def ud_verb_heuristics(self, ud_tags, tokens, lemmas):
"""Heuristics to improve accuracy of UD tags, return modified ud_tags"""
ud_tags = list(ud_tags)
n = len(ud_tags)
for i in range(n):
if ud_tags[i] == 'AUX':
if lemmas[i] == 'vara':
# Trust the copula classifier
continue
for j in range(i + 1, n):
if ud_tags[j] in ('AUX', 'VERB'):
# If followed by AUX or VERB, do nothing
break
if (ud_tags[j] in ('SCONJ', 'PUNCT')) \
or tokens[j].lower() == 'som' or j == n - 1:
# If no AUX/VERB before SCONJ, PUNCT, "som" or end of
# sentence, change to VERB
ud_tags[i] = 'VERB'
break
return ud_tags
def ud_features(self, suc_tags_list, lemmas):
ud_features = []
for suc_tags, lemma in zip(suc_tags_list, lemmas):
# Apparently incorrect code from the UD 1 version:
#if "|" not in suc_tags:
# ud_features.append("_")
# continue
if "|" in suc_tags:
fields = suc_tags.split("|")
suc_tag = fields[0]
suc_features = fields[1:]
else:
suc_tag = suc_tags
suc_features = []
ud_feature_list = []
for suc_feature in suc_features:
# Don't include suc_features with multiple options in the UD suc_features
if "/" not in suc_feature:
ud_feature_list += self.FEATURE_MAPPING[suc_feature]
if "VerbForm=Fin" in ud_feature_list and "Mood=Imp" not in ud_feature_list and "Mood=Sub" not in ud_feature_list:
ud_feature_list += ["Mood=Ind"]
if suc_tag in ["HA", "HD", "HP", "HS"]:
ud_feature_list += ["PronType=Int,Rel"]
if suc_tag in ["HS", "PS"]:
ud_feature_list += ["Poss=Yes"] # Test this!
if suc_tag == "UO":
ud_feature_list += ["Foreign=Yes"]
if (lemma, suc_tag) in self.NEGATIVE:
ud_feature_list += ["Polarity=Neg"]
# Currently not used in the Swedish UD treebank:
#elif (lemma, suc_tag) in self.POSITIVE:
# ud_feature_list += ["Polarity=Pos"]
ud_features.append("|".join(sorted(ud_feature_list)) or "_")
return ud_features