-
Notifications
You must be signed in to change notification settings - Fork 0
/
proper_tagger_ph2.m4
188 lines (153 loc) · 7.26 KB
/
proper_tagger_ph2.m4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
! -*- coding: utf-8 -*-
!======================================================================
!==== Auxiliary definitions
!======================================================================
m4_include(`finer_defs.m4')
!======================================================================
!==== Recognition rule
!======================================================================
!----------------------------------------------------------------------
! Expansion
! Assign identical NE tag to neighbouring untagged proper names in lists
! NB: These rule take input with additional fields and field separators,
! which must be accounted when marking word boundaries.
!----------------------------------------------------------------------
Define FWSep
FSep* WSep ;
Define MakeTag(S)
Word FSep "<" ("/") S ("/") ">" ;
Define CompleteList1(Tag)
LC( Tag FWSep lemma_exact(Comma) FWSep Tag FWSep lemma_exact( Comma | {ja} | {sekä} ) FWSep )
CapWord ;
Define CompleteList2(Tag)
LC( Tag FWSep lemma_exact(Comma) FWSep )
CapWord
RC( FWSep lemma_exact( Comma | {ja} ) FWSep Tag ) ;
Define CompleteList3(Tag)
CapWord
RC( FWSep lemma_exact(Comma) FWSep Tag FWSep lemma_exact( Comma | {ja} | {sekä} ) FWSep Tag ) ;
Define CompleteList(Tag)
[ CompleteList1(Tag) | CompleteList2(Tag) | CompleteList3(Tag) ] ;
Define PrsHumTag MakeTag({EnamexPrsHum}) ;
Define PrsMytTag MakeTag({EnamexPrsMyt}) ;
Define LocGplTag MakeTag({EnamexLocGpl}) ;
Define LocPplTag MakeTag({EnamexLocPpl}) ;
Define LocFncTag MakeTag({EnamexLocFnc}) ;
Define LocAstTag MakeTag({EnamexLocAst}) ;
Define LocMytTag MakeTag({EnamexLocMyt}) ;
Define OrgCrpTag MakeTag({EnamexOrgCrp}) ;
Define OrgAthTag MakeTag({EnamexOrgAth}) ;
Define OrgPltTag MakeTag({EnamexOrgPlt}) ;
Define OrgFinTag MakeTag({EnamexOrgFin}) ;
Define OrgEduTag MakeTag({EnamexOrgEdu}) ;
Define PrsTag MakeTag({EnamexPrs} Alpha*) ;
Define LocTag MakeTag({EnamexLoc} Alpha*) ;
Define OrgTag MakeTag({EnamexOrg} Alpha*) ;
Define EvtTag MakeTag({EnamexEvt} Alpha*) ;
Define ProTag MakeTag({EnamexPro} Alpha*) ;
Define CompleteListPrsHum CompleteList(PrsHumTag) EndTag(EnamexPrsHum) ;
Define CompleteListPrsMyt CompleteList(PrsMytTag) EndTag(EnamexPrsMyt) ;
Define CompleteListOrgCrp CompleteList(OrgCrpTag) EndTag(EnamexOrgCrp) ;
Define CompleteListOrgAth CompleteList(OrgAthTag) EndTag(EnamexOrgAth) ;
Define CompleteListOrgFin CompleteList(OrgFinTag) EndTag(EnamexOrgFin) ;
Define CompleteListOrgEdu CompleteList(OrgEduTag) EndTag(EnamexOrgEdu) ;
Define CompleteListOrgPlt CompleteList(OrgPltTag) EndTag(EnamexOrgPlt) ;
Define CompleteListLocGpl CompleteList(LocGplTag) EndTag(EnamexLocGpl) ;
Define CompleteListLocPpl CompleteList(LocPplTag) EndTag(EnamexLocPpl) ;
Define CompleteListLocAst CompleteList(LocAstTag) EndTag(EnamexLocAst) ;
Define CompleteListLocFnc CompleteList(LocFncTag) EndTag(EnamexLocFnc) ;
Define CompleteListLocMyt CompleteList(LocMytTag) EndTag(EnamexLocMyt) ;
Define CompleteListOrg CompleteList(OrgTag) EndTag(EnamexOrgCrp) ;
Define CompleteListLoc CompleteList(LocTag) EndTag(EnamexLocPpl) ;
Define CompleteListPrs CompleteList(PrsTag) EndTag(EnamexPrsHum) ;
Define CompleteListPro CompleteList(ProTag) EndTag(EnamexProXxx) ;
Define CompleteListEvt CompleteList(EvtTag) EndTag(EnamexEvtXxx) ;
Define AbbrInParenthesesOrg
LC( OrgTag FWSep lemma_exact( LPar ) FWSep )
wordform_exact( AlphaUp+ Field Capture(OrgCpt) )
RC( FWSep lemma_exact( RPar ) )
EndTag(EnamexOrgCrp) ;
Define AbbrInParenthesesPro
LC( ProTag FWSep lemma_exact( LPar ) FWSep )
wordform_exact( AlphaUp+ Field Capture(ProCpt) )
RC( FWSep lemma_exact( RPar ) )
EndTag(EnamexProXxx) ;
Define OrgCaptured
wordform_exact( OrgCpt (":" AlphaDown ) )
EndTag(EnamexOrgCrp) ;
Define ProCaptured
wordform_exact( ProCpt (":" AlphaDown ) )
EndTag(EnamexProXxx) ;
Define InQPro [
[ wordform_exact(Apostr) FWSep AlphaUp [ [ ? - Apostr ] Word FSep FWSep ]+ wordform_exact(Apostr) ] |
[ wordform_exact(DoubleQuote) FWSep AlphaUp [ [ ? - DoubleQuote ] Word FSep FWSep ]+ wordform_exact(DoubleQuote) ]
] EndTag(EnamexProXxx1) ;
Define ProQuoteAndQuote
LC( Quote FSep Word FSep "<" [ "/" Alpha* {Pro} Alpha* | Alpha* {Pro} Alpha* "/" ] ">" FWSep )
[ lemma_exact( Comma ) FWSep InQPro FWSep ]*
[ lemma_exact( Comma | {ja} | {sekä} ) FWSep InQPro ] ;
!* HEAD
Define Expand
[ CompleteListPrsHum
| CompleteListPrsMyt
| CompleteListOrgCrp
| CompleteListOrgAth
| CompleteListOrgFin
| CompleteListOrgEdu
| CompleteListOrgPlt
| CompleteListLocGpl
| CompleteListLocPpl
| CompleteListLocAst
| CompleteListLocFnc
| CompleteListLocMyt
| CompleteListOrg::0.25
| CompleteListLoc::0.25
| CompleteListPrs::0.25
| CompleteListPro::0.25
| CompleteListEvt::0.25
| AbbrInParenthesesOrg
| AbbrInParenthesesPro
| OrgCaptured
| ProCaptured
| ProQuoteAndQuote
] ;
Define PersTitleStr [ [ Field @txt"gaz/gPersTitle.txt" ]
- [ Field [ {digiassistentti} | {laitetoimittaja} | {järjestelmätoimittaja} |
{markkinajohtaja} | {syöjätär} ] ] ] ;
Define TitleAdj
lemma_exact( {johtava} | {vastaava} | {vt.} | {operatiivinen} | {entinen} ) ;
Define PersTitle1
[ ( TitleAdj FWSep )
( TruncPfx FWSep wordform_exact({ja}) FWSep )
[ lemma_exact_morph( PersTitleStr, {[NUM=SG]}) - morphtag({CASE=ESS}|{CASE=TRA}) ] |
lemma_exact( @txt"gaz/gPersTitleAbbr.txt" ) ] ;
Define PersTitle2
( wordform_ends( AlphaDown+ [ {iikan} | {sofian} | {logian} | {tieteen} |
{emian} | {tutkimuksen} | {nomian} ] ) FWSep wordform_exact({ja}) FWSep )
( TruncPfx FWSep wordform_exact({ja}) FWSep )
wordform_ends( AlphaDown+ [ {iikan} | {sofian} | {logian} | {tieteen} |
{emian} | {tutkimuksen} | {nomian} ] ) FWSep
lemma_exact_morph( {opiskelija} | {kandidaatti} | {maisteri} | {dosentti} | {tohtori} | {professori}, [ Field {NUM=SG} Field ] - [ Field [{CASE=ESS}|{CASE=TRA}] Field ]) ;
Define PersTitle3
[ wordform_exact(OptCap({hallituksen}|{johtoryhmän})) FWSep lemma_exact({puheenjohtaja}) ] |
[ lemma_exact(OptCap({luova})) FWSep lemma_exact({johtaja}) ] |
[ wordform_exact(OptCap({tasavallan}|{istuva})) FWSep lemma_exact({presidentti}) ] |
[ lemma_exact(OptCap({teollinen}|{graafinen})) FWSep lemma_exact({muotoilija}|{suunnittelija}) ] |
[ wordform_exact(OptCap({stand} (Dash) {up})) FWSep lemma_exact((Dash) {koomikko}) ] ;
Define PersTitle
[ Ins(PersTitle1) | Ins(PersTitle2) | Ins(PersTitle3) ] EndTag(EnamexPrsTit1) ;
Define PersTitleRule
( Ins(PersTitle) FWSep wordform_exact({ja}) FWSep )
Ins(PersTitle)
RC( FWSep PrsTag ) ;
!----------------------------------------------------------------------
! Exceptions
!----------------------------------------------------------------------
!----------------------------------------------------------------------
! TOP: Main entry of the recognizer
!----------------------------------------------------------------------
Define TOP
LC( WordBoundary )
[ Expand
| PersTitleRule
] RC( FSep* WordBoundary ) ;