Skip to content

Commit

Permalink
Put an empty ^$ after wblank
Browse files Browse the repository at this point in the history
I thought we were good because

```sh
$ echo '^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»' |cg-proc -1ng nob-nno.genprefs.rlx.bin
ikkje «[[tf:i:a]]s\^å[[/]]»
```

worked, but

If there's an analysis after the unmerged word blank, cg-proc errors out:

```sh
$ echo '^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å» ^.<sent>/.$' |cg-proc -1ng nob-nno.genprefs.rlx.bin
Error: Word-bound blank was not immediately prior to token on line 0
```

Fair enough, so lt-merge must put an empty ^$ after word blanks to
appease cg-proc. Seems like tf-inject finds the right point at which
to end it anyway?
  • Loading branch information
unhammer committed Dec 19, 2024
1 parent c7b2eb2 commit 8ebdfc2
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 4 deletions.
1 change: 1 addition & 0 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1967,6 +1967,7 @@ FSTProcessor::quoteMerge(InputFile& input, UFILE *output)
if(surface.size() > 0) {
surface += reader.blank;
appendEscaped(surface, reader.wblank);
if(!reader.wblank.empty()) { appendEscaped(surface, "^$"_u); } // otherwise cg-proc will Error wordblank not prior to token
}
else {
// The initial blank should just be output before the merged LU:
Expand Down
8 changes: 4 additions & 4 deletions tests/lt_merge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ class EscapeTest(MergeTest):

class WordblankTest(MergeTest):
# Using r'' to avoid doubling escapes even more:
inputs = [r'^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^ve\/ldig/v<adv>$[[/]]^»/»<rquot><MERGE_END>$']
expectedOutputs = [r'^«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»/«\[\[tf:i:a\]\]ve\\\/ldig\[\[\/\]\]»<MERGED>$']
inputs = [r'^«/«<lquot><MERGE_BEG>$[[tf:i:a]]^ve\/ldig/v<adv>$^»/»<rquot><MERGE_END>$']
expectedOutputs = [r'^«\[\[tf:i:a\]\]\^\$ve\\\/ldig»/«\[\[tf:i:a\]\]\^\$ve\\\/ldig»<MERGED>$']


class SimpleUnmergeTest(MergeTest):
Expand All @@ -52,5 +52,5 @@ class SimpleUnmergeTest(MergeTest):
class EscapedUnmergeTest(MergeTest):
procflags = ['--unmerge']
# Using r'' to avoid doubling escapes even more:
inputs = [r'^ikkje<adv>/ikkje$ ^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»$']
expectedOutputs = [r'^ikkje<adv>/ikkje$ «[[tf:i:a]]s\^å[[/]]»']
inputs = [r'^ikkje<adv>/ikkje$ ^«\[\[tf:i:a\]\]\^\$s\\\^å»<MERGED>/«\[\[tf:i:a\]\]\^\$s\\\^å»$']
expectedOutputs = [r'^ikkje<adv>/ikkje$ «[[tf:i:a]]^$s\^å»']

0 comments on commit 8ebdfc2

Please sign in to comment.