Skip to content

Commit

Permalink
Let lt-proc -b handle special ANY_CHAR tag (<w/> from lsx)
Browse files Browse the repository at this point in the history
  • Loading branch information
unhammer committed Dec 19, 2024
1 parent 716a00f commit 51898cc
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 4 deletions.
11 changes: 10 additions & 1 deletion lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,8 @@ void
FSTProcessor::load(FILE *input)
{
readTransducerSet(input, alphabetic_chars, alphabet, transducers);
alphabet.includeSymbol("<ANY_CHAR>"_u);
any_char = alphabet("<ANY_CHAR>"_u);
}

void
Expand Down Expand Up @@ -1755,7 +1757,14 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
if (reader.readings[index].mark == '#') current_state.step('#');
for (size_t i = 0; i < symbols.size(); i++) {
seenTags = seenTags || alphabet.isTag(symbols[i]);
current_state.step_case(symbols[i], beCaseSensitive(current_state));
UString source;
alphabet.getSymbol(source, symbols[i]);
if(beCaseSensitive(current_state)) { // allow any_char
current_state.step_override(symbols[i], any_char, symbols[i]);
}
else { // include lower alt
current_state.step_override(symbols[i], towlower(symbols[i]), any_char, symbols[i]);
}
if (current_state.isFinal(all_finals)) {
queue_start = i;
current_state.filterFinalsArray(result,
Expand Down
5 changes: 5 additions & 0 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,11 @@ class FSTProcessor
*/
int maxWeightClasses = INT_MAX;

/**
* The alphabet index of the tag <ANY_CHAR>
*/
int any_char;

/**
* Prints an error of input stream and exits
*/
Expand Down
6 changes: 3 additions & 3 deletions lttoolbox/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ class State

/**
* Make a transition, but overriding the output symbol
* @param input symbol
* @param output symbol we expect to appear
* @param output symbol we want to appear
* @param input symbol read from infile
* @param output symbol from the FST
* @param output symbol we want to appear in outfile
*/
void apply_override(int const input, int const old_sym, int const new_sym);

Expand Down
20 changes: 20 additions & 0 deletions tests/data/pass-through.lsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<dictionary type="separable">
<alphabet></alphabet>
<sdefs>
<sdef n="MERGED"/>
</sdefs>

<pardefs>
<pardef n="foo">
<e> <i>foo<d/></i> </e>
</pardef>
</pardefs>

<section id="main" type="standard">

<e c="pass-through MERGED words">
<i><w/><s n="MERGED"/></i>
</e>
</section>
</dictionary>
21 changes: 21 additions & 0 deletions tests/lt_proc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,11 @@ class BiltransGarbage(ProcTest):
inputs = ['^$']
expectedOutputs = ['^$']

class BiltransSimple(ProcTest):
procflags = ['-b', '-z']
inputs = ['^abc$']
expectedOutputs = ['^abc/ab<n><def>$']

class SlashesInTags(ProcTest):
procdix = 'data/slash-tags.dix'
procflags = ['-b', '-z']
Expand All @@ -496,5 +501,21 @@ class SlashesInTags(ProcTest):
'^\\*lobwana1.1<n><1/2><a/b>/*lopwana1.1<n><1/2><a/b>$',
'^\\*lobwana1.1<n><3/4><a/b>/@\\*lobwana1.1<n><3/4><a/b>$']

class BiltransAnyChar(ProcTest):
procdix = 'data/pass-through.lsx'
procflags = ['-b', '-z']
# Using r'' to avoid doubling escapes even more:
inputs = [r'^simple<MERGED>$']
expectedOutputs = [r'^simple<MERGED>/simple<MERGED>$']


class BiltransAnyCharEscapes(ProcTest):
procdix = 'data/pass-through.lsx'
procflags = ['-b', '-z']
# Using r'' to avoid doubling escapes even more:
inputs = [r'^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>$']
expectedOutputs = [r'^«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>/«\[\[tf:i:a\]\]s\\\^å\[\[\/\]\]»<MERGED>$']


# These fail on some systems:
#from null_flush_invalid_stream_format import *

0 comments on commit 51898cc

Please sign in to comment.