From fcc9f0f34014329e2fb75f84489457c09d117ced Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Wed, 23 Feb 2022 14:07:10 +0100 Subject: [PATCH] =?UTF-8?q?Fix=20#129=20=E2=80=93=20don't=20drop=2032-bit?= =?UTF-8?q?=20chars=20down=20to=2016-bit=20in=20postgen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lttoolbox/fst_processor.cc | 10 +++++----- lttoolbox/fst_processor.h | 2 +- tests/lt_proc/__init__.py | 7 +++++++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index a370474e..846ffd1b 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -383,7 +383,7 @@ FSTProcessor::readTMAnalysis(InputFile& input) return val; } -int +int32_t FSTProcessor::readPostgeneration(InputFile& input, UFILE *output) { if(!input_buffer.isEmpty()) @@ -903,7 +903,7 @@ FSTProcessor::lastBlank(UString const &str) } void -FSTProcessor::printSpace(UChar const val, UFILE *output) +FSTProcessor::printSpace(UChar32 const val, UFILE *output) { if(blankqueue.size() > 0) { @@ -1803,7 +1803,7 @@ FSTProcessor::postgeneration(InputFile& input, UFILE *output) int last = 0; set empty_escaped_chars; - while(UChar val = readPostgeneration(input, output)) + while(UChar32 val = readPostgeneration(input, output)) { if(val == '~') { @@ -2027,7 +2027,7 @@ FSTProcessor::intergeneration(InputFile& input, UFILE *output) while (true) { - UChar val = readPostgeneration(input, output); + UChar32 val = readPostgeneration(input, output); if (val == '~') { @@ -2165,7 +2165,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output) UString sf; int last = 0; - while(UChar val = readPostgeneration(input, output)) + while(UChar32 val = readPostgeneration(input, output)) { if(u_ispunct(val) || u_isspace(val)) { diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 04cc68a0..7cde42e1 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -452,7 +452,7 @@ class FSTProcessor * @param val the space character to use if no blank queue * @param output stream where the word is written */ - void printSpace(UChar const val, UFILE *output); + void printSpace(UChar32 const val, UFILE *output); void skipUntil(InputFile& input, UFILE *output, UChar32 const character); static UString removeTags(UString const &str); diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index 7de96a52..22cf1794 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -254,5 +254,12 @@ class AlphabeticMultibyteTest(ProcTest): expectedOutputs = ["^𝜊/*𝜊$"] +class AlphabeticMultibyteTestPost(ProcTest): + procdix = "data/minimal-mono.dix" + inputs = ["𝜊"] # code point >65535, needs two bytes in utf-8, isAlphabetic + procflags = ['-z', '-p'] + expectedOutputs = ["𝜊"] + + # These fail on some systems: #from null_flush_invalid_stream_format import *