From e0733f305c83287fdbc3dcf4ff860fa93f0fbb25 Mon Sep 17 00:00:00 2001 From: Max Hniebergall <137079448+maxhniebergall@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:47:26 -0500 Subject: [PATCH] [ML] Fix deberta tokenizer bug caused by bug in normalizer (#117189) (#117254) * Fix deberta tokenizer bug caused by bug in normalizer which caused offesets to be negative * Update docs/changelog/117189.yaml --- docs/changelog/117189.yaml | 5 +++++ .../tokenizers/PrecompiledCharMapNormalizer.java | 2 +- .../nlp/tokenizers/DebertaV2TokenizerTests.java | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 docs/changelog/117189.yaml diff --git a/docs/changelog/117189.yaml b/docs/changelog/117189.yaml new file mode 100644 index 0000000000000..e89c2d81506d9 --- /dev/null +++ b/docs/changelog/117189.yaml @@ -0,0 +1,5 @@ +pr: 117189 +summary: Fix deberta tokenizer bug caused by bug in normalizer +area: Machine Learning +type: bug +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java index bbe5bea691c35..5dd7dbbffaa61 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java @@ -194,7 +194,7 @@ Reader normalize(CharSequence str) { if (charDelta < 0) { // normalised form is shorter int lastDiff = getLastCumulativeDiff(); - addOffCorrectMap(normalizedCharPos, lastDiff + charDelta); + addOffCorrectMap(normalizedCharPos, lastDiff - charDelta); } else if (charDelta > 0) { // inserted chars, add the offset in the output stream int lastDiff = getLastCumulativeDiff(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java index bbe509da67452..a8461de8630ae 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java @@ -94,6 +94,20 @@ public void testTokenize() throws IOException { } } + public void testTokenizeWithHiddenControlCharacters() throws IOException { + try ( + DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder( + TEST_CASE_VOCAB, + TEST_CASE_SCORES, + new DebertaV2Tokenization(false, false, null, Tokenization.Truncate.NONE, -1) + ).build() + ) { + TokenizationResult.Tokens tokenization = tokenizer.tokenize("\u009F\u008Fz", Tokenization.Truncate.NONE, -1, 0, null).get(0); + assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁", "z")); + + } + } + public void testSurrogatePair() throws IOException { try ( DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(