From c9af95cc9eb2bf27ce98047cd4a17f036cc2add2 Mon Sep 17 00:00:00 2001 From: Luigi Dell'Aquila Date: Tue, 23 Jul 2024 17:54:04 +0200 Subject: [PATCH] Fix Dissect with leading non-ascii characters (#111184) (#111196) Co-authored-by: Elastic Machine --- docs/changelog/111184.yaml | 5 +++++ .../org/elasticsearch/dissect/DissectParser.java | 2 +- .../elasticsearch/dissect/DissectParserTests.java | 12 ++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 docs/changelog/111184.yaml diff --git a/docs/changelog/111184.yaml b/docs/changelog/111184.yaml new file mode 100644 index 0000000000000..5ecdba54b09be --- /dev/null +++ b/docs/changelog/111184.yaml @@ -0,0 +1,5 @@ +pr: 111184 +summary: Fix Dissect with leading non-ascii characters +area: Ingest Node +type: bug +issues: [] diff --git a/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java index f3f53f1b3c5ea..3c01e490369de 100644 --- a/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java +++ b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java @@ -203,7 +203,7 @@ public Map parse(String inputString) { DissectKey key = dissectPair.key(); byte[] delimiter = dissectPair.delimiter().getBytes(StandardCharsets.UTF_8); // start dissection after the first delimiter - int i = leadingDelimiter.length(); + int i = leadingDelimiter.getBytes(StandardCharsets.UTF_8).length; int valueStart = i; int lookAheadMatches; // start walking the input string byte by byte, look ahead for matches where needed diff --git a/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java index 431b26fc1155d..2893e419a84a3 100644 --- a/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java +++ b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java @@ -211,6 +211,18 @@ public void testMatchUnicode() { assertMatch("%{a->}࿏%{b}", "⟳༒࿏࿏࿏࿏࿏༒⟲", Arrays.asList("a", "b"), Arrays.asList("⟳༒", "༒⟲")); assertMatch("%{*a}࿏%{&a}", "⟳༒࿏༒⟲", Arrays.asList("⟳༒"), Arrays.asList("༒⟲")); assertMatch("%{}࿏%{a}", "⟳༒࿏༒⟲", Arrays.asList("a"), Arrays.asList("༒⟲")); + assertMatch( + "Zürich, the %{adjective} city in Switzerland", + "Zürich, the largest city in Switzerland", + Arrays.asList("adjective"), + Arrays.asList("largest") + ); + assertMatch( + "Zürich, the %{one} city in Switzerland; Zürich, the %{two} city in Switzerland", + "Zürich, the largest city in Switzerland; Zürich, the LARGEST city in Switzerland", + Arrays.asList("one", "two"), + Arrays.asList("largest", "LARGEST") + ); } public void testMatchRemainder() {