From 4c99e93597cd354da97147662f92f1e109f57a42 Mon Sep 17 00:00:00 2001 From: Luigi Dell'Aquila Date: Tue, 23 Jul 2024 11:33:19 +0200 Subject: [PATCH 1/3] Fix Dissect with leading non-ascii characters --- .../org/elasticsearch/dissect/DissectParser.java | 2 +- .../elasticsearch/dissect/DissectParserTests.java | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java index f3f53f1b3c5ea..3c01e490369de 100644 --- a/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java +++ b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java @@ -203,7 +203,7 @@ public Map parse(String inputString) { DissectKey key = dissectPair.key(); byte[] delimiter = dissectPair.delimiter().getBytes(StandardCharsets.UTF_8); // start dissection after the first delimiter - int i = leadingDelimiter.length(); + int i = leadingDelimiter.getBytes(StandardCharsets.UTF_8).length; int valueStart = i; int lookAheadMatches; // start walking the input string byte by byte, look ahead for matches where needed diff --git a/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java index 431b26fc1155d..57400589f2c1c 100644 --- a/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java +++ b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java @@ -211,6 +211,18 @@ public void testMatchUnicode() { assertMatch("%{a->}࿏%{b}", "⟳༒࿏࿏࿏࿏࿏༒⟲", Arrays.asList("a", "b"), Arrays.asList("⟳༒", "༒⟲")); assertMatch("%{*a}࿏%{&a}", "⟳༒࿏༒⟲", Arrays.asList("⟳༒"), Arrays.asList("༒⟲")); assertMatch("%{}࿏%{a}", "⟳༒࿏༒⟲", Arrays.asList("a"), Arrays.asList("༒⟲")); + assertMatch( + "Zürich, the %{adjective} city in Switzerland", + "Zürich, the largest city in Switzerland", + Arrays.asList("adjective"), + Arrays.asList("largest") + ); + assertMatch( + "Zürich, the %{one} city in Switzerland; Zürich, the %{two} city in Switzerland", + "Zürich, the largest city in Switzerland; Zürich, the best city in Switzerland", + Arrays.asList("one", "two"), + Arrays.asList("largest", "best") + ); } public void testMatchRemainder() { From 61dab89c5473f70bd3dbf461c13e680e5847693b Mon Sep 17 00:00:00 2001 From: Luigi Dell'Aquila Date: Tue, 23 Jul 2024 11:40:09 +0200 Subject: [PATCH 2/3] Update docs/changelog/111184.yaml --- docs/changelog/111184.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/111184.yaml diff --git a/docs/changelog/111184.yaml b/docs/changelog/111184.yaml new file mode 100644 index 0000000000000..5ecdba54b09be --- /dev/null +++ b/docs/changelog/111184.yaml @@ -0,0 +1,5 @@ +pr: 111184 +summary: Fix Dissect with leading non-ascii characters +area: Ingest Node +type: bug +issues: [] From f2ef1364af40d176d24968d84b5d883ddb40eb70 Mon Sep 17 00:00:00 2001 From: Luigi Dell'Aquila Date: Tue, 23 Jul 2024 12:54:57 +0200 Subject: [PATCH 3/3] Tests --- .../java/org/elasticsearch/dissect/DissectParserTests.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java index 57400589f2c1c..2893e419a84a3 100644 --- a/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java +++ b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java @@ -219,9 +219,9 @@ public void testMatchUnicode() { ); assertMatch( "Zürich, the %{one} city in Switzerland; Zürich, the %{two} city in Switzerland", - "Zürich, the largest city in Switzerland; Zürich, the best city in Switzerland", + "Zürich, the largest city in Switzerland; Zürich, the LARGEST city in Switzerland", Arrays.asList("one", "two"), - Arrays.asList("largest", "best") + Arrays.asList("largest", "LARGEST") ); }