From e4a8e173a585c9696e3a808258a2e7eb0038b64e Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 26 Feb 2024 14:34:39 +0800 Subject: [PATCH 1/3] add delimiter chunker Signed-off-by: xinyual --- .../processor/chunker/DelimiterChunker.java | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index c9ef5e211..b95ccd088 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -4,20 +4,42 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Scanner; public class DelimiterChunker implements IFieldChunker { public DelimiterChunker() {} + public static String DELIMITER_FIELD = "delimiter"; + + @Override public void validateParameters(Map parameters) { - throw new UnsupportedOperationException("delimiter chunker has not been implemented yet"); + if (parameters.containsKey(DELIMITER_FIELD)) + { + Object delimiter = parameters.get(DELIMITER_FIELD); + if (!(delimiter instanceof String)){ + throw new IllegalArgumentException("delimiter parameters " + delimiter + " must be string"); + } + } + else { + throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter"); + } } @Override public List chunk(String content, Map parameters) { - throw new UnsupportedOperationException("delimiter chunker has not been implemented yet"); + List chunkingResult = new ArrayList<>(); + String delimiter = (String) parameters.get(DELIMITER_FIELD); + Scanner scanner = new Scanner(content); + scanner.useDelimiter(delimiter); + while (scanner.hasNext()) { + String nextChunk = scanner.next(); + chunkingResult.add(nextChunk); + } + return chunkingResult; } } From e4eb355611bc4782b4a33d4d24fbf57e609f6181 Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 26 Feb 2024 20:15:01 +0800 Subject: [PATCH 2/3] add UT for delimiter chunker Signed-off-by: xinyual --- .../processor/chunker/DelimiterChunker.java | 33 ++++---- .../chunker/DelimiterChunkerTests.java | 84 +++++++++++++++++++ 2 files changed, 102 insertions(+), 15 deletions(-) create mode 100644 src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index b95ccd088..625562a36 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -7,7 +7,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.Scanner; public class DelimiterChunker implements IFieldChunker { @@ -15,31 +14,35 @@ public DelimiterChunker() {} public static String DELIMITER_FIELD = "delimiter"; - @Override public void validateParameters(Map parameters) { - if (parameters.containsKey(DELIMITER_FIELD)) - { + if (parameters.containsKey(DELIMITER_FIELD)) { Object delimiter = parameters.get(DELIMITER_FIELD); - if (!(delimiter instanceof String)){ - throw new IllegalArgumentException("delimiter parameters " + delimiter + " must be string"); + if (!(delimiter instanceof String)) { + throw new IllegalArgumentException("delimiter parameters: " + delimiter + " must be string"); } - } - else { + } else { throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter"); } } @Override public List chunk(String content, Map parameters) { - List chunkingResult = new ArrayList<>(); String delimiter = (String) parameters.get(DELIMITER_FIELD); - Scanner scanner = new Scanner(content); - scanner.useDelimiter(delimiter); - while (scanner.hasNext()) { - String nextChunk = scanner.next(); - chunkingResult.add(nextChunk); + List chunkResult = new ArrayList<>(); + int start = 0; + int end = content.indexOf(delimiter); + + while (end != -1) { + chunkResult.add(content.substring(start, end + delimiter.length())); + start = end + delimiter.length(); + end = content.indexOf(delimiter, start); } - return chunkingResult; + + if (start < content.length()) { + chunkResult.add(content.substring(start)); + } + return chunkResult; + } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java new file mode 100644 index 000000000..776f96479 --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -0,0 +1,84 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; +import java.util.Map; + +import static junit.framework.TestCase.assertEquals; +import static org.junit.Assert.assertThrows; +import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; + +public class DelimiterChunkerTests { + + @Test + public void testChunkerWithNoDelimiterField() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of("", ""); + Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); + Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter", exception.getMessage()); + } + + @Test + public void testChunkerWithDelimiterFieldNotString() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of(DELIMITER_FIELD, List.of("")); + Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); + Assert.assertEquals("delimiter parameters: " + List.of("") + " must be string", exception.getMessage()); + } + + @Test + public void testChunker() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(4, chunkResult.size()); + assertEquals(7, cntLength(chunkResult)); + } + + @Test + public void testChunkerWithDelimiterEnd() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd\n"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(4, chunkResult.size()); + assertEquals(8, cntLength(chunkResult)); + } + + @Test + public void testChunkerWithOnlyDelimiter() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "\n"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(1, chunkResult.size()); + assertEquals(1, cntLength(chunkResult)); + } + + @Test + public void testChunkerWithAllDelimiters() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "\n\n\n"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(3, chunkResult.size()); + assertEquals(3, cntLength(chunkResult)); + } + + private int cntLength(List outputs) { + int totalLength = 0; + for (String output : outputs) { + totalLength += output.length(); + } + return totalLength; + } +} From 0713a25f8531d6d4f04b2079402fe067e8ba7311 Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 26 Feb 2024 20:16:36 +0800 Subject: [PATCH 3/3] add delimiter chunker processor Signed-off-by: xinyual --- .../processor/DelimiterChunkerProcessor.json | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 src/test/resources/processor/DelimiterChunkerProcessor.json diff --git a/src/test/resources/processor/DelimiterChunkerProcessor.json b/src/test/resources/processor/DelimiterChunkerProcessor.json new file mode 100644 index 000000000..c94f3e249 --- /dev/null +++ b/src/test/resources/processor/DelimiterChunkerProcessor.json @@ -0,0 +1,17 @@ +{ + "description": "An example delimiter chunker pipeline", + "processors" : [ + { + "chunking": { + "field_map": { + "body_chunk1": { + "delimiter": { + "delimiter": "\n" + }, + "output_field": "body_chunk2" + } + } + } + } + ] +} \ No newline at end of file