Skip to content

Commit

Permalink
Merge pull request #1 from xinyual/addDelimiterChunker
Browse files Browse the repository at this point in the history
Add delimiter chunker
  • Loading branch information
xinyual authored Feb 26, 2024
2 parents 084b97b + 0713a25 commit 1088125
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,45 @@
*/
package org.opensearch.neuralsearch.processor.chunker;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public class DelimiterChunker implements IFieldChunker {

public DelimiterChunker() {}

public static String DELIMITER_FIELD = "delimiter";

@Override
public void validateParameters(Map<String, Object> parameters) {
throw new UnsupportedOperationException("delimiter chunker has not been implemented yet");
if (parameters.containsKey(DELIMITER_FIELD)) {
Object delimiter = parameters.get(DELIMITER_FIELD);
if (!(delimiter instanceof String)) {
throw new IllegalArgumentException("delimiter parameters: " + delimiter + " must be string");
}
} else {
throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter");
}
}

@Override
public List<String> chunk(String content, Map<String, Object> parameters) {
throw new UnsupportedOperationException("delimiter chunker has not been implemented yet");
String delimiter = (String) parameters.get(DELIMITER_FIELD);
List<String> chunkResult = new ArrayList<>();
int start = 0;
int end = content.indexOf(delimiter);

while (end != -1) {
chunkResult.add(content.substring(start, end + delimiter.length()));
start = end + delimiter.length();
end = content.indexOf(delimiter, start);
}

if (start < content.length()) {
chunkResult.add(content.substring(start));
}
return chunkResult;

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.processor.chunker;

import org.junit.Assert;
import org.junit.Test;

import java.util.List;
import java.util.Map;

import static junit.framework.TestCase.assertEquals;
import static org.junit.Assert.assertThrows;
import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD;

public class DelimiterChunkerTests {

@Test
public void testChunkerWithNoDelimiterField() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "a\nb\nc\nd";
Map<String, Object> inputParameters = Map.of("", "");
Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters));
Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter", exception.getMessage());
}

@Test
public void testChunkerWithDelimiterFieldNotString() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "a\nb\nc\nd";
Map<String, Object> inputParameters = Map.of(DELIMITER_FIELD, List.of(""));
Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters));
Assert.assertEquals("delimiter parameters: " + List.of("") + " must be string", exception.getMessage());
}

@Test
public void testChunker() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "a\nb\nc\nd";
Map<String, Object> inputParameters = Map.of(DELIMITER_FIELD, "\n");
List<String> chunkResult = chunker.chunk(content, inputParameters);
assertEquals(4, chunkResult.size());
assertEquals(7, cntLength(chunkResult));
}

@Test
public void testChunkerWithDelimiterEnd() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "a\nb\nc\nd\n";
Map<String, Object> inputParameters = Map.of(DELIMITER_FIELD, "\n");
List<String> chunkResult = chunker.chunk(content, inputParameters);
assertEquals(4, chunkResult.size());
assertEquals(8, cntLength(chunkResult));
}

@Test
public void testChunkerWithOnlyDelimiter() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "\n";
Map<String, Object> inputParameters = Map.of(DELIMITER_FIELD, "\n");
List<String> chunkResult = chunker.chunk(content, inputParameters);
assertEquals(1, chunkResult.size());
assertEquals(1, cntLength(chunkResult));
}

@Test
public void testChunkerWithAllDelimiters() {
DelimiterChunker chunker = new DelimiterChunker();
String content = "\n\n\n";
Map<String, Object> inputParameters = Map.of(DELIMITER_FIELD, "\n");
List<String> chunkResult = chunker.chunk(content, inputParameters);
assertEquals(3, chunkResult.size());
assertEquals(3, cntLength(chunkResult));
}

private int cntLength(List<String> outputs) {
int totalLength = 0;
for (String output : outputs) {
totalLength += output.length();
}
return totalLength;
}
}
17 changes: 17 additions & 0 deletions src/test/resources/processor/DelimiterChunkerProcessor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"description": "An example delimiter chunker pipeline",
"processors" : [
{
"chunking": {
"field_map": {
"body_chunk1": {
"delimiter": {
"delimiter": "\n"
},
"output_field": "body_chunk2"
}
}
}
}
]
}

0 comments on commit 1088125

Please sign in to comment.