diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index affc3624a..0d9d26f42 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -31,7 +31,7 @@ import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.Chunker.DEFAULT_MAX_CHUNK_LIMIT; import static org.opensearch.neuralsearch.processor.chunker.Chunker.DISABLED_MAX_CHUNK_LIMIT; -import static org.opensearch.neuralsearch.processor.chunker.Chunker.STRING_TOBE_CHUNKED_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; /** @@ -172,10 +172,10 @@ public IngestDocument execute(final IngestDocument ingestDocument) { // fixed token length algorithm needs runtime parameter max_token_count for tokenization Map runtimeParameters = new HashMap<>(); int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); - int stringTobeChunkedCount = getStringTobeChunkedCountFromMap(sourceAndMetadataMap, fieldMap); + int chunkStringCount = getChunkStringCountFromMap(sourceAndMetadataMap, fieldMap); runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); - runtimeParameters.put(STRING_TOBE_CHUNKED_FIELD, stringTobeChunkedCount); + runtimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount); chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters); return ingestDocument; } @@ -234,8 +234,8 @@ private void validateListTypeValue(final String sourceKey, final Object sourceVa } @SuppressWarnings("unchecked") - private int getStringTobeChunkedCountFromMap(Map sourceAndMetadataMap, final Map fieldMap) { - int stringTobeChunkedCount = 0; + private int getChunkStringCountFromMap(Map sourceAndMetadataMap, final Map fieldMap) { + int chunkStringCount = 0; for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { String originalKey = fieldMapEntry.getKey(); Object targetKey = fieldMapEntry.getValue(); @@ -246,14 +246,14 @@ private int getStringTobeChunkedCountFromMap(Map sourceAndMetada List sourceObjectList = (List) sourceObject; for (Object source : sourceObjectList) { if (source instanceof Map) { - stringTobeChunkedCount += getStringTobeChunkedCountFromMap( + chunkStringCount += getChunkStringCountFromMap( (Map) source, (Map) targetKey ); } } } else if (sourceObject instanceof Map) { - stringTobeChunkedCount += getStringTobeChunkedCountFromMap( + chunkStringCount += getChunkStringCountFromMap( (Map) sourceObject, (Map) targetKey ); @@ -261,14 +261,14 @@ private int getStringTobeChunkedCountFromMap(Map sourceAndMetada } else { // chunk the object when target key is of leaf type (null, string and list of string) Object chunkObject = sourceAndMetadataMap.get(originalKey); - stringTobeChunkedCount += getStringTobeChunkedCountFromLeafType(chunkObject); + chunkStringCount += getChunkStringCountFromLeafType(chunkObject); } } - return stringTobeChunkedCount; + return chunkStringCount; } @SuppressWarnings("unchecked") - private int getStringTobeChunkedCountFromLeafType(final Object value) { + private int getChunkStringCountFromLeafType(final Object value) { // leaf type means null, String or List // the result should be an empty list when the input is null if (value instanceof String) { @@ -319,9 +319,9 @@ private List chunkString(final String content, final Map return List.of(); } List contentResult = chunker.chunk(content, runTimeParameters); - // update string_tobe_chunked_count for each string - int stringTobeChunkedCount = parseIntegerParameter(runTimeParameters, STRING_TOBE_CHUNKED_FIELD, 1); - runTimeParameters.put(STRING_TOBE_CHUNKED_FIELD, stringTobeChunkedCount - 1); + // update chunk_string_count for each string + int chunkStringCount = parseIntegerParameter(runTimeParameters, CHUNK_STRING_COUNT_FIELD, 1); + runTimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount - 1); // update runtime max_chunk_limit if not disabled int runtimeMaxChunkLimit = parseIntegerParameter(runTimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); if (runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 4a651f7df..3fa2eeb7c 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -14,7 +14,7 @@ public interface Chunker { String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; - String STRING_TOBE_CHUNKED_FIELD = "string_tobe_chunked_count"; + String CHUNK_STRING_COUNT_FIELD = "chunk_string_count"; int DEFAULT_MAX_CHUNK_LIMIT = 100; int DISABLED_MAX_CHUNK_LIMIT = -1; @@ -41,9 +41,9 @@ public interface Chunker { * * @param chunkResultSize the size of chunking result * @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize - * @param stringTobeChunkedCount runtime string_tobe_chunked_count, used to check with chunkResultSize + * @param chunkStringCount runtime chunk_string_count, used to check with chunkResultSize */ - static boolean checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int stringTobeChunkedCount) { - return runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize + stringTobeChunkedCount >= runtimeMaxChunkLimit; + static boolean checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int chunkStringCount) { + return runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize + chunkStringCount >= runtimeMaxChunkLimit; } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 86d61fd31..fe2418ee8 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -49,19 +49,19 @@ public void parseParameters(Map parameters) { * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_chunk_limit field level max chunk limit - * 2. string_tobe_chunked_count number of non-empty strings (including itself) which need to be chunked later + * 2. chunk_string_count number of non-empty strings (including itself) which need to be chunked later */ @Override public List chunk(final String content, final Map runtimeParameters) { int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); - int stringTobeChunkedCount = parseIntegerParameter(runtimeParameters, STRING_TOBE_CHUNKED_FIELD, 1); + int chunkStringCount = parseIntegerParameter(runtimeParameters, CHUNK_STRING_COUNT_FIELD, 1); List chunkResult = new ArrayList<>(); int start = 0, end; int nextDelimiterPosition = content.indexOf(delimiter); while (nextDelimiterPosition != -1) { - if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, stringTobeChunkedCount)) { + if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, chunkStringCount)) { break; } end = nextDelimiterPosition + delimiter.length(); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 055b9a729..276e41ac7 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -117,13 +117,13 @@ public void parseParameters(Map parameters) { * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_token_count the max token limit for the tokenizer * 2. max_chunk_limit field level max chunk limit - * 3. string_tobe_chunked_count number of non-empty strings (including itself) which need to be chunked later + * 3. chunk_string_count number of non-empty strings (including itself) which need to be chunked later */ @Override public List chunk(final String content, final Map runtimeParameters) { int maxTokenCount = parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, this.maxChunkLimit); - int stringTobeChunkedCount = parseIntegerParameter(runtimeParameters, STRING_TOBE_CHUNKED_FIELD, 1); + int chunkStringCount = parseIntegerParameter(runtimeParameters, CHUNK_STRING_COUNT_FIELD, 1); List tokens = tokenize(content, tokenizer, maxTokenCount); List chunkResult = new ArrayList<>(); @@ -139,7 +139,7 @@ public List chunk(final String content, final Map runtim } else { startContentPosition = tokens.get(startTokenIndex).getStartOffset(); } - if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, stringTobeChunkedCount)) { + if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, chunkStringCount)) { // include all characters till the end if exceeds max chunk limit chunkResult.add(content.substring(startContentPosition)); break; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index cb25f2621..e4c2a5c05 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -13,7 +13,7 @@ import org.opensearch.test.OpenSearchTestCase; import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; -import static org.opensearch.neuralsearch.processor.chunker.Chunker.STRING_TOBE_CHUNKED_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; public class DelimiterChunkerTests extends OpenSearchTestCase { @@ -112,10 +112,10 @@ public void testChunk_whenExceedRuntimeMaxChunkLimit_withTwoStringsTobeChunked_t int maxChunkLimit = 3; DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit)); String content = "\n\na\n\n\n"; - int runtimeMaxChunkLimit = 2, stringTobeChunkedCount = 2; + int runtimeMaxChunkLimit = 2, chunkStringCount = 2; List passages = chunker.chunk( content, - Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit, STRING_TOBE_CHUNKED_FIELD, stringTobeChunkedCount) + Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit, CHUNK_STRING_COUNT_FIELD, chunkStringCount) ); List expectedPassages = List.of("\n\na\n\n\n"); assertEquals(expectedPassages, passages); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 527830642..d2a607a5b 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -24,7 +24,7 @@ import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; -import static org.opensearch.neuralsearch.processor.chunker.Chunker.STRING_TOBE_CHUNKED_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ALGORITHM_NAME; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; @@ -292,7 +292,7 @@ public void testChunk_whenExceedRuntimeMaxChunkLimit_thenLastPassageGetConcatena } public void testChunk_whenExceedRuntimeMaxChunkLimit_withOneStringTobeChunked_thenLastPassageGetConcatenated() { - int maxChunkLimit = 3, runtimeMaxChunkLimit = 2, stringTobeChunkedCount = 1; + int maxChunkLimit = 3, runtimeMaxChunkLimit = 2, chunkStringCount = 1; Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); parameters.put(TOKENIZER_FIELD, "standard"); @@ -301,7 +301,7 @@ public void testChunk_whenExceedRuntimeMaxChunkLimit_withOneStringTobeChunked_th Map runtimeParameters = new HashMap<>(); runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit); - runtimeParameters.put(STRING_TOBE_CHUNKED_FIELD, stringTobeChunkedCount); + runtimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters);