From 463de71bd0075af4524fe29b4140769193049d90 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 14:24:10 +0800 Subject: [PATCH] update parameter setting of max chunk limit Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 27 ++++++--- .../chunker/ChunkerParameterParser.java | 19 ++++-- .../processor/TextChunkingProcessorTests.java | 60 +++++++++++++------ 3 files changed, 75 insertions(+), 31 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index c9bd5e46e..268248867 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -29,7 +29,8 @@ import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter; + +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; /** * This processor is used for user input data text chunking. @@ -47,6 +48,7 @@ public final class TextChunkingProcessor extends AbstractProcessor { static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; + private static final int DISABLED_MAX_CHUNK_LIMIT = -1; private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME; private int maxChunkLimit; @@ -131,7 +133,17 @@ private void parseAlgorithmMap(final Map algorithmMap) { chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); } this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); - this.maxChunkLimit = parsePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); + this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); + if (maxChunkLimit <= 0 && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Parameter [%s] must be positive or %s to disable this parameter", + MAX_CHUNK_LIMIT_FIELD, + DISABLED_MAX_CHUNK_LIMIT + ) + ); + } } @SuppressWarnings("unchecked") @@ -283,13 +295,14 @@ private int chunkString(final String content, List result, final Map contentResult = chunker.chunk(content, runTimeParameters); updatedChunkCount += contentResult.size(); - if (updatedChunkCount > maxChunkLimit) { - throw new IllegalArgumentException( + if (updatedChunkCount > maxChunkLimit && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { + throw new IllegalStateException( String.format( Locale.ROOT, - "Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]", - updatedChunkCount, - maxChunkLimit + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", + TYPE, + maxChunkLimit, + MAX_CHUNK_LIMIT_FIELD ) ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index 56916ea34..4ef83e03a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -38,23 +38,30 @@ public static String parseStringParameter(final Map parameters, } /** - * Parse Integer type parameter with positive value. - * Throw IllegalArgumentException if parameter is not a positive integer. + * Parse Integer type parameter. + * Throw IllegalArgumentException if parameter is not an integer. */ - public static int parsePositiveIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { + public static int parseIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional + // all integer parameters are optional return defaultValue; } - int fieldValueInt; String fieldValueString = parameters.get(fieldName).toString(); try { - fieldValueInt = NumberUtils.createInteger(fieldValueString); + return NumberUtils.createInteger(fieldValueString); } catch (Exception e) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName()) ); } + } + + /** + * Parse Integer type parameter with positive value. + * Throw IllegalArgumentException if parameter is not a positive integer. + */ + public static int parsePositiveIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { + int fieldValueInt = parseIntegerParameter(parameters, fieldName, defaultValue); if (fieldValueInt <= 0) { throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 239537a52..37b66cef1 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -201,10 +201,21 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { IllegalArgumentException.class, () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals( - String.format(Locale.ROOT, "Parameter [%s] must be positive.", MAX_CHUNK_LIMIT_FIELD), - illegalArgumentException.getMessage() - ); + assert (illegalArgumentException.getMessage() + .contains(String.format(Locale.ROOT, "Parameter [%s] must be positive", MAX_CHUNK_LIMIT_FIELD))); + } + + @SneakyThrows + public void testCreate_whenMaxChunkNumDisabledValue_thenSucceed() { + Map registry = new HashMap<>(); + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunk(-1)); + config.put(FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); + textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { @@ -379,23 +390,36 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT } } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumDisabled_thenFail() { + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), -1); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); + expectedPassages.add("standard tokenizer in OpenSearch."); + assertEquals(expectedPassages, passages); + } + @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumExceed_thenFail() { - TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 1); + int maxChunkLimit = 1; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> processor.execute(ingestDocument) - ); - assertEquals( - String.format( - Locale.ROOT, - "Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]", - 3, - 1 - ), - illegalArgumentException.getMessage() - ); + IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> processor.execute(ingestDocument)); + assert (illegalStateException.getMessage() + .contains( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", + TYPE, + maxChunkLimit + ) + )); } @SneakyThrows