From 89c465c8ca56c77ea1d5736fcd798ab6648894fe Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 12:15:52 +0800 Subject: [PATCH] assign positive default value for max chunk limit Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 10 ++++++---- .../processor/chunker/ChunkerParameterParser.java | 3 +-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 5e648152a..963135e68 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -45,7 +45,7 @@ public final class TextChunkingProcessor extends AbstractProcessor { @VisibleForTesting static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; - private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; + private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME; private int maxChunkLimit; @@ -114,8 +114,10 @@ private void parseAlgorithmMap(final Map algorithmMap) { } Map chunkerParameters = (Map) algorithmValue; - // fixed token length algorithm needs analysis registry for tokenization - chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); + if (algorithmKey.equals(FixedTokenLengthChunker.ALGORITHM_NAME)) { + // fixed token length algorithm needs analysis registry for tokenization + chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); + } this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); this.maxChunkLimit = parsePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); } @@ -269,7 +271,7 @@ private int chunkString(final String content, List result, final Map contentResult = chunker.chunk(content, runTimeParameters); updatedChunkCount += contentResult.size(); - if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && updatedChunkCount > maxChunkLimit) { + if (updatedChunkCount > maxChunkLimit) { throw new IllegalArgumentException( String.format( Locale.ROOT, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index d9a0e75ba..56916ea34 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -55,8 +55,7 @@ public static int parsePositiveIntegerParameter(final Map parame String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName()) ); } - // some parameter has negative default value, indicating that this parameter is not effective - if (fieldValueInt != defaultValue && fieldValueInt <= 0) { + if (fieldValueInt <= 0) { throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); } return fieldValueInt;