diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index e03c4780f..5e648152a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -28,13 +28,13 @@ import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter; /** * This processor is used for user input data text chunking. - * The chunking results could be fed to downstream embedding processor, - * algorithm defines chunking algorithm and parameters, + * The chunking results could be fed to downstream embedding processor. + * The processor needs two fields: algorithm and field_map, + * where algorithm defines chunking algorithm and parameters, * and field_map specifies which fields needs chunking and the corresponding keys for the chunking results. */ public final class TextChunkingProcessor extends AbstractProcessor { @@ -117,8 +117,7 @@ private void parseAlgorithmMap(final Map algorithmMap) { // fixed token length algorithm needs analysis registry for tokenization chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); - validatePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); - this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); + this.maxChunkLimit = parsePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); } @SuppressWarnings("unchecked") diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index af8290f7e..da67c91d0 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -13,16 +13,9 @@ */ public interface Chunker { - /** - * Validate the parameters for chunking algorithm, - * will throw IllegalArgumentException when parameters are invalid - * - * @param parameters a map containing non-runtime parameters for chunking algorithms - */ - void validateParameters(Map parameters); - /** * Parse the parameters for chunking algorithm. + * Throw IllegalArgumentException when parameters are invalid. * The parameters must be validated before parsing. * * @param parameters a map containing non-runtime parameters for chunking algorithms diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index 5fb5d6666..d9a0e75ba 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -4,47 +4,80 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; +import java.util.Locale; import java.util.Map; /** * Parse the parameter for text chunking processor and algorithms. - * The parameter must be validated before parsing. + * Throw IllegalArgumentException when parameters are invalid. */ public class ChunkerParameterParser { /** - * Parse string type parameter + * Parse string type parameter. + * Throw IllegalArgumentException if parameter is not a string or empty. */ public static String parseStringParameter(final Map parameters, final String fieldName, final String defaultValue) { if (!parameters.containsKey(fieldName)) { + // all string parameters are optional return defaultValue; } - return parameters.get(fieldName).toString(); + Object fieldValue = parameters.get(fieldName); + if (!(fieldValue instanceof String)) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, String.class.getName()) + ); + } + if (StringUtils.isEmpty(fieldValue.toString())) { + throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", fieldName)); + } + return fieldValue.toString(); } /** - * Parse integer type parameter + * Parse Integer type parameter with positive value. + * Throw IllegalArgumentException if parameter is not a positive integer. */ - public static int parseIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { + public static int parsePositiveIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { if (!parameters.containsKey(fieldName)) { // all chunking algorithm parameters are optional return defaultValue; } + int fieldValueInt; String fieldValueString = parameters.get(fieldName).toString(); - return NumberUtils.createInteger(fieldValueString); + try { + fieldValueInt = NumberUtils.createInteger(fieldValueString); + } catch (Exception e) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName()) + ); + } + // some parameter has negative default value, indicating that this parameter is not effective + if (fieldValueInt != defaultValue && fieldValueInt <= 0) { + throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); + } + return fieldValueInt; } /** - * parse double type parameter + * Parse double type parameter. + * Throw IllegalArgumentException if parameter is not a double. */ public static double parseDoubleParameter(final Map parameters, final String fieldName, final double defaultValue) { if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional + // all double parameters are optional return defaultValue; } String fieldValueString = parameters.get(fieldName).toString(); - return NumberUtils.createDouble(fieldValueString); + try { + return NumberUtils.createDouble(fieldValueString); + } catch (Exception e) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Double.class.getName()) + ); + } } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java deleted file mode 100644 index abf37de0f..000000000 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ -package org.opensearch.neuralsearch.processor.chunker; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.math.NumberUtils; - -import java.util.Map; -import java.util.Locale; - -/** - * Validate the parameter for text chunking processor and algorithms - */ -public class ChunkerParameterValidator { - - /** - * Validate string type parameter - */ - public static void validateStringParameter(final Map parameters, final String fieldName, final boolean allowEmpty) { - if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional - return; - } - Object fieldValue = parameters.get(fieldName); - if (!(fieldValue instanceof String)) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, String.class.getName()) - ); - } - if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { - throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", fieldName)); - } - } - - /** - * Validate integer type parameter with positive value - */ - public static void validatePositiveIntegerParameter( - final Map parameters, - final String fieldName, - final int defaultValue - ) { - if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional - return; - } - String fieldValueString = parameters.get(fieldName).toString(); - if (!(NumberUtils.isParsable(fieldValueString))) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Number.class.getName()) - ); - } - int fieldValueInt = NumberUtils.createInteger(fieldValueString); - // sometimes the parameter has negative default value, indicating that this parameter is not effective - if (fieldValueInt != defaultValue && fieldValueInt <= 0) { - throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); - } - } - - /** - * Validate double type parameter within range [lowerBound, upperBound] - */ - public static void validateDoubleParameterWithinRange( - final Map parameters, - final String fieldName, - final double lowerBound, - final double upperBound - ) { - if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional - return; - } - String fieldValueString = parameters.get(fieldName).toString(); - if (!(NumberUtils.isParsable(fieldValueString))) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Number.class.getName()) - ); - } - double fieldValueDouble = NumberUtils.createDouble(fieldValueString); - if (fieldValueDouble < lowerBound || fieldValueDouble > upperBound) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", fieldName, lowerBound, upperBound) - ); - } - } -} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 168936a8d..0008fad21 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -9,7 +9,6 @@ import java.util.ArrayList; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter; /** * The implementation {@link Chunker} for delimiter algorithm @@ -23,28 +22,15 @@ public class DelimiterChunker implements Chunker { private String delimiter; public DelimiterChunker(final Map parameters) { - validateParameters(parameters); parseParameters(parameters); } /** - * Validate the parameters for delimiter algorithm, - * will throw IllegalArgumentException if delimiter is not a string or empty + * Parse the parameters for delimiter algorithm. + * Throw IllegalArgumentException if delimiter is not a string or empty. * * @param parameters a map containing parameters, containing the following parameters - * 1. A string as the paragraph split indicator - */ - @Override - public void validateParameters(Map parameters) { - validateStringParameter(parameters, DELIMITER_FIELD, false); - } - - /** - * Parse the parameters for delimiter algorithm, - * will throw IllegalArgumentException if delimiter is not a string or empty - * - * @param parameters a map containing parameters, containing the following parameters - * 1. A string as the paragraph split indicator + * 1. delimiter A string as the paragraph split indicator */ @Override public void parseParameters(Map parameters) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 145484b79..c21fcf12b 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -17,10 +17,7 @@ import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateDoubleParameterWithinRange; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter; /** * The implementation {@link Chunker} for fixed token length algorithm. @@ -62,35 +59,44 @@ public class FixedTokenLengthChunker implements Chunker { private final AnalysisRegistry analysisRegistry; public FixedTokenLengthChunker(final Map parameters) { - validateParameters(parameters); parseParameters(parameters); this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD); } /** - * Validate the parameters for fixed token length algorithm, - * will throw IllegalArgumentException when parameters are invalid + * Parse the parameters for fixed token length algorithm. + * Throw IllegalArgumentException when parameters are invalid. * - * @param parameters a map containing non-runtime parameters as the following: - * 1. tokenizer: the analyzer tokenizer in opensearch + * @param parameters a map non-runtime parameters as the following: + * 1. tokenizer: the word tokenizer in opensearch * 2. token_limit: the token limit for each chunked passage * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage * Here are requirements for parameters: - * max_token_count and token_limit should be a positive integer - * overlap_rate should be within range [0, 0.5] - * tokenizer should be string + * 1. token_limit must be a positive integer + * 2. overlap_rate must be within range [0, 0.5] + * 3. tokenizer must be a word tokenizer */ @Override - public void validateParameters(Map parameters) { - validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); - validateDoubleParameterWithinRange(parameters, OVERLAP_RATE_FIELD, OVERLAP_RATE_LOWER_BOUND, OVERLAP_RATE_UPPER_BOUND); - validateStringParameter(parameters, TOKENIZER_FIELD, false); - String tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER); + public void parseParameters(Map parameters) { + this.tokenLimit = parsePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); + this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE); + this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER); + if (overlapRate < OVERLAP_RATE_LOWER_BOUND || overlapRate > OVERLAP_RATE_UPPER_BOUND) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Parameter [%s] must be between %s and %s", + OVERLAP_RATE_FIELD, + OVERLAP_RATE_LOWER_BOUND, + OVERLAP_RATE_UPPER_BOUND + ) + ); + } if (!WORD_TOKENIZERS.contains(tokenizer)) { throw new IllegalArgumentException( String.format( Locale.ROOT, - "tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s", + "Tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s", tokenizer, ALGORITHM_NAME, WORD_TOKENIZERS @@ -100,33 +106,19 @@ public void validateParameters(Map parameters) { } /** - * Parse the parameters for fixed token length algorithm, - * will throw IllegalArgumentException when parameters are invalid - * - * @param parameters a map non-runtime parameters as the following: - * 1. tokenizer: the word tokenizer in opensearch - * 2. token_limit: the token limit for each chunked passage - * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage - */ - @Override - public void parseParameters(Map parameters) { - this.tokenLimit = parseIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); - this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE); - this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER); - } - - /** - * Return the chunked passages for fixed token length algorithm + * Return the chunked passages for fixed token length algorithm. + * Throw IllegalArgumentException when runtime parameters are invalid. * * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_token_count the max token limit for the tokenizer + * Here are requirements for runtime parameters: + * 1. max_token_count must be a positive integer */ @Override public List chunk(final String content, final Map runtimeParameters) { - // before chunking, validate and parse runtimeParameters - validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); - int maxTokenCount = parseIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); + // parse runtimeParameters before chunking + int maxTokenCount = parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); List tokens = tokenize(content, tokenizer, maxTokenCount); List chunkResult = new ArrayList<>(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 7bba1e73b..979228366 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -63,7 +63,7 @@ public Map> getTokeniz } public void testValidateAndParseParameters_whenNoParams_thenSuccessful() { - fixedTokenLengthChunker.validateParameters(Map.of()); + fixedTokenLengthChunker.parseParameters(Map.of()); } public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() { @@ -71,10 +71,10 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() parameters.put(TOKEN_LIMIT_FIELD, "invalid token limit"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", TOKEN_LIMIT_FIELD, Number.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", TOKEN_LIMIT_FIELD, Integer.class.getName()), illegalArgumentException.getMessage() ); } @@ -84,7 +84,7 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitValue_thenFail() parameters.put(TOKEN_LIMIT_FIELD, -1); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] must be positive.", TOKEN_LIMIT_FIELD), @@ -97,10 +97,10 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() parameters.put(OVERLAP_RATE_FIELD, "invalid overlap rate"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", OVERLAP_RATE_FIELD, Number.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", OVERLAP_RATE_FIELD, Double.class.getName()), illegalArgumentException.getMessage() ); } @@ -110,7 +110,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateValue_thenFail( parameters.put(OVERLAP_RATE_FIELD, 0.6); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", OVERLAP_RATE_FIELD, 0.0, 0.5), @@ -123,7 +123,7 @@ public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { parameters.put(TOKENIZER_FIELD, 111); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] must be of %s type", TOKENIZER_FIELD, String.class.getName()), @@ -136,10 +136,10 @@ public void testValidateAndParseParameters_whenUnsupportedTokenizer_thenFail() { Map parameters = Map.of(TOKENIZER_FIELD, ngramTokenizer); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assert (illegalArgumentException.getMessage() - .contains(String.format(Locale.ROOT, "tokenizer [%s] is not supported for [%s] algorithm.", ngramTokenizer, ALGORITHM_NAME))); + .contains(String.format(Locale.ROOT, "Tokenizer [%s] is not supported for [%s] algorithm.", ngramTokenizer, ALGORITHM_NAME))); } public void testChunk_withTokenLimit_10() {