merge parameter validator into the parser

Signed-off-by: yuye-aws <[email protected]>
yuye-aws · Mar 15, 2024 · 3b8a3af · 3b8a3af
1 parent f3decb4
commit 3b8a3af
Show file tree

Hide file tree

Showing 7 changed files with 91 additions and 176 deletions.
diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java
@@ -28,13 +28,13 @@
 import org.opensearch.index.mapper.IndexFieldMapper;
 import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory;
 import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
-import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter;
-import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
+import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter;
 
 /**
  * This processor is used for user input data text chunking.
- * The chunking results could be fed to downstream embedding processor,
- * algorithm defines chunking algorithm and parameters,
+ * The chunking results could be fed to downstream embedding processor.
+ * The processor needs two fields: algorithm and field_map,
+ * where algorithm defines chunking algorithm and parameters,
  * and field_map specifies which fields needs chunking and the corresponding keys for the chunking results.
  */
 public final class TextChunkingProcessor extends AbstractProcessor {
@@ -117,8 +117,7 @@ private void parseAlgorithmMap(final Map<String, Object> algorithmMap) {
         // fixed token length algorithm needs analysis registry for tokenization
         chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry);
         this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters);
-        validatePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
-        this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
+        this.maxChunkLimit = parsePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
     }
 
     @SuppressWarnings("unchecked")

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java
@@ -13,16 +13,9 @@
  */
 public interface Chunker {
 
-    /**
-     * Validate the parameters for chunking algorithm,
-     * will throw IllegalArgumentException when parameters are invalid
-     *
-     * @param parameters a map containing non-runtime parameters for chunking algorithms
-     */
-    void validateParameters(Map<String, Object> parameters);
-
     /**
      * Parse the parameters for chunking algorithm.
+     * Throw IllegalArgumentException when parameters are invalid.
      * The parameters must be validated before parsing.
      *
      * @param parameters a map containing non-runtime parameters for chunking algorithms

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java
@@ -4,47 +4,80 @@
  */
 package org.opensearch.neuralsearch.processor.chunker;
 
+import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.math.NumberUtils;
 
+import java.util.Locale;
 import java.util.Map;
 
 /**
  * Parse the parameter for text chunking processor and algorithms.
- * The parameter must be validated before parsing.
+ * Throw IllegalArgumentException when parameters are invalid.
  */
 public class ChunkerParameterParser {
 
     /**
-     * Parse string type parameter
+     * Parse string type parameter.
+     * Throw IllegalArgumentException if parameter is not a string or empty.
      */
     public static String parseStringParameter(final Map<String, Object> parameters, final String fieldName, final String defaultValue) {
         if (!parameters.containsKey(fieldName)) {
+            // all string parameters are optional
             return defaultValue;
         }
-        return parameters.get(fieldName).toString();
+        Object fieldValue = parameters.get(fieldName);
+        if (!(fieldValue instanceof String)) {
+            throw new IllegalArgumentException(
+                String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, String.class.getName())
+            );
+        }
+        if (StringUtils.isEmpty(fieldValue.toString())) {
+            throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", fieldName));
+        }
+        return fieldValue.toString();
     }
 
     /**
-     * Parse integer type parameter
+     * Parse Integer type parameter with positive value.
+     * Throw IllegalArgumentException if parameter is not a positive integer.
      */
-    public static int parseIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
+    public static int parsePositiveIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
         if (!parameters.containsKey(fieldName)) {
             // all chunking algorithm parameters are optional
             return defaultValue;
         }
+        int fieldValueInt;
         String fieldValueString = parameters.get(fieldName).toString();
-        return NumberUtils.createInteger(fieldValueString);
+        try {
+            fieldValueInt = NumberUtils.createInteger(fieldValueString);
+        } catch (Exception e) {
+            throw new IllegalArgumentException(
+                String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName())
+            );
+        }
+        // some parameter has negative default value, indicating that this parameter is not effective
+        if (fieldValueInt != defaultValue && fieldValueInt <= 0) {
+            throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName));
+        }
+        return fieldValueInt;
     }
 
     /**
-     * parse double type parameter
+     * Parse double type parameter.
+     * Throw IllegalArgumentException if parameter is not a double.
      */
     public static double parseDoubleParameter(final Map<String, Object> parameters, final String fieldName, final double defaultValue) {
         if (!parameters.containsKey(fieldName)) {
-            // all chunking algorithm parameters are optional
+            // all double parameters are optional
             return defaultValue;
         }
         String fieldValueString = parameters.get(fieldName).toString();
-        return NumberUtils.createDouble(fieldValueString);
+        try {
+            return NumberUtils.createDouble(fieldValueString);
+        } catch (Exception e) {
+            throw new IllegalArgumentException(
+                String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Double.class.getName())
+            );
+        }
     }
 }
diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java
diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java
@@ -9,7 +9,6 @@
 import java.util.ArrayList;
 
 import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;
-import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter;
 
 /**
  * The implementation {@link Chunker} for delimiter algorithm
@@ -23,28 +22,15 @@ public class DelimiterChunker implements Chunker {
     private String delimiter;
 
     public DelimiterChunker(final Map<String, Object> parameters) {
-        validateParameters(parameters);
         parseParameters(parameters);
     }
 
     /**
-     * Validate the parameters for delimiter algorithm,
-     * will throw IllegalArgumentException if delimiter is not a string or empty
+     * Parse the parameters for delimiter algorithm.
+     * Throw IllegalArgumentException if delimiter is not a string or empty.
      *
      * @param parameters a map containing parameters, containing the following parameters
-     * 1. A string as the paragraph split indicator
-     */
-    @Override
-    public void validateParameters(Map<String, Object> parameters) {
-        validateStringParameter(parameters, DELIMITER_FIELD, false);
-    }
-
-    /**
-     * Parse the parameters for delimiter algorithm,
-     * will throw IllegalArgumentException if delimiter is not a string or empty
-     *
-     * @param parameters a map containing parameters, containing the following parameters
-     * 1. A string as the paragraph split indicator
+     * 1. delimiter A string as the paragraph split indicator
      */
     @Override
     public void parseParameters(Map<String, Object> parameters) {

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java
@@ -17,10 +17,7 @@
 import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze;
 import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;
 import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter;
-import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
-import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter;
-import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter;
-import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateDoubleParameterWithinRange;
+import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter;
 
 /**
  * The implementation {@link Chunker} for fixed token length algorithm.
@@ -62,35 +59,44 @@ public class FixedTokenLengthChunker implements Chunker {
     private final AnalysisRegistry analysisRegistry;
 
     public FixedTokenLengthChunker(final Map<String, Object> parameters) {
-        validateParameters(parameters);
         parseParameters(parameters);
         this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD);
     }
 
     /**
-     * Validate the parameters for fixed token length algorithm,
-     * will throw IllegalArgumentException when parameters are invalid
+     * Parse the parameters for fixed token length algorithm.
+     * Throw IllegalArgumentException when parameters are invalid.
      *
-     * @param parameters a map containing non-runtime parameters as the following:
-     * 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">analyzer tokenizer</a> in opensearch
+     * @param parameters a map non-runtime parameters as the following:
+     * 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">word tokenizer</a> in opensearch
      * 2. token_limit: the token limit for each chunked passage
      * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage
      * Here are requirements for parameters:
-     * max_token_count and token_limit should be a positive integer
-     * overlap_rate should be within range [0, 0.5]
-     * tokenizer should be string
+     * 1. token_limit must be a positive integer
+     * 2. overlap_rate must be within range [0, 0.5]
+     * 3. tokenizer must be a word tokenizer
      */
     @Override
-    public void validateParameters(Map<String, Object> parameters) {
-        validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
-        validateDoubleParameterWithinRange(parameters, OVERLAP_RATE_FIELD, OVERLAP_RATE_LOWER_BOUND, OVERLAP_RATE_UPPER_BOUND);
-        validateStringParameter(parameters, TOKENIZER_FIELD, false);
-        String tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
+    public void parseParameters(Map<String, Object> parameters) {
+        this.tokenLimit = parsePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
+        this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE);
+        this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
+        if (overlapRate < OVERLAP_RATE_LOWER_BOUND || overlapRate > OVERLAP_RATE_UPPER_BOUND) {
+            throw new IllegalArgumentException(
+                String.format(
+                    Locale.ROOT,
+                    "Parameter [%s] must be between %s and %s",
+                    OVERLAP_RATE_FIELD,
+                    OVERLAP_RATE_LOWER_BOUND,
+                    OVERLAP_RATE_UPPER_BOUND
+                )
+            );
+        }
         if (!WORD_TOKENIZERS.contains(tokenizer)) {
             throw new IllegalArgumentException(
                 String.format(
                     Locale.ROOT,
-                    "tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s",
+                    "Tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s",
                     tokenizer,
                     ALGORITHM_NAME,
                     WORD_TOKENIZERS
@@ -100,33 +106,19 @@ public void validateParameters(Map<String, Object> parameters) {
     }
 
     /**
-     * Parse the parameters for fixed token length algorithm,
-     * will throw IllegalArgumentException when parameters are invalid
-     *
-     * @param parameters a map non-runtime parameters as the following:
-     * 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">word tokenizer</a> in opensearch
-     * 2. token_limit: the token limit for each chunked passage
-     * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage
-     */
-    @Override
-    public void parseParameters(Map<String, Object> parameters) {
-        this.tokenLimit = parseIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
-        this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE);
-        this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
-    }
-
-    /**
-     * Return the chunked passages for fixed token length algorithm
+     * Return the chunked passages for fixed token length algorithm.
+     * Throw IllegalArgumentException when runtime parameters are invalid.
      *
      * @param content input string
      * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters:
      * 1. max_token_count the max token limit for the tokenizer
+     * Here are requirements for runtime parameters:
+     * 1. max_token_count must be a positive integer
      */
     @Override
     public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
-        // before chunking, validate and parse runtimeParameters
-        validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);
-        int maxTokenCount = parseIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);
+        // parse runtimeParameters before chunking
+        int maxTokenCount = parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);
 
         List<AnalyzeToken> tokens = tokenize(content, tokenizer, maxTokenCount);
         List<String> chunkResult = new ArrayList<>();