Skip to content

Commit

Permalink
merge parameter validator into the parser
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 15, 2024
1 parent f3decb4 commit 3b8a3af
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 176 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
import org.opensearch.index.mapper.IndexFieldMapper;
import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory;
import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter;

/**
* This processor is used for user input data text chunking.
* The chunking results could be fed to downstream embedding processor,
* algorithm defines chunking algorithm and parameters,
* The chunking results could be fed to downstream embedding processor.
* The processor needs two fields: algorithm and field_map,
* where algorithm defines chunking algorithm and parameters,
* and field_map specifies which fields needs chunking and the corresponding keys for the chunking results.
*/
public final class TextChunkingProcessor extends AbstractProcessor {
Expand Down Expand Up @@ -117,8 +117,7 @@ private void parseAlgorithmMap(final Map<String, Object> algorithmMap) {
// fixed token length algorithm needs analysis registry for tokenization
chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry);
this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters);
validatePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
this.maxChunkLimit = parsePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
}

@SuppressWarnings("unchecked")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,9 @@
*/
public interface Chunker {

/**
* Validate the parameters for chunking algorithm,
* will throw IllegalArgumentException when parameters are invalid
*
* @param parameters a map containing non-runtime parameters for chunking algorithms
*/
void validateParameters(Map<String, Object> parameters);

/**
* Parse the parameters for chunking algorithm.
* Throw IllegalArgumentException when parameters are invalid.
* The parameters must be validated before parsing.
*
* @param parameters a map containing non-runtime parameters for chunking algorithms
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,80 @@
*/
package org.opensearch.neuralsearch.processor.chunker;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;

import java.util.Locale;
import java.util.Map;

/**
* Parse the parameter for text chunking processor and algorithms.
* The parameter must be validated before parsing.
* Throw IllegalArgumentException when parameters are invalid.
*/
public class ChunkerParameterParser {

/**
* Parse string type parameter
* Parse string type parameter.
* Throw IllegalArgumentException if parameter is not a string or empty.
*/
public static String parseStringParameter(final Map<String, Object> parameters, final String fieldName, final String defaultValue) {
if (!parameters.containsKey(fieldName)) {
// all string parameters are optional
return defaultValue;
}
return parameters.get(fieldName).toString();
Object fieldValue = parameters.get(fieldName);
if (!(fieldValue instanceof String)) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, String.class.getName())
);
}
if (StringUtils.isEmpty(fieldValue.toString())) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", fieldName));
}
return fieldValue.toString();
}

/**
* Parse integer type parameter
* Parse Integer type parameter with positive value.
* Throw IllegalArgumentException if parameter is not a positive integer.
*/
public static int parseIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
public static int parsePositiveIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
if (!parameters.containsKey(fieldName)) {
// all chunking algorithm parameters are optional
return defaultValue;
}
int fieldValueInt;
String fieldValueString = parameters.get(fieldName).toString();
return NumberUtils.createInteger(fieldValueString);
try {
fieldValueInt = NumberUtils.createInteger(fieldValueString);
} catch (Exception e) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName())
);
}
// some parameter has negative default value, indicating that this parameter is not effective
if (fieldValueInt != defaultValue && fieldValueInt <= 0) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName));
}
return fieldValueInt;
}

/**
* parse double type parameter
* Parse double type parameter.
* Throw IllegalArgumentException if parameter is not a double.
*/
public static double parseDoubleParameter(final Map<String, Object> parameters, final String fieldName, final double defaultValue) {
if (!parameters.containsKey(fieldName)) {
// all chunking algorithm parameters are optional
// all double parameters are optional
return defaultValue;
}
String fieldValueString = parameters.get(fieldName).toString();
return NumberUtils.createDouble(fieldValueString);
try {
return NumberUtils.createDouble(fieldValueString);
} catch (Exception e) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Double.class.getName())
);
}
}
}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import java.util.ArrayList;

import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter;

/**
* The implementation {@link Chunker} for delimiter algorithm
Expand All @@ -23,28 +22,15 @@ public class DelimiterChunker implements Chunker {
private String delimiter;

public DelimiterChunker(final Map<String, Object> parameters) {
validateParameters(parameters);
parseParameters(parameters);
}

/**
* Validate the parameters for delimiter algorithm,
* will throw IllegalArgumentException if delimiter is not a string or empty
* Parse the parameters for delimiter algorithm.
* Throw IllegalArgumentException if delimiter is not a string or empty.
*
* @param parameters a map containing parameters, containing the following parameters
* 1. A string as the paragraph split indicator
*/
@Override
public void validateParameters(Map<String, Object> parameters) {
validateStringParameter(parameters, DELIMITER_FIELD, false);
}

/**
* Parse the parameters for delimiter algorithm,
* will throw IllegalArgumentException if delimiter is not a string or empty
*
* @param parameters a map containing parameters, containing the following parameters
* 1. A string as the paragraph split indicator
* 1. delimiter A string as the paragraph split indicator
*/
@Override
public void parseParameters(Map<String, Object> parameters) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@
import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateDoubleParameterWithinRange;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter;

/**
* The implementation {@link Chunker} for fixed token length algorithm.
Expand Down Expand Up @@ -62,35 +59,44 @@ public class FixedTokenLengthChunker implements Chunker {
private final AnalysisRegistry analysisRegistry;

public FixedTokenLengthChunker(final Map<String, Object> parameters) {
validateParameters(parameters);
parseParameters(parameters);
this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD);
}

/**
* Validate the parameters for fixed token length algorithm,
* will throw IllegalArgumentException when parameters are invalid
* Parse the parameters for fixed token length algorithm.
* Throw IllegalArgumentException when parameters are invalid.
*
* @param parameters a map containing non-runtime parameters as the following:
* 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">analyzer tokenizer</a> in opensearch
* @param parameters a map non-runtime parameters as the following:
* 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">word tokenizer</a> in opensearch
* 2. token_limit: the token limit for each chunked passage
* 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage
* Here are requirements for parameters:
* max_token_count and token_limit should be a positive integer
* overlap_rate should be within range [0, 0.5]
* tokenizer should be string
* 1. token_limit must be a positive integer
* 2. overlap_rate must be within range [0, 0.5]
* 3. tokenizer must be a word tokenizer
*/
@Override
public void validateParameters(Map<String, Object> parameters) {
validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
validateDoubleParameterWithinRange(parameters, OVERLAP_RATE_FIELD, OVERLAP_RATE_LOWER_BOUND, OVERLAP_RATE_UPPER_BOUND);
validateStringParameter(parameters, TOKENIZER_FIELD, false);
String tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
public void parseParameters(Map<String, Object> parameters) {
this.tokenLimit = parsePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE);
this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
if (overlapRate < OVERLAP_RATE_LOWER_BOUND || overlapRate > OVERLAP_RATE_UPPER_BOUND) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"Parameter [%s] must be between %s and %s",
OVERLAP_RATE_FIELD,
OVERLAP_RATE_LOWER_BOUND,
OVERLAP_RATE_UPPER_BOUND
)
);
}
if (!WORD_TOKENIZERS.contains(tokenizer)) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s",
"Tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s",
tokenizer,
ALGORITHM_NAME,
WORD_TOKENIZERS
Expand All @@ -100,33 +106,19 @@ public void validateParameters(Map<String, Object> parameters) {
}

/**
* Parse the parameters for fixed token length algorithm,
* will throw IllegalArgumentException when parameters are invalid
*
* @param parameters a map non-runtime parameters as the following:
* 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">word tokenizer</a> in opensearch
* 2. token_limit: the token limit for each chunked passage
* 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage
*/
@Override
public void parseParameters(Map<String, Object> parameters) {
this.tokenLimit = parseIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE);
this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
}

/**
* Return the chunked passages for fixed token length algorithm
* Return the chunked passages for fixed token length algorithm.
* Throw IllegalArgumentException when runtime parameters are invalid.
*
* @param content input string
* @param runtimeParameters a map for runtime parameters, containing the following runtime parameters:
* 1. max_token_count the max token limit for the tokenizer
* Here are requirements for runtime parameters:
* 1. max_token_count must be a positive integer
*/
@Override
public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
// before chunking, validate and parse runtimeParameters
validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);
int maxTokenCount = parseIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);
// parse runtimeParameters before chunking
int maxTokenCount = parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);

List<AnalyzeToken> tokens = tokenize(content, tokenizer, maxTokenCount);
List<String> chunkResult = new ArrayList<>();
Expand Down
Loading

0 comments on commit 3b8a3af

Please sign in to comment.