Skip to content

Commit

Permalink
implement parser and validator
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 14, 2024
1 parent f16882d commit a969a60
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 79 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory;
import org.opensearch.neuralsearch.processor.chunker.Chunker;
import org.opensearch.index.mapper.IndexFieldMapper;
import org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator;
import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;

/**
* This processor is used for user input data text chunking.
Expand Down Expand Up @@ -116,11 +117,8 @@ private void validateAndParseAlgorithmMap(final Map<String, Object> algorithmMap
// fixed token length algorithm needs analysis registry for tokenization
chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry);
this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters);
this.maxChunkLimit = ChunkerParameterValidator.validatePositiveIntegerParameter(
chunkerParameters,
MAX_CHUNK_LIMIT_FIELD,
DEFAULT_MAX_CHUNK_LIMIT
);
validatePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
}

@SuppressWarnings("unchecked")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,20 @@
public interface Chunker {

/**
* Validate and parse the parameters for chunking algorithm,
* Validate the parameters for chunking algorithm,
* will throw IllegalArgumentException when parameters are invalid
*
* @param parameters a map containing non-runtime parameters for chunking algorithms
*/
void validateAndParseParameters(Map<String, Object> parameters);
void validateParameters(Map<String, Object> parameters);

/**
* Parse the parameters for chunking algorithm.
* The parameters must be validated before parsing.
*
* @param parameters a map containing non-runtime parameters for chunking algorithms
*/
void parseParameters(Map<String, Object> parameters);

/**
* Chunk the incoming string according to parameters and return chunked passages
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.processor.chunker;

import org.apache.commons.lang3.math.NumberUtils;

import java.util.Map;

/**
* Parse the parameter for text chunking processor and algorithms.
* The parameter must be validated before parsing.
*/
public class ChunkerParameterParser {

/**
* Parse string type parameter
*/
public static String parseStringParameter(final Map<String, Object> parameters, final String fieldName, final String defaultValue) {
if (!parameters.containsKey(fieldName)) {
return defaultValue;
}
return parameters.get(fieldName).toString();
}

/**
* Parse integer type parameter
*/
public static int parseIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
if (!parameters.containsKey(fieldName)) {
// all chunking algorithm parameters are optional
return defaultValue;
}
String fieldValueString = parameters.get(fieldName).toString();
return NumberUtils.createInteger(fieldValueString);
}

/**
* parse double type parameter
*/
public static double parseDoubleParameter(final Map<String, Object> parameters, final String fieldName, final double defaultValue) {
if (!parameters.containsKey(fieldName)) {
// all chunking algorithm parameters are optional
return defaultValue;
}
String fieldValueString = parameters.get(fieldName).toString();
return NumberUtils.createDouble(fieldValueString);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,85 +11,78 @@
import java.util.Locale;

/**
* Validate and parse the parameter for text chunking processor and algorithms
* Validate the parameter for text chunking processor and algorithms
*/
public class ChunkerParameterValidator {

/**
* Validate and parse the parameter for string parameters
* Validate string type parameter
*/
public static String validateStringParameters(
final Map<String, Object> parameters,
final String fieldName,
final String defaultValue,
final boolean allowEmpty
) {
public static void validateStringParameter(final Map<String, Object> parameters, final String fieldName, final boolean allowEmpty) {
if (!parameters.containsKey(fieldName)) {
// all chunking algorithm parameters are optional
return defaultValue;
return;
}
Object fieldValue = parameters.get(fieldName);
if (!(fieldValue instanceof String)) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, String.class.getName())
);
} else if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) {
}
if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", fieldName));
}
return (String) fieldValue;
}

/**
* Validate and parse the parameter for numeric parameters
* Validate integer type parameter with positive value
*/
public static Number validateNumberParameter(final Map<String, Object> parameters, final String fieldName, final Number defaultValue) {
public static void validatePositiveIntegerParameter(
final Map<String, Object> parameters,
final String fieldName,
final int defaultValue
) {
if (!parameters.containsKey(fieldName)) {
// all chunking algorithm parameters are optional
return defaultValue;
return;
}
String fieldValue = parameters.get(fieldName).toString();
if (!(NumberUtils.isParsable(fieldValue))) {
String fieldValueString = parameters.get(fieldName).toString();
if (!(NumberUtils.isParsable(fieldValueString))) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName())
);
}
return NumberUtils.createNumber(fieldValue);
}

/**
* Validate and parse the parameter for positive integer parameters
*/
public static int validatePositiveIntegerParameter(
final Map<String, Object> parameters,
final String fieldName,
final int defaultValue
) {
Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue);
int fieldValueInt = fieldValueNumber.intValue();
// sometimes parameter has negative default value, indicating that this parameter is not effective
int fieldValueInt = NumberUtils.createInteger(fieldValueString);
// sometimes the parameter has negative default value, indicating that this parameter is not effective
if (fieldValueInt != defaultValue && fieldValueInt <= 0) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName));
}
return fieldValueInt;
}

/**
* Validate and parse the parameter for double parameters within [lowerBound, upperBound]
* Validate double type parameter within range [lowerBound, upperBound]
*/
public static double validateRangeDoubleParameter(
public static void validateDoubleParameterWithinRange(
final Map<String, Object> parameters,
final String fieldName,
final double lowerBound,
final double upperBound,
final double defaultValue
final double upperBound
) {
Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue);
double fieldValueDouble = fieldValueNumber.doubleValue();
if (!parameters.containsKey(fieldName)) {
// all chunking algorithm parameters are optional
return;
}
String fieldValueString = parameters.get(fieldName).toString();
if (!(NumberUtils.isParsable(fieldValueString))) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName())
);
}
double fieldValueDouble = NumberUtils.createDouble(fieldValueString);
if (fieldValueDouble < lowerBound || fieldValueDouble > upperBound) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", fieldName, lowerBound, upperBound)
);
}
return fieldValueDouble;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import java.util.List;
import java.util.ArrayList;

import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameters;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter;

/**
* The implementation {@link Chunker} for delimiter algorithm
Expand All @@ -22,19 +23,32 @@ public class DelimiterChunker implements Chunker {
private String delimiter;

public DelimiterChunker(final Map<String, Object> parameters) {
validateAndParseParameters(parameters);
validateParameters(parameters);
parseParameters(parameters);
}

/**
* Validate and parse the parameters for delimiter algorithm,
* Validate the parameters for delimiter algorithm,
* will throw IllegalArgumentException if delimiter is not a string or empty
*
* @param parameters a map containing parameters, containing the following parameters
* 1. A string as the paragraph split indicator
*/
@Override
public void validateAndParseParameters(final Map<String, Object> parameters) {
this.delimiter = validateStringParameters(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER, false);
public void validateParameters(Map<String, Object> parameters) {
validateStringParameter(parameters, DELIMITER_FIELD, false);
}

/**
* Parse the parameters for delimiter algorithm,
* will throw IllegalArgumentException if delimiter is not a string or empty
*
* @param parameters a map containing parameters, containing the following parameters
* 1. A string as the paragraph split indicator
*/
@Override
public void parseParameters(Map<String, Object> parameters) {
this.delimiter = parseStringParameter(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@
import org.opensearch.action.admin.indices.analyze.AnalyzeAction;
import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken;
import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateRangeDoubleParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameters;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateDoubleParameterWithinRange;

/**
* The implementation {@link Chunker} for fixed token length algorithm.
Expand Down Expand Up @@ -59,48 +62,59 @@ public class FixedTokenLengthChunker implements Chunker {
private final AnalysisRegistry analysisRegistry;

public FixedTokenLengthChunker(final Map<String, Object> parameters) {
validateAndParseParameters(parameters);
validateParameters(parameters);
parseParameters(parameters);
this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD);
}

/**
* Validate and parse the parameters for fixed token length algorithm,
* Validate the parameters for fixed token length algorithm,
* will throw IllegalArgumentException when parameters are invalid
*
* @param parameters a map containing parameters, containing the following parameters:
* @param parameters a map containing non-runtime parameters as the following:
* 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">analyzer tokenizer</a> in opensearch
* 2. token_limit: the token limit for each chunked passage
* 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage
* 4. max_token_count: the max token limit for the tokenizer
* Here are requirements for parameters:
* max_token_count and token_limit should be a positive integer
* overlap_rate should be within range [0, 0.5]
* tokenizer should be string
*/
@Override
public void validateAndParseParameters(final Map<String, Object> parameters) {
this.tokenLimit = validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
this.overlapRate = validateRangeDoubleParameter(
parameters,
OVERLAP_RATE_FIELD,
OVERLAP_RATE_LOWER_BOUND,
OVERLAP_RATE_UPPER_BOUND,
DEFAULT_OVERLAP_RATE
);
this.tokenizer = validateStringParameters(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER, false);
if (!WORD_TOKENIZERS.contains(this.tokenizer)) {
public void validateParameters(Map<String, Object> parameters) {
validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
validateDoubleParameterWithinRange(parameters, OVERLAP_RATE_FIELD, OVERLAP_RATE_LOWER_BOUND, OVERLAP_RATE_UPPER_BOUND);
validateStringParameter(parameters, TOKENIZER_FIELD, false);
String tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
if (!WORD_TOKENIZERS.contains(tokenizer)) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s",
this.tokenizer,
tokenizer,
ALGORITHM_NAME,
WORD_TOKENIZERS
)
);
}
}

/**
* Parse the parameters for fixed token length algorithm,
* will throw IllegalArgumentException when parameters are invalid
*
* @param parameters a map non-runtime parameters as the following:
* 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">analyzer tokenizer</a> in opensearch
* 2. token_limit: the token limit for each chunked passage
* 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage
*/
@Override
public void parseParameters(Map<String, Object> parameters) {
this.tokenLimit = parseIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE);
this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
}

/**
* Return the chunked passages for fixed token length algorithm
*
Expand All @@ -111,7 +125,8 @@ public void validateAndParseParameters(final Map<String, Object> parameters) {
@Override
public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
// before chunking, validate and parse runtimeParameters
int maxTokenCount = validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);
validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);
int maxTokenCount = parseIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);

List<AnalyzeToken> tokens = tokenize(content, tokenizer, maxTokenCount);
List<String> chunkResult = new ArrayList<>();
Expand Down
Loading

0 comments on commit a969a60

Please sign in to comment.