Skip to content

Commit

Permalink
update parameter setting of max chunk limit
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 15, 2024
1 parent e7dffe0 commit 463de71
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
import org.opensearch.index.mapper.IndexFieldMapper;
import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory;
import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter;

import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;

/**
* This processor is used for user input data text chunking.
Expand All @@ -47,6 +48,7 @@ public final class TextChunkingProcessor extends AbstractProcessor {
static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit";

private static final int DEFAULT_MAX_CHUNK_LIMIT = 100;
private static final int DISABLED_MAX_CHUNK_LIMIT = -1;
private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME;

private int maxChunkLimit;
Expand Down Expand Up @@ -131,7 +133,17 @@ private void parseAlgorithmMap(final Map<String, Object> algorithmMap) {
chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry);
}
this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters);
this.maxChunkLimit = parsePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
if (maxChunkLimit <= 0 && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"Parameter [%s] must be positive or %s to disable this parameter",
MAX_CHUNK_LIMIT_FIELD,
DISABLED_MAX_CHUNK_LIMIT
)
);
}
}

@SuppressWarnings("unchecked")
Expand Down Expand Up @@ -283,13 +295,14 @@ private int chunkString(final String content, List<String> result, final Map<Str
int updatedChunkCount = chunkCount;
List<String> contentResult = chunker.chunk(content, runTimeParameters);
updatedChunkCount += contentResult.size();
if (updatedChunkCount > maxChunkLimit) {
throw new IllegalArgumentException(
if (updatedChunkCount > maxChunkLimit && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) {
throw new IllegalStateException(
String.format(
Locale.ROOT,
"Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]",
updatedChunkCount,
maxChunkLimit
"The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.",
TYPE,
maxChunkLimit,
MAX_CHUNK_LIMIT_FIELD
)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,23 +38,30 @@ public static String parseStringParameter(final Map<String, Object> parameters,
}

/**
* Parse Integer type parameter with positive value.
* Throw IllegalArgumentException if parameter is not a positive integer.
* Parse Integer type parameter.
* Throw IllegalArgumentException if parameter is not an integer.
*/
public static int parsePositiveIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
public static int parseIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
if (!parameters.containsKey(fieldName)) {
// all chunking algorithm parameters are optional
// all integer parameters are optional
return defaultValue;
}
int fieldValueInt;
String fieldValueString = parameters.get(fieldName).toString();
try {
fieldValueInt = NumberUtils.createInteger(fieldValueString);
return NumberUtils.createInteger(fieldValueString);
} catch (Exception e) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName())
);
}
}

/**
* Parse Integer type parameter with positive value.
* Throw IllegalArgumentException if parameter is not a positive integer.
*/
public static int parsePositiveIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
int fieldValueInt = parseIntegerParameter(parameters, fieldName, defaultValue);
if (fieldValueInt <= 0) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,21 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() {
IllegalArgumentException.class,
() -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config)
);
assertEquals(
String.format(Locale.ROOT, "Parameter [%s] must be positive.", MAX_CHUNK_LIMIT_FIELD),
illegalArgumentException.getMessage()
);
assert (illegalArgumentException.getMessage()
.contains(String.format(Locale.ROOT, "Parameter [%s] must be positive", MAX_CHUNK_LIMIT_FIELD)));
}

@SneakyThrows
public void testCreate_whenMaxChunkNumDisabledValue_thenSucceed() {
Map<String, Processor.Factory> registry = new HashMap<>();
Map<String, Object> config = new HashMap<>();
Map<String, Object> fieldMap = new HashMap<>();
Map<String, Object> algorithmMap = new HashMap<>();
fieldMap.put(INPUT_FIELD, OUTPUT_FIELD);
algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunk(-1));
config.put(FIELD_MAP_FIELD, fieldMap);
config.put(ALGORITHM_FIELD, algorithmMap);
textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
}

public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() {
Expand Down Expand Up @@ -379,23 +390,36 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT
}
}

@SneakyThrows
public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumDisabled_thenFail() {
TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), -1);
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
assert (passages instanceof List<?>);
List<String> expectedPassages = new ArrayList<>();
expectedPassages.add("This is an example document to be chunked. The document ");
expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by ");
expectedPassages.add("standard tokenizer in OpenSearch.");
assertEquals(expectedPassages, passages);
}

@SneakyThrows
public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumExceed_thenFail() {
TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 1);
int maxChunkLimit = 1;
TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), maxChunkLimit);
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString());
IllegalArgumentException illegalArgumentException = assertThrows(
IllegalArgumentException.class,
() -> processor.execute(ingestDocument)
);
assertEquals(
String.format(
Locale.ROOT,
"Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]",
3,
1
),
illegalArgumentException.getMessage()
);
IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> processor.execute(ingestDocument));
assert (illegalStateException.getMessage()
.contains(
String.format(
Locale.ROOT,
"The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].",
TYPE,
maxChunkLimit
)
));
}

@SneakyThrows
Expand Down

0 comments on commit 463de71

Please sign in to comment.