diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 7b45d6d45..22c152b36 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -40,15 +40,13 @@ public final class TextChunkingProcessor extends AbstractProcessor { public static final String TYPE = "text_chunking"; - public static final String FIELD_MAP_FIELD = "field_map"; - public static final String ALGORITHM_FIELD = "algorithm"; - @VisibleForTesting static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; + private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME; private int maxChunkLimit; @@ -88,29 +86,32 @@ public String getType() { @SuppressWarnings("unchecked") private void validateAndParseAlgorithmMap(final Map algorithmMap) { - if (algorithmMap.isEmpty()) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Unable to create %s processor as [%s] does not contain any algorithm", TYPE, ALGORITHM_FIELD) - ); - } else if (algorithmMap.size() > 1) { + if (algorithmMap.size() > 1) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Unable to create %s processor as [%s] contains multiple algorithms", TYPE, ALGORITHM_FIELD) ); } - Entry algorithmEntry = algorithmMap.entrySet().iterator().next(); - String algorithmKey = algorithmEntry.getKey(); - Object algorithmValue = algorithmEntry.getValue(); - if (!(algorithmValue instanceof Map)) { - throw new IllegalArgumentException( - String.format( - Locale.ROOT, - "Unable to create %s processor as [%s] parameters cannot be cast to [%s]", - TYPE, - algorithmKey, - Map.class.getName() - ) - ); + String algorithmKey; + Object algorithmValue; + if (algorithmMap.isEmpty()) { + algorithmKey = DEFAULT_ALGORITHM; + algorithmValue = new HashMap<>(); + } else { + Entry algorithmEntry = algorithmMap.entrySet().iterator().next(); + algorithmKey = algorithmEntry.getKey(); + algorithmValue = algorithmEntry.getValue(); + if (!(algorithmValue instanceof Map)) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Unable to create %s processor as [%s] parameters cannot be cast to [%s]", + TYPE, + algorithmKey, + Map.class.getName() + ) + ); + } } Map chunkerParameters = (Map) algorithmValue; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 2ea5f314a..4df563150 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -125,6 +125,16 @@ private Map createNestedFieldMap() { return fieldMap; } + @SneakyThrows + private TextChunkingProcessor createDefaultAlgorithmInstance(Map fieldMap) { + Map config = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + config.put(FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); + Map registry = new HashMap<>(); + return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + } + @SneakyThrows private TextChunkingProcessor createFixedTokenLengthInstance(Map fieldMap) { Map config = new HashMap<>(); @@ -195,24 +205,6 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { ); } - public void testCreate_whenAlgorithmFieldNoAlgorithm_thenFail() { - Map config = new HashMap<>(); - Map fieldMap = new HashMap<>(); - Map algorithmMap = new HashMap<>(); - fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - config.put(TextChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(ALGORITHM_FIELD, algorithmMap); - Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) - ); - assertEquals( - String.format(Locale.ROOT, "Unable to create %s processor as [%s] does not contain any algorithm", TYPE, ALGORITHM_FIELD), - illegalArgumentException.getMessage() - ); - } - public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); @@ -403,7 +395,21 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumE ), illegalArgumentException.getMessage() ); + } + @SneakyThrows + public void testCreate_withDefaultAlgorithm_andSourceDataString_thenSucceed() { + TextChunkingProcessor processor = createDefaultAlgorithmInstance(createStringFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add( + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + assertEquals(expectedPassages, passages); } @SneakyThrows