From 8a786fe0d9d97228245ef9fdac3c054a90a657e3 Mon Sep 17 00:00:00 2001 From: Ian Menendez <61611304+IanMenendez@users.noreply.github.com> Date: Wed, 2 Oct 2024 18:15:46 -0300 Subject: [PATCH] [Feature]: add ignore missing field to text chunking processors (#907) * feat: add ignore missing field to text chunking processor Signed-off-by: Ian Menendez Co-authored-by: Ian Menendez (cherry picked from commit 00e622e98a3f05a8b3cc096163032c8b02422629) --- CHANGELOG.md | 1 + .../processor/TextChunkingProcessor.java | 17 ++++++++++-- .../factory/TextChunkingProcessorFactory.java | 15 ++++++++++- .../processor/TextChunkingProcessorTests.java | 27 +++++++++++++++++++ 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7d4c926a..cc72c5d68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.17...2.x) ### Features ### Enhancements +- Implement `ignore_missing` field in text chunking processors ([#907](https://github.com/opensearch-project/neural-search/pull/907)) ### Bug Fixes ### Infrastructure ### Documentation diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 49435746c..bc1945bee 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -46,10 +46,13 @@ public final class TextChunkingProcessor extends AbstractProcessor { public static final String FIELD_MAP_FIELD = "field_map"; public static final String ALGORITHM_FIELD = "algorithm"; private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME; + public static final String IGNORE_MISSING = "ignore_missing"; + public static final boolean DEFAULT_IGNORE_MISSING = false; private int maxChunkLimit; private Chunker chunker; private final Map fieldMap; + private final boolean ignoreMissing; private final ClusterService clusterService; private final AnalysisRegistry analysisRegistry; private final Environment environment; @@ -59,12 +62,14 @@ public TextChunkingProcessor( final String description, final Map fieldMap, final Map algorithmMap, + final boolean ignoreMissing, final Environment environment, final ClusterService clusterService, final AnalysisRegistry analysisRegistry ) { super(tag, description); this.fieldMap = fieldMap; + this.ignoreMissing = ignoreMissing; this.environment = environment; this.clusterService = clusterService; this.analysisRegistry = analysisRegistry; @@ -75,6 +80,11 @@ public String getType() { return TYPE; } + // if ignore missing is true null fields return null. If ignore missing is false null fields return an empty list + private boolean shouldProcessChunk(Object chunkObject) { + return !ignoreMissing || Objects.nonNull(chunkObject); + } + @SuppressWarnings("unchecked") private void parseAlgorithmMap(final Map algorithmMap) { if (algorithmMap.size() > 1) { @@ -250,8 +260,11 @@ private void chunkMapType( } else { // chunk the object when target key is of leaf type (null, string and list of string) Object chunkObject = sourceAndMetadataMap.get(originalKey); - List chunkedResult = chunkLeafType(chunkObject, runtimeParameters); - sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult); + + if (shouldProcessChunk(chunkObject)) { + List chunkedResult = chunkLeafType(chunkObject, runtimeParameters); + sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult); + } } } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java index 91b9ac5c1..b9904f05d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java @@ -14,7 +14,10 @@ import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.DEFAULT_IGNORE_MISSING; import static org.opensearch.ingest.ConfigurationUtils.readMap; +import static org.opensearch.ingest.ConfigurationUtils.readBooleanProperty; /** * Factory for chunking ingest processor for ingestion pipeline. @@ -45,6 +48,16 @@ public TextChunkingProcessor create( ) throws Exception { Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); Map algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD); - return new TextChunkingProcessor(processorTag, description, fieldMap, algorithmMap, environment, clusterService, analysisRegistry); + boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, IGNORE_MISSING, DEFAULT_IGNORE_MISSING); + return new TextChunkingProcessor( + processorTag, + description, + fieldMap, + algorithmMap, + ignoreMissing, + environment, + clusterService, + analysisRegistry + ); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 433e51ef5..9cdb9aad0 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -42,6 +42,7 @@ import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING; import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; public class TextChunkingProcessorTests extends OpenSearchTestCase { @@ -181,6 +182,20 @@ private TextChunkingProcessor createDelimiterInstance() { return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } + @SneakyThrows + private TextChunkingProcessor createIgnoreMissingInstance() { + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + algorithmMap.put(DelimiterChunker.ALGORITHM_NAME, createDelimiterParameters()); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + config.put(FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); + config.put(IGNORE_MISSING, true); + Map registry = new HashMap<>(); + return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + } + public void testCreate_whenAlgorithmFieldMissing_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); @@ -945,4 +960,16 @@ public void testExecute_withDelimiter_andSourceDataString_thenSucceed() { expectedPassages.add(" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } + + @SneakyThrows + public void testExecute_withIgnoreMissing_thenSucceed() { + Map sourceAndMetadata = new HashMap<>(); + sourceAndMetadata.put("text_field", ""); + sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME); + IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>()); + + TextChunkingProcessor processor = createIgnoreMissingInstance(); + IngestDocument document = processor.execute(ingestDocument); + assertFalse(document.getSourceAndMetadata().containsKey(OUTPUT_FIELD)); + } }