Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature]: add ignore missing field to text chunking processors #907

1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.17...2.x)
### Features
- Implement `ignore_missing` field in text chunking processors ([#907](https://github.com/opensearch-project/neural-search/pull/907))
### Enhancements
### Bug Fixes
### Infrastructure
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,13 @@ public final class TextChunkingProcessor extends AbstractProcessor {
public static final String FIELD_MAP_FIELD = "field_map";
public static final String ALGORITHM_FIELD = "algorithm";
private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME;
public static final String IGNORE_MISSING = "ignore_missing";
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
public static final boolean DEFAULT_IGNORE_MISSING = false;

private int maxChunkLimit;
private Chunker chunker;
private final Map<String, Object> fieldMap;
private final boolean ignoreMissing;
private final ClusterService clusterService;
private final AnalysisRegistry analysisRegistry;
private final Environment environment;
Expand All @@ -59,12 +62,14 @@ public TextChunkingProcessor(
final String description,
final Map<String, Object> fieldMap,
final Map<String, Object> algorithmMap,
final boolean ignoreMissing,
final Environment environment,
final ClusterService clusterService,
final AnalysisRegistry analysisRegistry
) {
super(tag, description);
this.fieldMap = fieldMap;
this.ignoreMissing = ignoreMissing;
this.environment = environment;
this.clusterService = clusterService;
this.analysisRegistry = analysisRegistry;
Expand All @@ -75,6 +80,11 @@ public String getType() {
return TYPE;
}

// if ignore missing is true null fields return null. If ignore missing is false null fields return an empty list
private boolean shouldProcessChunk(Object chunkObject) {
return !ignoreMissing || Objects.nonNull(chunkObject);
}

@SuppressWarnings("unchecked")
private void parseAlgorithmMap(final Map<String, Object> algorithmMap) {
if (algorithmMap.size() > 1) {
Expand Down Expand Up @@ -250,8 +260,11 @@ private void chunkMapType(
} else {
// chunk the object when target key is of leaf type (null, string and list of string)
Object chunkObject = sourceAndMetadataMap.get(originalKey);
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);

IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
if (shouldProcessChunk(chunkObject)) {
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
}
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.DEFAULT_IGNORE_MISSING;
import static org.opensearch.ingest.ConfigurationUtils.readMap;
import static org.opensearch.ingest.ConfigurationUtils.readBooleanProperty;

/**
* Factory for chunking ingest processor for ingestion pipeline.
Expand Down Expand Up @@ -45,6 +48,16 @@ public TextChunkingProcessor create(
) throws Exception {
Map<String, Object> fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD);
Map<String, Object> algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD);
return new TextChunkingProcessor(processorTag, description, fieldMap, algorithmMap, environment, clusterService, analysisRegistry);
boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, IGNORE_MISSING, DEFAULT_IGNORE_MISSING);
return new TextChunkingProcessor(
martin-gaievski marked this conversation as resolved.
Show resolved Hide resolved
processorTag,
description,
fieldMap,
algorithmMap,
ignoreMissing,
environment,
clusterService,
analysisRegistry
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD;

public class TextChunkingProcessorTests extends OpenSearchTestCase {
Expand Down Expand Up @@ -181,6 +182,20 @@ private TextChunkingProcessor createDelimiterInstance() {
return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
}

@SneakyThrows
private TextChunkingProcessor createIgnoreMissingInstance() {
Map<String, Object> config = new HashMap<>();
Map<String, Object> fieldMap = new HashMap<>();
Map<String, Object> algorithmMap = new HashMap<>();
algorithmMap.put(DelimiterChunker.ALGORITHM_NAME, createDelimiterParameters());
fieldMap.put(INPUT_FIELD, OUTPUT_FIELD);
config.put(FIELD_MAP_FIELD, fieldMap);
config.put(ALGORITHM_FIELD, algorithmMap);
config.put(IGNORE_MISSING, true);
Map<String, Processor.Factory> registry = new HashMap<>();
return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
}

public void testCreate_whenAlgorithmFieldMissing_thenFail() {
Map<String, Object> config = new HashMap<>();
Map<String, Object> fieldMap = new HashMap<>();
Expand Down Expand Up @@ -945,4 +960,16 @@ public void testExecute_withDelimiter_andSourceDataString_thenSucceed() {
expectedPassages.add(" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.");
assertEquals(expectedPassages, passages);
}

@SneakyThrows
public void testExecute_withIgnoreMissing_thenSucceed() {
Map<String, Object> sourceAndMetadata = new HashMap<>();
sourceAndMetadata.put("text_field", "");
sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME);
IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>());

TextChunkingProcessor processor = createIgnoreMissingInstance();
IngestDocument document = processor.execute(ingestDocument);
assertFalse(document.getSourceAndMetadata().containsKey(OUTPUT_FIELD));
}
}
Loading