Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature]: add ignore missing field to text chunking processors #907

1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.17...2.x)
### Features
- Implement `ignore_missing` field in text chunking processors ([#907](https://github.com/opensearch-project/neural-search/pull/907))
### Enhancements
### Bug Fixes
### Infrastructure
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,13 @@ public final class TextChunkingProcessor extends AbstractProcessor {
public static final String FIELD_MAP_FIELD = "field_map";
public static final String ALGORITHM_FIELD = "algorithm";
private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME;
public static final String IGNORE_MISSING = "ignore_missing";
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
public static final Boolean DEFAULT_IGNORE_MISSING = false;
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved

private int maxChunkLimit;
private Chunker chunker;
private final Map<String, Object> fieldMap;
private final Boolean ignoreMissing;
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
private final ClusterService clusterService;
private final AnalysisRegistry analysisRegistry;
private final Environment environment;
Expand All @@ -59,12 +62,14 @@ public TextChunkingProcessor(
final String description,
final Map<String, Object> fieldMap,
final Map<String, Object> algorithmMap,
final boolean ignoreMissing,
final Environment environment,
final ClusterService clusterService,
final AnalysisRegistry analysisRegistry
) {
super(tag, description);
this.fieldMap = fieldMap;
this.ignoreMissing = ignoreMissing;
this.environment = environment;
this.clusterService = clusterService;
this.analysisRegistry = analysisRegistry;
Expand Down Expand Up @@ -250,8 +255,11 @@ private void chunkMapType(
} else {
// chunk the object when target key is of leaf type (null, string and list of string)
Object chunkObject = sourceAndMetadataMap.get(originalKey);
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);

IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
if (!(ignoreMissing && chunkObject == null)) {
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
}
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.DEFAULT_IGNORE_MISSING;
import static org.opensearch.ingest.ConfigurationUtils.readMap;
import static org.opensearch.ingest.ConfigurationUtils.readBooleanProperty;

/**
* Factory for chunking ingest processor for ingestion pipeline.
Expand Down Expand Up @@ -45,6 +48,16 @@ public TextChunkingProcessor create(
) throws Exception {
Map<String, Object> fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD);
Map<String, Object> algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD);
return new TextChunkingProcessor(processorTag, description, fieldMap, algorithmMap, environment, clusterService, analysisRegistry);
boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, IGNORE_MISSING, DEFAULT_IGNORE_MISSING);
return new TextChunkingProcessor(
martin-gaievski marked this conversation as resolved.
Show resolved Hide resolved
processorTag,
description,
fieldMap,
algorithmMap,
ignoreMissing,
environment,
clusterService,
analysisRegistry
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
private static final String TEST_DOCUMENT = "processor/chunker/TextChunkingTestDocument.json";

private static final String TEST_LONG_DOCUMENT = "processor/chunker/TextChunkingTestLongDocument.json";
private static final String TEST_MISSING_FIELD_DOCUMENT = "processor/chunker/TextChunkingTestMissingFieldDocument.json";

IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
private static final String IGNORE_MISSING_PIPELINE_NAME = "pipeline-with-ignore-missing";

private static final Map<String, String> PIPELINE_CONFIGS_BY_NAME = Map.of(
FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME,
Expand All @@ -59,7 +62,9 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
DELIMITER_PIPELINE_NAME,
"processor/chunker/PipelineForDelimiterChunker.json",
CASCADE_PIPELINE_NAME,
"processor/chunker/PipelineForCascadedChunker.json"
"processor/chunker/PipelineForCascadedChunker.json",
IGNORE_MISSING_PIPELINE_NAME,
"processor/chunker/PipelineWithIgnoreMissing.json"
);

@Before
Expand Down Expand Up @@ -176,6 +181,33 @@ public void testTextChunkingProcessor_withCascadePipeline_successful() {
}
}

@SneakyThrows
public void testTextChunkingProcessor_withIgnoreMissing() {
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
try {
createPipelineProcessor(IGNORE_MISSING_PIPELINE_NAME);
createTextChunkingIndex(INDEX_NAME, IGNORE_MISSING_PIPELINE_NAME);
ingestDocument(TEST_MISSING_FIELD_DOCUMENT);

validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, null);
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
} finally {
wipeOfTestResources(INDEX_NAME, IGNORE_MISSING_PIPELINE_NAME, null, null);
}
}

@SneakyThrows
public void testTextChunkingProcessor_withoutIgnoreMissing() {
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
ingestDocument(TEST_MISSING_FIELD_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages);
} finally {
wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME, null, null);
}
}

private void validateIndexIngestResults(String indexName, String fieldName, Object expected) {
assertEquals(1, getDocCount(indexName));
MatchAllQueryBuilder query = new MatchAllQueryBuilder();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"description": "An example fixed token length chunker pipeline with ignore missing == true",
"processors" : [
{
"text_chunking": {
"ignore_missing": true,
"field_map": {
"body": "body_chunk"
},
"algorithm": {
"fixed_token_length": {
"token_limit": 10,
"tokenizer": "letter"
}
}
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
"name": "OpenSearch"
}
Loading