From 354d0a39ec89615143d9174ab555f444844658a0 Mon Sep 17 00:00:00 2001 From: Mike Pellegrini Date: Fri, 3 Jan 2025 10:24:24 -0500 Subject: [PATCH] Semantic text - Clear inference results on explicit nulls (#119463) Fix a bug where setting a semantic_text source field explicitly to null in an update request to clear inference results did not actually clear the inference results for that field. This bug only affects the new _inference_fields format. --- .../xpack/inference/InferenceFeatures.java | 4 +- .../ShardBulkInferenceActionFilter.java | 55 +++- ...SemanticInferenceMetadataFieldsMapper.java | 3 + .../inference/mapper/SemanticTextField.java | 9 +- .../mapper/SemanticTextFieldMapper.java | 13 +- .../ShardBulkInferenceActionFilterTests.java | 131 ++++++++ .../mapper/SemanticTextFieldMapperTests.java | 29 ++ .../60_semantic_text_inference_update.yml | 288 +++++++++++++----- .../60_semantic_text_inference_update_bwc.yml | 25 ++ 9 files changed, 461 insertions(+), 96 deletions(-) diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java index 3fd671383369d..fc9d2641a8cfc 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java @@ -9,6 +9,7 @@ import org.elasticsearch.features.FeatureSpecification; import org.elasticsearch.features.NodeFeature; +import org.elasticsearch.xpack.inference.mapper.SemanticInferenceMetadataFieldsMapper; import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper; import org.elasticsearch.xpack.inference.rank.random.RandomRankRetrieverBuilder; import org.elasticsearch.xpack.inference.rank.textsimilarity.TextSimilarityRankRetrieverBuilder; @@ -46,7 +47,8 @@ public Set getTestFeatures() { SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX, SEMANTIC_TEXT_HIGHLIGHTER, SEMANTIC_MATCH_QUERY_REWRITE_INTERCEPTION_SUPPORTED, - SEMANTIC_SPARSE_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED + SEMANTIC_SPARSE_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED, + SemanticInferenceMetadataFieldsMapper.EXPLICIT_NULL_FIXES ); } } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java index 22d6157b335ca..f4aa49bad1648 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java @@ -39,6 +39,7 @@ import org.elasticsearch.inference.UnparsedModel; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.tasks.Task; +import org.elasticsearch.xcontent.XContent; import org.elasticsearch.xpack.core.inference.results.ChunkedInferenceError; import org.elasticsearch.xpack.inference.mapper.SemanticTextField; import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper; @@ -50,6 +51,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.HashMap; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -67,6 +69,8 @@ */ public class ShardBulkInferenceActionFilter implements MappedActionFilter { protected static final int DEFAULT_BATCH_SIZE = 512; + private static final Object EXPLICIT_NULL = new Object(); + private static final ChunkedInference EMPTY_CHUNKED_INFERENCE = new EmptyChunkedInference(); private final ClusterService clusterService; private final InferenceServiceRegistry inferenceServiceRegistry; @@ -393,11 +397,22 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons for (var entry : response.responses.entrySet()) { var fieldName = entry.getKey(); var responses = entry.getValue(); - var model = responses.get(0).model(); + Model model = null; + + InferenceFieldMetadata inferenceFieldMetadata = fieldInferenceMap.get(fieldName); + if (inferenceFieldMetadata == null) { + throw new IllegalStateException("No inference field metadata for field [" + fieldName + "]"); + } + // ensure that the order in the original field is consistent in case of multiple inputs Collections.sort(responses, Comparator.comparingInt(FieldInferenceResponse::inputOrder)); Map> chunkMap = new LinkedHashMap<>(); for (var resp : responses) { + // Get the first non-null model from the response list + if (model == null) { + model = resp.model; + } + var lst = chunkMap.computeIfAbsent(resp.sourceField, k -> new ArrayList<>()); lst.addAll( SemanticTextField.toSemanticTextFieldChunks( @@ -409,21 +424,26 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons ) ); } + List inputs = responses.stream() .filter(r -> r.sourceField().equals(fieldName)) .map(r -> r.input) .collect(Collectors.toList()); + + // The model can be null if we are only processing update requests that clear inference results. This is ok because we will + // merge in the field's existing model settings on the data node. var result = new SemanticTextField( useLegacyFormat, fieldName, useLegacyFormat ? inputs : null, new SemanticTextField.InferenceResult( - model.getInferenceEntityId(), - new SemanticTextField.ModelSettings(model), + inferenceFieldMetadata.getInferenceId(), + model != null ? new SemanticTextField.ModelSettings(model) : null, chunkMap ), indexRequest.getContentType() ); + if (useLegacyFormat) { SemanticTextUtils.insertValue(fieldName, newDocMap, result); } else { @@ -490,7 +510,8 @@ private Map> createFieldInferenceRequests(Bu } else { var inferenceMetadataFieldsValue = XContentMapValues.extractValue( InferenceMetadataFieldsMapper.NAME + "." + field, - docMap + docMap, + EXPLICIT_NULL ); if (inferenceMetadataFieldsValue != null) { // Inference has already been computed @@ -500,9 +521,22 @@ private Map> createFieldInferenceRequests(Bu int order = 0; for (var sourceField : entry.getSourceFields()) { - // TODO: Detect when the field is provided with an explicit null value - var valueObj = XContentMapValues.extractValue(sourceField, docMap); - if (valueObj == null) { + var valueObj = XContentMapValues.extractValue(sourceField, docMap, EXPLICIT_NULL); + if (useLegacyFormat == false && isUpdateRequest && valueObj == EXPLICIT_NULL) { + /** + * It's an update request, and the source field is explicitly set to null, + * so we need to propagate this information to the inference fields metadata + * to overwrite any inference previously computed on the field. + * This ensures that the field is treated as intentionally cleared, + * preventing any unintended carryover of prior inference results. + */ + var slot = ensureResponseAccumulatorSlot(itemIndex); + slot.addOrUpdateResponse( + new FieldInferenceResponse(field, sourceField, null, order++, 0, null, EMPTY_CHUNKED_INFERENCE) + ); + continue; + } + if (valueObj == null || valueObj == EXPLICIT_NULL) { if (isUpdateRequest && useLegacyFormat) { addInferenceResponseFailure( item.id(), @@ -552,4 +586,11 @@ static IndexRequest getIndexRequestOrNull(DocWriteRequest docWriteRequest) { return null; } } + + private static class EmptyChunkedInference implements ChunkedInference { + @Override + public Iterator chunksAsMatchedTextAndByteReference(XContent xcontent) { + return Collections.emptyIterator(); + } + } } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java index 7a1a9b056d0a1..3f49973d6e35f 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java @@ -12,6 +12,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.join.BitSetProducer; import org.elasticsearch.common.xcontent.XContentParserUtils; +import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.mapper.ContentPath; import org.elasticsearch.index.mapper.DocumentParserContext; import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; @@ -38,6 +39,8 @@ public class SemanticInferenceMetadataFieldsMapper extends InferenceMetadataFieldsMapper { private static final SemanticInferenceMetadataFieldsMapper INSTANCE = new SemanticInferenceMetadataFieldsMapper(); + public static final NodeFeature EXPLICIT_NULL_FIXES = new NodeFeature("semantic_text.inference_metadata_fields.explicit_null_fixes"); + public static final TypeParser PARSER = new FixedTypeParser( c -> InferenceMetadataFieldsMapper.isEnabled(c.getSettings()) ? INSTANCE : null ); diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java index d99889f11d3f2..78ecacc09c1a7 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java @@ -338,16 +338,13 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws static { SEMANTIC_TEXT_FIELD_PARSER.declareStringArray(optionalConstructorArg(), new ParseField(TEXT_FIELD)); - SEMANTIC_TEXT_FIELD_PARSER.declareObject( - constructorArg(), - (p, c) -> INFERENCE_RESULT_PARSER.parse(p, c), - new ParseField(INFERENCE_FIELD) - ); + SEMANTIC_TEXT_FIELD_PARSER.declareObject(constructorArg(), INFERENCE_RESULT_PARSER, new ParseField(INFERENCE_FIELD)); INFERENCE_RESULT_PARSER.declareString(constructorArg(), new ParseField(INFERENCE_ID_FIELD)); - INFERENCE_RESULT_PARSER.declareObject( + INFERENCE_RESULT_PARSER.declareObjectOrNull( constructorArg(), (p, c) -> MODEL_SETTINGS_PARSER.parse(p, null), + null, new ParseField(MODEL_SETTINGS_FIELD) ); INFERENCE_RESULT_PARSER.declareField(constructorArg(), (p, c) -> { diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java index b47c55c302273..690a136c566e0 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java @@ -384,6 +384,17 @@ void parseCreateFieldFromContext(DocumentParserContext context, SemanticTextFiel mapper = this; } + if (mapper.fieldType().getModelSettings() == null) { + for (var chunkList : field.inference().chunks().values()) { + if (chunkList.isEmpty() == false) { + throw new DocumentParsingException( + xContentLocation, + "[" + MODEL_SETTINGS_FIELD + "] must be set for field [" + fullFieldName + "] when chunks are provided" + ); + } + } + } + var chunksField = mapper.fieldType().getChunksField(); var embeddingsField = mapper.fieldType().getEmbeddingsField(); var offsetsField = mapper.fieldType().getOffsetsField(); @@ -895,7 +906,7 @@ private static boolean canMergeModelSettings( if (Objects.equals(previous, current)) { return true; } - if (previous == null) { + if (previous == null || current == null) { return true; } conflicts.addConflict("model_settings", ""); diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java index 478c81f7c5a32..0432a2ff3fc9e 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java @@ -18,6 +18,7 @@ import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.support.ActionFilterChain; import org.elasticsearch.action.support.WriteRequest; +import org.elasticsearch.action.update.UpdateRequest; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -67,6 +68,8 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.awaitLatch; import static org.elasticsearch.xpack.inference.action.filter.ShardBulkInferenceActionFilter.DEFAULT_BATCH_SIZE; import static org.elasticsearch.xpack.inference.action.filter.ShardBulkInferenceActionFilter.getIndexRequestOrNull; +import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getChunksFieldName; +import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getOriginalTextFieldName; import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldTests.randomChunkedInferenceEmbeddingSparse; import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldTests.randomSemanticText; import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldTests.randomSemanticTextInput; @@ -75,12 +78,15 @@ import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.is; import static org.mockito.Mockito.any; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; public class ShardBulkInferenceActionFilterTests extends ESTestCase { + private static final Object EXPLICIT_NULL = new Object(); + private final boolean useLegacyFormat; private ThreadPool threadPool; @@ -205,6 +211,11 @@ public void testItemFailures() throws Exception { XContentMapValues.extractValue(useLegacyFormat ? "field1.text" : "field1", actualRequest.sourceAsMap()), equalTo("I am a success") ); + if (useLegacyFormat == false) { + assertNotNull( + XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME + ".field1", actualRequest.sourceAsMap()) + ); + } // item 2 is a failure assertNotNull(bulkShardRequest.items()[2].getPrimaryResponse()); @@ -232,6 +243,79 @@ public void testItemFailures() throws Exception { awaitLatch(chainExecuted, 10, TimeUnit.SECONDS); } + @SuppressWarnings({ "unchecked", "rawtypes" }) + public void testExplicitNull() throws Exception { + StaticModel model = StaticModel.createRandomInstance(); + model.putResult("I am a failure", new ChunkedInferenceError(new IllegalArgumentException("boom"))); + model.putResult("I am a success", randomChunkedInferenceEmbeddingSparse(List.of("I am a success"))); + + ShardBulkInferenceActionFilter filter = createFilter( + threadPool, + Map.of(model.getInferenceEntityId(), model), + randomIntBetween(1, 10), + useLegacyFormat + ); + + CountDownLatch chainExecuted = new CountDownLatch(1); + ActionFilterChain actionFilterChain = (task, action, request, listener) -> { + try { + BulkShardRequest bulkShardRequest = (BulkShardRequest) request; + assertNull(bulkShardRequest.getInferenceFieldMap()); + assertThat(bulkShardRequest.items().length, equalTo(5)); + + // item 0 + assertNull(bulkShardRequest.items()[0].getPrimaryResponse()); + IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request()); + assertThat(XContentMapValues.extractValue("obj.field1", actualRequest.sourceAsMap(), EXPLICIT_NULL), is(EXPLICIT_NULL)); + assertNull(XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME, actualRequest.sourceAsMap(), EXPLICIT_NULL)); + + // item 1 is a success + assertNull(bulkShardRequest.items()[1].getPrimaryResponse()); + actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request()); + assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", "I am a success", 1); + + // item 2 is a failure + assertNotNull(bulkShardRequest.items()[2].getPrimaryResponse()); + assertTrue(bulkShardRequest.items()[2].getPrimaryResponse().isFailed()); + var failure = bulkShardRequest.items()[2].getPrimaryResponse().getFailure(); + assertThat(failure.getCause().getCause().getMessage(), containsString("boom")); + + // item 3 + assertNull(bulkShardRequest.items()[3].getPrimaryResponse()); + actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[3].request()); + assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, 0); + + // item 4 + assertNull(bulkShardRequest.items()[4].getPrimaryResponse()); + actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[4].request()); + assertNull(XContentMapValues.extractValue("obj.field1", actualRequest.sourceAsMap(), EXPLICIT_NULL)); + assertNull(XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME, actualRequest.sourceAsMap(), EXPLICIT_NULL)); + } finally { + chainExecuted.countDown(); + } + }; + ActionListener actionListener = mock(ActionListener.class); + Task task = mock(Task.class); + + Map inferenceFieldMap = Map.of( + "obj.field1", + new InferenceFieldMetadata("obj.field1", model.getInferenceEntityId(), new String[] { "obj.field1" }) + ); + Map sourceWithNull = new HashMap<>(); + sourceWithNull.put("field1", null); + + BulkItemRequest[] items = new BulkItemRequest[5]; + items[0] = new BulkItemRequest(0, new IndexRequest("index").source(Map.of("obj", sourceWithNull))); + items[1] = new BulkItemRequest(1, new IndexRequest("index").source("obj.field1", "I am a success")); + items[2] = new BulkItemRequest(2, new IndexRequest("index").source("obj.field1", "I am a failure")); + items[3] = new BulkItemRequest(3, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("obj", sourceWithNull)))); + items[4] = new BulkItemRequest(4, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("field2", "value")))); + BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items); + request.setInferenceFieldMap(inferenceFieldMap); + filter.apply(task, TransportShardBulkAction.ACTION_NAME, request, actionListener, actionFilterChain); + awaitLatch(chainExecuted, 10, TimeUnit.SECONDS); + } + @SuppressWarnings({ "unchecked", "rawtypes" }) public void testManyRandomDocs() throws Exception { Map inferenceModelMap = new HashMap<>(); @@ -435,6 +519,53 @@ private static BulkItemRequest[] randomBulkItemRequest( new BulkItemRequest(requestId, new IndexRequest("index").source(expectedDocMap, requestContentType)) }; } + @SuppressWarnings({ "unchecked" }) + private static void assertInferenceResults( + boolean useLegacyFormat, + IndexRequest request, + String fieldName, + Object expectedOriginalValue, + int expectedChunkCount + ) { + final Map requestMap = request.sourceAsMap(); + if (useLegacyFormat) { + assertThat( + XContentMapValues.extractValue(getOriginalTextFieldName(fieldName), requestMap, EXPLICIT_NULL), + equalTo(expectedOriginalValue) + ); + + List chunks = (List) XContentMapValues.extractValue(getChunksFieldName(fieldName), requestMap); + if (expectedChunkCount > 0) { + assertNotNull(chunks); + assertThat(chunks.size(), equalTo(expectedChunkCount)); + } else { + // If the expected chunk count is 0, we expect that no inference has been performed. In this case, the source should not be + // transformed, and thus the semantic text field structure should not be created. + assertNull(chunks); + } + } else { + assertThat(XContentMapValues.extractValue(fieldName, requestMap, EXPLICIT_NULL), equalTo(expectedOriginalValue)); + + Map inferenceMetadataFields = (Map) XContentMapValues.extractValue( + InferenceMetadataFieldsMapper.NAME, + requestMap, + EXPLICIT_NULL + ); + assertNotNull(inferenceMetadataFields); + + // When using the inference metadata fields format, chunks are mapped by source field. We handle clearing inference results for + // a field by emitting an empty chunk list for it. This is done to prevent the clear operation from clearing inference results + // for other source fields. + List chunks = (List) XContentMapValues.extractValue( + getChunksFieldName(fieldName) + "." + fieldName, + inferenceMetadataFields, + EXPLICIT_NULL + ); + assertNotNull(chunks); + assertThat(chunks.size(), equalTo(expectedChunkCount)); + } + } + private static class StaticModel extends TestModel { private final Map resultMap; diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java index 09073b800f009..ddc697881eccb 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java @@ -770,6 +770,35 @@ public void testDenseVectorElementType() throws IOException { assertMapperService.accept(byteMapperService, DenseVectorFieldMapper.ElementType.BYTE); } + public void testModelSettingsRequiredWithChunks() throws IOException { + // Create inference results where model settings are set to null and chunks are provided + Model model = TestModel.createRandomInstance(TaskType.SPARSE_EMBEDDING); + SemanticTextField randomSemanticText = randomSemanticText(useLegacyFormat, "field", model, List.of("a"), XContentType.JSON); + SemanticTextField inferenceResults = new SemanticTextField( + randomSemanticText.useLegacyFormat(), + randomSemanticText.fieldName(), + randomSemanticText.originalValues(), + new SemanticTextField.InferenceResult( + randomSemanticText.inference().inferenceId(), + null, + randomSemanticText.inference().chunks() + ), + randomSemanticText.contentType() + ); + + MapperService mapperService = createMapperService( + mapping(b -> addSemanticTextMapping(b, "field", model.getInferenceEntityId(), null)), + useLegacyFormat + ); + SourceToParse source = source(b -> addSemanticTextInferenceResults(useLegacyFormat, b, List.of(inferenceResults))); + DocumentParsingException ex = expectThrows( + DocumentParsingException.class, + DocumentParsingException.class, + () -> mapperService.documentMapper().parse(source) + ); + assertThat(ex.getMessage(), containsString("[model_settings] must be set for field [field] when chunks are provided")); + } + private MapperService mapperServiceForFieldWithModelSettings( String fieldName, String inferenceId, diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml index 660d3e37f4242..27c405f6c23bf 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml @@ -819,84 +819,210 @@ setup: - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 30 } -# TODO: Uncomment this test once we implement a fix -#--- -#"Bypass inference on bulk update operation": -# # Update as upsert -# - do: -# bulk: -# body: -# - '{"update": {"_index": "test-index", "_id": "doc_1"}}' -# - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}' -# -# - match: { errors: false } -# - match: { items.0.update.result: "created" } -# -# - do: -# bulk: -# body: -# - '{"update": {"_index": "test-index", "_id": "doc_1"}}' -# - '{"doc": { "non_inference_field": "another value" }, "doc_as_upsert": true}' -# refresh: true -# -# - match: { errors: false } -# - match: { items.0.update.result: "updated" } -# -# - do: -# search: -# index: test-index -# body: -# fields: [ _inference_fields ] -# query: -# match_all: { } -# -# - match: { hits.total.value: 1 } -# - match: { hits.total.relation: eq } -# -# - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } -# - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } -# - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings -# - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } -# - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } -# -# - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } -# - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } -# - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings -# - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } -# - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } -# -# - match: { hits.hits.0._source.sparse_field: "inference test" } -# - match: { hits.hits.0._source.dense_field: "another inference test" } -# - match: { hits.hits.0._source.non_inference_field: "another value" } -# -# - do: -# bulk: -# body: -# - '{"update": {"_index": "test-index", "_id": "doc_1"}}' -# - '{"doc": { "sparse_field": null, "dense_field": null, "non_inference_field": "updated value" }, "doc_as_upsert": true}' -# refresh: true -# -# - match: { errors: false } -# - match: { items.0.update.result: "updated" } -# -# - do: -# search: -# index: test-index -# body: -# fields: [ _inference_fields ] -# query: -# match_all: { } -# -# - match: { hits.total.value: 1 } -# - match: { hits.total.relation: eq } -# -# # TODO: BUG! Setting sparse_field & dense_field to null does not clear _inference_fields -# - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } -# - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 0 } -# -# - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } -# - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 0 } -# -# - not_exists: hits.hits.0._source.sparse_field -# - not_exists: hits.hits.0._source.dense_field -# - match: { hits.hits.0._source.non_inference_field: "updated value" } +--- +"Bypass inference on bulk update operation": + # Update as upsert + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}' + + - match: { errors: false } + - match: { items.0.update.result: "created" } + + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "non_inference_field": "another value" }, "doc_as_upsert": true}' + refresh: true + + - match: { errors: false } + - match: { items.0.update.result: "updated" } + + - do: + search: + index: test-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } + + - match: { hits.hits.0._source.sparse_field: "inference test" } + - match: { hits.hits.0._source.dense_field: "another inference test" } + - match: { hits.hits.0._source.non_inference_field: "another value" } + +--- +"Explicit nulls clear inference results on bulk update operation": + - requires: + cluster_features: "semantic_text.inference_metadata_fields.explicit_null_fixes" + reason: Fixes explicit null handling when using the _inference_fields metafield + + - skip: + features: [ "headers" ] + + - do: + indices.create: + index: test-copy-to-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + sparse_source_field: + type: text + copy_to: sparse_field + dense_field: + type: semantic_text + inference_id: dense-inference-id + dense_source_field: + type: text + copy_to: dense_field + non_inference_field: + type: text + + - do: + index: + index: test-copy-to-index + id: doc_1 + body: + sparse_field: "inference test" + sparse_source_field: "sparse source test" + dense_field: "another inference test" + dense_source_field: "dense source test" + non_inference_field: "non inference test" + refresh: true + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + search: + index: test-copy-to-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 2 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings + - set: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings: sparse_source_field_embeddings } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.end_offset: 18 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 2 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings + - set: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings: dense_source_field_embeddings } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.end_offset: 17 } + + - match: { hits.hits.0._source.sparse_field: "inference test" } + - match: { hits.hits.0._source.sparse_source_field: "sparse source test" } + - match: { hits.hits.0._source.dense_field: "another inference test" } + - match: { hits.hits.0._source.dense_source_field: "dense source test" } + - match: { hits.hits.0._source.non_inference_field: "non inference test" } + + - do: + bulk: + body: + - '{"update": {"_index": "test-copy-to-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_field": null, "dense_field": null, "non_inference_field": "updated value" }, "doc_as_upsert": true}' + refresh: true + + - match: { errors: false } + - match: { items.0.update.result: "updated" } + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + search: + index: test-copy-to-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field: 1 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings: $sparse_source_field_embeddings } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.end_offset: 18 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field: 1 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings: $dense_source_field_embeddings } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.end_offset: 17 } + + - not_exists: hits.hits.0._source.sparse_field + - match: { hits.hits.0._source.sparse_source_field: "sparse source test" } + - not_exists: hits.hits.0._source.dense_field + - match: { hits.hits.0._source.dense_source_field: "dense source test" } + - match: { hits.hits.0._source.non_inference_field: "updated value" } + + - do: + bulk: + body: + - '{"update": {"_index": "test-copy-to-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_source_field": null, "dense_source_field": null, "non_inference_field": "another value" }, "doc_as_upsert": true}' + refresh: true + + - match: { errors: false } + - match: { items.0.update.result: "updated" } + + - do: + search: + index: test-copy-to-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - not_exists: hits.hits.0._source._inference_fields + - not_exists: hits.hits.0._source.sparse_field + - not_exists: hits.hits.0._source.sparse_source_field + - not_exists: hits.hits.0._source.dense_field + - not_exists: hits.hits.0._source.dense_source_field + - match: { hits.hits.0._source.non_inference_field: "another value" } diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml index 6b494d531b2d1..912cdb5a85d35 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml @@ -632,6 +632,31 @@ setup: - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } - match: { _source.non_inference_field: "another value" } +--- +"Explicit nulls clear inference results on bulk update operation": + # Update as upsert + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}' + + - match: { errors: false } + - match: { items.0.update.result: "created" } + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.dense_field.text: "another inference test" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.non_inference_field: "non inference test" } + - do: bulk: body: