From 354d0a39ec89615143d9174ab555f444844658a0 Mon Sep 17 00:00:00 2001
From: Mike Pellegrini <mike.pellegrini@elastic.co>
Date: Fri, 3 Jan 2025 10:24:24 -0500
Subject: [PATCH] Semantic text - Clear inference results on explicit nulls
 (#119463)

Fix a bug where setting a semantic_text source field explicitly to null in an update request to clear inference results did not actually clear the inference results for that field. This bug only affects the new _inference_fields format.
---
 .../xpack/inference/InferenceFeatures.java    |   4 +-
 .../ShardBulkInferenceActionFilter.java       |  55 +++-
 ...SemanticInferenceMetadataFieldsMapper.java |   3 +
 .../inference/mapper/SemanticTextField.java   |   9 +-
 .../mapper/SemanticTextFieldMapper.java       |  13 +-
 .../ShardBulkInferenceActionFilterTests.java  | 131 ++++++++
 .../mapper/SemanticTextFieldMapperTests.java  |  29 ++
 .../60_semantic_text_inference_update.yml     | 288 +++++++++++++-----
 .../60_semantic_text_inference_update_bwc.yml |  25 ++
 9 files changed, 461 insertions(+), 96 deletions(-)
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java
index 3fd671383369d..fc9d2641a8cfc 100644
--- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java
+++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java
@@ -9,6 +9,7 @@
 
 import org.elasticsearch.features.FeatureSpecification;
 import org.elasticsearch.features.NodeFeature;
+import org.elasticsearch.xpack.inference.mapper.SemanticInferenceMetadataFieldsMapper;
 import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper;
 import org.elasticsearch.xpack.inference.rank.random.RandomRankRetrieverBuilder;
 import org.elasticsearch.xpack.inference.rank.textsimilarity.TextSimilarityRankRetrieverBuilder;
@@ -46,7 +47,8 @@ public Set<NodeFeature> getTestFeatures() {
             SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX,
             SEMANTIC_TEXT_HIGHLIGHTER,
             SEMANTIC_MATCH_QUERY_REWRITE_INTERCEPTION_SUPPORTED,
-            SEMANTIC_SPARSE_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED
+            SEMANTIC_SPARSE_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED,
+            SemanticInferenceMetadataFieldsMapper.EXPLICIT_NULL_FIXES
         );
     }
 }
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java
index 22d6157b335ca..f4aa49bad1648 100644
--- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java
+++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java
@@ -39,6 +39,7 @@
 import org.elasticsearch.inference.UnparsedModel;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.tasks.Task;
+import org.elasticsearch.xcontent.XContent;
 import org.elasticsearch.xpack.core.inference.results.ChunkedInferenceError;
 import org.elasticsearch.xpack.inference.mapper.SemanticTextField;
 import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper;
@@ -50,6 +51,7 @@
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -67,6 +69,8 @@
  */
 public class ShardBulkInferenceActionFilter implements MappedActionFilter {
     protected static final int DEFAULT_BATCH_SIZE = 512;
+    private static final Object EXPLICIT_NULL = new Object();
+    private static final ChunkedInference EMPTY_CHUNKED_INFERENCE = new EmptyChunkedInference();
 
     private final ClusterService clusterService;
     private final InferenceServiceRegistry inferenceServiceRegistry;
@@ -393,11 +397,22 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons
             for (var entry : response.responses.entrySet()) {
                 var fieldName = entry.getKey();
                 var responses = entry.getValue();
-                var model = responses.get(0).model();
+                Model model = null;
+
+                InferenceFieldMetadata inferenceFieldMetadata = fieldInferenceMap.get(fieldName);
+                if (inferenceFieldMetadata == null) {
+                    throw new IllegalStateException("No inference field metadata for field [" + fieldName + "]");
+                }
+
                 // ensure that the order in the original field is consistent in case of multiple inputs
                 Collections.sort(responses, Comparator.comparingInt(FieldInferenceResponse::inputOrder));
                 Map<String, List<SemanticTextField.Chunk>> chunkMap = new LinkedHashMap<>();
                 for (var resp : responses) {
+                    // Get the first non-null model from the response list
+                    if (model == null) {
+                        model = resp.model;
+                    }
+
                     var lst = chunkMap.computeIfAbsent(resp.sourceField, k -> new ArrayList<>());
                     lst.addAll(
                         SemanticTextField.toSemanticTextFieldChunks(
@@ -409,21 +424,26 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons
                         )
                     );
                 }
+
                 List<String> inputs = responses.stream()
                     .filter(r -> r.sourceField().equals(fieldName))
                     .map(r -> r.input)
                     .collect(Collectors.toList());
+
+                // The model can be null if we are only processing update requests that clear inference results. This is ok because we will
+                // merge in the field's existing model settings on the data node.
                 var result = new SemanticTextField(
                     useLegacyFormat,
                     fieldName,
                     useLegacyFormat ? inputs : null,
                     new SemanticTextField.InferenceResult(
-                        model.getInferenceEntityId(),
-                        new SemanticTextField.ModelSettings(model),
+                        inferenceFieldMetadata.getInferenceId(),
+                        model != null ? new SemanticTextField.ModelSettings(model) : null,
                         chunkMap
                     ),
                     indexRequest.getContentType()
                 );
+
                 if (useLegacyFormat) {
                     SemanticTextUtils.insertValue(fieldName, newDocMap, result);
                 } else {
@@ -490,7 +510,8 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
                     } else {
                         var inferenceMetadataFieldsValue = XContentMapValues.extractValue(
                             InferenceMetadataFieldsMapper.NAME + "." + field,
-                            docMap
+                            docMap,
+                            EXPLICIT_NULL
                         );
                         if (inferenceMetadataFieldsValue != null) {
                             // Inference has already been computed
@@ -500,9 +521,22 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
 
                     int order = 0;
                     for (var sourceField : entry.getSourceFields()) {
-                        // TODO: Detect when the field is provided with an explicit null value
-                        var valueObj = XContentMapValues.extractValue(sourceField, docMap);
-                        if (valueObj == null) {
+                        var valueObj = XContentMapValues.extractValue(sourceField, docMap, EXPLICIT_NULL);
+                        if (useLegacyFormat == false && isUpdateRequest && valueObj == EXPLICIT_NULL) {
+                            /**
+                             * It's an update request, and the source field is explicitly set to null,
+                             * so we need to propagate this information to the inference fields metadata
+                             * to overwrite any inference previously computed on the field.
+                             * This ensures that the field is treated as intentionally cleared,
+                             * preventing any unintended carryover of prior inference results.
+                             */
+                            var slot = ensureResponseAccumulatorSlot(itemIndex);
+                            slot.addOrUpdateResponse(
+                                new FieldInferenceResponse(field, sourceField, null, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
+                            );
+                            continue;
+                        }
+                        if (valueObj == null || valueObj == EXPLICIT_NULL) {
                             if (isUpdateRequest && useLegacyFormat) {
                                 addInferenceResponseFailure(
                                     item.id(),
@@ -552,4 +586,11 @@ static IndexRequest getIndexRequestOrNull(DocWriteRequest<?> docWriteRequest) {
             return null;
         }
     }
+
+    private static class EmptyChunkedInference implements ChunkedInference {
+        @Override
+        public Iterator<Chunk> chunksAsMatchedTextAndByteReference(XContent xcontent) {
+            return Collections.emptyIterator();
+        }
+    }
 }
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java
index 7a1a9b056d0a1..3f49973d6e35f 100644
--- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java
+++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java
@@ -12,6 +12,7 @@
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.join.BitSetProducer;
 import org.elasticsearch.common.xcontent.XContentParserUtils;
+import org.elasticsearch.features.NodeFeature;
 import org.elasticsearch.index.mapper.ContentPath;
 import org.elasticsearch.index.mapper.DocumentParserContext;
 import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper;
@@ -38,6 +39,8 @@
 public class SemanticInferenceMetadataFieldsMapper extends InferenceMetadataFieldsMapper {
     private static final SemanticInferenceMetadataFieldsMapper INSTANCE = new SemanticInferenceMetadataFieldsMapper();
 
+    public static final NodeFeature EXPLICIT_NULL_FIXES = new NodeFeature("semantic_text.inference_metadata_fields.explicit_null_fixes");
+
     public static final TypeParser PARSER = new FixedTypeParser(
         c -> InferenceMetadataFieldsMapper.isEnabled(c.getSettings()) ? INSTANCE : null
     );
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java
index d99889f11d3f2..78ecacc09c1a7 100644
--- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java
+++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java
@@ -338,16 +338,13 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
 
     static {
         SEMANTIC_TEXT_FIELD_PARSER.declareStringArray(optionalConstructorArg(), new ParseField(TEXT_FIELD));
-        SEMANTIC_TEXT_FIELD_PARSER.declareObject(
-            constructorArg(),
-            (p, c) -> INFERENCE_RESULT_PARSER.parse(p, c),
-            new ParseField(INFERENCE_FIELD)
-        );
+        SEMANTIC_TEXT_FIELD_PARSER.declareObject(constructorArg(), INFERENCE_RESULT_PARSER, new ParseField(INFERENCE_FIELD));
 
         INFERENCE_RESULT_PARSER.declareString(constructorArg(), new ParseField(INFERENCE_ID_FIELD));
-        INFERENCE_RESULT_PARSER.declareObject(
+        INFERENCE_RESULT_PARSER.declareObjectOrNull(
             constructorArg(),
             (p, c) -> MODEL_SETTINGS_PARSER.parse(p, null),
+            null,
             new ParseField(MODEL_SETTINGS_FIELD)
         );
         INFERENCE_RESULT_PARSER.declareField(constructorArg(), (p, c) -> {
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java
index b47c55c302273..690a136c566e0 100644
--- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java
+++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java
@@ -384,6 +384,17 @@ void parseCreateFieldFromContext(DocumentParserContext context, SemanticTextFiel
             mapper = this;
         }
 
+        if (mapper.fieldType().getModelSettings() == null) {
+            for (var chunkList : field.inference().chunks().values()) {
+                if (chunkList.isEmpty() == false) {
+                    throw new DocumentParsingException(
+                        xContentLocation,
+                        "[" + MODEL_SETTINGS_FIELD + "] must be set for field [" + fullFieldName + "] when chunks are provided"
+                    );
+                }
+            }
+        }
+
         var chunksField = mapper.fieldType().getChunksField();
         var embeddingsField = mapper.fieldType().getEmbeddingsField();
         var offsetsField = mapper.fieldType().getOffsetsField();
@@ -895,7 +906,7 @@ private static boolean canMergeModelSettings(
         if (Objects.equals(previous, current)) {
             return true;
         }
-        if (previous == null) {
+        if (previous == null || current == null) {
             return true;
         }
         conflicts.addConflict("model_settings", "");
diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java
index 478c81f7c5a32..0432a2ff3fc9e 100644
--- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java
+++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java
@@ -18,6 +18,7 @@
 import org.elasticsearch.action.index.IndexRequest;
 import org.elasticsearch.action.support.ActionFilterChain;
 import org.elasticsearch.action.support.WriteRequest;
+import org.elasticsearch.action.update.UpdateRequest;
 import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
@@ -67,6 +68,8 @@
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.awaitLatch;
 import static org.elasticsearch.xpack.inference.action.filter.ShardBulkInferenceActionFilter.DEFAULT_BATCH_SIZE;
 import static org.elasticsearch.xpack.inference.action.filter.ShardBulkInferenceActionFilter.getIndexRequestOrNull;
+import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getChunksFieldName;
+import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getOriginalTextFieldName;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldTests.randomChunkedInferenceEmbeddingSparse;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldTests.randomSemanticText;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldTests.randomSemanticTextInput;
@@ -75,12 +78,15 @@
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.is;
 import static org.mockito.Mockito.any;
 import static org.mockito.Mockito.doAnswer;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 
 public class ShardBulkInferenceActionFilterTests extends ESTestCase {
+    private static final Object EXPLICIT_NULL = new Object();
+
     private final boolean useLegacyFormat;
     private ThreadPool threadPool;
 
@@ -205,6 +211,11 @@ public void testItemFailures() throws Exception {
                     XContentMapValues.extractValue(useLegacyFormat ? "field1.text" : "field1", actualRequest.sourceAsMap()),
                     equalTo("I am a success")
                 );
+                if (useLegacyFormat == false) {
+                    assertNotNull(
+                        XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME + ".field1", actualRequest.sourceAsMap())
+                    );
+                }
 
                 // item 2 is a failure
                 assertNotNull(bulkShardRequest.items()[2].getPrimaryResponse());
@@ -232,6 +243,79 @@ public void testItemFailures() throws Exception {
         awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
     }
 
+    @SuppressWarnings({ "unchecked", "rawtypes" })
+    public void testExplicitNull() throws Exception {
+        StaticModel model = StaticModel.createRandomInstance();
+        model.putResult("I am a failure", new ChunkedInferenceError(new IllegalArgumentException("boom")));
+        model.putResult("I am a success", randomChunkedInferenceEmbeddingSparse(List.of("I am a success")));
+
+        ShardBulkInferenceActionFilter filter = createFilter(
+            threadPool,
+            Map.of(model.getInferenceEntityId(), model),
+            randomIntBetween(1, 10),
+            useLegacyFormat
+        );
+
+        CountDownLatch chainExecuted = new CountDownLatch(1);
+        ActionFilterChain actionFilterChain = (task, action, request, listener) -> {
+            try {
+                BulkShardRequest bulkShardRequest = (BulkShardRequest) request;
+                assertNull(bulkShardRequest.getInferenceFieldMap());
+                assertThat(bulkShardRequest.items().length, equalTo(5));
+
+                // item 0
+                assertNull(bulkShardRequest.items()[0].getPrimaryResponse());
+                IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request());
+                assertThat(XContentMapValues.extractValue("obj.field1", actualRequest.sourceAsMap(), EXPLICIT_NULL), is(EXPLICIT_NULL));
+                assertNull(XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME, actualRequest.sourceAsMap(), EXPLICIT_NULL));
+
+                // item 1 is a success
+                assertNull(bulkShardRequest.items()[1].getPrimaryResponse());
+                actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request());
+                assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", "I am a success", 1);
+
+                // item 2 is a failure
+                assertNotNull(bulkShardRequest.items()[2].getPrimaryResponse());
+                assertTrue(bulkShardRequest.items()[2].getPrimaryResponse().isFailed());
+                var failure = bulkShardRequest.items()[2].getPrimaryResponse().getFailure();
+                assertThat(failure.getCause().getCause().getMessage(), containsString("boom"));
+
+                // item 3
+                assertNull(bulkShardRequest.items()[3].getPrimaryResponse());
+                actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[3].request());
+                assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, 0);
+
+                // item 4
+                assertNull(bulkShardRequest.items()[4].getPrimaryResponse());
+                actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[4].request());
+                assertNull(XContentMapValues.extractValue("obj.field1", actualRequest.sourceAsMap(), EXPLICIT_NULL));
+                assertNull(XContentMapValues.extractValue(InferenceMetadataFieldsMapper.NAME, actualRequest.sourceAsMap(), EXPLICIT_NULL));
+            } finally {
+                chainExecuted.countDown();
+            }
+        };
+        ActionListener actionListener = mock(ActionListener.class);
+        Task task = mock(Task.class);
+
+        Map<String, InferenceFieldMetadata> inferenceFieldMap = Map.of(
+            "obj.field1",
+            new InferenceFieldMetadata("obj.field1", model.getInferenceEntityId(), new String[] { "obj.field1" })
+        );
+        Map<String, Object> sourceWithNull = new HashMap<>();
+        sourceWithNull.put("field1", null);
+
+        BulkItemRequest[] items = new BulkItemRequest[5];
+        items[0] = new BulkItemRequest(0, new IndexRequest("index").source(Map.of("obj", sourceWithNull)));
+        items[1] = new BulkItemRequest(1, new IndexRequest("index").source("obj.field1", "I am a success"));
+        items[2] = new BulkItemRequest(2, new IndexRequest("index").source("obj.field1", "I am a failure"));
+        items[3] = new BulkItemRequest(3, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("obj", sourceWithNull))));
+        items[4] = new BulkItemRequest(4, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("field2", "value"))));
+        BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items);
+        request.setInferenceFieldMap(inferenceFieldMap);
+        filter.apply(task, TransportShardBulkAction.ACTION_NAME, request, actionListener, actionFilterChain);
+        awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
+    }
+
     @SuppressWarnings({ "unchecked", "rawtypes" })
     public void testManyRandomDocs() throws Exception {
         Map<String, StaticModel> inferenceModelMap = new HashMap<>();
@@ -435,6 +519,53 @@ private static BulkItemRequest[] randomBulkItemRequest(
             new BulkItemRequest(requestId, new IndexRequest("index").source(expectedDocMap, requestContentType)) };
     }
 
+    @SuppressWarnings({ "unchecked" })
+    private static void assertInferenceResults(
+        boolean useLegacyFormat,
+        IndexRequest request,
+        String fieldName,
+        Object expectedOriginalValue,
+        int expectedChunkCount
+    ) {
+        final Map<String, Object> requestMap = request.sourceAsMap();
+        if (useLegacyFormat) {
+            assertThat(
+                XContentMapValues.extractValue(getOriginalTextFieldName(fieldName), requestMap, EXPLICIT_NULL),
+                equalTo(expectedOriginalValue)
+            );
+
+            List<Object> chunks = (List<Object>) XContentMapValues.extractValue(getChunksFieldName(fieldName), requestMap);
+            if (expectedChunkCount > 0) {
+                assertNotNull(chunks);
+                assertThat(chunks.size(), equalTo(expectedChunkCount));
+            } else {
+                // If the expected chunk count is 0, we expect that no inference has been performed. In this case, the source should not be
+                // transformed, and thus the semantic text field structure should not be created.
+                assertNull(chunks);
+            }
+        } else {
+            assertThat(XContentMapValues.extractValue(fieldName, requestMap, EXPLICIT_NULL), equalTo(expectedOriginalValue));
+
+            Map<String, Object> inferenceMetadataFields = (Map<String, Object>) XContentMapValues.extractValue(
+                InferenceMetadataFieldsMapper.NAME,
+                requestMap,
+                EXPLICIT_NULL
+            );
+            assertNotNull(inferenceMetadataFields);
+
+            // When using the inference metadata fields format, chunks are mapped by source field. We handle clearing inference results for
+            // a field by emitting an empty chunk list for it. This is done to prevent the clear operation from clearing inference results
+            // for other source fields.
+            List<Object> chunks = (List<Object>) XContentMapValues.extractValue(
+                getChunksFieldName(fieldName) + "." + fieldName,
+                inferenceMetadataFields,
+                EXPLICIT_NULL
+            );
+            assertNotNull(chunks);
+            assertThat(chunks.size(), equalTo(expectedChunkCount));
+        }
+    }
+
     private static class StaticModel extends TestModel {
         private final Map<String, ChunkedInference> resultMap;
 
diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java
index 09073b800f009..ddc697881eccb 100644
--- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java
+++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java
@@ -770,6 +770,35 @@ public void testDenseVectorElementType() throws IOException {
         assertMapperService.accept(byteMapperService, DenseVectorFieldMapper.ElementType.BYTE);
     }
 
+    public void testModelSettingsRequiredWithChunks() throws IOException {
+        // Create inference results where model settings are set to null and chunks are provided
+        Model model = TestModel.createRandomInstance(TaskType.SPARSE_EMBEDDING);
+        SemanticTextField randomSemanticText = randomSemanticText(useLegacyFormat, "field", model, List.of("a"), XContentType.JSON);
+        SemanticTextField inferenceResults = new SemanticTextField(
+            randomSemanticText.useLegacyFormat(),
+            randomSemanticText.fieldName(),
+            randomSemanticText.originalValues(),
+            new SemanticTextField.InferenceResult(
+                randomSemanticText.inference().inferenceId(),
+                null,
+                randomSemanticText.inference().chunks()
+            ),
+            randomSemanticText.contentType()
+        );
+
+        MapperService mapperService = createMapperService(
+            mapping(b -> addSemanticTextMapping(b, "field", model.getInferenceEntityId(), null)),
+            useLegacyFormat
+        );
+        SourceToParse source = source(b -> addSemanticTextInferenceResults(useLegacyFormat, b, List.of(inferenceResults)));
+        DocumentParsingException ex = expectThrows(
+            DocumentParsingException.class,
+            DocumentParsingException.class,
+            () -> mapperService.documentMapper().parse(source)
+        );
+        assertThat(ex.getMessage(), containsString("[model_settings] must be set for field [field] when chunks are provided"));
+    }
+
     private MapperService mapperServiceForFieldWithModelSettings(
         String fieldName,
         String inferenceId,
diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml
index 660d3e37f4242..27c405f6c23bf 100644
--- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml
+++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml
@@ -819,84 +819,210 @@ setup:
   - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 }
   - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 30 }
 
-# TODO: Uncomment this test once we implement a fix
-#---
-#"Bypass inference on bulk update operation":
-#  # Update as upsert
-#  - do:
-#      bulk:
-#        body:
-#          - '{"update": {"_index": "test-index", "_id": "doc_1"}}'
-#          - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}'
-#
-#  - match: { errors: false }
-#  - match: { items.0.update.result: "created" }
-#
-#  - do:
-#      bulk:
-#        body:
-#          - '{"update": {"_index": "test-index", "_id": "doc_1"}}'
-#          - '{"doc": { "non_inference_field": "another value" }, "doc_as_upsert": true}'
-#        refresh: true
-#
-#  - match: { errors: false }
-#  - match: { items.0.update.result: "updated" }
-#
-#  - do:
-#      search:
-#        index: test-index
-#        body:
-#          fields: [ _inference_fields ]
-#          query:
-#            match_all: { }
-#
-#  - match: { hits.total.value: 1 }
-#  - match: { hits.total.relation: eq }
-#
-#  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 }
-#  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 }
-#  - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings
-#  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 }
-#  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 }
-#
-#  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 }
-#  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 }
-#  - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings
-#  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 }
-#  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 }
-#
-#  - match: { hits.hits.0._source.sparse_field: "inference test" }
-#  - match: { hits.hits.0._source.dense_field: "another inference test" }
-#  - match: { hits.hits.0._source.non_inference_field: "another value" }
-#
-#  - do:
-#      bulk:
-#        body:
-#          - '{"update": {"_index": "test-index", "_id": "doc_1"}}'
-#          - '{"doc": { "sparse_field": null, "dense_field": null, "non_inference_field": "updated value" }, "doc_as_upsert": true}'
-#        refresh: true
-#
-#  - match: { errors: false }
-#  - match: { items.0.update.result: "updated" }
-#
-#  - do:
-#      search:
-#        index: test-index
-#        body:
-#          fields: [ _inference_fields ]
-#          query:
-#            match_all: { }
-#
-#  - match: { hits.total.value: 1 }
-#  - match: { hits.total.relation: eq }
-#
-#  # TODO: BUG! Setting sparse_field & dense_field to null does not clear _inference_fields
-#  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 }
-#  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 0 }
-#
-#  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 }
-#  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 0 }
-#
-#  - not_exists: hits.hits.0._source.sparse_field
-#  - not_exists: hits.hits.0._source.dense_field
-#  - match: { hits.hits.0._source.non_inference_field: "updated value" }
+---
+"Bypass inference on bulk update operation":
+  # Update as upsert
+  - do:
+      bulk:
+        body:
+          - '{"update": {"_index": "test-index", "_id": "doc_1"}}'
+          - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}'
+
+  - match: { errors: false }
+  - match: { items.0.update.result: "created" }
+
+  - do:
+      bulk:
+        body:
+          - '{"update": {"_index": "test-index", "_id": "doc_1"}}'
+          - '{"doc": { "non_inference_field": "another value" }, "doc_as_upsert": true}'
+        refresh: true
+
+  - match: { errors: false }
+  - match: { items.0.update.result: "updated" }
+
+  - do:
+      search:
+        index: test-index
+        body:
+          fields: [ _inference_fields ]
+          query:
+            match_all: { }
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.total.relation: eq }
+
+  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 }
+  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 }
+  - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 }
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 }
+
+  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 }
+  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 }
+  - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings
+  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 }
+  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 }
+
+  - match: { hits.hits.0._source.sparse_field: "inference test" }
+  - match: { hits.hits.0._source.dense_field: "another inference test" }
+  - match: { hits.hits.0._source.non_inference_field: "another value" }
+
+---
+"Explicit nulls clear inference results on bulk update operation":
+  - requires:
+      cluster_features: "semantic_text.inference_metadata_fields.explicit_null_fixes"
+      reason: Fixes explicit null handling when using the _inference_fields metafield
+
+  - skip:
+      features: [ "headers" ]
+
+  - do:
+      indices.create:
+        index: test-copy-to-index
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              sparse_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+              sparse_source_field:
+                type: text
+                copy_to: sparse_field
+              dense_field:
+                type: semantic_text
+                inference_id: dense-inference-id
+              dense_source_field:
+                type: text
+                copy_to: dense_field
+              non_inference_field:
+                type: text
+
+  - do:
+      index:
+        index: test-copy-to-index
+        id: doc_1
+        body:
+          sparse_field: "inference test"
+          sparse_source_field: "sparse source test"
+          dense_field: "another inference test"
+          dense_source_field: "dense source test"
+          non_inference_field: "non inference test"
+        refresh: true
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      search:
+        index: test-copy-to-index
+        body:
+          fields: [ _inference_fields ]
+          query:
+            match_all: { }
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.total.relation: eq }
+
+  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 2 }
+  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 }
+  - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 }
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 }
+  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field: 1 }
+  - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings
+  - set: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings: sparse_source_field_embeddings }
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.start_offset: 0 }
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.end_offset: 18 }
+
+  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 2 }
+  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 }
+  - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings
+  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 }
+  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 }
+  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field: 1 }
+  - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings
+  - set: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings: dense_source_field_embeddings }
+  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.start_offset: 0 }
+  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.end_offset: 17 }
+
+  - match: { hits.hits.0._source.sparse_field: "inference test" }
+  - match: { hits.hits.0._source.sparse_source_field: "sparse source test" }
+  - match: { hits.hits.0._source.dense_field: "another inference test" }
+  - match: { hits.hits.0._source.dense_source_field: "dense source test" }
+  - match: { hits.hits.0._source.non_inference_field: "non inference test" }
+
+  - do:
+      bulk:
+        body:
+          - '{"update": {"_index": "test-copy-to-index", "_id": "doc_1"}}'
+          - '{"doc": { "sparse_field": null, "dense_field": null, "non_inference_field": "updated value" }, "doc_as_upsert": true}'
+        refresh: true
+
+  - match: { errors: false }
+  - match: { items.0.update.result: "updated" }
+
+  - do:
+      headers:
+        # Force JSON content type so that we use a parser that interprets the embeddings as doubles
+        Content-Type: application/json
+      search:
+        index: test-copy-to-index
+        body:
+          fields: [ _inference_fields ]
+          query:
+            match_all: { }
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.total.relation: eq }
+
+  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 }
+  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field: 1 }
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings: $sparse_source_field_embeddings }
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.start_offset: 0 }
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.end_offset: 18 }
+
+  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 }
+  - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field: 1 }
+  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings: $dense_source_field_embeddings }
+  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.start_offset: 0 }
+  - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.end_offset: 17 }
+
+  - not_exists: hits.hits.0._source.sparse_field
+  - match: { hits.hits.0._source.sparse_source_field: "sparse source test" }
+  - not_exists: hits.hits.0._source.dense_field
+  - match: { hits.hits.0._source.dense_source_field: "dense source test" }
+  - match: { hits.hits.0._source.non_inference_field: "updated value" }
+
+  - do:
+      bulk:
+        body:
+          - '{"update": {"_index": "test-copy-to-index", "_id": "doc_1"}}'
+          - '{"doc": { "sparse_source_field": null, "dense_source_field": null, "non_inference_field": "another value" }, "doc_as_upsert": true}'
+        refresh: true
+
+  - match: { errors: false }
+  - match: { items.0.update.result: "updated" }
+
+  - do:
+      search:
+        index: test-copy-to-index
+        body:
+          fields: [ _inference_fields ]
+          query:
+            match_all: { }
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.total.relation: eq }
+
+  - not_exists: hits.hits.0._source._inference_fields
+  - not_exists: hits.hits.0._source.sparse_field
+  - not_exists: hits.hits.0._source.sparse_source_field
+  - not_exists: hits.hits.0._source.dense_field
+  - not_exists: hits.hits.0._source.dense_source_field
+  - match: { hits.hits.0._source.non_inference_field: "another value" }
diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml
index 6b494d531b2d1..912cdb5a85d35 100644
--- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml
+++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml
@@ -632,6 +632,31 @@ setup:
   - match: { _source.dense_field.inference.chunks.0.text: "another inference test" }
   - match: { _source.non_inference_field: "another value" }
 
+---
+"Explicit nulls clear inference results on bulk update operation":
+  # Update as upsert
+  - do:
+      bulk:
+        body:
+          - '{"update": {"_index": "test-index", "_id": "doc_1"}}'
+          - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}'
+
+  - match: { errors: false }
+  - match: { items.0.update.result: "created" }
+
+  - do:
+      get:
+        index: test-index
+        id: doc_1
+
+  - match: { _source.sparse_field.text: "inference test" }
+  - exists: _source.sparse_field.inference.chunks.0.embeddings
+  - match: { _source.sparse_field.inference.chunks.0.text: "inference test" }
+  - match: { _source.dense_field.text: "another inference test" }
+  - exists: _source.dense_field.inference.chunks.0.embeddings
+  - match: { _source.dense_field.inference.chunks.0.text: "another inference test" }
+  - match: { _source.non_inference_field: "non inference test" }
+
   - do:
       bulk:
         body: