Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feature/semantic-text] semantic text copy to support #106689

Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7aaa3b6
Revert "Extract interface from ModelRegistry so it can be used from s…
jimczi Mar 7, 2024
ebc26d2
inference as an action filter
jimczi Mar 14, 2024
86ddc9d
add more tests
jimczi Mar 18, 2024
dd73d01
Merge branch 'feature/semantic-text' into carlosdelest/semantic-text-…
carlosdelest Mar 19, 2024
c5de0da
Merge from feature branch
carlosdelest Mar 19, 2024
cf62b1b
Allow SemanticTextFieldMapper to be a multifield
carlosdelest Mar 19, 2024
f029015
Add multifields / copy_to tests to lookup and metadata
carlosdelest Mar 19, 2024
d3f9d86
First iteration for adding inference support for copy_to / multifields
carlosdelest Mar 19, 2024
140caa3
Add tests for copy_to
carlosdelest Mar 19, 2024
7d1c92a
Spotless
carlosdelest Mar 19, 2024
e023a19
Minor changes from previous PR
carlosdelest Mar 20, 2024
05aa06f
Add getMapper(field) to Mapper, so both field with multifields and ob…
carlosdelest Mar 21, 2024
c80677f
multifields support
carlosdelest Mar 22, 2024
5896c60
Add copy_to support
carlosdelest Mar 22, 2024
df0cc90
Spotless
carlosdelest Mar 22, 2024
6d4bbf3
Fix tests
carlosdelest Mar 22, 2024
068615a
Remove the need to get mappings by using MappingLookup.getFieldType()
carlosdelest Mar 22, 2024
5c3d9c4
Revert "Add getMapper(field) to Mapper, so both field with multifield…
carlosdelest Mar 22, 2024
ace17dd
PR feedback
carlosdelest Mar 26, 2024
8bebb8d
Merge branch 'feature/semantic-text' into carlosdelest/semantic-text-…
carlosdelest Mar 26, 2024
47a22d7
Fix merge with feature branch
carlosdelest Mar 26, 2024
5311424
Fix merge with feature branch
carlosdelest Mar 27, 2024
707b3f1
Remove multifield testing
carlosdelest Mar 27, 2024
5e5b32a
Now, actually drop multifields support
carlosdelest Mar 27, 2024
1abf95c
Merge branch 'feature/semantic-text' into carlosdelest/semantic-text-…
carlosdelest Mar 27, 2024
29dd33e
Fix merge with main
carlosdelest Mar 27, 2024
b17d584
Merge branch 'feature/semantic-text' into carlosdelest/semantic-text-…
carlosdelest Apr 1, 2024
82ffb5b
Fix merge with feature branch
carlosdelest Apr 1, 2024
6b26200
Check that bulk updates use all source fields for updating a semantic…
carlosdelest Apr 1, 2024
bf5b837
Add check for inference results when no value is provided
carlosdelest Apr 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder;
import static org.hamcrest.Matchers.equalTo;
Expand Down Expand Up @@ -106,6 +107,12 @@ public void testCopyToFieldsParsing() throws Exception {

fieldMapper = mapperService.documentMapper().mappers().getMapper("new_field");
assertThat(fieldMapper.typeName(), equalTo("long"));

MappingLookup mappingLookup = mapperService.mappingLookup();
assertThat(mappingLookup.sourcePaths("another_field"), equalTo(Set.of("copy_test", "int_to_str_test", "another_field")));
assertThat(mappingLookup.sourcePaths("new_field"), equalTo(Set.of("new_field", "int_to_str_test")));
assertThat(mappingLookup.sourcePaths("copy_test"), equalTo(Set.of("copy_test", "cyclic_test")));
assertThat(mappingLookup.sourcePaths("cyclic_test"), equalTo(Set.of("cyclic_test", "copy_test")));
}

public void testCopyToFieldsInnerObjectParsing() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ public void testEmpty() {
assertNotNull(names);
assertThat(names, hasSize(0));

Map<String, String> fieldsForModels = lookup.getInferenceIdsForFields();
assertNotNull(fieldsForModels);
assertTrue(fieldsForModels.isEmpty());
Map<String, String> inferenceForFields = lookup.getInferenceIdsForFields();
assertNotNull(inferenceForFields);
assertTrue(inferenceForFields.isEmpty());
}

public void testAddNewField() {
Expand All @@ -48,9 +48,9 @@ public void testAddNewField() {
assertNull(lookup.get("bar"));
assertEquals(f.fieldType(), lookup.get("foo"));

Map<String, String> fieldsForModels = lookup.getInferenceIdsForFields();
assertNotNull(fieldsForModels);
assertTrue(fieldsForModels.isEmpty());
Map<String, String> inferenceForFields = lookup.getInferenceIdsForFields();
assertNotNull(inferenceForFields);
assertTrue(inferenceForFields.isEmpty());
}

public void testAddFieldAlias() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ public void testSourcePathFields() throws IOException {
final Set<String> fieldsUsingSourcePath = new HashSet<>();
((FieldMapper) mapper).sourcePathUsedBy().forEachRemaining(mapper1 -> fieldsUsingSourcePath.add(mapper1.name()));
assertThat(fieldsUsingSourcePath, equalTo(Set.of("field.subfield1", "field.subfield2")));

assertThat(mapperService.mappingLookup().sourcePaths("field.subfield1"), equalTo(Set.of("field")));
assertThat(mapperService.mappingLookup().sourcePaths("field.subfield2"), equalTo(Set.of("field")));
}

public void testUnknownLegacyFieldsUnderKnownRootField() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,7 @@ public final void testMinimalIsInvalidInRoutingPath() throws IOException {
}
}

private String minimalIsInvalidRoutingPathErrorMessage(Mapper mapper) {
protected String minimalIsInvalidRoutingPathErrorMessage(Mapper mapper) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to take back this to protected so it can be overriden by our tests

if (mapper instanceof FieldMapper fieldMapper && fieldMapper.fieldType().isDimension() == false) {
return "All fields that match routing_path must be configured with [time_series_dimension: true] "
+ "or flattened fields with a list of dimensions in [time_series_dimensions] and "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,41 +297,47 @@ private Map<String, List<FieldInferenceRequest>> createFieldInferenceRequests(Bu
final Map<String, Object> docMap = indexRequest.sourceAsMap();
boolean hasInput = false;
for (var entry : fieldInferenceMetadata.getFieldInferenceOptions().entrySet()) {
String field = entry.getKey();
String inferenceId = entry.getValue().inferenceId();
var value = XContentMapValues.extractValue(field, docMap);
if (value == null) {
continue;
}
if (inferenceResults.get(item.id()) == null) {
inferenceResults.set(
item.id(),
new FieldInferenceResponseAccumulator(
String fieldName = entry.getKey();
for (var sourceField : entry.getValue().sourceFields()) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now we iterate on all the source fields for retrieving inference - so we're adding an additional loop here


var value = XContentMapValues.extractValue(sourceField, docMap);
if (value == null) {
continue;
}
if (inferenceResults.get(item.id()) == null) {
inferenceResults.set(
item.id(),
Collections.synchronizedList(new ArrayList<>()),
Collections.synchronizedList(new ArrayList<>())
)
);
}
if (value instanceof String valueStr) {
List<FieldInferenceRequest> fieldRequests = fieldRequestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>());
fieldRequests.add(new FieldInferenceRequest(item.id(), field, valueStr));
hasInput = true;
} else {
inferenceResults.get(item.id()).failures.add(
new ElasticsearchStatusException(
"Invalid format for field [{}], expected [String] got [{}]",
RestStatus.BAD_REQUEST,
field,
value.getClass().getSimpleName()
)
);
new FieldInferenceResponseAccumulator(
item.id(),
Collections.synchronizedList(new ArrayList<>()),
Collections.synchronizedList(new ArrayList<>())
)
);
}
if (value instanceof String valueStr) {
List<FieldInferenceRequest> fieldRequests = fieldRequestsMap.computeIfAbsent(
inferenceId,
k -> new ArrayList<>()
);
fieldRequests.add(new FieldInferenceRequest(item.id(), fieldName, valueStr));
hasInput = true;
} else {
inferenceResults.get(item.id()).failures.add(
new ElasticsearchStatusException(
"Invalid format for field [{}], expected [String] got [{}]",
RestStatus.BAD_REQUEST,
fieldName,
value.getClass().getSimpleName()
)
);
}
}
}
if (hasInput == false) {
// remove the existing _inference field (if present) since none of the content require inference.
if (docMap.remove(InferenceMetadataFieldMapper.NAME) != null) {
indexRequest.source(docMap);
if (hasInput == false) {
// remove the existing _inference field (if present) since none of the content require inference.
if (docMap.remove(InferenceMetadataFieldMapper.NAME) != null) {
indexRequest.source(docMap);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ private void parseResultsObject(
}
parser.nextToken();
fieldMapper.parse(context);
// Reset leaf object after parsing the field
context.path().setWithinLeafObject(true);
}
if (visited.containsAll(REQUIRED_SUBFIELDS) == false) {
Set<String> missingSubfields = REQUIRED_SUBFIELDS.stream()
Expand Down Expand Up @@ -380,6 +382,7 @@ public SourceLoader.SyntheticFieldLoader syntheticFieldLoader() {
return SourceLoader.SyntheticFieldLoader.NOTHING;
}

@SuppressWarnings("unchecked")
public static void applyFieldInference(
Map<String, Object> inferenceMap,
String field,
Expand All @@ -404,11 +407,12 @@ public static void applyFieldInference(
results.getWriteableName()
);
}
Map<String, Object> fieldMap = new LinkedHashMap<>();
fieldMap.put(INFERENCE_ID, model.getInferenceEntityId());

Map<String, Object> fieldMap = (Map<String, Object>) inferenceMap.computeIfAbsent(field, s -> new LinkedHashMap<>());
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiple inference results can be applied to a single field from each source field - so we need to be prepared for that

fieldMap.putAll(new SemanticTextModelSettings(model).asMap());
fieldMap.put(CHUNKS, chunks);
inferenceMap.put(field, fieldMap);
List<Map<String, Object>> fieldChunks = (List<Map<String, Object>>) fieldMap.computeIfAbsent(CHUNKS, k -> new ArrayList<>());
fieldChunks.addAll(chunks);
fieldMap.put(INFERENCE_ID, model.getInferenceEntityId());
}

record SemanticTextMapperContext(MapperBuilderContext context, SemanticTextFieldMapper mapper) {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import static org.hamcrest.CoreMatchers.equalTo;

public class SemanticTextClusterMetadataTests extends ESSingleNodeTestCase {

Expand All @@ -33,13 +36,13 @@ public void testCreateIndexWithSemanticTextField() {
"test",
client().admin().indices().prepareCreate("test").setMapping("field", "type=semantic_text,inference_id=test_model")
);
assertEquals(
assertThat(
indexService.getMetadata().getFieldInferenceMetadata().getFieldInferenceOptions().get("field").inferenceId(),
"test_model"
equalTo("test_model")
);
}

public void testAddSemanticTextField() throws Exception {
public void testSingleSourceSemanticTextField() throws Exception {
final IndexService indexService = createIndex("test", client().admin().indices().prepareCreate("test"));
final MetadataMappingService mappingService = getInstanceFromNode(MetadataMappingService.class);
final MetadataMappingService.PutMappingExecutor putMappingExecutor = mappingService.new PutMappingExecutor();
Expand All @@ -53,10 +56,51 @@ public void testAddSemanticTextField() throws Exception {
putMappingExecutor,
singleTask(request)
);
assertEquals(
resultingState.metadata().index("test").getFieldInferenceMetadata().getFieldInferenceOptions().get("field").inferenceId(),
"test_model"
FieldInferenceMetadata.FieldInferenceOptions fieldInferenceOptions = resultingState.metadata()
.index("test")
.getFieldInferenceMetadata()
.getFieldInferenceOptions()
.get("field");
assertThat(fieldInferenceOptions.inferenceId(), equalTo("test_model"));
assertThat(fieldInferenceOptions.sourceFields(), equalTo(Set.of("field")));
}

public void testCopyToSemanticTextField() throws Exception {
final IndexService indexService = createIndex("test", client().admin().indices().prepareCreate("test"));
final MetadataMappingService mappingService = getInstanceFromNode(MetadataMappingService.class);
final MetadataMappingService.PutMappingExecutor putMappingExecutor = mappingService.new PutMappingExecutor();
final ClusterService clusterService = getInstanceFromNode(ClusterService.class);

final PutMappingClusterStateUpdateRequest request = new PutMappingClusterStateUpdateRequest("""
{
"properties": {
"semantic": {
"type": "semantic_text",
"inference_id": "test_model"
},
"copy_origin_1": {
"type": "text",
"copy_to": "semantic"
},
"copy_origin_2": {
"type": "text",
"copy_to": "semantic"
}
}
}
""");
request.indices(new Index[] { indexService.index() });
final var resultingState = ClusterStateTaskExecutorUtils.executeAndAssertSuccessful(
clusterService.state(),
putMappingExecutor,
singleTask(request)
);
IndexMetadata indexMetadata = resultingState.metadata().index("test");
FieldInferenceMetadata.FieldInferenceOptions fieldInferenceOptions = indexMetadata.getFieldInferenceMetadata()
.getFieldInferenceOptions()
.get("semantic");
assertThat(fieldInferenceOptions.inferenceId(), equalTo("test_model"));
assertThat(fieldInferenceOptions.sourceFields(), equalTo(Set.of("semantic", "copy_origin_1", "copy_origin_2")));
}

private static List<MetadataMappingService.PutMappingClusterStateUpdateTask> singleTask(PutMappingClusterStateUpdateRequest request) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,11 @@ public void testInferenceNotFound() throws Exception {
FieldInferenceMetadata inferenceFields = new FieldInferenceMetadata(
Map.of(
"field1",
new FieldInferenceMetadata.FieldInferenceOptions(model.getInferenceEntityId(), Set.of()),
new FieldInferenceMetadata.FieldInferenceOptions(model.getInferenceEntityId(), Set.of("field1")),
"field2",
new FieldInferenceMetadata.FieldInferenceOptions("inference_0", Set.of()),
new FieldInferenceMetadata.FieldInferenceOptions("inference_0", Set.of("field2")),
"field3",
new FieldInferenceMetadata.FieldInferenceOptions("inference_0", Set.of())
new FieldInferenceMetadata.FieldInferenceOptions("inference_0", Set.of("field3"))
)
);
BulkItemRequest[] items = new BulkItemRequest[10];
Expand All @@ -154,7 +154,7 @@ public void testManyRandomDocs() throws Exception {
for (int i = 0; i < numInferenceFields; i++) {
String field = randomAlphaOfLengthBetween(5, 10);
String inferenceId = randomFrom(inferenceModelMap.keySet());
inferenceFieldsMap.put(field, new FieldInferenceMetadata.FieldInferenceOptions(inferenceId, Set.of()));
inferenceFieldsMap.put(field, new FieldInferenceMetadata.FieldInferenceOptions(inferenceId, Set.of(field)));
}
FieldInferenceMetadata fieldInferenceMetadata = new FieldInferenceMetadata(inferenceFieldsMap);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,38 @@ setup:
id: doc_1
body:
non_inference_field: "non inference test"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any other tests come to mind?

---
"semantic_text copy_to calculate inference for source fields":
- do:
indices.create:
index: test-copy-to-index
body:
mappings:
properties:
inference_field:
type: semantic_text
inference_id: dense-inference-id
source_field:
type: text
copy_to: inference_field

- do:
index:
index: test-copy-to-index
id: doc_1
body:
source_field: "copy_to inference test"
inference_field: "inference test"

- do:
get:
index: test-copy-to-index
id: doc_1

- match: { _source.inference_field: "inference test" }
- length: {_source._inference.inference_field.chunks: 2}
- exists: _source._inference.inference_field.chunks.0.inference
- exists: _source._inference.inference_field.chunks.0.text
- exists: _source._inference.inference_field.chunks.1.inference
- exists: _source._inference.inference_field.chunks.1.text