-
Notifications
You must be signed in to change notification settings - Fork 24.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Semantic text - field mapper #102971
Semantic text - field mapper #102971
Changes from all commits
ffd86f7
085751e
03423e3
64efbeb
34360e2
8280753
66fa2cc
048779b
55c7ae1
e448d3d
fd235f5
b93cf5d
43c5cdb
c0c1125
4b06655
39e061f
07613ae
d7d8d5e
fd83fe9
d8ae624
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0 and the Server Side Public License, v 1; you may not use this file except | ||
* in compliance with, at your election, the Elastic License 2.0 or the Server | ||
* Side Public License, v 1. | ||
*/ | ||
|
||
package org.elasticsearch.index.mapper; | ||
|
||
/** | ||
* Field type that uses an inference model. | ||
*/ | ||
public interface InferenceModelFieldType { | ||
/** | ||
* Retrieve inference model used by the field type. | ||
* | ||
* @return model id used by the field type | ||
*/ | ||
String getInferenceModel(); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0; you may not use this file except in compliance with the Elastic License | ||
* 2.0. | ||
*/ | ||
|
||
package org.elasticsearch.xpack.ml; | ||
|
||
import org.elasticsearch.common.util.FeatureFlag; | ||
|
||
/** | ||
* semantic_text feature flag. When the feature is complete, this flag will be removed. | ||
*/ | ||
public class SemanticTextFeature { | ||
|
||
private SemanticTextFeature() {} | ||
|
||
private static final FeatureFlag FEATURE_FLAG = new FeatureFlag("semantic_text"); | ||
|
||
public static boolean isEnabled() { | ||
return FEATURE_FLAG.isEnabled(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0; you may not use this file except in compliance with the Elastic License | ||
* 2.0. | ||
*/ | ||
|
||
package org.elasticsearch.xpack.ml.mapper; | ||
|
||
import org.apache.lucene.search.Query; | ||
import org.elasticsearch.common.Strings; | ||
import org.elasticsearch.index.fielddata.FieldDataContext; | ||
import org.elasticsearch.index.fielddata.IndexFieldData; | ||
import org.elasticsearch.index.mapper.DocumentParserContext; | ||
import org.elasticsearch.index.mapper.FieldMapper; | ||
import org.elasticsearch.index.mapper.InferenceModelFieldType; | ||
import org.elasticsearch.index.mapper.MappedFieldType; | ||
import org.elasticsearch.index.mapper.MapperBuilderContext; | ||
import org.elasticsearch.index.mapper.SimpleMappedFieldType; | ||
import org.elasticsearch.index.mapper.SourceValueFetcher; | ||
import org.elasticsearch.index.mapper.TextSearchInfo; | ||
import org.elasticsearch.index.mapper.ValueFetcher; | ||
import org.elasticsearch.index.query.SearchExecutionContext; | ||
|
||
import java.io.IOException; | ||
import java.util.Map; | ||
|
||
/** | ||
* A {@link FieldMapper} for semantic text fields. These fields have a model id reference, that is used for performing inference | ||
* at ingestion and query time. | ||
* For now, it is compatible with text expansion models only, but will be extended to support dense vector models as well. | ||
* This field mapper performs no indexing, as inference results will be included as a different field in the document source, and will | ||
* be indexed using a different field mapper. | ||
*/ | ||
public class SemanticTextFieldMapper extends FieldMapper { | ||
|
||
public static final String CONTENT_TYPE = "semantic_text"; | ||
|
||
private static SemanticTextFieldMapper toType(FieldMapper in) { | ||
return (SemanticTextFieldMapper) in; | ||
} | ||
|
||
public static final TypeParser PARSER = new TypeParser((n, c) -> new Builder(n), notInMultiFields(CONTENT_TYPE)); | ||
|
||
private SemanticTextFieldMapper(String simpleName, MappedFieldType mappedFieldType, CopyTo copyTo) { | ||
super(simpleName, mappedFieldType, MultiFields.empty(), copyTo); | ||
} | ||
|
||
@Override | ||
public FieldMapper.Builder getMergeBuilder() { | ||
return new Builder(simpleName()).init(this); | ||
} | ||
|
||
@Override | ||
protected void parseCreateField(DocumentParserContext context) throws IOException { | ||
// Just parses text - no indexing is performed | ||
context.parser().textOrNull(); | ||
} | ||
|
||
@Override | ||
protected String contentType() { | ||
return CONTENT_TYPE; | ||
} | ||
|
||
@Override | ||
public SemanticTextFieldType fieldType() { | ||
return (SemanticTextFieldType) super.fieldType(); | ||
} | ||
|
||
public static class Builder extends FieldMapper.Builder { | ||
|
||
private final Parameter<String> modelId = Parameter.stringParam("model_id", false, m -> toType(m).fieldType().modelId, null) | ||
.addValidator(v -> { | ||
if (Strings.isEmpty(v)) { | ||
throw new IllegalArgumentException("field [model_id] must be specified"); | ||
} | ||
}); | ||
|
||
private final Parameter<Map<String, String>> meta = Parameter.metaParam(); | ||
|
||
public Builder(String name) { | ||
super(name); | ||
} | ||
|
||
@Override | ||
protected Parameter<?>[] getParameters() { | ||
return new Parameter<?>[] { modelId, meta }; | ||
} | ||
|
||
@Override | ||
public SemanticTextFieldMapper build(MapperBuilderContext context) { | ||
return new SemanticTextFieldMapper(name(), new SemanticTextFieldType(name(), modelId.getValue(), meta.getValue()), copyTo); | ||
} | ||
} | ||
|
||
public static class SemanticTextFieldType extends SimpleMappedFieldType implements InferenceModelFieldType { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we move this nested class to the bottom of the file please? I know we have it like this in other mappers but it makes the code very hard to follow as can be seen by the accidental introduction of duplicate state across the field and mapper here IMO :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense - done in 4b06655 |
||
|
||
private final String modelId; | ||
|
||
public SemanticTextFieldType(String name, String modelId, Map<String, String> meta) { | ||
super(name, false, false, false, TextSearchInfo.NONE, meta); | ||
this.modelId = modelId; | ||
} | ||
|
||
@Override | ||
public String typeName() { | ||
return CONTENT_TYPE; | ||
} | ||
|
||
@Override | ||
public String getInferenceModel() { | ||
return modelId; | ||
} | ||
|
||
@Override | ||
public Query termQuery(Object value, SearchExecutionContext context) { | ||
throw new IllegalArgumentException("termQuery not implemented yet"); | ||
} | ||
|
||
@Override | ||
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { | ||
return SourceValueFetcher.toString(name(), context, format); | ||
} | ||
|
||
@Override | ||
public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) { | ||
throw new IllegalArgumentException("[semantic_text] fields do not support sorting, scripting or aggregating"); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0; you may not use this file except in compliance with the Elastic License | ||
* 2.0. | ||
*/ | ||
|
||
package org.elasticsearch.xpack.ml.mapper; | ||
|
||
import org.apache.lucene.index.IndexableField; | ||
import org.elasticsearch.common.Strings; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.mapper.DocumentMapper; | ||
import org.elasticsearch.index.mapper.MappedFieldType; | ||
import org.elasticsearch.index.mapper.MapperParsingException; | ||
import org.elasticsearch.index.mapper.MapperService; | ||
import org.elasticsearch.index.mapper.MapperTestCase; | ||
import org.elasticsearch.index.mapper.ParsedDocument; | ||
import org.elasticsearch.plugins.Plugin; | ||
import org.elasticsearch.xcontent.XContentBuilder; | ||
import org.elasticsearch.xpack.ml.MachineLearning; | ||
import org.junit.AssumptionViolatedException; | ||
|
||
import java.io.IOException; | ||
import java.util.Collection; | ||
import java.util.List; | ||
|
||
import static java.util.Collections.singletonList; | ||
import static org.hamcrest.Matchers.containsString; | ||
|
||
public class SemanticTextFieldMapperTests extends MapperTestCase { | ||
|
||
public void testDefaults() throws Exception { | ||
DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); | ||
assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); | ||
|
||
ParsedDocument doc1 = mapper.parse(source(this::writeField)); | ||
List<IndexableField> fields = doc1.rootDoc().getFields("field"); | ||
|
||
// No indexable fields | ||
assertTrue(fields.isEmpty()); | ||
} | ||
|
||
public void testModelIdNotPresent() throws IOException { | ||
Exception e = expectThrows( | ||
MapperParsingException.class, | ||
() -> createMapperService(fieldMapping(b -> b.field("type", "semantic_text"))) | ||
); | ||
assertThat(e.getMessage(), containsString("field [model_id] must be specified")); | ||
} | ||
|
||
public void testCannotBeUsedInMultiFields() { | ||
Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { | ||
b.field("type", "text"); | ||
b.startObject("fields"); | ||
b.startObject("semantic"); | ||
b.field("type", "semantic_text"); | ||
b.endObject(); | ||
b.endObject(); | ||
}))); | ||
assertThat(e.getMessage(), containsString("Field [semantic] of type [semantic_text] can't be used in multifields")); | ||
} | ||
|
||
public void testUpdatesToModelIdNotSupported() throws IOException { | ||
MapperService mapperService = createMapperService( | ||
fieldMapping(b -> b.field("type", "semantic_text").field("model_id", "test_model")) | ||
); | ||
Exception e = expectThrows( | ||
IllegalArgumentException.class, | ||
() -> merge(mapperService, fieldMapping(b -> b.field("type", "semantic_text").field("model_id", "another_model"))) | ||
); | ||
assertThat(e.getMessage(), containsString("Cannot update parameter [model_id] from [test_model] to [another_model]")); | ||
} | ||
|
||
@Override | ||
protected Collection<? extends Plugin> getPlugins() { | ||
return singletonList(new MachineLearning(Settings.EMPTY)); | ||
} | ||
|
||
@Override | ||
protected void minimalMapping(XContentBuilder b) throws IOException { | ||
b.field("type", "semantic_text").field("model_id", "test_model"); | ||
} | ||
|
||
@Override | ||
protected Object getSampleValueForDocument() { | ||
return "value"; | ||
} | ||
|
||
@Override | ||
protected boolean supportsIgnoreMalformed() { | ||
return false; | ||
} | ||
|
||
@Override | ||
protected boolean supportsStoredFields() { | ||
return false; | ||
} | ||
|
||
@Override | ||
protected void registerParameters(ParameterChecker checker) throws IOException {} | ||
|
||
@Override | ||
protected Object generateRandomInputValue(MappedFieldType ft) { | ||
assumeFalse("doc_values are not supported in semantic_text", true); | ||
return null; | ||
} | ||
|
||
@Override | ||
protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { | ||
throw new AssumptionViolatedException("not supported"); | ||
} | ||
|
||
@Override | ||
protected IngestScriptSupport ingestScriptSupport() { | ||
throw new AssumptionViolatedException("not supported"); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding a new MapperPlugin broke this check. Check that an actual filter has been used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't make sense to me. We shouldn't change this. Why do the other plugins that satisfy the mapping plugin don't suffer from this?
What is the actual issue here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the heads up Ben!
Not doing this made some tests fail (see this test result).
My understanding was that we were adding another MapperPlugin that was not accounted for - thus the change to understand that the MapperPlugin did something useful.
I've been checking and after merging with main this is no longer an issue, so I've withdrawn the changes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, but I still see the change here, you are going to revert this in a future commit here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same case as the other one - fixed in d7d8d5e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK - removing this caused tests to fail. I've looked deeper into this issue:
LocalStateCompositeXPackPlugin
is used for testingLocalStateCompositeXPackPlugin
use aMapperPlugin
as part of its pluginsLocalStateMachineLearning
has as pluginsSecurity
(which is aMapperPlugin
) andMachineLearning
. MakingMachineLearning
anotherMapperPlugin
implied that there are 2MapperPlugins
getFieldsFilter()
method has been overriden or not, but just the numbrer ofMapperPlugins
So I think the changes I've done actually preserve the original intention - we're not looking for how many
MapperPlugins
there are, but how many of them actually override the method (that is, do not returnNOOP_FIELD_FILTER
.Does it make sense? Happy to discuss if not
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@carlosdelest I see now. Seems good to me
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
++ I say this is ok.