diff --git a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java index 1e623ccbb0456..47cc25ea9cd2b 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java +++ b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java @@ -38,9 +38,9 @@ public CodecService(@Nullable MapperService mapperService) { codecs.put(BEST_COMPRESSION_CODEC, new Lucene90Codec(Lucene90Codec.Mode.BEST_COMPRESSION)); } else { codecs.put(DEFAULT_CODEC, - new PerFieldMappingPostingFormatCodec(Lucene90Codec.Mode.BEST_SPEED, mapperService)); + new PerFieldMappingCodec(Lucene90Codec.Mode.BEST_SPEED, mapperService)); codecs.put(BEST_COMPRESSION_CODEC, - new PerFieldMappingPostingFormatCodec(Lucene90Codec.Mode.BEST_COMPRESSION, mapperService)); + new PerFieldMappingCodec(Lucene90Codec.Mode.BEST_COMPRESSION, mapperService)); } codecs.put(LUCENE_DEFAULT_CODEC, Codec.getDefault()); for (String codec : Codec.availableCodecs()) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMappingCodec.java similarity index 57% rename from server/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java rename to server/src/main/java/org/elasticsearch/index/codec/PerFieldMappingCodec.java index 240728b3eec2f..e11194e0da576 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMappingCodec.java @@ -10,6 +10,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90Codec; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; @@ -17,24 +18,24 @@ import org.elasticsearch.index.mapper.MapperService; /** - * {@link PerFieldMappingPostingFormatCodec This postings format} is the default - * {@link PostingsFormat} for Elasticsearch. It utilizes the - * {@link MapperService} to lookup a {@link PostingsFormat} per field. This - * allows users to change the low level postings format for individual fields - * per index in real time via the mapping API. If no specific postings format is - * configured for a specific field the default postings format is used. + * {@link PerFieldMappingCodec This postings format} is the default + * {@link PostingsFormat} and {@link KnnVectorsFormat} for Elasticsearch. It utilizes the + * {@link MapperService} to lookup a {@link PostingsFormat} and {@link KnnVectorsFormat} per field. This + * allows users to change the low level postings format and vectors format for individual fields + * per index in real time via the mapping API. If no specific postings format or vector format is + * configured for a specific field the default postings or vector format is used. */ -public class PerFieldMappingPostingFormatCodec extends Lucene90Codec { +public class PerFieldMappingCodec extends Lucene90Codec { private final MapperService mapperService; private final DocValuesFormat docValuesFormat = new Lucene90DocValuesFormat(); static { - assert Codec.forName(Lucene.LATEST_CODEC).getClass().isAssignableFrom(PerFieldMappingPostingFormatCodec.class) : - "PerFieldMappingPostingFormatCodec must subclass the latest " + "lucene codec: " + Lucene.LATEST_CODEC; + assert Codec.forName(Lucene.LATEST_CODEC).getClass().isAssignableFrom(PerFieldMappingCodec.class) : + "PerFieldMappingCodec must subclass the latest " + "lucene codec: " + Lucene.LATEST_CODEC; } - public PerFieldMappingPostingFormatCodec(Mode compressionMode, MapperService mapperService) { + public PerFieldMappingCodec(Mode compressionMode, MapperService mapperService) { super(compressionMode); this.mapperService = mapperService; } @@ -48,6 +49,15 @@ public PostingsFormat getPostingsFormatForField(String field) { return format; } + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + KnnVectorsFormat format = mapperService.mappingLookup().getKnnVectorsFormatForField(field); + if (format == null) { + return super.getKnnVectorsFormatForField(field); + } + return format; + } + @Override public DocValuesFormat getDocValuesFormatForField(String field) { return docValuesFormat; diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java b/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java index 02e1698b7f5d2..29a154e707536 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java @@ -8,6 +8,7 @@ package org.elasticsearch.index.mapper; +import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.index.IndexSettings; @@ -228,6 +229,20 @@ public PostingsFormat getPostingsFormat(String field) { return completionFields.contains(field) ? CompletionFieldMapper.postingsFormat() : null; } + /** + * Returns the knn vectors format for a particular field + * @param field the field to retrieve a knn vectors format for + * @return the knn vectors format for the field, or {@code null} if the default format should be used + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + Mapper fieldMapper = fieldMappers.get(field); + if (fieldMapper instanceof VectorFieldMapper) { + return ((VectorFieldMapper) fieldMapper).getKnnVectorsFormatForField(); + } else { + return null; + } + } + void checkLimits(IndexSettings settings) { checkFieldLimit(settings.getMappingTotalFieldsLimit()); checkObjectDepthLimit(settings.getMappingDepthLimit()); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/VectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/VectorFieldMapper.java new file mode 100644 index 0000000000000..2c1c38a4b3927 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/mapper/VectorFieldMapper.java @@ -0,0 +1,128 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ +package org.elasticsearch.index.mapper; + +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat; +import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.xcontent.ToXContent; +import org.elasticsearch.xcontent.XContentBuilder; + +import java.io.IOException; +import java.util.Map; +import java.util.Objects; + +import static org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN; +import static org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH; + +/** + * Field mapper for a vector field for ann search. + */ + +public abstract class VectorFieldMapper extends FieldMapper { + public static final IndexOptions DEFAULT_INDEX_OPTIONS = new HNSWIndexOptions(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH); + protected final IndexOptions indexOptions; + + protected VectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, MultiFields multiFields, CopyTo copyTo, + IndexOptions indexOptions) { + super(simpleName, mappedFieldType, multiFields, copyTo); + this.indexOptions = indexOptions; + } + + /** + * Returns the knn vectors format that is customly set up for this field or {@code null} if + * the format is not set up or if the set up format matches the default format. + * @return the knn vectors format for the field, or {@code null} if the default format should be used + */ + public KnnVectorsFormat getKnnVectorsFormatForField() { + if (indexOptions == null && indexOptions == DEFAULT_INDEX_OPTIONS) { + return null; + } else { + HNSWIndexOptions hnswIndexOptions = (HNSWIndexOptions) indexOptions; + return new Lucene90HnswVectorsFormat(hnswIndexOptions.m, hnswIndexOptions.efConstruction); + } + } + + public static IndexOptions parseVectorIndexOptions(String fieldName, Object propNode) { + if (propNode == null) { + return null; + } + Map indexOptionsMap = (Map) propNode; + String type = XContentMapValues.nodeStringValue(indexOptionsMap.remove("type"), "hnsw"); + if (type.equals("hnsw")) { + return HNSWIndexOptions.parseIndexOptions(fieldName, indexOptionsMap); + } else { + throw new MapperParsingException("Unknown vector index options type [" + type + "] for field [" + fieldName + "]"); + } + } + + public abstract static class IndexOptions implements ToXContent { + protected final String type; + public IndexOptions(String type) { + this.type = type; + } + } + + public static class HNSWIndexOptions extends IndexOptions { + private final int m; + private final int efConstruction; + + public HNSWIndexOptions(int m, int efConstruction) { + super("hnsw"); + this.m = m; + this.efConstruction = efConstruction; + } + + public int m() { + return m; + } + + public int efConstruction() { + return efConstruction; + } + + public static IndexOptions parseIndexOptions(String fieldName, Map indexOptionsMap) { + int m = XContentMapValues.nodeIntegerValue(indexOptionsMap.remove("m"), DEFAULT_MAX_CONN); + int efConstruction = XContentMapValues.nodeIntegerValue(indexOptionsMap.remove("ef_construction"), DEFAULT_BEAM_WIDTH); + MappingParser.checkNoRemainingFields(fieldName, indexOptionsMap); + if (m == DEFAULT_MAX_CONN && efConstruction == DEFAULT_BEAM_WIDTH) { + return VectorFieldMapper.DEFAULT_INDEX_OPTIONS; + } else { + return new HNSWIndexOptions(m, efConstruction); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + builder.field("type", type); + builder.field("m", m); + builder.field("ef_construction", efConstruction); + builder.endObject(); + return builder; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + HNSWIndexOptions that = (HNSWIndexOptions) o; + return m == that.m && efConstruction == that.efConstruction; + } + + @Override + public int hashCode() { + return Objects.hash(type, m, efConstruction); + } + + @Override + public String toString() { + return "{type=" + type + ", m=" + m + ", ef_construction=" + efConstruction + " }"; + } + } +} diff --git a/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java b/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java index ec1a49dafe75c..9a666f9057e5d 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java @@ -40,7 +40,7 @@ public class CodecTests extends ESTestCase { public void testResolveDefaultCodecs() throws Exception { CodecService codecService = createCodecService(); - assertThat(codecService.codec("default"), instanceOf(PerFieldMappingPostingFormatCodec.class)); + assertThat(codecService.codec("default"), instanceOf(PerFieldMappingCodec.class)); assertThat(codecService.codec("default"), instanceOf(Lucene90Codec.class)); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/CompletionFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/CompletionFieldMapperTests.java index 15b2af9211f37..a240f1da193b4 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/CompletionFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/CompletionFieldMapperTests.java @@ -29,6 +29,7 @@ import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.index.codec.PerFieldMappingCodec; import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentFactory; @@ -38,7 +39,6 @@ import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.codec.CodecService; -import org.elasticsearch.index.codec.PerFieldMappingPostingFormatCodec; import org.hamcrest.FeatureMatcher; import org.hamcrest.Matcher; import org.hamcrest.Matchers; @@ -122,8 +122,8 @@ public void testPostingsFormat() throws IOException { MapperService mapperService = createMapperService(fieldMapping(this::minimalMapping)); CodecService codecService = new CodecService(mapperService); Codec codec = codecService.codec("default"); - assertThat(codec, instanceOf(PerFieldMappingPostingFormatCodec.class)); - PerFieldMappingPostingFormatCodec perFieldCodec = (PerFieldMappingPostingFormatCodec) codec; + assertThat(codec, instanceOf(PerFieldMappingCodec.class)); + PerFieldMappingCodec perFieldCodec = (PerFieldMappingCodec) codec; assertThat(perFieldCodec.getPostingsFormatForField("field"), instanceOf(Completion90PostingsFormat.class)); } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/vectors/10_dense_vector_basic.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/vectors/10_dense_vector_basic.yml index 6d3456182901f..952ee4654f040 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/vectors/10_dense_vector_basic.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/vectors/10_dense_vector_basic.yml @@ -18,6 +18,10 @@ setup: dims: 5 index: true similarity: dot_product + index_options: + type: hnsw + m: 15 + ef_construction: 80 - do: index: index: test-index diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/vectors/20_dense_vector_special_cases.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/vectors/20_dense_vector_special_cases.yml index 6b486c6299c61..9d097c55a4510 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/vectors/20_dense_vector_special_cases.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/vectors/20_dense_vector_special_cases.yml @@ -19,6 +19,9 @@ setup: dims: 3 index: true similarity: l2_norm + index_options: + type: hnsw + m: 15 --- "Indexing of Dense vectors should error when dims don't match defined in the mapping": diff --git a/x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/mapper/DenseVectorFieldMapper.java b/x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/mapper/DenseVectorFieldMapper.java index b336cc19466c1..41299c925168a 100644 --- a/x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/mapper/DenseVectorFieldMapper.java +++ b/x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/mapper/DenseVectorFieldMapper.java @@ -16,6 +16,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; import org.elasticsearch.Version; +import org.elasticsearch.index.mapper.VectorFieldMapper; import org.elasticsearch.xcontent.XContentParser.Token; import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.index.fielddata.IndexFieldData; @@ -47,7 +48,7 @@ /** * A {@link FieldMapper} for indexing a dense vector of floats. */ -public class DenseVectorFieldMapper extends FieldMapper { +public class DenseVectorFieldMapper extends VectorFieldMapper { public static final String CONTENT_TYPE = "dense_vector"; public static short MAX_DIMS_COUNT = 2048; //maximum allowed number of dimensions @@ -73,6 +74,8 @@ public static class Builder extends FieldMapper.Builder { private final Parameter indexed = Parameter.indexParam(m -> toType(m).indexed, false); private final Parameter similarity = Parameter.enumParam( "similarity", false, m -> toType(m).similarity, null, VectorSimilarity.class); + private final Parameter indexOptions = new Parameter<>("index_options", false, () -> null, + (n, c, o) -> VectorFieldMapper.parseVectorIndexOptions(n, o), m -> toType(m).indexOptions); private final Parameter> meta = Parameter.metaParam(); final Version indexVersionCreated; @@ -84,11 +87,13 @@ public Builder(String name, Version indexVersionCreated) { this.indexed.requiresParameters(similarity); this.similarity.setSerializerCheck((id, ic, v) -> v != null); this.similarity.requiresParameters(indexed); + this.indexOptions.requiresParameters(indexed); + this.indexOptions.setSerializerCheck((id, ic, v) -> v != null); } @Override protected List> getParameters() { - return List.of(dims, indexed, similarity, meta); + return List.of(dims, indexed, similarity, indexOptions, meta); } @Override @@ -102,7 +107,8 @@ public DenseVectorFieldMapper build(MapperBuilderContext context) { similarity.getValue(), indexVersionCreated, multiFieldsBuilder.build(this, context), - copyTo.build()); + copyTo.build(), + indexOptions.getValue()); } } @@ -187,10 +193,10 @@ public Query termQuery(Object value, SearchExecutionContext context) { private final VectorSimilarity similarity; private final Version indexCreatedVersion; - private DenseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, int dims, - boolean indexed, VectorSimilarity similarity, - Version indexCreatedVersion, MultiFields multiFields, CopyTo copyTo) { - super(simpleName, mappedFieldType, multiFields, copyTo); + private DenseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, int dims, boolean indexed, + VectorSimilarity similarity, Version indexCreatedVersion, MultiFields multiFields, + CopyTo copyTo, VectorFieldMapper.IndexOptions indexOptions) { + super(simpleName, mappedFieldType, multiFields, copyTo, indexOptions); this.dims = dims; this.indexed = indexed; this.similarity = similarity; diff --git a/x-pack/plugin/vectors/src/test/java/org/elasticsearch/xpack/vectors/mapper/DenseVectorFieldMapperTests.java b/x-pack/plugin/vectors/src/test/java/org/elasticsearch/xpack/vectors/mapper/DenseVectorFieldMapperTests.java index 43dec82107f79..1e321b68843ec 100644 --- a/x-pack/plugin/vectors/src/test/java/org/elasticsearch/xpack/vectors/mapper/DenseVectorFieldMapperTests.java +++ b/x-pack/plugin/vectors/src/test/java/org/elasticsearch/xpack/vectors/mapper/DenseVectorFieldMapperTests.java @@ -9,6 +9,9 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.KnnVectorField; import org.apache.lucene.index.IndexableField; @@ -16,6 +19,9 @@ import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; import org.elasticsearch.Version; +import org.elasticsearch.index.codec.CodecService; +import org.elasticsearch.index.codec.PerFieldMappingCodec; +import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.LuceneDocument; @@ -35,15 +41,19 @@ import java.util.Collection; import java.util.List; +import static org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH; +import static org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; public class DenseVectorFieldMapperTests extends MapperTestCase { private final boolean indexed; + private final boolean indexOptionsSet; public DenseVectorFieldMapperTests() { this.indexed = randomBoolean(); + this.indexOptionsSet = randomBoolean(); } @Override @@ -56,6 +66,13 @@ protected void minimalMapping(XContentBuilder b) throws IOException { b.field("type", "dense_vector").field("dims", 4); if (indexed) { b.field("index", true).field("similarity", "dot_product"); + if (indexOptionsSet) { + b.startObject("index_options"); + b.field("type", "hnsw"); + b.field("m", 5); + b.field("ef_construction", 50); + b.endObject(); + } } } @@ -86,6 +103,21 @@ protected void registerParameters(ParameterChecker checker) throws IOException { fieldMapping(b -> b.field("type", "dense_vector") .field("dims", 4) .field("index", false))); + checker.registerConflictCheck("index_options", + fieldMapping(b -> b.field("type", "dense_vector") + .field("dims", 4) + .field("index", true) + .field("similarity", "dot_product") + .startObject("index_options") + .field("m", 5) + .field("ef_construction", 80) + .endObject()), + fieldMapping(b -> b.field("type", "dense_vector") + .field("dims", 4) + .field("index", true) + .field("similarity", "dot_product") + .startObject("index_options") + .endObject())); } @Override @@ -203,6 +235,14 @@ public void testInvalidParameters() { .field("dims", 3) .field("similarity", "l2_norm")))); assertThat(e.getMessage(), containsString("Field [similarity] requires field [index] to be configured")); + + e = expectThrows(MapperParsingException.class, + () -> createDocumentMapper(fieldMapping(b -> b + .field("type", "dense_vector") + .field("dims", 3) + .startObject("index_options") + .endObject()))); + assertThat(e.getMessage(), containsString("Field [index_options] requires field [index] to be configured")); } public void testAddDocumentsToIndexBefore_V_7_5_0() throws Exception { @@ -288,4 +328,27 @@ public void testCannotBeUsedInMultifields() { }))); assertThat(e.getMessage(), containsString("Field [vectors] of type [dense_vector] can't be used in multifields")); } + + protected void mappingWithCustomIndexOptions(XContentBuilder b, int m, int efConstruction) throws IOException { + b.field("type", "dense_vector"); + b.field("dims", 4); + b.field("index", true); + b.field("similarity", "dot_product"); + b.startObject("index_options"); + b.field("m", m) ; + b.field("ef_construction", efConstruction); + b.endObject(); + } + + public void testKnnVectorsFormat() throws IOException { + int m = randomIntBetween(1, DEFAULT_MAX_CONN + 10); + int efConstruction = randomIntBetween(1, DEFAULT_BEAM_WIDTH + 10); + MapperService mapperService = createMapperService(fieldMapping(b -> mappingWithCustomIndexOptions(b, m, efConstruction))); + CodecService codecService = new CodecService(mapperService); + Codec codec = codecService.codec("default"); + assertThat(codec, instanceOf(PerFieldMappingCodec.class)); + KnnVectorsFormat knnVectorsFormat = ((PerFieldMappingCodec) codec).getKnnVectorsFormatForField("field"); + assertThat(knnVectorsFormat, instanceOf(Lucene90HnswVectorsFormat.class)); + //TODO: add more assertions once LUCENE-10178 is implemented + } }