From 29c39c614da62824ab0ef4716faf0e321c6bc1ed Mon Sep 17 00:00:00 2001 From: Rassyan Date: Tue, 15 Oct 2024 02:59:56 +0800 Subject: [PATCH 1/8] Fix Synthetic Source Handling for `bit` Type in `dense_vector` Field (#114407) **Description:** This PR addresses the issue described in [#114402](https://github.com/elastic/elasticsearch/issues/114402), where the `synthetic_source` feature does not correctly handle the `bit` type in `dense_vector` fields when `index` is set to `false`. The root cause of the issue was that the `bit` type was not properly accounted for, leading to an array that is 8 times the size of the actual `dims` value of docvalue. This mismatch will causes an array out-of-bounds exception when reconstructing the document. **Changes:** - Adjusted the `synthetic_source` logic to correctly handle the `bit` type by ensuring the array size accounts for the 8x difference in dimensions. - Added yaml test to cover the `bit` type scenario in `dense_vector` fields with `index` set to `false`. **Related Issues:** - Closes [#114402](https://github.com/elastic/elasticsearch/issues/114402) - Introduced in [#110059](https://github.com/elastic/elasticsearch/pull/110059) (cherry picked from commit 465c65c02fdca9f3c3faa01a8cb15f9f58dde58e) --- docs/changelog/114407.yaml | 6 +++ .../test/search.vectors/45_knn_search_bit.yml | 51 +++++++++++++++++++ .../ES814ScalarQuantizedVectorsFormat.java | 6 +++ .../vectors/ES815BitFlatVectorsFormat.java | 7 +++ .../vectors/DenseVectorFieldMapper.java | 2 +- .../vectors/DenseVectorFieldMapperTests.java | 15 +++--- 6 files changed, 80 insertions(+), 7 deletions(-) create mode 100644 docs/changelog/114407.yaml diff --git a/docs/changelog/114407.yaml b/docs/changelog/114407.yaml new file mode 100644 index 0000000000000..4c1134a9d3834 --- /dev/null +++ b/docs/changelog/114407.yaml @@ -0,0 +1,6 @@ +pr: 114407 +summary: Fix synthetic source handling for `bit` type in `dense_vector` field +area: Search +type: bug +issues: + - 114402 diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/45_knn_search_bit.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/45_knn_search_bit.yml index ed469ffd7ff16..02576ad1b2b01 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/45_knn_search_bit.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/45_knn_search_bit.yml @@ -354,3 +354,54 @@ setup: dims: 40 index: true similarity: max_inner_product + + +--- +"Search with synthetic source": + - requires: + capabilities: + - method: POST + path: /_search + capabilities: [ bit_dense_vector_synthetic_source ] + test_runner_features: capabilities + reason: "Support for bit dense vector synthetic source capability required" + - do: + indices.create: + index: test_synthetic_source + body: + mappings: + properties: + name: + type: keyword + vector1: + type: dense_vector + element_type: bit + dims: 40 + index: false + vector2: + type: dense_vector + element_type: bit + dims: 40 + index: true + similarity: l2_norm + + - do: + index: + index: test_synthetic_source + id: "1" + body: + name: cow.jpg + vector1: [2, -1, 1, 4, -3] + vector2: [2, -1, 1, 4, -3] + + - do: + indices.refresh: {} + + - do: + search: + force_synthetic_source: true + index: test_synthetic_source + + - match: {hits.hits.0._id: "1"} + - match: {hits.hits.0._source.vector1: [2, -1, 1, 4, -3]} + - match: {hits.hits.0._source.vector2: [2, -1, 1, 4, -3]} diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java index c4b52d26fc6e7..1cf5ce3b1a26e 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java @@ -41,6 +41,7 @@ import java.io.IOException; import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.DYNAMIC_CONFIDENCE_INTERVAL; +import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT; public class ES814ScalarQuantizedVectorsFormat extends FlatVectorsFormat { @@ -281,4 +282,9 @@ public RandomVectorScorer getRandomVectorScorer(VectorSimilarityFunction sim, Ra return delegate.getRandomVectorScorer(sim, values, query); } } + + @Override + public int getMaxDimensions(String fieldName) { + return MAX_DIMS_COUNT; + } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorsFormat.java index de91833c99842..fbededaf65454 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorsFormat.java @@ -24,6 +24,8 @@ import java.io.IOException; +import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT; + class ES815BitFlatVectorsFormat extends FlatVectorsFormat { private final FlatVectorsFormat delegate = new Lucene99FlatVectorsFormat(FlatBitVectorScorer.INSTANCE); @@ -38,6 +40,11 @@ public FlatVectorsReader fieldsReader(SegmentReadState segmentReadState) throws return delegate.fieldsReader(segmentReadState); } + @Override + public int getMaxDimensions(String fieldName) { + return MAX_DIMS_COUNT; + } + static class FlatBitVectorScorer implements FlatVectorsScorer { static final FlatBitVectorScorer INSTANCE = new FlatBitVectorScorer(); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java index a23d3f22a4d90..afd10be91b3a9 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java @@ -2218,7 +2218,7 @@ public void write(XContentBuilder b) throws IOException { if (indexCreatedVersion.onOrAfter(LITTLE_ENDIAN_FLOAT_STORED_INDEX_VERSION)) { byteBuffer.order(ByteOrder.LITTLE_ENDIAN); } - int dims = fieldType().dims; + int dims = fieldType().elementType == ElementType.BIT ? fieldType().dims / Byte.SIZE : fieldType().dims; for (int dim = 0; dim < dims; dim++) { fieldType().elementType.readAndWriteValue(byteBuffer, b); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java index b9557281adfd3..95895ec653f58 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java @@ -1435,24 +1435,27 @@ protected boolean supportsEmptyInputArray() { private static class DenseVectorSyntheticSourceSupport implements SyntheticSourceSupport { private final int dims = between(5, 1000); - private final ElementType elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT); + private final ElementType elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT, ElementType.BIT); private final boolean indexed = randomBoolean(); private final boolean indexOptionsSet = indexed && randomBoolean(); @Override public SyntheticSourceExample example(int maxValues) throws IOException { - Object value = elementType == ElementType.BYTE - ? randomList(dims, dims, ESTestCase::randomByte) - : randomList(dims, dims, ESTestCase::randomFloat); + Object value = switch (elementType) { + case BYTE, BIT: + yield randomList(dims, dims, ESTestCase::randomByte); + case FLOAT: + yield randomList(dims, dims, ESTestCase::randomFloat); + }; return new SyntheticSourceExample(value, value, this::mapping); } private void mapping(XContentBuilder b) throws IOException { b.field("type", "dense_vector"); - b.field("dims", dims); - if (elementType == ElementType.BYTE || randomBoolean()) { + if (elementType == ElementType.BYTE || elementType == ElementType.BIT || randomBoolean()) { b.field("element_type", elementType.toString()); } + b.field("dims", elementType == ElementType.BIT ? dims * Byte.SIZE : dims); if (indexed) { b.field("index", true); b.field("similarity", "l2_norm"); From df898ba4da2c0ca49e6af8b08613013ab5408b48 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:52:20 -0400 Subject: [PATCH 2/8] fixing backport of search capabilities --- .../action/search/SearchCapabilities.java | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java diff --git a/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java new file mode 100644 index 0000000000000..4153b9c1a12ce --- /dev/null +++ b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.rest.action.search; + +import java.util.Set; + +/** + * A {@link Set} of "capabilities" supported by the {@link RestSearchAction}. + */ +public final class SearchCapabilities { + + private SearchCapabilities() {} + + /** Support synthetic source with `bit` type in `dense_vector` field when `index` is set to `false`. */ + private static final String BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY = "bit_dense_vector_synthetic_source"; + + public static final Set CAPABILITIES = Set.of( + BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY + ); +} From 2541f00effdae406965530ec9922f2e2b9ad6168 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:53:00 -0400 Subject: [PATCH 3/8] fixing license header --- .../rest/action/search/SearchCapabilities.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java index 4153b9c1a12ce..880a51c40c71b 100644 --- a/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java +++ b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java @@ -1,10 +1,9 @@ /* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the "Elastic License - * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side - * Public License v 1"; you may not use this file except in compliance with, at - * your election, the "Elastic License 2.0", the "GNU Affero General Public - * License v3.0 only", or the "Server Side Public License, v 1". + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. */ package org.elasticsearch.rest.action.search; From 89fe0762cf917c690398a189dc5f93350ad1c29e Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:54:20 -0400 Subject: [PATCH 4/8] adding capabilities to RestSearchAction --- .../elasticsearch/rest/action/search/RestSearchAction.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java b/server/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java index 3dbb98f7a7685..b2525e7a8d167 100644 --- a/server/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java +++ b/server/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java @@ -94,6 +94,11 @@ public List routes() { ); } + @Override + public Set supportedCapabilities() { + return SearchCapabilities.CAPABILITIES; + } + @Override public RestChannelConsumer prepareRequest(final RestRequest request, final NodeClient client) throws IOException { From 426cc91cafb2743668d7ca887c54cc7cdf9f96f7 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Mon, 14 Oct 2024 16:03:24 -0400 Subject: [PATCH 5/8] fixing backport --- .../codec/vectors/ES814ScalarQuantizedVectorsFormat.java | 6 ------ .../index/codec/vectors/ES815BitFlatVectorsFormat.java | 7 ------- 2 files changed, 13 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java index 1cf5ce3b1a26e..c4b52d26fc6e7 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java @@ -41,7 +41,6 @@ import java.io.IOException; import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.DYNAMIC_CONFIDENCE_INTERVAL; -import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT; public class ES814ScalarQuantizedVectorsFormat extends FlatVectorsFormat { @@ -282,9 +281,4 @@ public RandomVectorScorer getRandomVectorScorer(VectorSimilarityFunction sim, Ra return delegate.getRandomVectorScorer(sim, values, query); } } - - @Override - public int getMaxDimensions(String fieldName) { - return MAX_DIMS_COUNT; - } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorsFormat.java index fbededaf65454..de91833c99842 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorsFormat.java @@ -24,8 +24,6 @@ import java.io.IOException; -import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT; - class ES815BitFlatVectorsFormat extends FlatVectorsFormat { private final FlatVectorsFormat delegate = new Lucene99FlatVectorsFormat(FlatBitVectorScorer.INSTANCE); @@ -40,11 +38,6 @@ public FlatVectorsReader fieldsReader(SegmentReadState segmentReadState) throws return delegate.fieldsReader(segmentReadState); } - @Override - public int getMaxDimensions(String fieldName) { - return MAX_DIMS_COUNT; - } - static class FlatBitVectorScorer implements FlatVectorsScorer { static final FlatBitVectorScorer INSTANCE = new FlatBitVectorScorer(); From 282d3bba3275401b1ff2f17eeb0b352ee1d1c303 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Mon, 14 Oct 2024 20:37:20 -0400 Subject: [PATCH 6/8] spotless --- .../elasticsearch/rest/action/search/SearchCapabilities.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java index 880a51c40c71b..ce31e5b5e72a1 100644 --- a/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java +++ b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java @@ -20,7 +20,5 @@ private SearchCapabilities() {} /** Support synthetic source with `bit` type in `dense_vector` field when `index` is set to `false`. */ private static final String BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY = "bit_dense_vector_synthetic_source"; - public static final Set CAPABILITIES = Set.of( - BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY - ); + public static final Set CAPABILITIES = Set.of(BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY); } From 712858a76ac328e5119132f4f560408f87f51460 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:35:24 -0400 Subject: [PATCH 7/8] muting teset for ccs --- qa/ccs-common-rest/build.gradle | 3 ++- .../rest-api-spec/test/search.vectors/45_knn_search_bit.yml | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/qa/ccs-common-rest/build.gradle b/qa/ccs-common-rest/build.gradle index e5e8c5a489d5b..f3e38b6827ef4 100644 --- a/qa/ccs-common-rest/build.gradle +++ b/qa/ccs-common-rest/build.gradle @@ -40,7 +40,8 @@ tasks.named("yamlRestTest") { 'search.aggregation/220_filters_bucket/cache hits', // node_selector? 'search.aggregation/50_filter/Standard queries get cached', 'search.aggregation/50_filter/Terms lookup gets cached', // terms lookup by "index" doesn't seem to work correctly - 'search.aggregation/70_adjacency_matrix/Terms lookup' // terms lookup by "index" doesn't seem to work correctly + 'search.aggregation/70_adjacency_matrix/Terms lookup', // terms lookup by "index" doesn't seem to work correctly + 'search.vectors/45_knn_search_bit/Search with synthetic source' // capabilities failure ].join(',') } diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/45_knn_search_bit.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/45_knn_search_bit.yml index 02576ad1b2b01..9b14f46da8da8 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/45_knn_search_bit.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/45_knn_search_bit.yml @@ -359,12 +359,12 @@ setup: --- "Search with synthetic source": - requires: + reason: "Support for bit dense vector synthetic source capability required" + test_runner_features: [capabilities] capabilities: - method: POST path: /_search capabilities: [ bit_dense_vector_synthetic_source ] - test_runner_features: capabilities - reason: "Support for bit dense vector synthetic source capability required" - do: indices.create: index: test_synthetic_source From 67018e00f8a306efa992f7986f0b5e406b83e94f Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:31:37 -0400 Subject: [PATCH 8/8] adding capabilities to the ccs test runner --- qa/ccs-common-rest/build.gradle | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/qa/ccs-common-rest/build.gradle b/qa/ccs-common-rest/build.gradle index f3e38b6827ef4..6121f7dcd4f82 100644 --- a/qa/ccs-common-rest/build.gradle +++ b/qa/ccs-common-rest/build.gradle @@ -10,7 +10,7 @@ apply plugin: 'elasticsearch.internal-yaml-rest-test' restResources { restApi { - include '_common', 'bulk', 'count', 'cluster', 'field_caps', 'get', 'knn_search', 'index', 'indices', 'msearch', + include 'capabilities', '_common', 'bulk', 'count', 'cluster', 'field_caps', 'get', 'knn_search', 'index', 'indices', 'msearch', 'search', 'async_search', 'graph', '*_point_in_time', 'info', 'scroll', 'clear_scroll', 'search_mvt', 'eql', 'sql' } restTests { @@ -40,8 +40,7 @@ tasks.named("yamlRestTest") { 'search.aggregation/220_filters_bucket/cache hits', // node_selector? 'search.aggregation/50_filter/Standard queries get cached', 'search.aggregation/50_filter/Terms lookup gets cached', // terms lookup by "index" doesn't seem to work correctly - 'search.aggregation/70_adjacency_matrix/Terms lookup', // terms lookup by "index" doesn't seem to work correctly - 'search.vectors/45_knn_search_bit/Search with synthetic source' // capabilities failure + 'search.aggregation/70_adjacency_matrix/Terms lookup' // terms lookup by "index" doesn't seem to work correctly ].join(',') }