Skip to content

Commit

Permalink
Fix Synthetic Source Handling for bit Type in dense_vector Field (e…
Browse files Browse the repository at this point in the history
…lastic#114407)

**Description:**

This PR addresses the issue described in [elastic#114402](elastic#114402), where the `synthetic_source` feature does not correctly handle the `bit` type in `dense_vector` fields when `index` is set to `false`. The root cause of the issue was that the `bit` type was not properly accounted for, leading to an array that is 8 times the size of the actual `dims` value of docvalue. This mismatch will causes an array out-of-bounds exception when reconstructing the document.

**Changes:**

- Adjusted the `synthetic_source` logic to correctly handle the `bit` type by ensuring the array size accounts for the 8x difference in dimensions.
- Added yaml test to cover the `bit` type scenario in `dense_vector` fields with `index` set to `false`.

**Related Issues:**

- Closes [elastic#114402](elastic#114402)
- Introduced in [elastic#110059](elastic#110059)
  • Loading branch information
Rassyan authored and benwtrent committed Oct 14, 2024
1 parent b0197c8 commit ee6ded6
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 8 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/114407.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 114407
summary: Fix synthetic source handling for `bit` type in `dense_vector` field
area: Search
type: bug
issues:
- 114402
Original file line number Diff line number Diff line change
Expand Up @@ -354,3 +354,54 @@ setup:
dims: 40
index: true
similarity: max_inner_product


---
"Search with synthetic source":
- requires:
capabilities:
- method: POST
path: /_search
capabilities: [ bit_dense_vector_synthetic_source ]
test_runner_features: capabilities
reason: "Support for bit dense vector synthetic source capability required"
- do:
indices.create:
index: test_synthetic_source
body:
mappings:
properties:
name:
type: keyword
vector1:
type: dense_vector
element_type: bit
dims: 40
index: false
vector2:
type: dense_vector
element_type: bit
dims: 40
index: true
similarity: l2_norm

- do:
index:
index: test_synthetic_source
id: "1"
body:
name: cow.jpg
vector1: [2, -1, 1, 4, -3]
vector2: [2, -1, 1, 4, -3]

- do:
indices.refresh: {}

- do:
search:
force_synthetic_source: true
index: test_synthetic_source

- match: {hits.hits.0._id: "1"}
- match: {hits.hits.0._source.vector1: [2, -1, 1, 4, -3]}
- match: {hits.hits.0._source.vector2: [2, -1, 1, 4, -3]}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import java.io.IOException;

import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.DYNAMIC_CONFIDENCE_INTERVAL;
import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT;

public class ES814ScalarQuantizedVectorsFormat extends FlatVectorsFormat {

Expand Down Expand Up @@ -289,4 +290,9 @@ public RandomVectorScorer getRandomVectorScorer(VectorSimilarityFunction sim, Ra
return delegate.getRandomVectorScorer(sim, values, query);
}
}

@Override
public int getMaxDimensions(String fieldName) {
return MAX_DIMS_COUNT;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

import java.io.IOException;

import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT;

class ES815BitFlatVectorsFormat extends FlatVectorsFormat {

private final FlatVectorsFormat delegate = new Lucene99FlatVectorsFormat(FlatBitVectorScorer.INSTANCE);
Expand All @@ -43,6 +45,11 @@ public FlatVectorsReader fieldsReader(SegmentReadState segmentReadState) throws
return delegate.fieldsReader(segmentReadState);
}

@Override
public int getMaxDimensions(String fieldName) {
return MAX_DIMS_COUNT;
}

static class FlatBitVectorScorer implements FlatVectorsScorer {

static final FlatBitVectorScorer INSTANCE = new FlatBitVectorScorer();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2270,7 +2270,7 @@ public void write(XContentBuilder b) throws IOException {
if (indexCreatedVersion.onOrAfter(LITTLE_ENDIAN_FLOAT_STORED_INDEX_VERSION)) {
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
}
int dims = fieldType().dims;
int dims = fieldType().elementType == ElementType.BIT ? fieldType().dims / Byte.SIZE : fieldType().dims;
for (int dim = 0; dim < dims; dim++) {
fieldType().elementType.readAndWriteValue(byteBuffer, b);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ private SearchCapabilities() {}

/** Support regex and range match rules in interval queries. */
private static final String RANGE_REGEX_INTERVAL_QUERY_CAPABILITY = "range_regexp_interval_queries";
/** Support synthetic source with `bit` type in `dense_vector` field when `index` is set to `false`. */
private static final String BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY = "bit_dense_vector_synthetic_source";

public static final Set<String> CAPABILITIES = Set.of(RANGE_REGEX_INTERVAL_QUERY_CAPABILITY);
public static final Set<String> CAPABILITIES = Set.of(
RANGE_REGEX_INTERVAL_QUERY_CAPABILITY,
BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY
);
}
Original file line number Diff line number Diff line change
Expand Up @@ -2040,24 +2040,27 @@ protected boolean supportsEmptyInputArray() {

private static class DenseVectorSyntheticSourceSupport implements SyntheticSourceSupport {
private final int dims = between(5, 1000);
private final ElementType elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT);
private final ElementType elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT, ElementType.BIT);
private final boolean indexed = randomBoolean();
private final boolean indexOptionsSet = indexed && randomBoolean();

@Override
public SyntheticSourceExample example(int maxValues) throws IOException {
Object value = elementType == ElementType.BYTE
? randomList(dims, dims, ESTestCase::randomByte)
: randomList(dims, dims, ESTestCase::randomFloat);
Object value = switch (elementType) {
case BYTE, BIT:
yield randomList(dims, dims, ESTestCase::randomByte);
case FLOAT:
yield randomList(dims, dims, ESTestCase::randomFloat);
};
return new SyntheticSourceExample(value, value, this::mapping);
}

private void mapping(XContentBuilder b) throws IOException {
b.field("type", "dense_vector");
b.field("dims", dims);
if (elementType == ElementType.BYTE || randomBoolean()) {
if (elementType == ElementType.BYTE || elementType == ElementType.BIT || randomBoolean()) {
b.field("element_type", elementType.toString());
}
b.field("dims", elementType == ElementType.BIT ? dims * Byte.SIZE : dims);
if (indexed) {
b.field("index", true);
b.field("similarity", "l2_norm");
Expand Down

0 comments on commit ee6ded6

Please sign in to comment.