From d9f331f18ac4480cc1741a6095575ff5d996f549 Mon Sep 17 00:00:00 2001 From: Anh Dung Bui Date: Thu, 21 Nov 2024 21:08:00 +0900 Subject: [PATCH] Fix doc ord bug & flush writer multiple times --- .../lucene/search/RerankKnnFloatVectorQuery.java | 2 +- .../search/TestRerankKnnFloatVectorQuery.java | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/RerankKnnFloatVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/RerankKnnFloatVectorQuery.java index eb1f65f54863..db8180dfafbf 100644 --- a/lucene/core/src/java/org/apache/lucene/search/RerankKnnFloatVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/RerankKnnFloatVectorQuery.java @@ -73,7 +73,7 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { int docId = iterator.docID(); float[] vectorValue = floatVectorValues.vectorValue(docId); float score = comparer.compare(vectorValue, target); - queue.insertWithOverflow(new ScoreDoc(docId, score)); + queue.insertWithOverflow(new ScoreDoc(leaf.docBase + docId, score)); } } int i = 0; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRerankKnnFloatVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRerankKnnFloatVectorQuery.java index a90494eaf15f..b4ef745ebb5d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestRerankKnnFloatVectorQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestRerankKnnFloatVectorQuery.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.search; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.Random; @@ -46,7 +47,6 @@ public class TestRerankKnnFloatVectorQuery extends LuceneTestCase { VectorSimilarityFunction.COSINE; private Directory directory; private IndexWriterConfig config; - private static final int NUM_VECTORS = 1000; private static final int VECTOR_DIMENSION = 128; @Before @@ -66,15 +66,22 @@ public void testTwoPhaseKnnVectorQuery() throws Exception { Random random = random(); + int numVectors = atLeast(1000); + // Step 1: Index random vectors in quantized format try (IndexWriter writer = new IndexWriter(directory, config)) { - for (int i = 0; i < NUM_VECTORS; i++) { + for (int i = 0; i < numVectors; i++) { float[] vector = randomFloatVector(VECTOR_DIMENSION, random); Document doc = new Document(); doc.add(new IntField("id", i, Field.Store.YES)); doc.add(new KnnFloatVectorField(FIELD, vector, VECTOR_SIMILARITY_FUNCTION)); writer.addDocument(doc); vectors.put(i, vector); + + // flush to create multiple segments + if (random.nextInt(10) == 0) { + writer.flush(); + } } } @@ -93,10 +100,11 @@ public void testTwoPhaseKnnVectorQuery() throws Exception { // Step 3: Verify that TopDocs scores match similarity with unquantized vectors for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document retrievedDoc = searcher.storedFields().document(scoreDoc.doc); - float[] docVector = vectors.get(retrievedDoc.getField("id").numericValue().intValue()); + int id = retrievedDoc.getField("id").numericValue().intValue(); + float[] docVector = vectors.get(id); float expectedScore = VECTOR_SIMILARITY_FUNCTION.compare(targetVector, docVector); Assert.assertEquals( - "Score does not match expected similarity for docId: " + scoreDoc.doc, + "Score does not match expected similarity for doc ord: " + scoreDoc.doc + ", id: " + id, expectedScore, scoreDoc.score, 1e-5);