apache · dungba88 · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java
@@ -99,7 +99,7 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException {
     if (topK.scoreDocs.length == 0) {
       return new MatchNoDocsQuery();
     }
-    return createRewrittenQuery(reader, topK);
+    return createRewrittenQuery(reader, topK.scoreDocs);
   }
 
   private TopDocs searchLeaf(
@@ -255,18 +255,18 @@ protected TopDocs mergeLeafResults(TopDocs[] perLeafResults) {
     return TopDocs.merge(k, perLeafResults);
   }
 
-  private Query createRewrittenQuery(IndexReader reader, TopDocs topK) {
-    int len = topK.scoreDocs.length;
+  static Query createRewrittenQuery(IndexReader reader, ScoreDoc[] scoreDocs) {
+    int len = scoreDocs.length;
 
     assert len > 0;
-    float maxScore = topK.scoreDocs[0].score;
+    float maxScore = scoreDocs[0].score;
 
-    Arrays.sort(topK.scoreDocs, Comparator.comparingInt(a -> a.doc));
+    Arrays.sort(scoreDocs, Comparator.comparingInt(a -> a.doc));
     int[] docs = new int[len];
     float[] scores = new float[len];
     for (int i = 0; i < len; i++) {
-      docs[i] = topK.scoreDocs[i].doc;
-      scores[i] = topK.scoreDocs[i].score;
+      docs[i] = scoreDocs[i].doc;
+      scores[i] = scoreDocs[i].score;
     }
     int[] segmentStarts = findSegmentStarts(reader.leaves(), docs);
     return new DocAndScoreQuery(docs, scores, maxScore, segmentStarts, reader.getContext().id());

diff --git a/lucene/core/src/java/org/apache/lucene/search/RerankKnnFloatVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/RerankKnnFloatVectorQuery.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import static org.apache.lucene.search.AbstractKnnVectorQuery.createRewrittenQuery;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Objects;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.VectorSimilarityFunction;
+
+/**
+ * A wrapper of KnnFloatVectorQuery which does full-precision reranking.
+ *
+ * @lucene.experimental
+ */
+public class RerankKnnFloatVectorQuery extends Query {
+
+  private final int k;
+  private final float[] target;
+  private final KnnFloatVectorQuery query;
+
+  /**
+   * Execute the KnnFloatVectorQuery and re-rank using full-precision vectors
+   *
+   * @param query the KNN query to execute as initial phase
+   * @param target the target of the search
+   * @param k the number of documents to find
+   * @throws IllegalArgumentException if <code>k</code> is less than 1
+   */
+  public RerankKnnFloatVectorQuery(KnnFloatVectorQuery query, float[] target, int k) {
+    this.query = query;
+    this.target = target;
+    this.k = k;
+  }
+
+  @Override
+  public Query rewrite(IndexSearcher indexSearcher) throws IOException {
+    IndexReader reader = indexSearcher.getIndexReader();
+    Query rewritten = indexSearcher.rewrite(query);
+    // short-circuit: don't re-rank if we already got all possible results
+    if (query.getK() <= k) {
+      return rewritten;
+    }
+    Weight weight = indexSearcher.createWeight(rewritten, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
+    HitQueue queue = new HitQueue(k, false);
+    for (var leaf : reader.leaves()) {
+      Scorer scorer = weight.scorer(leaf);
+      if (scorer == null) {
+        continue;
+      }
+      FloatVectorValues floatVectorValues = leaf.reader().getFloatVectorValues(query.getField());
+      if (floatVectorValues == null) {
+        continue;
+      }
+      FieldInfo fi = leaf.reader().getFieldInfos().fieldInfo(query.getField());
+      if (fi == null) {
+        continue;
+      }
+      VectorSimilarityFunction comparer = fi.getVectorSimilarityFunction();
+      DocIdSetIterator iterator = scorer.iterator();
+      while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+        int docId = iterator.docID();
+        float[] vectorValue = floatVectorValues.vectorValue(docId);
+        float score = comparer.compare(vectorValue, target);
+        queue.insertWithOverflow(new ScoreDoc(leaf.docBase + docId, score));
+      }
+    }
+    int i = 0;
+    ScoreDoc[] scoreDocs = new ScoreDoc[queue.size()];
+    for (ScoreDoc topDoc : queue) {
+      scoreDocs[i++] = topDoc;
+    }
+    return createRewrittenQuery(reader, scoreDocs);
+  }
+
+  @Override
+  public int hashCode() {
+    int result = Arrays.hashCode(target);
+    result = 31 * result + Objects.hash(query, k);
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    RerankKnnFloatVectorQuery that = (RerankKnnFloatVectorQuery) o;
+    return Objects.equals(query, that.query) && Arrays.equals(target, that.target) && k == that.k;
+  }
+
+  @Override
+  public void visit(QueryVisitor visitor) {
+    query.visit(visitor);
+  }
+
+  @Override
+  public String toString(String field) {
+    return getClass().getSimpleName() + ":" + query.toString(field) + "[" + k + "]";
+  }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRerankKnnFloatVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRerankKnnFloatVectorQuery.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.KnnFloatVectorField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.store.ByteBuffersDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.TestUtil;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestRerankKnnFloatVectorQuery extends LuceneTestCase {
+
+  private static final String FIELD = "vector";
+  private static final VectorSimilarityFunction VECTOR_SIMILARITY_FUNCTION =
+      VectorSimilarityFunction.COSINE;
+  private static final int NUM_VECTORS = 1000;
+  private static final int VECTOR_DIMENSION = 128;
+
+  private Directory directory;
+  private IndexWriterConfig config;
+
+  @Before
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    directory = new ByteBuffersDirectory();
+
+    // Set up the IndexWriterConfig to use quantized vector storage
+    config = new IndexWriterConfig();
+    config.setCodec(
+        TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswScalarQuantizedVectorsFormat()));
+  }
+
+  @Test
+  public void testTwoPhaseKnnVectorQuery() throws Exception {
+    Map<Integer, float[]> vectors = new HashMap<>();
+
+    Random random = random();
+
+    int numVectors = atLeast(NUM_VECTORS);
+    int numSegments = random.nextInt(2, 10);
+
+    // Step 1: Index random vectors in quantized format
+    try (IndexWriter writer = new IndexWriter(directory, config)) {
+      for (int j = 0; j < numSegments; j++) {
+        for (int i = 0; i < numVectors; i++) {
+          float[] vector = randomFloatVector(VECTOR_DIMENSION, random);
+          Document doc = new Document();
+          int id = j * numVectors + i;
+          doc.add(new IntField("id", id, Field.Store.YES));
+          doc.add(new KnnFloatVectorField(FIELD, vector, VECTOR_SIMILARITY_FUNCTION));
+          writer.addDocument(doc);
+          vectors.put(id, vector);
+
+          writer.flush();
+        }
+      }
+    }
+
+    // Step 2: Run TwoPhaseKnnVectorQuery with a random target vector
+    try (IndexReader reader = DirectoryReader.open(directory)) {
+      IndexSearcher searcher = new IndexSearcher(reader);
+      float[] targetVector = randomFloatVector(VECTOR_DIMENSION, random);
+      int k = 10;
+      double oversample = random.nextFloat(1.5f, 3.0f);
+
+      KnnFloatVectorQuery knnQuery =
+          new KnnFloatVectorQuery(FIELD, targetVector, k + (int) (k * oversample));
+      RerankKnnFloatVectorQuery query = new RerankKnnFloatVectorQuery(knnQuery, targetVector, k);
+      TopDocs topDocs = searcher.search(query, k);
+
+      // Step 3: Verify that TopDocs scores match similarity with unquantized vectors
+      for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+        Document retrievedDoc = searcher.storedFields().document(scoreDoc.doc);
+        int id = retrievedDoc.getField("id").numericValue().intValue();
+        float[] docVector = vectors.get(id);
+        assert docVector != null : "Vector for id " + id + " not found";
+        float expectedScore = VECTOR_SIMILARITY_FUNCTION.compare(targetVector, docVector);
+        Assert.assertEquals(
+            "Score does not match expected similarity for doc ord: " + scoreDoc.doc + ", id: " + id,
+            expectedScore,
+            scoreDoc.score,
+            1e-5);
+      }
+    }
+  }
+
+  private float[] randomFloatVector(int dimension, Random random) {
+    float[] vector = new float[dimension];
+    for (int i = 0; i < dimension; i++) {
+      vector[i] = random.nextFloat();
+    }
+    return vector;
+  }
+}