Have one score definition for cosinesimilarity

Currently we have different score calculation for cosine similarity, for ex: script score, approximate search, exact search has diffent formula to convert distance to cosine similarity that is aligned with OpenSearch score. To keep it consistent, we will be using one defintion which is used by Lucene as standard definition for cosine similarity for all search types. Signed-off-by: Vijayan Balasubramanian <[email protected]>
VijayanB · Dec 30, 2024 · 6597eb1 · 6597eb1
1 parent c728f02
commit 6597eb1
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Introduced a writing layer in native engines where relies on the writing interface to process IO. (#2241)[https://github.com/opensearch-project/k-NN/pull/2241]
 - Allow method parameter override for training based indices (#2290) https://github.com/opensearch-project/k-NN/pull/2290]
 - Optimizes lucene query execution to prevent unnecessary rewrites (#2305)[https://github.com/opensearch-project/k-NN/pull/2305]
+- Use one formula to calculate cosine similarity (#2357)[https://github.com/opensearch-project/k-NN/pull/2357]
 ### Bug Fixes
 * Fixing the bug when a segment has no vector field present for disk based vector search (#2282)[https://github.com/opensearch-project/k-NN/pull/2282]
 * Allow validation for non knn index only after 2.17.0 (#2315)[https://github.com/opensearch-project/k-NN/pull/2315]

diff --git a/src/main/java/org/opensearch/knn/index/SpaceType.java b/src/main/java/org/opensearch/knn/index/SpaceType.java
@@ -60,9 +60,21 @@ public float scoreToDistanceTranslation(float score) {
         }
     },
     COSINESIMIL("cosinesimil") {
+        /**
+         * Cosine similarity has range of [-1, 1] where -1 represents vectors are at diametrically opposite, and 1 is where
+         * they are identical in direction and perfectly similar. In Lucene, scores have to be in the range of [0, Float.MAX_VALUE].
+         * Hence, to move the range from [-1, 1] to [ 0, Float.MAX_VALUE], we convert  using following formula which is adopted
+         * by Lucene as mentioned here
+         * https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
+         * We expect raw score = 1 - cosine(x,y), if underlying library returns different range or other than expected raw score,
+         * they should override this method to either provide valid range or convert raw score to the format as 1 - cosine and call this method
+         *
+         * @param rawScore score returned from underlying library
+         * @return Lucene scaled score
+         */
         @Override
         public float scoreTranslation(float rawScore) {
-            return 1 / (1 + rawScore);
+            return Math.max((2.0F - rawScore) / 2.0F, 0.0F);
         }
 
         @Override

diff --git a/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java b/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java
@@ -144,7 +144,12 @@ public CosineSimilarity(Object query, MappedFieldType fieldType) {
         protected BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery) {
             SpaceType.COSINESIMIL.validateVector(processedQuery);
             float qVectorSquaredMagnitude = getVectorMagnitudeSquared(processedQuery);
-            return (float[] q, float[] v) -> 1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude);
+            // To be consistent, we will be using same formula used by lucene as mentioned below
+            // https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
+            return (float[] q, float[] v) -> Math.max(
+                (1.0F + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude)) / 2.0F,
+                0.0F
+            );
         }
     }
 

diff --git a/src/test/java/org/opensearch/knn/index/NmslibIT.java b/src/test/java/org/opensearch/knn/index/NmslibIT.java
@@ -195,6 +195,64 @@ public void testEndToEnd() throws Exception {
         fail("Graphs are not getting evicted");
     }
 
+    public void testEndToEnd_withApproxAndExactSearch_inSameIndex_ForCosineSpaceType() throws Exception {
+        String indexName = "test-index-1";
+        String fieldName = "test-field-1";
+        SpaceType spaceType = SpaceType.COSINESIMIL;
+        Integer dimension = testData.indexData.vectors[0].length;
+
+        // Create an index
+        XContentBuilder builder = XContentFactory.jsonBuilder()
+            .startObject()
+            .startObject("properties")
+            .startObject(fieldName)
+            .field("type", "knn_vector")
+            .field("dimension", dimension)
+            .field(KNNConstants.METHOD_PARAMETER_SPACE_TYPE, spaceType.getValue())
+            .startObject(KNNConstants.KNN_METHOD)
+            .field(KNNConstants.NAME, KNNConstants.METHOD_HNSW)
+            .field(KNNConstants.KNN_ENGINE, KNNEngine.NMSLIB.getName())
+            .endObject()
+            .endObject()
+            .endObject()
+            .endObject();
+
+        Map<String, Object> mappingMap = xContentBuilderToMap(builder);
+        String mapping = builder.toString();
+
+        createKnnIndex(indexName, buildKNNIndexSettings(0), mapping);
+
+        // Index one document
+        addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray());
+
+        // Assert we have the right number of documents in the index
+        refreshAllIndices();
+        assertEquals(1, getDocCount(indexName));
+        // update threshold setting to skip building graph
+        updateIndexSettings(indexName, Settings.builder().put(KNNSettings.INDEX_KNN_ADVANCED_APPROXIMATE_THRESHOLD, -1));
+        // add duplicate document with different id
+        addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray());
+        assertEquals(2, getDocCount(indexName));
+        final int k = 2;
+        // search index
+        Response response = searchKNNIndex(
+            indexName,
+            KNNQueryBuilder.builder().fieldName(fieldName).vector(testData.queries[0]).k(k).build(),
+            k
+        );
+        String responseBody = EntityUtils.toString(response.getEntity());
+        List<KNNResult> knnResults = parseSearchResponse(responseBody, fieldName);
+        assertEquals(k, knnResults.size());
+
+        List<Float> actualScores = parseSearchResponseScore(responseBody, fieldName);
+
+        // both document should have identical score
+        assertEquals(actualScores.get(0), actualScores.get(1), 0.001);
+
+        // Delete index
+        deleteKNNIndex(indexName);
+    }
+
     @SneakyThrows
     private void validateSearch(
         final String indexName,

diff --git a/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java b/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java
@@ -10,6 +10,7 @@
 import java.util.Locale;
 
 import lombok.SneakyThrows;
+import org.apache.lucene.index.VectorSimilarityFunction;
 import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.knn.KNNTestCase;
 import org.opensearch.knn.index.engine.KNNMethodContext;
@@ -86,7 +87,11 @@ public void testCosineSimilarity_whenValid_thenSucceed() {
             getMappingConfigForMethodMapping(knnMethodContext, 3)
         );
         KNNScoringSpace.CosineSimilarity cosineSimilarity = new KNNScoringSpace.CosineSimilarity(arrayListQueryObject, fieldType);
-        assertEquals(2F, cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat), 0.1F);
+        assertEquals(
+            VectorSimilarityFunction.COSINE.compare(arrayFloat2, arrayFloat),
+            cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat),
+            0.1F
+        );
 
         // invalid zero vector
         final List<Float> queryZeroVector = List.of(0.0f, 0.0f, 0.0f);