From 31afa465f9a9b646b64e626027b9fcb7c168491f Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Wed, 5 Oct 2022 16:40:50 -0400 Subject: [PATCH] Fixed KNN search formulas (#1447) Signed-off-by: Fanit Kolchina Signed-off-by: Fanit Kolchina --- _search-plugins/knn/approximate-knn.md | 32 ++++++++++----------- _search-plugins/knn/knn-score-script.md | 38 ++++++++++++------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/_search-plugins/knn/approximate-knn.md b/_search-plugins/knn/approximate-knn.md index 75f3bf85fb..cdac0879f5 100644 --- a/_search-plugins/knn/approximate-knn.md +++ b/_search-plugins/knn/approximate-knn.md @@ -279,39 +279,39 @@ GET my-knn-index-1/_search A space corresponds to the function used to measure the distance between two points in order to determine the k-nearest neighbors. From the k-NN perspective, a lower score equates to a closer and better result. This is the opposite of how OpenSearch scores results, where a greater score equates to a better result. To convert distances to OpenSearch scores, we take 1 / (1 + distance). The k-NN plugin the spaces the plugin supports are below. Not every method supports each of these spaces. Be sure to check out [the method documentation]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions) to make sure the space you are interested in is supported. - + - + - - - + + + - - - + + + - - + + - - + + - - + +
spaceTypeDistance FunctionDistance Function (d) OpenSearch Score
l2\[ Distance(X, Y) = \sum_{i=1}^n (X_i - Y_i)^2 \]1 / (1 + Distance Function)l1\[ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n |x_i - y_i| \]\[ score = {1 \over 1 + d } \]
l1\[ Distance(X, Y) = \sum_{i=1}^n (X_i - Y_i) \]1 / (1 + Distance Function)l2\[ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n (x_i - y_i)^2 \]\[ score = {1 \over 1 + d } \]
linf\[ Distance(X, Y) = Max(X_i - Y_i) \]1 / (1 + Distance Function)\[ d(\mathbf{x}, \mathbf{y}) = max(|x_i - y_i|) \]\[ score = {1 \over 1 + d } \]
cosinesimil\[ 1 - {A · B \over \|A\| · \|B\|} = 1 - - {\sum_{i=1}^n (A_i · B_i) \over \sqrt{\sum_{i=1}^n A_i^2} · \sqrt{\sum_{i=1}^n B_i^2}}\] - where \(\|A\|\) and \(\|B\|\) represent normalized vectors.nmslib and faiss:
1 / (1 + Distance Function)
Lucene:
(1 + Distance Function) / 2
\[ d(\mathbf{x}, \mathbf{y}) = 1 - cos { \theta } = 1 - {\mathbf{x} · \mathbf{y} \over \|\mathbf{x}\| · \|\mathbf{y}\|}\]\[ = 1 - + {\sum_{i=1}^n x_i y_i \over \sqrt{\sum_{i=1}^n x_i^2} · \sqrt{\sum_{i=1}^n y_i^2}}\] + where \(\|\mathbf{x}\|\) and \(\|\mathbf{y}\|\) represent normalized vectors.nmslib and faiss:\[ score = {1 \over 1 + d } \]
Lucene:\[ score = {1 + d \over 2}\]
innerproduct (not supported for Lucene)\[ Distance(X, Y) = - {A · B} \]if Distance Function is > or = 0, use 1 / (1 + Distance Function). Otherwise, -Distance Function + 1\[ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} · \mathbf{y}} = - \sum_{i=1}^n x_i y_i \]\[ \text{If} d \ge 0, \] \[score = {1 \over 1 + d }\] \[\text{If} d < 0, score = −d + 1\]
diff --git a/_search-plugins/knn/knn-score-script.md b/_search-plugins/knn/knn-score-script.md index a02e85defd..1f77d8ff4f 100644 --- a/_search-plugins/knn/knn-score-script.md +++ b/_search-plugins/knn/knn-score-script.md @@ -282,44 +282,44 @@ GET my-long-index/_search A space corresponds to the function used to measure the distance between two points in order to determine the k-nearest neighbors. From the k-NN perspective, a lower score equates to a closer and better result. This is the opposite of how OpenSearch scores results, where a greater score equates to a better result. The following table illustrates how OpenSearch converts spaces to scores: - + - + - - - + + + - - - + + + - - + + - - + + - - - + + + - - + +
spaceTypeDistance FunctionDistance Function (d) OpenSearch Score
l2\[ Distance(X, Y) = \sum_{i=1}^n (X_i - Y_i)^2 \]1 / (1 + Distance Function)l1\[ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n |x_i - y_i| \]\[ score = {1 \over 1 + d } \]
l1\[ Distance(X, Y) = \sum_{i=1}^n (X_i - Y_i) \]1 / (1 + Distance Function)l2\[ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n (x_i - y_i)^2 \]\[ score = {1 \over 1 + d } \]
linf\[ Distance(X, Y) = Max(X_i - Y_i) \]1 / (1 + Distance Function)\[ d(\mathbf{x}, \mathbf{y}) = max(|x_i - y_i|) \]\[ score = {1 \over 1 + d } \]
cosinesimil\[ {A · B \over \|A\| · \|B\|} = - {\sum_{i=1}^n (A_i · B_i) \over \sqrt{\sum_{i=1}^n A_i^2} · \sqrt{\sum_{i=1}^n B_i^2}}\] - where \(\|A\|\) and \(\|B\|\) represent normalized vectors.1 + Distance Function\[ d(\mathbf{x}, \mathbf{y}) = cos \theta = {\mathbf{x} · \mathbf{y} \over \|\mathbf{x}\| · \|\mathbf{y}\|}\]\[ = + {\sum_{i=1}^n x_i y_i \over \sqrt{\sum_{i=1}^n x_i^2} · \sqrt{\sum_{i=1}^n y_i^2}}\] + where \(\|\mathbf{x}\|\) and \(\|\mathbf{y}\|\) represent normalized vectors.\[ score = 1 + d \]
innerproduct\[ Distance(X, Y) = \sum_{i=1}^n X_iY_i \]1 / (1 + Distance Function)innerproduct (not supported for Lucene)\[ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} · \mathbf{y}} = - \sum_{i=1}^n x_i y_i \]\[ \text{If} d \ge 0, \] \[score = {1 \over 1 + d }\] \[\text{If} d < 0, score = −d + 1\]
hammingbitDistance = countSetBits(X \(\oplus\) Y) 1 / (1 + Distance Function)\[ d(\mathbf{x}, \mathbf{y}) = \text{countSetBits}(\mathbf{x} \oplus \mathbf{y})\]\[ score = {1 \over 1 + d } \]