From 17e6e6d2042666a7a1bfd315fee8ee2b7c785401 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 3 Sep 2024 03:00:27 -0700 Subject: [PATCH] Added rrf method --- CHANGELOG.md | 1 + README.md | 8 +++++++- examples/hybrid.rb | 5 ++++- lib/searchkick.rb | 1 + lib/searchkick/reranking.rb | 28 ++++++++++++++++++++++++++++ test/hybrid_test.rb | 7 +++++-- 6 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 lib/searchkick/reranking.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c1e6e4b..eeacca7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 5.4.0 (unreleased) - Added experimental `knn` option +- Added experimental `rrf` method - Added experimental support for `_raw` to `where` option - Added warning for `exists` with non-`true` values - Added warning for full reindex and `:queue` mode diff --git a/README.md b/README.md index 1afad076..b5c6300f 100644 --- a/README.md +++ b/README.md @@ -1875,7 +1875,13 @@ semantic_search = Product.search(knn: {field: :embedding, vector: [1, 2, 3]}, li Searchkick.multi_search([keyword_search, semantic_search]) ``` -To combine the results, use a reranking model +To combine the results, use Reciprocal Rank Fusion (RRF) + +```ruby +Searchkick::Reranking.rrf(keyword_search, semantic_search) +``` + +Or a reranking model ```ruby rerank = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-xsmall-v1") diff --git a/examples/hybrid.rb b/examples/hybrid.rb index 23a5a90a..2ad943b0 100644 --- a/examples/hybrid.rb +++ b/examples/hybrid.rb @@ -45,7 +45,10 @@ class Document < ActiveRecord::Base Searchkick.multi_search([keyword_search, semantic_search]) -# to combine the results, use a reranking model +# to combine the results, use Reciprocal Rank Fusion (RRF) +p Searchkick::Reranking.rrf(keyword_search, semantic_search).map { |v| v[:result].content } + +# or a reranking model rerank = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-xsmall-v1") results = (keyword_search.to_a + semantic_search.to_a).uniq p rerank.(query, results.map(&:content), top_k: 5).map { |v| results[v[:doc_id]] }.map(&:content) diff --git a/lib/searchkick.rb b/lib/searchkick.rb index 28e3305c..ef566939 100644 --- a/lib/searchkick.rb +++ b/lib/searchkick.rb @@ -25,6 +25,7 @@ require_relative "searchkick/record_indexer" require_relative "searchkick/relation" require_relative "searchkick/relation_indexer" +require_relative "searchkick/reranking" require_relative "searchkick/results" require_relative "searchkick/raw" require_relative "searchkick/version" diff --git a/lib/searchkick/reranking.rb b/lib/searchkick/reranking.rb new file mode 100644 index 00000000..e4356cae --- /dev/null +++ b/lib/searchkick/reranking.rb @@ -0,0 +1,28 @@ +module Searchkick + module Reranking + def self.rrf(first_ranking, *rankings, k: 60) + rankings.unshift(first_ranking) + rankings.map!(&:to_ary) + + ranks = [] + results = [] + rankings.each do |ranking| + ranks << ranking.map.with_index.to_h { |v, i| [v, i + 1] } + results.concat(ranking) + end + + results = + results.uniq.map do |result| + score = + ranks.sum do |rank| + r = rank[result] + r ? 1.0 / (k + r) : 0.0 + end + + {result: result, score: score} + end + + results.sort_by { |v| -v[:score] } + end + end +end diff --git a/test/hybrid_test.rb b/test/hybrid_test.rb index 5e214869..d9b80669 100644 --- a/test/hybrid_test.rb +++ b/test/hybrid_test.rb @@ -24,8 +24,11 @@ def test_multi_search semantic_search = Product.search(knn: {field: :embedding, vector: [1, 2, 3]}) Searchkick.multi_search([keyword_search, semantic_search]) + results = Searchkick::Reranking.rrf(keyword_search, semantic_search) expected = ["The bear is growling", "The dog is barking", "The cat is purring"] - assert_equal expected.first(1), keyword_search.map(&:name) - assert_equal expected, semantic_search.map(&:name) + assert_equal expected, results.map { |v| v[:result].name } + assert_in_delta 0.03279, results[0][:score] + assert_in_delta 0.01612, results[1][:score] + assert_in_delta 0.01587, results[2][:score] end end