Skip to content

Commit

Permalink
Improve ranked-search particularly for empty or invalid input
Browse files Browse the repository at this point in the history
  • Loading branch information
wardle committed Nov 7, 2024
1 parent 5936988 commit 7332fab
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 27 deletions.
3 changes: 2 additions & 1 deletion src/com/eldrix/hermes/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,8 @@
[[search]], in which no results would be returned if there is a token that
matches no results, [[ranked-search]] simply scores from best to worst.
This function is most useful for finding best matches, while [[search]]
is best used for autocompletion."
is best used for autocompletion. Unlike [[search]], this function returns
no results if there is no search string, or no tokens in the search string."
[^Svc svc params]
(search/do-ranked-search (.-searcher svc) (make-search-params svc params)))

Expand Down
61 changes: 37 additions & 24 deletions src/com/eldrix/hermes/impl/search.clj
Original file line number Diff line number Diff line change
Expand Up @@ -218,10 +218,14 @@
(let [term (.toString termAtt)]
(recur (.incrementToken tokenStream) (conj result term))))))))

(defn- make-autocomplete-tokens-query
(defn make-autocomplete-tokens-query
"Create a query that matches ALL tokens within `s` for field `field-name`.
This is most appropriate for autocompletion, in which a user would expect
to type and have all tokens considered in search."
to type and have all tokens considered in search. If `s` is nil, or has
no valid tokens (e.g an empty string or only stop characters), then this
returns nil. This permits use as a filter in autocompletion such that an
unfiltered list could be returned. This behaviour is different to
[[make-ranked-search-tokens-query]]."
(^BooleanQuery [field-name s] (make-autocomplete-tokens-query field-name s 0))
(^BooleanQuery [field-name s fuzzy]
(with-open [analyzer (StandardAnalyzer.)]
Expand All @@ -234,23 +238,27 @@
(.build builder))
(first qs)))))))

(defn- make-ranked-search-tokens-query
(defn make-ranked-search-tokens-query
"Create a query that searches using the tokens in `s`. This should be used
for 'ranked' search, in which one is looking for a best match. This means
some tokens may be ignored, but results will be ranked higher if more
tokens match."
tokens match. Unlike [[make-autocomplete-tokens-query]] this will always
return a query, and will return a MatchNoDocsQuery if the `s` is nil, or
has no valid tokens."
(^BooleanQuery [field-name s]
(make-ranked-search-tokens-query field-name s 0))
(^BooleanQuery [field-name s fuzzy]
(with-open [analyzer (StandardAnalyzer.)]
(when s
(let [qs (map #(make-token-query field-name % fuzzy) (tokenize analyzer field-name s))]
(if (pos-int? (count qs))
(let [builder (BooleanQuery$Builder.)]
(doseq [q qs]
(.add builder q BooleanClause$Occur/SHOULD))
(.build builder))
(first qs)))))))
(or
(when s
(let [qs (map #(make-token-query field-name % fuzzy) (tokenize analyzer field-name s))]
(if (pos-int? (count qs))
(let [builder (BooleanQuery$Builder.)]
(doseq [q qs]
(.add builder q BooleanClause$Occur/SHOULD))
(.build builder))
(first qs))))
(MatchNoDocsQuery.))))) ;; fallback to always returning a query that will match no results

(defn q-or [queries]
(lucene/q-or queries))
Expand Down Expand Up @@ -288,12 +296,13 @@
^Query
[{:keys [s query fuzzy show-fsn? inactive-concepts? inactive-descriptions? boost-length? concept-refsets properties]
:or {show-fsn? false, inactive-concepts? false, inactive-descriptions? true, boost-length? true}}]
(let [qb (cond-> (BooleanQuery$Builder.)
(let [s-query (make-autocomplete-tokens-query "nterm" s fuzzy) ;; doesn't matter if s is nil or blank
qb (cond-> (BooleanQuery$Builder.)

s
(.add (make-autocomplete-tokens-query "nterm" s fuzzy) BooleanClause$Occur/MUST)
s-query ;; if we have a search string AND it was turned into a non-nil query, use it
(.add s-query BooleanClause$Occur/MUST)

query
query ;; also bring in any other query
(.add query BooleanClause$Occur/MUST)

(not inactive-concepts?)
Expand Down Expand Up @@ -418,6 +427,8 @@ items."
(when (and (zero? fuzzy) (pos? fallback)) ; only fallback to fuzzy search if no fuzziness requested first time
(do-search searcher (assoc params :fuzzy fallback)))))))

(s/fdef do-ranked-search
:args (s/cat :searcher ::searcher :params ::search-params))
(defn do-ranked-search
"A modified [[do-search]] that returns results in ranked order. `max-hits`
must be specified.
Expand All @@ -427,16 +438,18 @@ items."
```
will return the 'best' match for the search tokens. Unlike [[do-search]],
which is useful for autocompletion, this will not return zero results if
one or more token is not found. Instead, results will be ranked from 'best'
to 'worst'. An important design consideration here was to not alter the
functioning of [[do-search]] with conditionals."
one or more token is not found. Instead, results will be ranked from 'best'
to 'worst'. An important design consideration here was to not alter the
functioning of [[do-search]] with conditionals. Note: no results will be
returned if 's' is nil, or empty, or contains no valid word tokens."
[searcher {:keys [s fuzzy query] :as params}]
(let [q1 (make-ranked-search-tokens-query "nterm" s fuzzy)
q2 (if query (q-and [q1 query]) q1)]
(do-search searcher (-> params ;; alter original parameters to use internal API to do ranked search, not autocompletion
(dissoc :s)
(assoc :boost-length? false
:query q2)))))
q2 (if (and q1 query) (q-and [q1 query]) q1)]
(when q2
(do-search searcher (-> params ;; alter original parameters to use internal API to do ranked search, not autocompletion
(dissoc :s)
(assoc :boost-length? false
:query q2))))))

(defn q-self
"Returns a query that will only return documents for the concept specified."
Expand Down
26 changes: 24 additions & 2 deletions test/com/eldrix/hermes/search_test.clj
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
(ns com.eldrix.hermes.search-test
(:require [clojure.core.async :as async]
[clojure.spec.gen.alpha :as gen]
[clojure.test :refer [deftest is]]
[clojure.test :refer [deftest is testing]]
[com.eldrix.hermes.core :as hermes]
[com.eldrix.hermes.impl.lucene :as lucene]
[com.eldrix.hermes.impl.search :as search]))
[com.eldrix.hermes.impl.search :as search])
(:import (org.apache.lucene.search Query)))

(def example-results-1
[{:id 464271012
Expand Down Expand Up @@ -70,6 +71,27 @@
(with-open [svc (hermes/open "snomed.db")]
(is (empty? (hermes/search svc {:constraint "<24700007" :query (search/q-concept-id 24700007)})))))

(deftest ^:live test-token-queries
(let [ss (gen/sample (gen/string) 1000)]
(doseq [s ss]
(let [q1 (search/make-autocomplete-tokens-query "nterm" s)
q2 (search/make-ranked-search-tokens-query "nterm" s)]
(is (or (nil? q1) (instance? Query q1)) "Autocomplete tokenisation should work with arbitrary string input")
(is (instance? Query q2) "Ranked search tokenisation should work with arbitrary string input and always return a query")))))

(deftest ^:live empty-search
(with-open [svc (hermes/open "snomed.db")]
(testing "autocompletion"
(is (seq (hermes/search svc {:max-hits 1 :constraint "<14679004"}))
"For autocompletion, with no search string, result should be unfiltered sequence")
(is (seq (hermes/search svc {:s " " :max-hits 1 :constraint "<14679004"}))
"For autocompletion, when search term resolves to zero tokens, result should be unfiltered sequence"))
(testing "ranked search"
(is (empty? (hermes/ranked-search svc {:max-hits 1 :constraint "<14679004"}))
"For ranked search, with no search string, result should be empty AND it must not throw an 'null query' exception")
(is (empty? (hermes/ranked-search svc {:s " " :max-hits 1 :constraint "<14679004"}))
"For ranked search, when search term resolves to zero tokens, result should be empty AND it must not throw an 'null query' exception"))))

(comment
(def svc (hermes/open "snomed.db"))
(def searcher (.-searcher svc))
Expand Down

0 comments on commit 7332fab

Please sign in to comment.