Skip to content

Commit

Permalink
Fix: exact match should not stem (#145)
Browse files Browse the repository at this point in the history
* failing test for exact search

* add not_stemmed fields to vespa schema

* fix exact search

* add prod vespa schema changes; change rank-profile for exact search

* bump version to 1.9.5
  • Loading branch information
kdutia authored Nov 28, 2024
1 parent b1c8fbd commit f71b720
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 9 deletions.
2 changes: 1 addition & 1 deletion src/cpr_sdk/version.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
_MAJOR = "1"
_MINOR = "9"
_PATCH = "4"
_PATCH = "5"
_SUFFIX = ""

VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
Expand Down
2 changes: 1 addition & 1 deletion src/cpr_sdk/vespa.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def build_vespa_request_body(parameters: SearchParameters) -> dict[str, str]:
if parameters.all_results:
pass
elif parameters.exact_match:
vespa_request_body["ranking.profile"] = "exact"
vespa_request_body["ranking.profile"] = "exact_not_stemmed"
elif sensitive:
vespa_request_body["ranking.profile"] = "hybrid_no_closeness"
else:
Expand Down
6 changes: 3 additions & 3 deletions src/cpr_sdk/yql_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ def build_search_term(self) -> str:
if self.params.exact_match:
return """
(
(family_name contains({stem: false}@query_string)) or
(family_description contains({stem: false}@query_string)) or
(text_block contains ({stem: false}@query_string))
(family_name_not_stemmed contains({stem: false}@query_string)) or
(family_description_not_stemmed contains({stem: false}@query_string)) or
(text_block_not_stemmed contains ({stem: false}@query_string))
)
"""
elif self.sensitive:
Expand Down
66 changes: 63 additions & 3 deletions tests/local_vespa/test_app/schemas/document_passage.sd
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
schema document_passage {

field text_block_not_stemmed type string {
indexing: input text_block | summary | index
stemming: none
}

document document_passage {

field search_weights_ref type reference<search_weights> {
Expand Down Expand Up @@ -134,14 +139,55 @@ schema document_passage {
summary concepts {}
}

document-summary search_summary_with_tokens {
summary family_name {}
summary family_description {}
summary family_import_id {}
summary family_slug {}
summary family_category {}
summary family_publication_ts {}
summary family_geography {}
summary family_geographies {}
summary family_source {}
summary document_import_id {}
summary document_slug {}
summary document_languages {}
summary document_content_type {}
summary document_cdn_object {}
summary document_source_url {}
summary corpus_import_id {}
summary corpus_type_name {}
summary metadata {}
summary text_block {}
summary text_block_id {}
summary text_block_type {}
summary text_block_page {}
summary text_block_coords {}
summary concepts {}
summary text_block_tokens {
source: text_block
tokens
}
}

rank-profile exact inherits default {
function text_score() {
expression: attribute(passage_weight) * fieldMatch(text_block)
}
first-phase {
expression: text_score()
}
match-features: text_score()
match-features: text_score() fieldMatch(text_block)
}

rank-profile exact_not_stemmed inherits default {
function text_score() {
expression: attribute(passage_weight) * fieldMatch(text_block_not_stemmed)
}
first-phase {
expression: text_score()
}
match-features: text_score() fieldMatch(text_block)
}

rank-profile hybrid_no_closeness inherits default {
Expand All @@ -151,7 +197,7 @@ schema document_passage {
first-phase {
expression: text_score()
}
match-features: text_score()
match-features: text_score() bm25(text_block)
}

rank-profile hybrid inherits default {
Expand All @@ -164,6 +210,20 @@ schema document_passage {
first-phase {
expression: text_score()
}
match-features: text_score()
match-features: text_score() bm25(text_block) closeness(text_embedding)
}

rank-profile hybrid_custom_weight inherits default {
inputs {
query(query_embedding) tensor<float>(x[768])
query(bm25_weight) double
}
function text_score() {
expression: attribute(passage_weight) * (query(bm25_weight) * bm25(text_block) + closeness(text_embedding))
}
first-phase {
expression: text_score()
}
match-features: text_score() bm25(text_block) closeness(text_embedding)
}
}
92 changes: 92 additions & 0 deletions tests/local_vespa/test_app/schemas/family_document.sd
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
schema family_document {

field family_name_not_stemmed type string {
indexing: input family_name_index | index
stemming: none
}

field family_description_not_stemmed type string {
indexing: input family_description_index | index
stemming: none
}

document family_document {

field search_weights_ref type reference<search_weights> {
Expand Down Expand Up @@ -170,6 +180,19 @@ schema family_document {
}
match-features: name_score() description_score()
}

rank-profile exact_not_stemmed inherits default {
function name_score() {
expression: attribute(name_weight) * fieldMatch(family_name_not_stemmed)
}
function description_score() {
expression: attribute(description_weight) * fieldMatch(family_description_not_stemmed)
}
first-phase {
expression: name_score() + description_score()
}
match-features: name_score() description_score()
}

rank-profile hybrid_no_closeness inherits default {
function name_score() {
Expand Down Expand Up @@ -199,6 +222,40 @@ schema family_document {
}
match-features: name_score() description_score()
}

rank-profile hybrid_no_description_embedding inherits default {
inputs {
query(query_embedding) tensor<float>(x[768])
}
function name_score() {
expression: attribute(name_weight) * bm25(family_name_index)
}
function description_score() {
expression: attribute(description_weight) * bm25(family_description_index)
}
first-phase {
expression: name_score() + description_score()
}
match-features: name_score() description_score()
}

rank-profile hybrid_custom_weight inherits default {
inputs {
query(query_embedding) tensor<float>(x[768])
query(bm25_weight) double
}
function name_score() {
expression: attribute(name_weight) * bm25(family_name_index)
}
function description_score() {
expression: attribute(description_weight) * bm25(family_description_index)
}
first-phase {
expression: name_score() + description_score()
}
match-features: name_score() description_score()
}


document-summary search_summary {
summary family_name {}
Expand All @@ -223,4 +280,39 @@ schema family_document {
summary collection_title {}
summary collection_summary {}
}

document-summary search_summary_with_tokens {
summary family_name {}
summary family_description {}
summary family_import_id {}
summary family_slug {}
summary family_category {}
summary family_publication_ts {}
summary family_geography {}
summary family_geographies {}
summary family_source {}
summary document_import_id {}
summary document_title {}
summary document_slug {}
summary document_languages {}
summary document_content_type {}
summary document_cdn_object {}
summary document_source_url {}
summary metadata {}
summary corpus_import_id {}
summary corpus_type_name {}
summary collection_title {}
summary collection_summary {}
summary family_name_index {}
summary family_name_index_tokens {
source: family_name_index
tokens
}
summary family_description_index {}
summary family_description_index_tokens {
source: family_description_index
tokens
}
from-disk
}
}
14 changes: 14 additions & 0 deletions tests/test_search_adaptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,20 @@ def test_vespa_search_adaptor__works(test_vespa):
assert total_passage_count == response.total_hits


@pytest.mark.vespa
def test_vespa_search_adaptor__exact_search(test_vespa):
"""Test that exact search works"""

request = SearchParameters(query_string="biodiversity", exact_match=True)
response = vespa_search(test_vespa, request)

assert response.total_hits > 0
for family in response.families:
for hit in family.hits:
if isinstance(hit, Passage):
assert "biodiversity" in hit.text_block.lower()


@pytest.mark.parametrize(
"params",
(
Expand Down
6 changes: 5 additions & 1 deletion tests/test_search_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@
)
def test_build_vespa_request_body(query_type, params):
body = build_vespa_request_body(parameters=params)
assert body["ranking.profile"] == query_type
assert (
body["ranking.profile"] == query_type
if query_type != "exact"
else "exact_not_stemmed"
)
for key, value in body.items():
assert (
len(value) > 0
Expand Down

0 comments on commit f71b720

Please sign in to comment.