Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vespa schema changes for query control & general quality of life #163

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 54 additions & 1 deletion src/cpr_sdk/models/search.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from datetime import datetime
from typing import List, Literal, Optional, Sequence
from typing import List, Literal, Optional, Sequence, Any

from pydantic import (
AliasChoices,
Expand Down Expand Up @@ -251,6 +251,12 @@ class SearchParameters(BaseModel):
A field and item mapping to search in the concepts field of the document passages.
"""

custom_vespa_request_body: Optional[dict[str, Any]] = None
"""
Extra fields to be added to the vespa request body. Overrides any existing fields,
so can also be used to override YQL or ranking profiles.
"""

@model_validator(mode="after")
def validate(self):
"""Validate against mutually exclusive fields"""
Expand Down Expand Up @@ -408,6 +414,21 @@ def from_vespa_response(cls, response_hit: dict) -> "Hit":
raise ValueError(f"Unknown response type: {response_type}")
return hit

def __eq__(self, other):
kdutia marked this conversation as resolved.
Show resolved Hide resolved
"""
Check if two hits are equal.

Ignores relevance and rank_features as these are dependent on non-deterministic query routing.
"""
if not isinstance(other, self.__class__):
return False

fields_to_compare = [
f for f in self.__dict__.keys() if f not in ("relevance", "rank_features")
]

return all(getattr(self, f) == getattr(other, f) for f in fields_to_compare)


class Document(Hit):
"""A document search result hit."""
Expand Down Expand Up @@ -517,6 +538,20 @@ class Family(BaseModel):
prev_continuation_token: Optional[str] = None
relevance: Optional[float] = None

def __eq__(self, other):
"""
Check if two Families are equal.

Ignores relevance as it's dependent on non-deterministic query routing.
"""

if not isinstance(other, self.__class__):
return False

fields_to_compare = [f for f in self.__dict__.keys() if f not in ("relevance")]

return all(getattr(self, f) == getattr(other, f) for f in fields_to_compare)


class SearchResponse(BaseModel):
"""Relevant results, and search response metadata"""
Expand All @@ -529,3 +564,21 @@ class SearchResponse(BaseModel):
continuation_token: Optional[str] = None
this_continuation_token: Optional[str] = None
prev_continuation_token: Optional[str] = None

def __eq__(self, other):
"""
Check if two hits are equal.

Ignores query time fields as they are non-deterministic.
"""

if not isinstance(other, self.__class__):
return False

fields_to_compare = [
f
for f in self.__dict__.keys()
if f not in ("query_time_ms", "total_time_ms")
]

return all(getattr(self, f) == getattr(other, f) for f in fields_to_compare)
2 changes: 1 addition & 1 deletion src/cpr_sdk/version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
_MAJOR = "1"
_MINOR = "11"
_MINOR = "12"
_PATCH = "0"
_SUFFIX = ""

Expand Down
11 changes: 11 additions & 0 deletions src/cpr_sdk/vespa.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,17 @@ def build_vespa_request_body(parameters: SearchParameters) -> dict[str, str]:
"input.query(query_embedding)"
] = "embed(msmarco-distilbert-dot-v5, @query_string)"

if parameters.custom_vespa_request_body is not None:
overlapping_keys = set(vespa_request_body.keys()) & set(
parameters.custom_vespa_request_body.keys()
)
if overlapping_keys:
_LOGGER.warning(
f"Custom request body contains overlapping keys that will override defaults: {overlapping_keys}"
)
Comment on lines +108 to +114
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This overlapping keys check feels like it could be lifted into a distinct utility method, that would also make it easy to unit test

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An important part of this to me is specifying that the request body contains overlapping keys, which I think we'd then loose if lifting out to a utility method? Wdyt?


vespa_request_body = vespa_request_body | parameters.custom_vespa_request_body
kdutia marked this conversation as resolved.
Show resolved Hide resolved

return vespa_request_body


Expand Down
82 changes: 26 additions & 56 deletions tests/local_vespa/test_app/schemas/document_passage.sd
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
schema document_passage {

field language type string {
indexing: "en" | set_language
kdutia marked this conversation as resolved.
Show resolved Hide resolved
}

field text_block_not_stemmed type string {
indexing: input text_block | summary | index
stemming: none
Expand Down Expand Up @@ -139,91 +143,57 @@ schema document_passage {
summary concepts {}
}

document-summary search_summary_with_tokens {
summary family_name {}
summary family_description {}
summary family_import_id {}
summary family_slug {}
summary family_category {}
summary family_publication_ts {}
summary family_geography {}
summary family_geographies {}
summary family_source {}
summary document_import_id {}
summary document_slug {}
summary document_languages {}
summary document_content_type {}
summary document_cdn_object {}
summary document_source_url {}
summary corpus_import_id {}
summary corpus_type_name {}
summary metadata {}
summary text_block {}
summary text_block_id {}
summary text_block_type {}
summary text_block_page {}
summary text_block_coords {}
summary concepts {}
document-summary search_summary_with_tokens inherits search_summary {
kdutia marked this conversation as resolved.
Show resolved Hide resolved
summary text_block_tokens {
source: text_block
tokens
}
}

rank-profile exact inherits default {
kdutia marked this conversation as resolved.
Show resolved Hide resolved
function text_score() {
expression: attribute(passage_weight) * fieldMatch(text_block)
}
first-phase {
expression: text_score()
}
summary-features: text_score() fieldMatch(text_block)
}

rank-profile exact_not_stemmed inherits default {
function text_score() {
expression: attribute(passage_weight) * fieldMatch(text_block_not_stemmed)
expression: fieldMatch(text_block_not_stemmed)
}
first-phase {
expression: text_score()
expression: attribute(passage_weight) * text_score()
}
summary-features: text_score() fieldMatch(text_block)
}

rank-profile hybrid_no_closeness inherits default {
function text_score() {
expression: attribute(passage_weight) * bm25(text_block)
}
first-phase {
expression: text_score()
}
summary-features: text_score() bm25(text_block)
summary-features: attribute(passage_weight) text_score()
}

rank-profile hybrid inherits default {
inputs {
query(query_embedding) tensor<float>(x[768])
query(passage_bm25_weight) double: 1.0
query(passage_closeness_weight) double: 1.0
}
function text_score() {
expression: attribute(passage_weight) * (bm25(text_block) + closeness(text_embedding))
expression: query(passage_bm25_weight) * bm25(text_block) + query(passage_closeness_weight) * closeness(text_embedding)
}
first-phase {
expression: text_score()
expression: attribute(passage_weight) * text_score()
}
summary-features: text_score() bm25(text_block) closeness(text_embedding)
summary-features: text_score() bm25(text_block) closeness(text_embedding) attribute(passage_weight)
}

rank-profile hybrid_custom_weight inherits default {
rank-profile hybrid_nativerank inherits default {
inputs {
query(query_embedding) tensor<float>(x[768])
query(bm25_weight) double
query(passage_nativerank_weight) double: 1.0
query(passage_closeness_weight) double: 1.0
}
function text_score() {
expression: attribute(passage_weight) * (query(bm25_weight) * bm25(text_block) + closeness(text_embedding))
expression: query(passage_nativerank_weight) * nativeRank(text_block) + query(passage_closeness_weight) * closeness(text_embedding)
}
first-phase {
expression: text_score()
expression: attribute(passage_weight) * text_score()
}
summary-features: text_score() bm25(text_block) closeness(text_embedding)
summary-features: text_score() nativeRank(text_block) closeness(text_embedding) attribute(passage_weight)
}

rank-profile hybrid_no_closeness inherits hybrid {
inputs {
query(passage_closeness_weight) double: 0.0
}
}

}
101 changes: 25 additions & 76 deletions tests/local_vespa/test_app/schemas/family_document.sd
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
schema family_document {

field language type string {
indexing: "en" | set_language
}

field family_name_not_stemmed type string {
indexing: input family_name_index | index
stemming: none
Expand Down Expand Up @@ -168,95 +172,61 @@ schema family_document {
fields: family_name_index, family_description_index
}

rank-profile exact inherits default {
function name_score() {
expression: attribute(name_weight) * fieldMatch(family_name_index)
}
function description_score() {
expression: attribute(description_weight) * fieldMatch(family_description_index)
}
first-phase {
expression: name_score() + description_score()
}
summary-features: name_score() description_score()
}

rank-profile exact_not_stemmed inherits default {
function name_score() {
expression: attribute(name_weight) * fieldMatch(family_name_not_stemmed)
expression: fieldMatch(family_name_not_stemmed)
}
function description_score() {
expression: attribute(description_weight) * fieldMatch(family_description_not_stemmed)
expression: fieldMatch(family_description_not_stemmed)
}
first-phase {
expression: name_score() + description_score()
expression: attribute(name_weight) * name_score() + attribute(description_weight) * description_score()
}
summary-features: name_score() description_score()
}

rank-profile hybrid_no_closeness inherits default {
function name_score() {
expression: attribute(name_weight) * bm25(family_name_index)
}
function description_score() {
expression: attribute(description_weight) * bm25(family_description_index)
}
first-phase {
expression: name_score() + description_score()
}
summary-features: name_score() description_score()
summary-features: name_score() description_score() attribute(name_weight) attribute(description_weight)
}

rank-profile hybrid inherits default {
inputs {
query(query_embedding) tensor<float>(x[768])
query(description_bm25_weight) double: 1.0
query(description_closeness_weight) double: 1.0
}
function name_score() {
expression: attribute(name_weight) * bm25(family_name_index)
expression: bm25(family_name_index)
}
function description_score() {
expression: attribute(description_weight) * (bm25(family_description_index) + closeness(family_description_embedding))
expression: query(description_bm25_weight) * bm25(family_description_index) + query(descriptione_closeness_weight) * closeness(family_description_embedding)
}
first-phase {
expression: name_score() + description_score()
expression: (attribute(name_weight) * name_score()) + (attribute(description_weight) * description_score())
}
summary-features: name_score() description_score()
summary-features: name_score() description_score() bm25(family_name_index) bm25(family_description_index) closeness(family_description_embedding)
}

rank-profile hybrid_no_description_embedding inherits default {
rank-profile hybrid_nativerank inherits default {
inputs {
query(query_embedding) tensor<float>(x[768])
query(description_nativerank_weight) double: 1.0
query(description_closeness_weight) double: 1.0
}
function name_score() {
expression: attribute(name_weight) * bm25(family_name_index)
expression: nativeRank(family_name_index)
}
function description_score() {
expression: attribute(description_weight) * bm25(family_description_index)
expression: query(description_nativerank_weight) * nativeRank(family_description_index) + query(description_closeness_weight) * closeness(family_description_embedding)
}
first-phase {
expression: name_score() + description_score()
expression: (attribute(name_weight) * name_score()) + (attribute(description_weight) * description_score())
}
summary-features: name_score() description_score()
summary-features: name_score() description_score() nativeRank(family_name_index) nativeRank(family_description_index) closeness(family_description_embedding)
}

rank-profile hybrid_custom_weight inherits default {
rank-profile hybrid_no_closeness inherits hybrid {
inputs {
query(query_embedding) tensor<float>(x[768])
query(bm25_weight) double
}
function name_score() {
expression: attribute(name_weight) * bm25(family_name_index)
}
function description_score() {
expression: attribute(description_weight) * bm25(family_description_index)
}
first-phase {
expression: name_score() + description_score()
query(description_closeness_weight) double: 0.0
kdutia marked this conversation as resolved.
Show resolved Hide resolved
}
summary-features: name_score() description_score()
}



document-summary search_summary {
summary family_name {}
summary family_description {}
Expand All @@ -281,28 +251,7 @@ schema family_document {
summary collection_summary {}
}

document-summary search_summary_with_tokens {
summary family_name {}
summary family_description {}
summary family_import_id {}
summary family_slug {}
summary family_category {}
summary family_publication_ts {}
summary family_geography {}
summary family_geographies {}
summary family_source {}
summary document_import_id {}
summary document_title {}
summary document_slug {}
summary document_languages {}
summary document_content_type {}
summary document_cdn_object {}
summary document_source_url {}
summary metadata {}
summary corpus_import_id {}
summary corpus_type_name {}
summary collection_title {}
summary collection_summary {}
document-summary search_summary_with_tokens inherits search_summary {
summary family_name_index {}
summary family_name_index_tokens {
source: family_name_index
Expand Down
Loading
Loading