-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vespa schema changes for query control & general quality of life #163
base: main
Are you sure you want to change the base?
Changes from all commits
9048c5b
9b8ea4a
59084c5
401105e
6a7c6b0
780bfa4
7e2411d
4f238b1
7ab8260
d6e21d8
3aec5fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
_MAJOR = "1" | ||
_MINOR = "11" | ||
_MINOR = "12" | ||
_PATCH = "0" | ||
_SUFFIX = "" | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,15 @@ schema document_passage { | |
stemming: none | ||
} | ||
|
||
field language type string { | ||
indexing: "en" | set_language | ||
} | ||
|
||
field text_block_bolding type string { | ||
indexing: input text_block | summary | index | ||
bolding: true | ||
} | ||
|
||
document document_passage { | ||
|
||
field search_weights_ref type reference<search_weights> { | ||
|
@@ -139,91 +148,57 @@ schema document_passage { | |
summary concepts {} | ||
} | ||
|
||
document-summary search_summary_with_tokens { | ||
summary family_name {} | ||
summary family_description {} | ||
summary family_import_id {} | ||
summary family_slug {} | ||
summary family_category {} | ||
summary family_publication_ts {} | ||
summary family_geography {} | ||
summary family_geographies {} | ||
summary family_source {} | ||
summary document_import_id {} | ||
summary document_slug {} | ||
summary document_languages {} | ||
summary document_content_type {} | ||
summary document_cdn_object {} | ||
summary document_source_url {} | ||
summary corpus_import_id {} | ||
summary corpus_type_name {} | ||
summary metadata {} | ||
summary text_block {} | ||
summary text_block_id {} | ||
summary text_block_type {} | ||
summary text_block_page {} | ||
summary text_block_coords {} | ||
summary concepts {} | ||
document-summary search_summary_with_tokens inherits search_summary { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ohh this is nice! |
||
summary text_block_tokens { | ||
source: text_block | ||
tokens | ||
} | ||
} | ||
|
||
rank-profile exact inherits default { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice cleaning this up now we use exact_not_stemmed instead 🎉 |
||
function text_score() { | ||
expression: attribute(passage_weight) * fieldMatch(text_block) | ||
} | ||
first-phase { | ||
expression: text_score() | ||
} | ||
summary-features: text_score() fieldMatch(text_block) | ||
} | ||
|
||
rank-profile exact_not_stemmed inherits default { | ||
function text_score() { | ||
expression: attribute(passage_weight) * fieldMatch(text_block_not_stemmed) | ||
expression: fieldMatch(text_block_not_stemmed) | ||
} | ||
first-phase { | ||
expression: text_score() | ||
expression: attribute(passage_weight) * text_score() | ||
} | ||
summary-features: text_score() fieldMatch(text_block) | ||
} | ||
|
||
rank-profile hybrid_no_closeness inherits default { | ||
function text_score() { | ||
expression: attribute(passage_weight) * bm25(text_block) | ||
} | ||
first-phase { | ||
expression: text_score() | ||
} | ||
summary-features: text_score() bm25(text_block) | ||
summary-features: attribute(passage_weight) text_score() | ||
} | ||
|
||
rank-profile hybrid inherits default { | ||
inputs { | ||
query(query_embedding) tensor<float>(x[768]) | ||
query(passage_bm25_weight) double: 1.0 | ||
query(passage_closeness_weight) double: 1.0 | ||
} | ||
function text_score() { | ||
expression: attribute(passage_weight) * (bm25(text_block) + closeness(text_embedding)) | ||
expression: query(passage_bm25_weight) * bm25(text_block) + query(passage_closeness_weight) * closeness(text_embedding) | ||
} | ||
first-phase { | ||
expression: text_score() | ||
expression: attribute(passage_weight) * text_score() | ||
} | ||
summary-features: text_score() bm25(text_block) closeness(text_embedding) | ||
summary-features: text_score() bm25(text_block) closeness(text_embedding) attribute(passage_weight) | ||
} | ||
|
||
rank-profile hybrid_custom_weight inherits default { | ||
rank-profile hybrid_nativerank inherits default { | ||
inputs { | ||
query(query_embedding) tensor<float>(x[768]) | ||
query(bm25_weight) double | ||
query(passage_nativerank_weight) double: 1.0 | ||
query(passage_closeness_weight) double: 1.0 | ||
} | ||
function text_score() { | ||
expression: attribute(passage_weight) * (query(bm25_weight) * bm25(text_block) + closeness(text_embedding)) | ||
expression: query(passage_nativerank_weight) * nativeRank(text_block) + query(passage_closeness_weight) * closeness(text_embedding) | ||
} | ||
first-phase { | ||
expression: text_score() | ||
expression: attribute(passage_weight) * text_score() | ||
} | ||
summary-features: text_score() bm25(text_block) closeness(text_embedding) | ||
summary-features: text_score() nativeRank(text_block) closeness(text_embedding) attribute(passage_weight) | ||
} | ||
|
||
rank-profile hybrid_no_closeness inherits hybrid { | ||
inputs { | ||
query(passage_closeness_weight) double: 0.0 | ||
} | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,20 @@ schema family_document { | |
stemming: none | ||
} | ||
|
||
field family_name_bolding type string { | ||
indexing: input family_name_index | summary | index | ||
bolding: true | ||
} | ||
|
||
field family_description_bolding type string { | ||
indexing: input family_description_index | summary | index | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason to hang onto the index attribute for these when they come from an index field? |
||
bolding: true | ||
} | ||
|
||
field language type string { | ||
indexing: "en" | set_language | ||
} | ||
|
||
Comment on lines
+23
to
+26
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As on the document passage, wondering if this needs to be at the top of the doc |
||
document family_document { | ||
|
||
field search_weights_ref type reference<search_weights> { | ||
|
@@ -168,95 +182,61 @@ schema family_document { | |
fields: family_name_index, family_description_index | ||
} | ||
|
||
rank-profile exact inherits default { | ||
function name_score() { | ||
expression: attribute(name_weight) * fieldMatch(family_name_index) | ||
} | ||
function description_score() { | ||
expression: attribute(description_weight) * fieldMatch(family_description_index) | ||
} | ||
first-phase { | ||
expression: name_score() + description_score() | ||
} | ||
summary-features: name_score() description_score() | ||
} | ||
|
||
rank-profile exact_not_stemmed inherits default { | ||
function name_score() { | ||
expression: attribute(name_weight) * fieldMatch(family_name_not_stemmed) | ||
expression: fieldMatch(family_name_not_stemmed) | ||
} | ||
function description_score() { | ||
expression: attribute(description_weight) * fieldMatch(family_description_not_stemmed) | ||
expression: fieldMatch(family_description_not_stemmed) | ||
} | ||
first-phase { | ||
expression: name_score() + description_score() | ||
expression: attribute(name_weight) * name_score() + attribute(description_weight) * description_score() | ||
} | ||
summary-features: name_score() description_score() | ||
} | ||
|
||
rank-profile hybrid_no_closeness inherits default { | ||
function name_score() { | ||
expression: attribute(name_weight) * bm25(family_name_index) | ||
} | ||
function description_score() { | ||
expression: attribute(description_weight) * bm25(family_description_index) | ||
} | ||
first-phase { | ||
expression: name_score() + description_score() | ||
} | ||
summary-features: name_score() description_score() | ||
summary-features: name_score() description_score() attribute(name_weight) attribute(description_weight) | ||
} | ||
|
||
rank-profile hybrid inherits default { | ||
inputs { | ||
query(query_embedding) tensor<float>(x[768]) | ||
query(description_bm25_weight) double: 1.0 | ||
query(description_closeness_weight) double: 1.0 | ||
} | ||
function name_score() { | ||
expression: attribute(name_weight) * bm25(family_name_index) | ||
expression: bm25(family_name_index) | ||
} | ||
function description_score() { | ||
expression: attribute(description_weight) * (bm25(family_description_index) + closeness(family_description_embedding)) | ||
expression: query(description_bm25_weight) * bm25(family_description_index) + query(descriptione_closeness_weight) * closeness(family_description_embedding) | ||
} | ||
first-phase { | ||
expression: name_score() + description_score() | ||
expression: (attribute(name_weight) * name_score()) + (attribute(description_weight) * description_score()) | ||
} | ||
summary-features: name_score() description_score() | ||
summary-features: name_score() description_score() bm25(family_name_index) bm25(family_description_index) closeness(family_description_embedding) | ||
} | ||
|
||
rank-profile hybrid_no_description_embedding inherits default { | ||
rank-profile hybrid_nativerank inherits default { | ||
inputs { | ||
query(query_embedding) tensor<float>(x[768]) | ||
query(description_nativerank_weight) double: 1.0 | ||
query(description_closeness_weight) double: 1.0 | ||
} | ||
function name_score() { | ||
expression: attribute(name_weight) * bm25(family_name_index) | ||
expression: nativeRank(family_name_index) | ||
} | ||
function description_score() { | ||
expression: attribute(description_weight) * bm25(family_description_index) | ||
expression: query(description_nativerank_weight) * nativeRank(family_description_index) + query(descriptione_closeness_weight) * closeness(family_description_embedding) | ||
} | ||
first-phase { | ||
expression: name_score() + description_score() | ||
expression: (attribute(name_weight) * name_score()) + (attribute(description_weight) * description_score()) | ||
} | ||
summary-features: name_score() description_score() | ||
summary-features: name_score() description_score() nativeRank(family_name_index) nativeRank(family_description_index) closeness(family_description_embedding) | ||
} | ||
|
||
rank-profile hybrid_custom_weight inherits default { | ||
rank-profile hybrid_no_closeness inherits hybrid { | ||
inputs { | ||
query(query_embedding) tensor<float>(x[768]) | ||
query(bm25_weight) double | ||
} | ||
function name_score() { | ||
expression: attribute(name_weight) * bm25(family_name_index) | ||
} | ||
function description_score() { | ||
expression: attribute(description_weight) * bm25(family_description_index) | ||
query(description_closeness_weight) double: 0.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Getting my head around this, does setting this to 0.0 make it have no effect? |
||
} | ||
first-phase { | ||
expression: name_score() + description_score() | ||
} | ||
summary-features: name_score() description_score() | ||
} | ||
|
||
|
||
|
||
document-summary search_summary { | ||
summary family_name {} | ||
summary family_description {} | ||
|
@@ -281,28 +261,7 @@ schema family_document { | |
summary collection_summary {} | ||
} | ||
|
||
document-summary search_summary_with_tokens { | ||
summary family_name {} | ||
summary family_description {} | ||
summary family_import_id {} | ||
summary family_slug {} | ||
summary family_category {} | ||
summary family_publication_ts {} | ||
summary family_geography {} | ||
summary family_geographies {} | ||
summary family_source {} | ||
summary document_import_id {} | ||
summary document_title {} | ||
summary document_slug {} | ||
summary document_languages {} | ||
summary document_content_type {} | ||
summary document_cdn_object {} | ||
summary document_source_url {} | ||
summary metadata {} | ||
summary corpus_import_id {} | ||
summary corpus_type_name {} | ||
summary collection_title {} | ||
summary collection_summary {} | ||
document-summary search_summary_with_tokens inherits search_summary { | ||
summary family_name_index {} | ||
summary family_name_index_tokens { | ||
source: family_name_index | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at the docs for this:
https://docs.vespa.ai/en/reference/indexing-language-reference.html#set_language
This feels weird to me to have a document config at a field level, but it is what it is! I'm wondering if we need to move the language field above
text_block_not_stemmed
?