climatepolicyradar · kdutia · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
@@ -1,6 +1,6 @@
 import re
 from datetime import datetime
-from typing import List, Literal, Optional, Sequence
+from typing import List, Literal, Optional, Sequence, Any
 
 from pydantic import (
     AliasChoices,
@@ -251,6 +251,12 @@ class SearchParameters(BaseModel):
     A field and item mapping to search in the concepts field of the document passages.
     """
 
+    custom_vespa_request_body: Optional[dict[str, Any]] = None
+    """
+    Extra fields to be added to the vespa request body. Overrides any existing fields,
+    so can also be used to override YQL or ranking profiles.
+    """
+
     @model_validator(mode="after")
     def validate(self):
         """Validate against mutually exclusive fields"""
@@ -408,6 +414,21 @@ def from_vespa_response(cls, response_hit: dict) -> "Hit":
             raise ValueError(f"Unknown response type: {response_type}")
         return hit
 
+    def __eq__(self, other):
+        """
+        Check if two hits are equal.
+
+        Ignores relevance and rank_features as these are dependent on non-deterministic query routing.
+        """
+        if not isinstance(other, self.__class__):
+            return False
+
+        fields_to_compare = [
+            f for f in self.__dict__.keys() if f not in ("relevance", "rank_features")
+        ]
+
+        return all(getattr(self, f) == getattr(other, f) for f in fields_to_compare)
+
 
 class Document(Hit):
     """A document search result hit."""
@@ -517,6 +538,20 @@ class Family(BaseModel):
     prev_continuation_token: Optional[str] = None
     relevance: Optional[float] = None
 
+    def __eq__(self, other):
+        """
+        Check if two Families are equal.
+
+        Ignores relevance as it's dependent on non-deterministic query routing.
+        """
+
+        if not isinstance(other, self.__class__):
+            return False
+
+        fields_to_compare = [f for f in self.__dict__.keys() if f not in ("relevance")]
+
+        return all(getattr(self, f) == getattr(other, f) for f in fields_to_compare)
+
 
 class SearchResponse(BaseModel):
     """Relevant results, and search response metadata"""
@@ -529,3 +564,21 @@ class SearchResponse(BaseModel):
     continuation_token: Optional[str] = None
     this_continuation_token: Optional[str] = None
     prev_continuation_token: Optional[str] = None
+
+    def __eq__(self, other):
+        """
+        Check if two hits are equal.
+
+        Ignores query time fields as they are non-deterministic.
+        """
+
+        if not isinstance(other, self.__class__):
+            return False
+
+        fields_to_compare = [
+            f
+            for f in self.__dict__.keys()
+            if f not in ("query_time_ms", "total_time_ms")
+        ]
+
+        return all(getattr(self, f) == getattr(other, f) for f in fields_to_compare)
@@ -1,5 +1,5 @@
 _MAJOR = "1"
-_MINOR = "11"
+_MINOR = "12"
 _PATCH = "0"
 _SUFFIX = ""
 

@@ -104,6 +104,17 @@ def build_vespa_request_body(parameters: SearchParameters) -> dict[str, str]:
             "input.query(query_embedding)"
         ] = "embed(msmarco-distilbert-dot-v5, @query_string)"
 
+    if parameters.custom_vespa_request_body is not None:
+        overlapping_keys = set(vespa_request_body.keys()) & set(
+            parameters.custom_vespa_request_body.keys()
+        )
+        if overlapping_keys:
+            _LOGGER.warning(
+                f"Custom request body contains overlapping keys that will override defaults: {overlapping_keys}"
+            )
+
+        vespa_request_body = vespa_request_body | parameters.custom_vespa_request_body
+
     return vespa_request_body
 
 

@@ -1,5 +1,9 @@
 schema document_passage {
 
+    field language type string {
+        indexing: "en" | set_language
+    }
+
     field text_block_not_stemmed type string {
         indexing: input text_block | summary | index
         stemming: none
@@ -139,91 +143,57 @@ schema document_passage {
         summary concepts {}
     }
 
-    document-summary search_summary_with_tokens {
-        summary family_name {}
-        summary family_description {}
-        summary family_import_id {}
-        summary family_slug {}
-        summary family_category {}
-        summary family_publication_ts {}
-        summary family_geography {}
-        summary family_geographies {}
-        summary family_source {}
-        summary document_import_id {}
-        summary document_slug {}
-        summary document_languages {}
-        summary document_content_type {}
-        summary document_cdn_object {}
-        summary document_source_url {}
-        summary corpus_import_id {}
-        summary corpus_type_name {}
-        summary metadata {}
-        summary text_block {}
-        summary text_block_id {}
-        summary text_block_type {}
-        summary text_block_page {}
-        summary text_block_coords {}
-        summary concepts {}
+    document-summary search_summary_with_tokens inherits search_summary {
         summary text_block_tokens {
             source: text_block
             tokens
         }
     }
-
-    rank-profile exact inherits default {
-        function text_score() {
-            expression: attribute(passage_weight) * fieldMatch(text_block)
-        }
-        first-phase {
-            expression: text_score()
-        }
-        summary-features: text_score() fieldMatch(text_block)
-    }
 
     rank-profile exact_not_stemmed inherits default {
         function text_score() {
-            expression: attribute(passage_weight) * fieldMatch(text_block_not_stemmed)
+            expression: fieldMatch(text_block_not_stemmed)
         }
         first-phase {
-            expression: text_score()
+            expression: attribute(passage_weight) * text_score()
         }
-        summary-features: text_score() fieldMatch(text_block)
-    }
-
-    rank-profile hybrid_no_closeness inherits default {
-        function text_score() {
-            expression: attribute(passage_weight) * bm25(text_block)
-        }
-        first-phase {
-            expression: text_score()
-        }
-        summary-features: text_score() bm25(text_block)
+        summary-features: attribute(passage_weight) text_score()
     }
 
     rank-profile hybrid inherits default {
         inputs {
             query(query_embedding) tensor<float>(x[768])
+            query(passage_bm25_weight) double: 1.0
+            query(passage_closeness_weight) double: 1.0
         }
         function text_score() {
-            expression: attribute(passage_weight) * (bm25(text_block) + closeness(text_embedding))
+            expression: query(passage_bm25_weight) * bm25(text_block) + query(passage_closeness_weight) * closeness(text_embedding)
         }
         first-phase {
-            expression: text_score()
+            expression: attribute(passage_weight) * text_score()
         }
-        summary-features: text_score() bm25(text_block) closeness(text_embedding)
+        summary-features: text_score() bm25(text_block) closeness(text_embedding) attribute(passage_weight)
     }
 
-    rank-profile hybrid_custom_weight inherits default {
+    rank-profile hybrid_nativerank inherits default {
         inputs {
             query(query_embedding) tensor<float>(x[768])
-            query(bm25_weight) double
+            query(passage_nativerank_weight) double: 1.0
+            query(passage_closeness_weight) double: 1.0
         }
         function text_score() {
-            expression: attribute(passage_weight) * (query(bm25_weight) * bm25(text_block) + closeness(text_embedding))
+            expression: query(passage_nativerank_weight) * nativeRank(text_block) + query(passage_closeness_weight) * closeness(text_embedding)
         }
         first-phase {
-            expression: text_score()
+            expression: attribute(passage_weight) * text_score()
         }
-        summary-features: text_score() bm25(text_block) closeness(text_embedding)
+        summary-features: text_score() nativeRank(text_block) closeness(text_embedding) attribute(passage_weight)
     }
+
+    rank-profile hybrid_no_closeness inherits hybrid {
+        inputs {
+            query(passage_closeness_weight) double: 0.0
+        }
+    }
+
 }
@@ -1,5 +1,9 @@
 schema family_document {
 
+    field language type string {
+        indexing: "en" | set_language
+    }
+
     field family_name_not_stemmed type string {
         indexing: input family_name_index | index
         stemming: none
@@ -168,95 +172,61 @@ schema family_document {
         fields: family_name_index, family_description_index
     }
 
-    rank-profile exact inherits default {
-        function name_score() {
-            expression: attribute(name_weight) * fieldMatch(family_name_index)
-        }
-        function description_score() {
-            expression: attribute(description_weight) * fieldMatch(family_description_index)
-        }
-        first-phase {
-            expression: name_score() + description_score()
-        }
-        summary-features: name_score() description_score()
-    }
-
     rank-profile exact_not_stemmed inherits default {
         function name_score() {
-            expression: attribute(name_weight) * fieldMatch(family_name_not_stemmed)
+            expression: fieldMatch(family_name_not_stemmed)
         }
         function description_score() {
-            expression: attribute(description_weight) * fieldMatch(family_description_not_stemmed)
+            expression: fieldMatch(family_description_not_stemmed)
         }
         first-phase {
-            expression: name_score() + description_score()
+            expression: attribute(name_weight) * name_score() + attribute(description_weight) * description_score()
         }
-        summary-features: name_score() description_score()
-    }
-
-    rank-profile hybrid_no_closeness inherits default {
-        function name_score() {
-            expression: attribute(name_weight) * bm25(family_name_index)
-        }
-        function description_score() {
-            expression: attribute(description_weight) * bm25(family_description_index)
-        }
-        first-phase {
-            expression: name_score() + description_score()
-        }
-        summary-features: name_score() description_score()
+        summary-features: name_score() description_score() attribute(name_weight) attribute(description_weight)
     }
 
     rank-profile hybrid inherits default {
         inputs {
             query(query_embedding) tensor<float>(x[768])
+            query(description_bm25_weight) double: 1.0
+            query(description_closeness_weight) double: 1.0
         }
         function name_score() {
-            expression: attribute(name_weight) * bm25(family_name_index)
+            expression: bm25(family_name_index)
         }
         function description_score() {
-            expression: attribute(description_weight) * (bm25(family_description_index) + closeness(family_description_embedding))
+            expression: query(description_bm25_weight) * bm25(family_description_index) + query(descriptione_closeness_weight) * closeness(family_description_embedding)
         }
         first-phase {
-            expression: name_score() + description_score()
+            expression: (attribute(name_weight) * name_score()) + (attribute(description_weight) * description_score())
         }
-        summary-features: name_score() description_score()
+        summary-features: name_score() description_score() bm25(family_name_index) bm25(family_description_index) closeness(family_description_embedding)
     }
 
-    rank-profile hybrid_no_description_embedding inherits default {
+    rank-profile hybrid_nativerank inherits default {
         inputs {
             query(query_embedding) tensor<float>(x[768])
+            query(description_nativerank_weight) double: 1.0
+            query(description_closeness_weight) double: 1.0
         }
         function name_score() {
-            expression: attribute(name_weight) * bm25(family_name_index)
+            expression: nativeRank(family_name_index)
         }
         function description_score() {
-            expression: attribute(description_weight) * bm25(family_description_index)
+            expression: query(description_nativerank_weight) * nativeRank(family_description_index) + query(description_closeness_weight) * closeness(family_description_embedding)
         }
         first-phase {
-            expression: name_score() + description_score()
+            expression: (attribute(name_weight) * name_score()) + (attribute(description_weight) * description_score())
         }
-        summary-features: name_score() description_score()
+        summary-features: name_score() description_score() nativeRank(family_name_index) nativeRank(family_description_index) closeness(family_description_embedding)
     }
 
-    rank-profile hybrid_custom_weight inherits default {
+    rank-profile hybrid_no_closeness inherits hybrid {
         inputs {
-            query(query_embedding) tensor<float>(x[768])
-            query(bm25_weight) double
-        }
-        function name_score() {
-            expression: attribute(name_weight) * bm25(family_name_index)
-        }
-        function description_score() {
-            expression: attribute(description_weight) * bm25(family_description_index)
-        }
-        first-phase {
-            expression: name_score() + description_score()
+            query(description_closeness_weight) double: 0.0
         }
-        summary-features: name_score() description_score()
     }
-
-
+
     document-summary search_summary {
         summary family_name {}
         summary family_description {}
@@ -281,28 +251,7 @@ schema family_document {
         summary collection_summary {}
     }
 
-    document-summary search_summary_with_tokens {
-        summary family_name {}
-        summary family_description {}
-        summary family_import_id {}
-        summary family_slug {}
-        summary family_category {}
-        summary family_publication_ts {}
-        summary family_geography {}
-        summary family_geographies {}
-        summary family_source {}
-        summary document_import_id {}
-        summary document_title {}
-        summary document_slug {}
-        summary document_languages {}
-        summary document_content_type {}
-        summary document_cdn_object {}
-        summary document_source_url {}
-        summary metadata {}
-        summary corpus_import_id {}
-        summary corpus_type_name {}
-        summary collection_title {}
-        summary collection_summary {}
+    document-summary search_summary_with_tokens inherits search_summary {
         summary family_name_index {}
         summary family_name_index_tokens {
             source: family_name_index