Adding one to vespa passage match pages. (#182)

* Adding one to vespa passage match pages. * Adding explicit test for the response objects. * Removing redundant test. --------- Co-authored-by: Mark <[email protected]>
climatepolicyradar · Nov 22, 2023 · b6425c7 · b6425c7
1 parent 80cd88e
commit b6425c7
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 1 deletion.
diff --git a/app/api/api_v1/schemas/search.py b/app/api/api_v1/schemas/search.py
@@ -81,6 +81,15 @@ class SearchResponseDocumentPassage(BaseModel):
     text_block_id: str
     text_block_page: Optional[int]
     text_block_coords: Optional[Sequence[Coord]]
+
+    @validator("text_block_page", always=True)
+    @classmethod
+    def validate_page(cls, value):
+        """PDF page numbers must be incremented from our 0-indexed values."""
+        if value is None:
+            return None
+        return value + 1
+
 
 
 class OpenSearchResponseMatchBase(BaseModel):

diff --git a/tests/unit/app/schemas/test_schemas.py b/tests/unit/app/schemas/test_schemas.py
@@ -1,7 +1,12 @@
 import pytest
 
 from app.api.api_v1.schemas.document import FamilyDocumentResponse
-from app.api.api_v1.schemas.search import SearchResponseFamilyDocument
+from app.api.api_v1.schemas.search import (
+    OpenSearchResponsePassageMatch,
+    SearchResponseDocumentPassage,
+    SearchResponseFamilyDocument,
+    OpenSearchResponseMatchBase,
+)
 
 CLIMATE_LAWS_DOMAIN_PATHS = [
     "climate-laws.org",
@@ -98,3 +103,54 @@ def test_non_climate_laws_source_url_left_in_document(source_domain_path, scheme
         document_role=None,
     )
     assert document_response.source_url == given_url
+
+
+def test_search_responses() -> None:
+    """
+    Test that instantiating Search Response objects is done correctly.
+
+    Particularly testing of the validators.
+    """
+    original_block_page = 0
+
+    original_block_data = {
+        "text": "example text",
+        "text_block_id": "p_0_b_0",
+        "text_block_page": original_block_page,
+        "text_block_coords": None,
+    }
+
+    base_response_data = {
+        "document_name": "Sample Document",
+        "document_geography": "USA",
+        "document_description": "This is a sample document description.",
+        "document_sectors": ["Technology", "Healthcare"],
+        "document_source": "Sample Source",
+        "document_id": "sample_import_id_123",
+        "document_date": "2023-11-22",
+        "document_type": "PDF",
+        "document_source_url": "https://example.com/sample_document",
+        "document_cdn_object": "sample_cdn_object_reference",
+        "document_category": "Sample Category",
+        "document_content_type": "application/pdf",
+        "document_slug": "sample-document",
+    }
+
+    # This is used for vespa responses
+    default_passage_response = SearchResponseDocumentPassage.parse_obj(
+        original_block_data
+    )
+
+    assert default_passage_response.text_block_page == original_block_page + 1
+
+    response_base = OpenSearchResponseMatchBase.parse_obj(base_response_data)
+
+    opensearch_passage_response = OpenSearchResponsePassageMatch(
+        **response_base.dict(), **original_block_data
+    )
+
+    assert opensearch_passage_response.text_block_page == original_block_page + 1
+
+    assert opensearch_passage_response.text_block_page == (
+        default_passage_response.text_block_page
+    )