diff --git a/app/api/api_v1/schemas/document.py b/app/api/api_v1/schemas/document.py index 9407f3c3..f532e3b6 100644 --- a/app/api/api_v1/schemas/document.py +++ b/app/api/api_v1/schemas/document.py @@ -5,6 +5,9 @@ from . import CLIMATE_LAWS_MATCH +Json = dict[str, Any] + + class Event(BaseModel): # noqa: D101 name: str description: str @@ -115,7 +118,6 @@ class DocumentParserInput(BaseModel): publication_ts: datetime name: str description: str - postfix: Optional[str] source_url: Optional[str] download_url: Optional[str] @@ -124,18 +126,12 @@ class DocumentParserInput(BaseModel): type: str source: str import_id: str + family_import_id: str category: str - - frameworks: Sequence[str] geography: str - hazards: Sequence[str] - instruments: Sequence[str] - keywords: Sequence[str] languages: Sequence[str] - sectors: Sequence[str] - topics: Sequence[str] - events: Sequence[Event] + metadata: Json def to_json(self) -> Mapping[str, Any]: """Provide a serialisable version of the model""" @@ -144,7 +140,6 @@ def to_json(self) -> Mapping[str, Any]: json_dict["publication_ts"] = ( self.publication_ts.isoformat() if self.publication_ts is not None else None ) - json_dict["events"] = [event.to_json() for event in self.events] return json_dict class Config: # noqa: D106 diff --git a/app/core/ingestion/pipeline.py b/app/core/ingestion/pipeline.py index 50245eca..de6163a9 100644 --- a/app/core/ingestion/pipeline.py +++ b/app/core/ingestion/pipeline.py @@ -1,5 +1,5 @@ from datetime import datetime, timezone -from typing import Sequence, Tuple, cast +from typing import Any, Sequence, Tuple, cast from sqlalchemy.orm import Session @@ -11,22 +11,27 @@ FamilyOrganisation, Geography, ) +from app.db.models.law_policy.metadata import FamilyMetadata def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput]: """Generates a complete view of the current document database as pipeline input""" query = ( - db.query(Family, FamilyDocument, Geography, Organisation) + db.query(Family, FamilyDocument, FamilyMetadata, Geography, Organisation) .join(Family, Family.import_id == FamilyDocument.family_import_id) .join( FamilyOrganisation, FamilyOrganisation.family_import_id == Family.import_id ) + .join(FamilyMetadata, Family.import_id == FamilyMetadata.family_import_id) .join(Organisation, Organisation.id == FamilyOrganisation.organisation_id) .join(Geography, Geography.id == Family.geography_id) ) query_result = cast( - Sequence[Tuple[Family, FamilyDocument, Geography, Organisation]], query.all() + Sequence[ + Tuple[Family, FamilyDocument, FamilyMetadata, Geography, Organisation] + ], + query.all(), ) fallback_date = datetime(1900, 1, 1, tzinfo=timezone.utc) documents: Sequence[DocumentParserInput] = [ @@ -36,6 +41,7 @@ def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput] category=str(family.family_category), publication_ts=family.published_date or fallback_date, import_id=cast(str, family_document.import_id), + family_import_id=cast(str, family.import_id), source_url=( cast(str, family_document.physical_document.source_url) if family_document.physical_document is not None @@ -54,17 +60,15 @@ def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput] else [] ) ], - # TODO: the following are not used & should be removed - events=[], - frameworks=[], - hazards=[], - instruments=[], - keywords=[], - postfix=None, - sectors=[], - topics=[], + metadata=cast(dict[str, Any], family_metadata.value), ) - for family, family_document, geography, organisation in query_result + for ( + family, + family_document, + family_metadata, + geography, + organisation, + ) in query_result ] return documents diff --git a/makefile-docker.defs b/makefile-docker.defs index 9e58ddfa..51f94112 100644 --- a/makefile-docker.defs +++ b/makefile-docker.defs @@ -77,7 +77,7 @@ setup_test_search_index: docker-compose -f docker-compose.yml -f docker-compose.dev.yml run --rm opensearch-test-loader multielasticdump --direction=load --input=/cpr-backend/tests/data/ --output=${OPENSEARCH_URL} --ignoreType=template test_search: setup_test_search_index - docker-compose -f docker-compose.yml -f docker-compose.dev.yml run --name search_test -v ./data:/data backend pytest -vvv -m 'search' + docker-compose -f docker-compose.yml -f docker-compose.dev.yml run --name search_test -v "${PWD}/data:/data" backend pytest -vvv -m 'search' docker cp search_test:/data/benchmark_browse.txt . docker cp search_test:/data/benchmark_search.txt . docker rm search_test diff --git a/tests/core/validation/test_util.py b/tests/core/validation/test_util.py index df2452b4..03994549 100644 --- a/tests/core/validation/test_util.py +++ b/tests/core/validation/test_util.py @@ -81,24 +81,18 @@ def test_write_documents_to_s3(test_s3_client, mocker): d = DocumentParserInput( publication_ts=datetime.datetime(year=2008, month=12, day=25), name="name", - postfix="pf-A", description="description", source_url=None, download_url=None, type="executive", source="CCLW", import_id="1234-5678", + family_import_id="family_1234-5678", slug="geo_2008_name_1234_5678", category="category", - frameworks=[], geography="GEO", - hazards=[], - instruments=[], - keywords=[], languages=[], - sectors=[], - topics=[], - events=[], + metadata={}, ) upload_file_mock = mocker.patch.object(test_s3_client, "upload_fileobj") diff --git a/tests/routes/test_admin_unfccc.py b/tests/routes/test_admin_unfccc.py index 71336d58..23e87611 100644 --- a/tests/routes/test_admin_unfccc.py +++ b/tests/routes/test_admin_unfccc.py @@ -21,23 +21,24 @@ "publication_ts": "2021-10-25T00:00:00+00:00", "name": "Nationally determined contributions under the Paris Agreement. Revised note by the secretariat", "description": "Nationally determined contributions under the Paris Agreement. Revised note by the secretariat, Synthesis Report from UNFCCC Secretariat in 2021", - "postfix": null, "source_url": "https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf", "download_url": "url of downloaded document", "slug": "Doc-slug", "type": "Synthesis Report", "source": "UNFCCC", "import_id": "UNFCCC.Document.1.0", + "family_import_id": "UNFCCC.family.1.0", "category": "UNFCCC", - "frameworks": [], "geography": "GBR", - "hazards": [], - "instruments": [], - "keywords": [], "languages": [], - "sectors": [], - "topics": [], - "events": [] + "metadata": { + "author": [ + "UNFCCC Secretariat" + ], + "author_type": [ + "Party" + ] + } } } }"""