From 1d42fd904c63c07ba04558f44ca318d80d1c5fbd Mon Sep 17 00:00:00 2001 From: annaCPR Date: Tue, 10 Sep 2024 15:35:13 +0100 Subject: [PATCH] Feature/pdct 1350 ingest events (#211) * Validate import_id and family_import_id before saving documents * Make fmaily_document_import_id optional on ingest events * Ingest events * Validate import id when saving events * Validate family_import_id when saving events * Do not generate an import_id when savin gevents if one is already provided * Validate event type against corpus taxonomy * Move collection validation logic to a separate service * Move family validation logic to a separate service * Move document validation logic to a separate service * Move event validation logci to a separate service * Switch to using validation service when ingesting collections * Switch to using validation service when ingesting families * Use validation service when ingesting documents * Use validation service when ingesting events * Tidy up tests * More tidy up of test data * Tiny refactor * Return better error on Exception * Bump minor version * Fix mocks in unit tests + tidy up * Add missing docstrings to validation service * Add missing doctypes and tighten return types * Update docstrings * Make test filenames more meaningful * Reference correct test files after rename * Update docstrings, again... * Update tests/integration_tests/ingest/test_ingest.py Co-authored-by: Katy Baulch <46493669+katybaulch@users.noreply.github.com> * Update tests/integration_tests/ingest/test_ingest.py Co-authored-by: Katy Baulch <46493669+katybaulch@users.noreply.github.com> * Update tests/integration_tests/ingest/test_ingest.py Co-authored-by: Katy Baulch <46493669+katybaulch@users.noreply.github.com> * Update tests/unit_tests/routers/ingest/test_ingest.py Co-authored-by: Katy Baulch <46493669+katybaulch@users.noreply.github.com> * Update tests/unit_tests/routers/ingest/test_ingest.py Co-authored-by: Katy Baulch <46493669+katybaulch@users.noreply.github.com> * Update tests/integration_tests/ingest/test_ingest.py Co-authored-by: Katy Baulch <46493669+katybaulch@users.noreply.github.com> * Add missing imports and params in docstrings for validation --------- Co-authored-by: Katy Baulch <46493669+katybaulch@users.noreply.github.com> --- app/api/api_v1/routers/ingest.py | 2 +- app/model/event.py | 1 + app/model/ingest.py | 18 +- app/repository/event.py | 9 +- app/service/ingest.py | 135 ++++++---- app/service/validation.py | 131 +++++++++ pyproject.toml | 2 +- .../ingest/{test.json => test_bulk_data.json} | 20 +- ...est_bulk_data_with_invalid_event_type.json | 42 +++ tests/integration_tests/ingest/test_ingest.py | 72 ++++- .../ingest/test_ingest_template.py | 11 +- tests/mocks/services/validation_service.py | 50 ++++ tests/unit_tests/conftest.py | 9 + .../ingest/{test.json => test_bulk_data.json} | 4 +- .../unit_tests/routers/ingest/test_ingest.py | 19 +- .../routers/ingest/test_ingest_template.py | 11 +- .../service/ingest/test_ingest_service.py | 248 ++++-------------- .../validation/test_collection_validation.py | 23 ++ .../validation/test_document_validation.py | 63 +++++ .../validation/test_event_validation.py | 54 ++++ .../validation/test_family_validation.py | 95 +++++++ 21 files changed, 747 insertions(+), 272 deletions(-) create mode 100644 app/service/validation.py rename tests/integration_tests/ingest/{test.json => test_bulk_data.json} (73%) create mode 100644 tests/integration_tests/ingest/test_bulk_data_with_invalid_event_type.json create mode 100644 tests/mocks/services/validation_service.py rename tests/unit_tests/routers/ingest/{test.json => test_bulk_data.json} (95%) create mode 100644 tests/unit_tests/service/validation/test_collection_validation.py create mode 100644 tests/unit_tests/service/validation/test_document_validation.py create mode 100644 tests/unit_tests/service/validation/test_event_validation.py create mode 100644 tests/unit_tests/service/validation/test_family_validation.py diff --git a/app/api/api_v1/routers/ingest.py b/app/api/api_v1/routers/ingest.py index ab8b537e..7ff8761d 100644 --- a/app/api/api_v1/routers/ingest.py +++ b/app/api/api_v1/routers/ingest.py @@ -157,5 +157,5 @@ async def ingest(new_data: UploadFile, corpus_import_id: str) -> Json: except Exception as e: _LOGGER.error(e) raise HTTPException( - status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Unexpected error" + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(e) ) diff --git a/app/model/event.py b/app/model/event.py index a8456f84..a16966ce 100644 --- a/app/model/event.py +++ b/app/model/event.py @@ -35,6 +35,7 @@ class EventCreateDTO(BaseModel): generated. """ + import_id: Optional[str] = None # From FamilyEvent event_title: str date: datetime diff --git a/app/model/ingest.py b/app/model/ingest.py index 1200c937..073c17d5 100644 --- a/app/model/ingest.py +++ b/app/model/ingest.py @@ -5,6 +5,7 @@ from app.model.collection import CollectionCreateDTO from app.model.document import DocumentCreateDTO +from app.model.event import EventCreateDTO from app.model.family import FamilyCreateDTO from app.model.general import Json @@ -64,11 +65,25 @@ class IngestEventDTO(BaseModel): import_id: str family_import_id: str - family_document_import_id: str + family_document_import_id: Optional[str] = None event_title: str date: datetime event_type_value: str + def to_event_create_dto(self) -> EventCreateDTO: + """ + Convert IngestEventDTO to EventCreateDTO. + + :return EventCreateDTO: Converted EventCreateDTO instance. + """ + return EventCreateDTO( + import_id=self.import_id, + family_import_id=self.family_import_id, + event_title=self.event_title, + date=self.date, + event_type_value=self.event_type_value, + ) + class IngestDocumentDTO(BaseModel): """Representation of a document for ingest.""" @@ -77,7 +92,6 @@ class IngestDocumentDTO(BaseModel): family_import_id: str variant_name: Optional[str] = None metadata: Json - events: list[str] title: str source_url: Optional[AnyHttpUrl] = None user_language_name: Optional[str] diff --git a/app/repository/event.py b/app/repository/event.py index d73cfbbc..5cb60b11 100644 --- a/app/repository/event.py +++ b/app/repository/event.py @@ -69,6 +69,7 @@ def _event_to_dto(family_event_meta: FamilyEventTuple) -> EventReadDTO: def _dto_to_event_dict(dto: EventCreateDTO) -> dict: return { + "import_id": dto.import_id if dto.import_id else None, "family_import_id": dto.family_import_id, "family_document_import_id": dto.family_document_import_id, "date": dto.date, @@ -179,9 +180,11 @@ def create(db: Session, event: EventCreateDTO) -> str: ) org_name = cast(str, org.name) - new_family_event.import_id = cast( - Column, generate_import_id(db, CountedEntity.Event, org_name) - ) + + if not new_family_event.import_id: + new_family_event.import_id = cast( + Column, generate_import_id(db, CountedEntity.Event, org_name) + ) db.add(new_family_event) except Exception: diff --git a/app/service/ingest.py b/app/service/ingest.py index 3d4ce109..ef559243 100644 --- a/app/service/ingest.py +++ b/app/service/ingest.py @@ -5,9 +5,8 @@ import of data and other services for validation etc. """ -from typing import Optional +from typing import Any, Optional -from db_client.models.dfce.taxonomy_entry import EntitySpecificTaxonomyKeys from fastapi import HTTPException, status from pydantic import ConfigDict, validate_call from sqlalchemy.orm import Session @@ -15,118 +14,153 @@ import app.clients.db.session as db_session import app.repository.collection as collection_repository import app.repository.document as document_repository +import app.repository.event as event_repository import app.repository.family as family_repository -import app.service.category as category -import app.service.collection as collection import app.service.corpus as corpus import app.service.geography as geography -import app.service.metadata as metadata +import app.service.validation as validation from app.errors import ValidationError -from app.model.ingest import IngestCollectionDTO, IngestDocumentDTO, IngestFamilyDTO -from app.service.collection import validate_import_id +from app.model.ingest import ( + IngestCollectionDTO, + IngestDocumentDTO, + IngestEventDTO, + IngestFamilyDTO, +) @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) def save_collections( - collection_data: list[dict], corpus_import_id: str, db: Optional[Session] = None + collection_data: list[dict[str, Any]], + corpus_import_id: str, + db: Optional[Session] = None, ) -> list[str]: """ Creates new collections with the values passed. - :param Session db: The database session to use for saving collections. - :param list[dict] collection_data: The data to use for creating collections. + :param list[dict[str, Any]] collection_data: The data to use for creating collections. :param str corpus_import_id: The import_id of the corpus the collections belong to. - :return str: The new import_ids for the saved collections. + :param Optional[Session] db: The database session to use for saving collections or None. + :return list[str]: The new import_ids for the saved collections. """ if db is None: db = db_session.get_db() + validation.validate_collections(collection_data) + collection_import_ids = [] org_id = corpus.get_corpus_org_id(corpus_import_id) + for coll in collection_data: dto = IngestCollectionDTO(**coll).to_collection_create_dto() - if dto.import_id: - validate_import_id(dto.import_id) import_id = collection_repository.create(db, dto, org_id) - collection_import_ids.append(import_id) + return collection_import_ids @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) def save_families( - family_data: list[dict], corpus_import_id: str, db: Optional[Session] = None + family_data: list[dict[str, Any]], + corpus_import_id: str, + db: Optional[Session] = None, ) -> list[str]: """ Creates new families with the values passed. - :param Session db: The database session to use for saving families. - :param list[dict] families_data: The data to use for creating families. + :param list[dict[str, Any]] family_data: The data to use for creating families. :param str corpus_import_id: The import_id of the corpus the families belong to. - :return str: The new import_ids for the saved families. + :param Optional[Session] db: The database session to use for saving families or None. + :return list[str]: The new import_ids for the saved families. """ if db is None: db = db_session.get_db() + validation.validate_families(family_data, corpus_import_id) + family_import_ids = [] org_id = corpus.get_corpus_org_id(corpus_import_id) + for fam in family_data: + # TODO: Uncomment when implementing feature/pdct-1402-validate-collection-exists-before-creating-family + # collection.validate(collections, db) dto = IngestFamilyDTO( **fam, corpus_import_id=corpus_import_id ).to_family_create_dto(corpus_import_id) - - if dto.import_id: - validate_import_id(dto.import_id) - corpus.validate(db, corpus_import_id) - geo_id = geography.get_id(db, dto.geography) - category.validate(dto.category) - collections = set(dto.collections) - collection.validate_multiple_ids(collections) - # TODO: Uncomment when implementing feature/pdct-1402-validate-collection-exists-before-creating-family - # collection.validate(collections, db) - metadata.validate_metadata(db, corpus_import_id, dto.metadata) - - import_id = family_repository.create(db, dto, geo_id, org_id) + import_id = family_repository.create( + db, dto, geography.get_id(db, dto.geography), org_id + ) family_import_ids.append(import_id) + return family_import_ids @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) def save_documents( - document_data: list[dict], + document_data: list[dict[str, Any]], corpus_import_id: str, db: Optional[Session] = None, ) -> list[str]: """ Creates new documents with the values passed. - :param Session db: The database session to use for saving documents. - :param list[dict] document_data: The data to use for creating documents. + :param list[dict[str, Any]] document_data: The data to use for creating documents. :param str corpus_import_id: The import_id of the corpus the documents belong to. - :return str: The new import_ids for the saved documents. + :param Optional[Session] db: The database session to use for saving documents or None. + :return list[str]: The new import_ids for the saved documents. """ if db is None: db = db_session.get_db() + validation.validate_documents(document_data, corpus_import_id) + document_import_ids = [] + for doc in document_data: dto = IngestDocumentDTO(**doc).to_document_create_dto() - - if dto.variant_name == "": - raise ValidationError("Variant name is empty") - metadata.validate_metadata( - db, - corpus_import_id, - dto.metadata, - EntitySpecificTaxonomyKeys.DOCUMENT.value, - ) import_id = document_repository.create(db, dto) document_import_ids.append(import_id) + return document_import_ids -def validate_entity_relationships(data: dict) -> None: +@validate_call(config=ConfigDict(arbitrary_types_allowed=True)) +def save_events( + event_data: list[dict[str, Any]], + corpus_import_id: str, + db: Optional[Session] = None, +) -> list[str]: + """ + Creates new events with the values passed. + + :param list[dict[str, Any]] event_data: The data to use for creating events. + :param str corpus_import_id: The import_id of the corpus the events belong to. + :param Optional[Session] db: The database session to use for saving events or None. + :return list[str]: The new import_ids for the saved events. + """ + if db is None: + db = db_session.get_db() + + validation.validate_events(event_data, corpus_import_id) + + event_import_ids = [] + + for event in event_data: + dto = IngestEventDTO(**event).to_event_create_dto() + import_id = event_repository.create(db, dto) + event_import_ids.append(import_id) + + return event_import_ids + + +def validate_entity_relationships(data: dict[str, Any]) -> None: + """ + Validates relationships between entities contained in data based on import_ids. + For documents, it validates that the family the document is linked to exists. + + :param dict[str, Any] data: The data object containing entities to be validated. + :raises ValidationError: raised should there be any unmatched relationships. + """ families = [] if "families" in data: for fam in data["families"]: @@ -144,21 +178,22 @@ def validate_entity_relationships(data: dict) -> None: @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) -def import_data(data: dict, corpus_import_id: str) -> dict: +def import_data(data: dict[str, Any], corpus_import_id: str) -> dict[str, str]: """ Imports data for a given corpus_import_id. - :param dict data: The data to be imported. + :param dict[str, Any] data: The data to be imported. :param str corpus_import_id: The import_id of the corpus the data should be imported into. :raises RepositoryError: raised on a database error. - :raises ValidationError: raised should the import_id be invalid. - :return dict: Import ids of the saved entities. + :raises ValidationError: raised should the data be invalid. + :return dict[str, str]: Import ids of the saved entities. """ db = db_session.get_db() collection_data = data["collections"] if "collections" in data else None family_data = data["families"] if "families" in data else None document_data = data["documents"] if "documents" in data else None + event_data = data["events"] if "events" in data else None if not data: raise HTTPException(status_code=status.HTTP_204_NO_CONTENT) @@ -176,6 +211,8 @@ def import_data(data: dict, corpus_import_id: str) -> dict: response["families"] = save_families(family_data, corpus_import_id, db) if document_data: response["documents"] = save_documents(document_data, corpus_import_id, db) + if event_data: + response["events"] = save_events(event_data, corpus_import_id, db) return response except Exception as e: diff --git a/app/service/validation.py b/app/service/validation.py new file mode 100644 index 00000000..77e7738b --- /dev/null +++ b/app/service/validation.py @@ -0,0 +1,131 @@ +from typing import Any, Optional + +from db_client.functions.corpus_helpers import TaxonomyData, get_taxonomy_from_corpus +from db_client.models.dfce.taxonomy_entry import EntitySpecificTaxonomyKeys + +import app.clients.db.session as db_session +import app.service.category as category +import app.service.collection as collection +import app.service.corpus as corpus +import app.service.metadata as metadata +from app.errors import ValidationError +from app.service.collection import validate_import_id + + +def validate_collection(collection: dict[str, Any]) -> None: + """ + Validates a collection. + + :param dict[str, Any] collection: The collection object to be validated. + :raises ValidationError: raised should the data be invalid. + """ + validate_import_id(collection["import_id"]) + + +def validate_collections(collections: list[dict[str, Any]]) -> None: + """ + Validates a list of collections. + + :param list[dict[str, Any]] collections: The list of collection objects to be validated. + """ + for coll in collections: + validate_collection(coll) + + +def validate_family(family: dict[str, Any], corpus_import_id: str) -> None: + """ + Validates a family. + + :param dict[str, Any] family: The family object to be validated. + :param str corpus_import_id: The corpus_import_id to be used for validating the family object. + :raises ValidationError: raised should the data be invalid. + """ + db = db_session.get_db() + + validate_import_id(family["import_id"]) + corpus.validate(db, corpus_import_id) + category.validate(family["category"]) + collections = set(family["collections"]) + collection.validate_multiple_ids(collections) + metadata.validate_metadata(db, corpus_import_id, family["metadata"]) + + +def validate_families(families: list[dict[str, Any]], corpus_import_id: str) -> None: + """ + Validates a list of families. + + :param list[dict[str, Any]] families: The list of family objects to be validated. + :param str corpus_import_id: The corpus_import_id to be used for validating the family objects. + """ + for fam in families: + validate_family(fam, corpus_import_id) + + +def validate_document(document: dict[str, Any], corpus_import_id: str) -> None: + """ + Validates a document. + + :param dict[str, Any] document: The document object to be validated. + :param str corpus_import_id: The corpus_import_id to be used for validating the document object. + :raises ValidationError: raised should the data be invalid. + """ + db = db_session.get_db() + + validate_import_id(document["import_id"]) + validate_import_id(document["family_import_id"]) + if document["variant_name"] == "": + raise ValidationError("Variant name is empty") + metadata.validate_metadata( + db, + corpus_import_id, + document["metadata"], + EntitySpecificTaxonomyKeys.DOCUMENT.value, + ) + + +def validate_documents(documents: list[dict[str, Any]], corpus_import_id: str) -> None: + """ + Validates a list of documents. + + :param list[dict[str, Any]] documents: The list of document objects to be validated. + :param str corpus_import_id: The corpus_import_id to be used for validating the document objects. + """ + for doc in documents: + validate_document(doc, corpus_import_id) + + +def validate_event(event: dict[str, Any], taxonomy: Optional[TaxonomyData]) -> None: + """ + Validates an event. + + :param dict[str, Any] event: The event object to be validated. + :param Optional[TaxonomyData] taxonomy: The taxonomy to be used to validate the type of the event object. + :raises ValidationError: raised should the data be invalid. + """ + validate_import_id(event["import_id"]) + validate_import_id(event["family_import_id"]) + allowed_event_types = taxonomy["event_type"]["allowed_values"] if taxonomy else None + if not allowed_event_types: + raise ValidationError( + f"No allowed event types found for event {event['import_id']}" + ) + if ( + isinstance(allowed_event_types, list) + and event["event_type_value"] not in allowed_event_types + ): + raise ValidationError(f"Event type ['{event['event_type_value']}'] is invalid!") + + +def validate_events(events: list[dict[str, Any]], corpus_import_id: str) -> None: + """ + Validates a list of events. + + :param list[dict[str, Any]] events: The list of event objects to be validated. + :param str corpus_import_id: The corpus_import_id to be used for validating the event objects. + """ + db = db_session.get_db() + + event_taxonomy = get_taxonomy_from_corpus(db, corpus_import_id) + + for ev in events: + validate_event(ev, event_taxonomy) diff --git a/pyproject.toml b/pyproject.toml index 3c9ad6b4..fe1f82ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "admin_backend" -version = "2.14.1" +version = "2.15.0" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] diff --git a/tests/integration_tests/ingest/test.json b/tests/integration_tests/ingest/test_bulk_data.json similarity index 73% rename from tests/integration_tests/ingest/test.json rename to tests/integration_tests/ingest/test_bulk_data.json index 532c888e..cc591cad 100644 --- a/tests/integration_tests/ingest/test.json +++ b/tests/integration_tests/ingest/test_bulk_data.json @@ -42,7 +42,7 @@ "import_id": "test.new.document.0", "family_import_id": "test.new.family.0", "metadata": { "role": ["MAIN"], "type": ["Law"] }, - "events": [], + "variant_name": "Original Language", "title": "", "user_language_name": "" }, @@ -50,9 +50,25 @@ "import_id": "test.new.document.1", "family_import_id": "test.new.family.1", "metadata": { "role": ["MAIN"], "type": ["Law"] }, - "events": [], + "variant_name": "Original Language", "title": "", "user_language_name": "" } + ], + "events": [ + { + "import_id": "test.new.event.0", + "family_import_id": "test.new.family.0", + "event_title": "Test", + "date": "2024-01-01", + "event_type_value": "Amended" + }, + { + "import_id": "test.new.event.1", + "family_import_id": "test.new.family.1", + "event_title": "Test", + "date": "2024-01-01", + "event_type_value": "Amended" + } ] } diff --git a/tests/integration_tests/ingest/test_bulk_data_with_invalid_event_type.json b/tests/integration_tests/ingest/test_bulk_data_with_invalid_event_type.json new file mode 100644 index 00000000..b0c7f0f4 --- /dev/null +++ b/tests/integration_tests/ingest/test_bulk_data_with_invalid_event_type.json @@ -0,0 +1,42 @@ +{ + "collections": [ + { + "import_id": "test.new.collection.0", + "title": "Test title", + "description": "Test description" + } + ], + "families": [ + { + "import_id": "test.new.family.0", + "title": "Test", + "summary": "Test", + "geography": "South Asia", + "category": "UNFCCC", + "metadata": { + "author_type": ["Non-Party"], + "author": ["Test"] + }, + "collections": ["test.new.collection.0"] + } + ], + "documents": [ + { + "import_id": "test.new.document.0", + "family_import_id": "test.new.family.0", + "metadata": { "role": ["MAIN"], "type": ["Law"] }, + "variant_name": "Original Language", + "title": "", + "user_language_name": "" + } + ], + "events": [ + { + "import_id": "test.new.event.0", + "family_import_id": "test.new.family.0", + "event_title": "Test", + "date": "2024-01-01", + "event_type_value": "Invalid" + } + ] +} diff --git a/tests/integration_tests/ingest/test_ingest.py b/tests/integration_tests/ingest/test_ingest.py index b05f1ac1..c243ed86 100644 --- a/tests/integration_tests/ingest/test_ingest.py +++ b/tests/integration_tests/ingest/test_ingest.py @@ -1,3 +1,6 @@ +import os + +from db_client.models.dfce import FamilyEvent from db_client.models.dfce.collection import Collection from db_client.models.dfce.family import Family, FamilyDocument from fastapi import status @@ -11,19 +14,28 @@ def test_ingest_when_ok(data_db: Session, client: TestClient, user_header_token) response = client.post( "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", - files={"new_data": open("tests/integration_tests/ingest/test.json", "rb")}, + files={ + "new_data": open( + os.path.join( + "tests", "integration_tests", "ingest", "test_bulk_data.json" + ), + "rb", + ) + }, headers=user_header_token, ) expected_collection_import_ids = ["test.new.collection.0", "test.new.collection.1"] expected_family_import_ids = ["test.new.family.0", "test.new.family.1"] expected_document_import_ids = ["test.new.document.0", "test.new.document.1"] + expected_event_import_ids = ["test.new.event.0", "test.new.event.1"] assert response.status_code == status.HTTP_201_CREATED assert response.json() == { "collections": expected_collection_import_ids, "families": expected_family_import_ids, "documents": expected_document_import_ids, + "events": expected_event_import_ids, } saved_collections = ( @@ -46,17 +58,28 @@ def test_ingest_when_ok(data_db: Session, client: TestClient, user_header_token) for fam in saved_families: assert fam.import_id in expected_family_import_ids - saved_documents = ( + saved_events = ( data_db.query(FamilyDocument) .filter(FamilyDocument.import_id.in_(expected_document_import_ids)) .all() ) - assert len(saved_documents) == 2 - for doc in saved_documents: + assert len(saved_events) == 2 + for doc in saved_events: assert doc.import_id in expected_document_import_ids assert doc.family_import_id in expected_family_import_ids + saved_events = ( + data_db.query(FamilyEvent) + .filter(FamilyEvent.import_id.in_(expected_event_import_ids)) + .all() + ) + + assert len(saved_events) == 2 + for doc in saved_events: + assert doc.import_id in expected_event_import_ids + assert doc.family_import_id in expected_family_import_ids + def test_ingest_rollback( client: TestClient, data_db: Session, rollback_collection_repo, user_header_token @@ -65,7 +88,14 @@ def test_ingest_rollback( response = client.post( "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", - files={"new_data": open("tests/integration_tests/ingest/test.json", "rb")}, + files={ + "new_data": open( + os.path.join( + "tests", "integration_tests", "ingest", "test_bulk_data.json" + ), + "rb", + ) + }, headers=user_header_token, ) @@ -84,7 +114,14 @@ def test_ingest_when_corpus_import_id_invalid( invalid_corpus = "test" response = client.post( f"/api/v1/ingest/{invalid_corpus}", - files={"new_data": open("tests/integration_tests/ingest/test.json", "rb")}, + files={ + "new_data": open( + os.path.join( + "tests", "integration_tests", "ingest", "test_bulk_data.json" + ), + "rb", + ) + }, headers=user_header_token, ) @@ -93,3 +130,26 @@ def test_ingest_when_corpus_import_id_invalid( response.json().get("detail") == f"No organisation associated with corpus {invalid_corpus}" ) + + +def test_ingest_events_when_event_type_invalid( + data_db: Session, client: TestClient, user_header_token +): + response = client.post( + "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", + files={ + "new_data": open( + os.path.join( + "tests", + "integration_tests", + "ingest", + "test_bulk_data_with_invalid_event_type.json", + ), + "rb", + ) + }, + headers=user_header_token, + ) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert response.json().get("detail") == "Event type ['Invalid'] is invalid!" diff --git a/tests/integration_tests/ingest/test_ingest_template.py b/tests/integration_tests/ingest/test_ingest_template.py index 61e9ed1c..51c1ddd7 100644 --- a/tests/integration_tests/ingest/test_ingest_template.py +++ b/tests/integration_tests/ingest/test_ingest_template.py @@ -51,7 +51,11 @@ def test_get_template_unfcc( "family_import_id": {"title": "Family Import Id", "type": "string"}, "family_document_import_id": { "title": "Family Document Import Id", - "type": "string", + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, }, "event_title": {"title": "Event Title", "type": "string"}, "date": { @@ -88,11 +92,6 @@ def test_get_template_unfcc( { "import_id": {"title": "Import Id", "type": "string"}, "family_import_id": {"title": "Family Import Id", "type": "string"}, - "events": { - "items": {"type": "string"}, - "title": "Events", - "type": "array", - }, "variant_name": { "anyOf": [{"type": "string"}, {"type": "null"}], "default": None, diff --git a/tests/mocks/services/validation_service.py b/tests/mocks/services/validation_service.py new file mode 100644 index 00000000..08556f8a --- /dev/null +++ b/tests/mocks/services/validation_service.py @@ -0,0 +1,50 @@ +from pytest import MonkeyPatch + +from app.errors import ValidationError + + +def mock_validation_service(validation_service, monkeypatch: MonkeyPatch, mocker): + validation_service.org_mismatch = False + validation_service.throw_validation_error = False + + def maybe_throw(): + if validation_service.throw_validation_error: + raise ValidationError("Error") + + def mock_validate_collection(_) -> None: + maybe_throw() + + def mock_validate_family(_, __) -> None: + maybe_throw() + + def mock_validate_families(_, __) -> None: + maybe_throw() + + def mock_validate_document(_, __) -> None: + maybe_throw() + + def mock_validate_event(_, __) -> None: + maybe_throw() + + def mock_validate_events(_, __) -> None: + maybe_throw() + + monkeypatch.setattr( + validation_service, "validate_collection", mock_validate_collection + ) + mocker.spy(validation_service, "validate_collection") + + monkeypatch.setattr(validation_service, "validate_family", mock_validate_family) + mocker.spy(validation_service, "validate_family") + + monkeypatch.setattr(validation_service, "validate_families", mock_validate_families) + mocker.spy(validation_service, "validate_families") + + monkeypatch.setattr(validation_service, "validate_document", mock_validate_document) + mocker.spy(validation_service, "validate_document") + + monkeypatch.setattr(validation_service, "validate_event", mock_validate_event) + mocker.spy(validation_service, "validate_event") + + monkeypatch.setattr(validation_service, "validate_events", mock_validate_events) + mocker.spy(validation_service, "validate_events") diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py index b40345ab..34bd5578 100644 --- a/tests/unit_tests/conftest.py +++ b/tests/unit_tests/conftest.py @@ -22,6 +22,7 @@ import app.service.family as family_service import app.service.taxonomy as taxonomy_service import app.service.token as token_service +import app.service.validation as validation_service from app.clients.aws.client import get_s3_client from app.main import app from app.model.user import UserContext @@ -55,6 +56,7 @@ from tests.mocks.services.document_service import mock_document_service from tests.mocks.services.event_service import mock_event_service from tests.mocks.services.family_service import mock_family_service +from tests.mocks.services.validation_service import mock_validation_service ORG_ID = 1 @@ -213,6 +215,13 @@ def corpus_service_mock(monkeypatch, mocker): yield corpus_service +@pytest.fixture +def validation_service_mock(monkeypatch, mocker): + """Mocks the service for a single test.""" + mock_validation_service(validation_service, monkeypatch, mocker) + yield validation_service + + # ----- User tokens diff --git a/tests/unit_tests/routers/ingest/test.json b/tests/unit_tests/routers/ingest/test_bulk_data.json similarity index 95% rename from tests/unit_tests/routers/ingest/test.json rename to tests/unit_tests/routers/ingest/test_bulk_data.json index d5dcee7e..083c6881 100644 --- a/tests/unit_tests/routers/ingest/test.json +++ b/tests/unit_tests/routers/ingest/test_bulk_data.json @@ -42,7 +42,7 @@ "import_id": "test.new.document.0", "family_import_id": "test.new.family.0", "metadata": { "color": ["pink"] }, - "events": [], + "variant_name": "Test", "title": "", "user_language_name": "" }, @@ -50,7 +50,7 @@ "import_id": "test.new.document.1", "family_import_id": "test.new.family.1", "metadata": { "color": ["pink"] }, - "events": [], + "variant_name": "Test", "title": "", "user_language_name": "" } diff --git a/tests/unit_tests/routers/ingest/test_ingest.py b/tests/unit_tests/routers/ingest/test_ingest.py index b9a5c3b9..c1abab52 100644 --- a/tests/unit_tests/routers/ingest/test_ingest.py +++ b/tests/unit_tests/routers/ingest/test_ingest.py @@ -6,6 +6,7 @@ import io import json +import os from fastapi import status from fastapi.testclient import TestClient @@ -31,7 +32,14 @@ def test_ingest_data_when_ok( response = client.post( "/api/v1/ingest/test", - files={"new_data": open("tests/unit_tests/routers/ingest/test.json", "rb")}, + files={ + "new_data": open( + os.path.join( + "tests", "unit_tests", "routers", "ingest", "test_bulk_data.json" + ), + "rb", + ) + }, headers=user_header_token, ) @@ -94,7 +102,14 @@ def test_ingest_data_when_db_error( response = client.post( "/api/v1/ingest/test", - files={"new_data": open("tests/unit_tests/routers/ingest/test.json", "rb")}, + files={ + "new_data": open( + os.path.join( + "tests", "unit_tests", "routers", "ingest", "test_bulk_data.json" + ), + "rb", + ) + }, headers=user_header_token, ) diff --git a/tests/unit_tests/routers/ingest/test_ingest_template.py b/tests/unit_tests/routers/ingest/test_ingest_template.py index a92c0b0d..525871e2 100644 --- a/tests/unit_tests/routers/ingest/test_ingest_template.py +++ b/tests/unit_tests/routers/ingest/test_ingest_template.py @@ -60,7 +60,11 @@ def test_ingest_template_when_ok( "family_import_id": {"title": "Family Import Id", "type": "string"}, "family_document_import_id": { "title": "Family Document Import Id", - "type": "string", + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, }, "event_title": {"title": "Event Title", "type": "string"}, "date": { @@ -81,11 +85,6 @@ def test_ingest_template_when_ok( { "import_id": {"title": "Import Id", "type": "string"}, "family_import_id": {"title": "Family Import Id", "type": "string"}, - "events": { - "items": {"type": "string"}, - "title": "Events", - "type": "array", - }, "variant_name": { "anyOf": [{"type": "string"}, {"type": "null"}], "default": None, diff --git a/tests/unit_tests/service/ingest/test_ingest_service.py b/tests/unit_tests/service/ingest/test_ingest_service.py index 44ac5ef0..e3c708cc 100644 --- a/tests/unit_tests/service/ingest/test_ingest_service.py +++ b/tests/unit_tests/service/ingest/test_ingest_service.py @@ -1,3 +1,5 @@ +from datetime import datetime + import pytest import app.service.ingest as ingest_service @@ -7,15 +9,16 @@ def test_ingest_when_ok( corpus_repo_mock, geography_repo_mock, - db_client_metadata_mock, collection_repo_mock, family_repo_mock, document_repo_mock, + event_repo_mock, + validation_service_mock, ): test_data = { "collections": [ { - "import_id": "", + "import_id": "test.new.collection.0", "title": "Test title", "description": "Test description", }, @@ -28,7 +31,7 @@ def test_ingest_when_ok( "geography": "Test", "category": "UNFCCC", "metadata": {"color": ["blue"], "size": [""]}, - "collections": [], + "collections": ["test.new.collection.0"], }, ], "documents": [ @@ -37,19 +40,28 @@ def test_ingest_when_ok( "family_import_id": "test.new.family.0", "variant_name": "Original Language", "metadata": {"color": ["blue"]}, - "events": [], "title": "", "source_url": None, "user_language_name": "", } ], + "events": [ + { + "import_id": "test.new.event.0", + "family_import_id": "test.new.family.0", + "event_title": "Test", + "date": datetime.now(), + "event_type_value": "Amended", + } + ], } - assert ingest_service.import_data(test_data, "test") == { + assert { "collections": ["test.new.collection.0"], "families": ["created"], "documents": ["test.new.doc.0"], - } + "events": ["test.new.event.0"], + } == ingest_service.import_data(test_data, "test") def test_ingest_when_db_error(corpus_repo_mock, collection_repo_mock): @@ -58,7 +70,7 @@ def test_ingest_when_db_error(corpus_repo_mock, collection_repo_mock): test_data = { "collections": [ { - "import_id": "", + "import_id": "test.new.collection.0", "title": "Test title", "description": "Test description", } @@ -67,126 +79,26 @@ def test_ingest_when_db_error(corpus_repo_mock, collection_repo_mock): with pytest.raises(RepositoryError) as e: ingest_service.import_data(test_data, "test") - assert e.value.message == "bad collection repo" - - -def test_save_collections_when_import_id_wrong_format(corpus_repo_mock): - invalid_import_id = "invalid" - test_data = [ - { - "import_id": invalid_import_id, - "title": "Test title", - "description": "Test description", - }, - ] - - with pytest.raises(ValidationError) as e: - ingest_service.save_collections(test_data, "test") - expected_msg = "The import id invalid is invalid!" - assert e.value.message == expected_msg - - -def test_ingest_families_when_import_id_wrong_format( - corpus_repo_mock, geography_repo_mock, db_client_metadata_mock -): - invalid_import_id = "invalid" - test_data = [ - { - "import_id": invalid_import_id, - "title": "Test", - "summary": "Test", - "geography": "Test", - "category": "UNFCCC", - "metadata": {"color": ["blue"], "size": [""]}, - "collections": [], - }, - ] + assert "bad collection repo" == e.value.message - with pytest.raises(ValidationError) as e: - ingest_service.save_families(test_data, "test") - expected_msg = "The import id invalid is invalid!" - assert e.value.message == expected_msg - - -def test_ingest_families_when_geography_invalid(corpus_repo_mock, geography_repo_mock): - geography_repo_mock.error = True - test_data = [ - { - "import_id": "test.new.family.0", - "title": "Test", - "summary": "Test", - "geography": "Invalid", - "category": "Test", - "metadata": {}, - "collections": [], - }, - ] - with pytest.raises(ValidationError) as e: - ingest_service.save_families(test_data, "test") - expected_msg = "The geography value Invalid is invalid!" - assert e.value.message == expected_msg - - -def test_ingest_families_when_category_invalid(corpus_repo_mock, geography_repo_mock): - test_data = [ - { - "import_id": "test.new.family.0", - "title": "Test", - "summary": "Test", - "geography": "Test", - "category": "Test", - "metadata": {}, - "collections": [], - }, - ] - - with pytest.raises(ValidationError) as e: - ingest_service.save_families(test_data, "test") - expected_msg = "Test is not a valid FamilyCategory" - assert e.value.message == expected_msg +def test_save_families_when_corpus_invalid(corpus_repo_mock, validation_service_mock): + corpus_repo_mock.error = True - -def test_ingest_families_when_corpus_invalid(corpus_repo_mock): - corpus_repo_mock.valid = False - - test_data = [ - { - "import_id": "test.new.family.0", - "title": "Test", - "summary": "Test", - "geography": "Test", - "category": "Test", - "metadata": {}, - "collections": [], - }, - ] + test_data = [{"import_id": "test.new.family.0"}] with pytest.raises(ValidationError) as e: ingest_service.save_families(test_data, "test") - expected_msg = "Corpus 'test' not found" - assert e.value.message == expected_msg + assert "No organisation associated with corpus test" == e.value.message -def test_ingest_families_when_collection_ids_invalid( - corpus_repo_mock, geography_repo_mock -): - test_data = [ - { - "import_id": "test.new.family.0", - "title": "Test", - "summary": "Test", - "geography": "Test", - "category": "UNFCCC", - "metadata": {}, - "collections": ["invalid"], - }, - ] +def test_save_families_when_data_invalid(corpus_repo_mock, validation_service_mock): + validation_service_mock.throw_validation_error = True + test_data = [{"import_id": "invalid"}] with pytest.raises(ValidationError) as e: ingest_service.save_families(test_data, "test") - expected_msg = "The import ids are invalid: ['invalid']" - assert e.value.message == expected_msg + assert "Error" == e.value.message # TODO: Uncomment when implementing feature/pdct-1402-validate-collection-exists-before-creating-family @@ -214,85 +126,14 @@ def test_ingest_families_when_collection_ids_invalid( # assert e.value.message == expected_msg -def test_ingest_families_when_metadata_not_found( - corpus_repo_mock, geography_repo_mock, collection_repo_mock, db_client_metadata_mock -): - db_client_metadata_mock.bad_taxonomy = True - - test_data = [ - { - "import_id": "test.new.family.0", - "title": "Test", - "summary": "Test", - "geography": "Test", - "category": "UNFCCC", - "metadata": {}, - "collections": ["id.does.not.exist"], - }, - ] +def test_save_documents_when_data_invalid(validation_service_mock): + validation_service_mock.throw_validation_error = True - with pytest.raises(ValidationError) as e: - ingest_service.save_families(test_data, "test") - expected_msg = "No taxonomy found for corpus" - assert e.value.message == expected_msg - - -def test_ingest_families_when_metadata_invalid( - corpus_repo_mock, geography_repo_mock, collection_repo_mock, db_client_metadata_mock -): - test_data = [ - { - "import_id": "test.new.family.0", - "title": "Test", - "summary": "Test", - "geography": "Test", - "category": "UNFCCC", - "metadata": {}, - "collections": ["id.does.not.exist"], - }, - ] - - with pytest.raises(ValidationError) as e: - ingest_service.save_families(test_data, "test") - expected_msg = "Metadata validation failed: Missing metadata keys:" - assert expected_msg in e.value.message - - -def test_save_documents_when_variant_empty(): - test_data = [ - { - "import_id": "test.new.document.0", - "family_import_id": "test.new.family.0", - "variant_name": "", - "metadata": {}, - "events": [], - "title": "", - "user_language_name": "", - }, - ] + test_data = [{"import_id": "invalid"}] with pytest.raises(ValidationError) as e: ingest_service.save_documents(test_data, "test") - assert e.value.message == "Variant name is empty" - - -def test_ingest_documents_when_metadata_invalid(db_client_metadata_mock): - test_data = [ - { - "import_id": "test.new.document.0", - "family_import_id": "test.new.family.0", - "variant_name": None, - "metadata": {}, - "events": [], - "title": "", - "user_language_name": "", - }, - ] - - with pytest.raises(ValidationError) as e: - ingest_service.save_documents(test_data, "test") - expected_msg = "Metadata validation failed: Missing metadata keys:" - assert expected_msg in e.value.message + assert "Error" == e.value.message def test_validate_entity_relationships_when_no_family_matching_document(): @@ -305,4 +146,27 @@ def test_validate_entity_relationships_when_no_family_matching_document(): with pytest.raises(ValidationError) as e: ingest_service.validate_entity_relationships(test_data) - assert e.value.message == f"No family with id ['{fam_import_id}'] found" + assert f"No family with id ['{fam_import_id}'] found" == e.value.message + + +def test_save_documents_when_no_family(): + fam_import_id = "test.new.family.0" + test_data = { + "documents": [ + {"import_id": "test.new.document.0", "family_import_id": fam_import_id} + ] + } + + with pytest.raises(ValidationError) as e: + ingest_service.import_data(test_data, "test") + assert f"No family with id ['{fam_import_id}'] found" == e.value.message + + +def test_save_events_when_data_invalid(validation_service_mock): + validation_service_mock.throw_validation_error = True + + test_data = [{"import_id": "imvalid"}] + + with pytest.raises(ValidationError) as e: + ingest_service.save_events(test_data, "test") + assert "Error" == e.value.message diff --git a/tests/unit_tests/service/validation/test_collection_validation.py b/tests/unit_tests/service/validation/test_collection_validation.py new file mode 100644 index 00000000..ee3aa39e --- /dev/null +++ b/tests/unit_tests/service/validation/test_collection_validation.py @@ -0,0 +1,23 @@ +import pytest + +import app.service.validation as validation_service +from app.errors import ValidationError + + +def test_validate_collection_when_ok(): + test_collection = { + "import_id": "test.new.collection.0", + } + + validation_service.validate_collection(test_collection) + + +def test_validate_collection_when_import_id_invalid(): + invalid_import_id = "invalid" + test_collection = { + "import_id": invalid_import_id, + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_collection(test_collection) + assert "The import id invalid is invalid!" == e.value.message diff --git a/tests/unit_tests/service/validation/test_document_validation.py b/tests/unit_tests/service/validation/test_document_validation.py new file mode 100644 index 00000000..c74a72fa --- /dev/null +++ b/tests/unit_tests/service/validation/test_document_validation.py @@ -0,0 +1,63 @@ +import pytest + +import app.service.validation as validation_service +from app.errors import ValidationError + + +def test_validate_document_when_ok(db_client_metadata_mock): + test_document = { + "import_id": "test.new.document.0", + "family_import_id": "test.new.family.0", + "variant_name": "Test", + "metadata": {"color": ["blue"]}, + } + + validation_service.validate_document(test_document, "test") + + +def test_validate_document_when_import_id_wrong_format(): + invalid_import_id = "invalid" + test_document = { + "import_id": invalid_import_id, + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_document(test_document, "test") + assert f"The import id {invalid_import_id} is invalid!" == e.value.message + + +def test_validate_document_when_family_import_id_wrong_format(): + invalid_family_import_id = "invalid" + test_document = { + "import_id": "test.new.document.0", + "family_import_id": invalid_family_import_id, + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_document(test_document, "test") + assert f"The import id {invalid_family_import_id} is invalid!" == e.value.message + + +def test_validate_document_when_variant_empty(): + test_document = { + "import_id": "test.new.document.0", + "family_import_id": "test.new.family.0", + "variant_name": "", + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_document(test_document, "test") + assert "Variant name is empty" == e.value.message + + +def test_validate_document_when_metadata_invalid(db_client_metadata_mock): + test_document = { + "import_id": "test.new.document.0", + "family_import_id": "test.new.family.0", + "variant_name": None, + "metadata": {}, + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_document(test_document, "test") + assert "Metadata validation failed: Missing metadata keys:" in e.value.message diff --git a/tests/unit_tests/service/validation/test_event_validation.py b/tests/unit_tests/service/validation/test_event_validation.py new file mode 100644 index 00000000..815a505f --- /dev/null +++ b/tests/unit_tests/service/validation/test_event_validation.py @@ -0,0 +1,54 @@ +import pytest + +import app.service.validation as validation_service +from app.errors import ValidationError + + +def test_validate_event_when_ok(): + test_event = { + "import_id": "test.new.event.0", + "family_import_id": "test.new.family.0", + "event_type_value": "Amended", + } + + validation_service.validate_event( + test_event, {"event_type": {"allowed_values": ["Amended"]}} + ) + + +def test_validate_event_when_import_id_wrong_format(): + invalid_import_id = "invalid" + test_event = { + "import_id": invalid_import_id, + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_event(test_event, {}) + assert f"The import id {invalid_import_id} is invalid!" == e.value.message + + +def test_validate_event_when_family_import_id_wrong_format(): + invalid_import_id = "invalid" + test_event = { + "import_id": "test.new.event.0", + "family_import_id": invalid_import_id, + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_event(test_event, {}) + assert f"The import id {invalid_import_id} is invalid!" == e.value.message + + +def test_validate_event_when_event_type_invalid(): + invalid_event_type = "invalid" + test_event = { + "import_id": "test.new.event.0", + "family_import_id": "test.new.family.0", + "event_type_value": invalid_event_type, + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_event( + test_event, {"event_type": {"allowed_values": ["test"]}} + ) + assert f"Event type ['{invalid_event_type}'] is invalid!" == e.value.message diff --git a/tests/unit_tests/service/validation/test_family_validation.py b/tests/unit_tests/service/validation/test_family_validation.py new file mode 100644 index 00000000..9cf668c2 --- /dev/null +++ b/tests/unit_tests/service/validation/test_family_validation.py @@ -0,0 +1,95 @@ +import pytest + +import app.service.validation as validation_service +from app.errors import ValidationError + + +def test_validate_family_when_ok(corpus_repo_mock, db_client_metadata_mock): + test_family = { + "import_id": "test.new.family.0", + "category": "UNFCCC", + "metadata": {"color": ["blue"], "size": [""]}, + "collections": ["test.new.collection.0"], + } + + validation_service.validate_family(test_family, "test") + + +def test_validate_family_when_import_id_invalid(corpus_repo_mock): + invalid_import_id = "invalid" + test_family = { + "import_id": invalid_import_id, + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_family(test_family, "test") + assert "The import id invalid is invalid!" == e.value.message + + +def test_validate_family_when_corpus_invalid(corpus_repo_mock): + corpus_repo_mock.valid = False + + test_family = { + "import_id": "test.new.family.0", + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_family(test_family, "invalid") + assert "Corpus 'invalid' not found" == e.value.message + + +def test_validate_family_when_category_invalid(corpus_repo_mock, geography_repo_mock): + test_family = { + "import_id": "test.new.family.0", + "category": "Test", + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_family(test_family, "test") + assert "Test is not a valid FamilyCategory" == e.value.message + + +def test_validate_family_when_collection_ids_invalid( + corpus_repo_mock, geography_repo_mock +): + test_family = { + "import_id": "test.new.family.0", + "category": "UNFCCC", + "collections": ["invalid"], + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_family(test_family, "test") + assert "The import ids are invalid: ['invalid']" == e.value.message + + +def test_validate_family_when_metadata_not_found( + corpus_repo_mock, geography_repo_mock, collection_repo_mock, db_client_metadata_mock +): + db_client_metadata_mock.bad_taxonomy = True + + test_family = { + "import_id": "test.new.family.0", + "category": "UNFCCC", + "collections": [], + "metadata": {}, + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_family(test_family, "test") + assert "No taxonomy found for corpus" == e.value.message + + +def test_validate_family_when_metadata_invalid( + corpus_repo_mock, geography_repo_mock, collection_repo_mock, db_client_metadata_mock +): + test_family = { + "import_id": "test.new.family.0", + "category": "UNFCCC", + "metadata": {}, + "collections": [], + } + + with pytest.raises(ValidationError) as e: + validation_service.validate_family(test_family, "test") + assert "Metadata validation failed: Missing metadata keys:" in e.value.message