From 682ad4458ae2d32a9d820c548c5b98150a8a4486 Mon Sep 17 00:00:00 2001 From: diversemix Date: Tue, 23 May 2023 12:52:54 +0100 Subject: [PATCH 1/4] Get multiple-collections working --- app/core/ingestion/collection.py | 96 ++++++++++++------- app/core/ingestion/params.py | 2 +- app/core/ingestion/processor.py | 23 ++--- .../ingestion/unfccc/ingest_row_unfccc.py | 3 +- app/core/ingestion/validator.py | 4 +- tests/core/ingestion/test_collection.py | 14 +-- .../core/ingestion/test_unfccc_ingest_row.py | 8 +- 7 files changed, 84 insertions(+), 66 deletions(-) diff --git a/app/core/ingestion/collection.py b/app/core/ingestion/collection.py index e4815b6a..fddf606e 100644 --- a/app/core/ingestion/collection.py +++ b/app/core/ingestion/collection.py @@ -1,4 +1,4 @@ -from typing import Any, Optional +from typing import Any, Optional, cast from sqlalchemy.orm import Session from app.core.ingestion.params import IngestParameters @@ -13,6 +13,29 @@ from app.db.models.law_policy.collection import CollectionFamily, CollectionOrganisation +def handle_cclw_collection_and_link( + db: Session, + params: IngestParameters, + org_id: int, + family_import_id: str, + result: dict[str, Any], +) -> Optional[Collection]: + collection = handle_create_collection( + db, + params.cpr_collection_ids[0], # Only every one for CCLW + params.collection_name, + params.collection_summary, + org_id, + result, + ) + + if collection is not None: + handle_link_collection_to_family( + db, params.cpr_collection_ids, cast(str, family_import_id), result + ) + return collection + + def create_collection( db: Session, row: CollectionIngestRow, @@ -65,11 +88,12 @@ def create_collection( ) -def handle_collection_and_link( +def handle_create_collection( db: Session, - params: IngestParameters, + collection_id: str, + collection_name: str, + collection_summary: str, org_id: int, - family_import_id: str, result: dict[str, Any], ) -> Optional[Collection]: """ @@ -85,34 +109,27 @@ def handle_collection_and_link( :param [dict[str, Any]]: a result dict in which to record what was created. :return [Collection | None]: A collection if one was created, otherwise None. """ - if not params.cpr_collection_id or params.cpr_collection_id == "n/a": + if not collection_id or collection_id == "n/a": return None # First check for the actual collection existing_collection = ( - db.query(Collection) - .filter(Collection.import_id == params.cpr_collection_id) - .one_or_none() + db.query(Collection).filter(Collection.import_id == collection_id).one_or_none() ) if existing_collection is None: - if params.create_collections is False: - id = params.cpr_collection_id - msg = f"Collection {id} is not pre-exsiting so not linking" - raise ValueError(msg) - collection = create( db, Collection, - import_id=params.cpr_collection_id, - title=params.collection_name, - extra={"description": params.collection_summary}, + import_id=collection_id, + title=collection_name, + extra={"description": collection_summary}, ) collection_organisation = create( db, CollectionOrganisation, - collection_import_id=collection.import_id, + collection_import_id=collection_id, organisation_id=org_id, ) @@ -121,30 +138,37 @@ def handle_collection_and_link( else: collection = existing_collection updated = {} - update_if_changed(updated, "title", params.collection_name, collection) - update_if_changed(updated, "description", params.collection_summary, collection) + update_if_changed(updated, "title", collection_name, collection) + update_if_changed(updated, "description", collection_summary, collection) if len(updated) > 0: result["collection"] = updated db.add(collection) db.flush() - # Second check for the family - collection link - existing_link = ( - db.query(CollectionFamily) - .filter_by( - collection_import_id=params.cpr_collection_id, - family_import_id=params.cpr_family_id, - ) - .one_or_none() - ) + return collection - if existing_link is None: - collection_family = create( - db, - CollectionFamily, - collection_import_id=collection.import_id, - family_import_id=family_import_id, + +def handle_link_collection_to_family( + db: Session, + collection_ids: list[str], + family_import_id: str, + result: dict[str, Any], +) -> None: + for collection_id in collection_ids: + existing_link = ( + db.query(CollectionFamily) + .filter_by( + collection_import_id=collection_id, + family_import_id=family_import_id, + ) + .one_or_none() ) - result["collection_family"] = to_dict(collection_family) - return collection + if existing_link is None: + collection_family = create( + db, + CollectionFamily, + collection_import_id=collection_id, + family_import_id=family_import_id, + ) + result["collection_family"] = to_dict(collection_family) diff --git a/app/core/ingestion/params.py b/app/core/ingestion/params.py index 78140f51..c6cb34cd 100644 --- a/app/core/ingestion/params.py +++ b/app/core/ingestion/params.py @@ -25,6 +25,6 @@ class IngestParameters: geography: str cpr_document_id: str cpr_family_id: str - cpr_collection_id: str + cpr_collection_ids: list[str] cpr_family_slug: str cpr_document_slug: str diff --git a/app/core/ingestion/processor.py b/app/core/ingestion/processor.py index e2be3cc1..aaecfdcf 100644 --- a/app/core/ingestion/processor.py +++ b/app/core/ingestion/processor.py @@ -4,7 +4,8 @@ from sqlalchemy.orm import Session from app.core.ingestion.collection import ( create_collection, - handle_collection_and_link, + handle_cclw_collection_and_link, + handle_link_collection_to_family, ) from app.core.ingestion.cclw.event import family_event_from_row from app.core.ingestion.family import handle_family_from_params @@ -78,7 +79,7 @@ def add_metadata(db: Session, import_id: str, taxonomy: Taxonomy, taxonomy_id: i geography=row.geography, cpr_document_id=row.cpr_document_id, cpr_family_id=row.cpr_family_id, - cpr_collection_id=row.cpr_collection_id, + cpr_collection_ids=[row.cpr_collection_id], cpr_family_slug=row.cpr_family_slug, cpr_document_slug=row.cpr_document_slug, ) @@ -108,7 +109,7 @@ def add_metadata(db: Session, import_id: str, taxonomy: Taxonomy, taxonomy_id: i geography=row.geography, cpr_document_id=row.cpr_document_id, cpr_family_id=row.cpr_family_id, - cpr_collection_id=row.cpr_collection_id, + cpr_collection_ids=row.cpr_collection_id, cpr_family_slug=row.cpr_family_slug, cpr_document_slug=row.cpr_document_slug, ) @@ -138,12 +139,8 @@ def ingest_cclw_document_row( ) params = build_params_from_cclw(row) family = handle_family_from_params(db, params, context.org_id, result) - handle_collection_and_link( - db, - params, - context.org_id, - cast(str, family.import_id), - result, + handle_cclw_collection_and_link( + db, params, context.org_id, cast(str, family.import_id), result ) _LOGGER.info( @@ -181,12 +178,8 @@ def ingest_unfccc_document_row( params = build_params_from_unfccc(row) family = handle_family_from_params(db, params, context.org_id, result) - handle_collection_and_link( - db, - params, - context.org_id, - cast(str, family.import_id), - result, + handle_link_collection_to_family( + db, params.cpr_collection_ids, cast(str, family.import_id), result ) ctx = cast(UNFCCCIngestContext, context) diff --git a/app/core/ingestion/unfccc/ingest_row_unfccc.py b/app/core/ingestion/unfccc/ingest_row_unfccc.py index cb93a050..a4fe9ef6 100644 --- a/app/core/ingestion/unfccc/ingest_row_unfccc.py +++ b/app/core/ingestion/unfccc/ingest_row_unfccc.py @@ -55,7 +55,7 @@ class UNFCCCDocumentIngestRow(BaseIngestRow): document_variant: str language: list[str] - cpr_collection_id: str + cpr_collection_id: list[str] cpr_document_id: str cpr_family_id: str cpr_family_slug: str @@ -63,7 +63,6 @@ class UNFCCCDocumentIngestRow(BaseIngestRow): cpr_document_status: str download_url: str - # FIXME: Where is the summary from? family_summary: str = "summary" VALID_COLUMNS: ClassVar[set[str]] = VALID_DOCUMENT_COLUMN_NAMES diff --git a/app/core/ingestion/validator.py b/app/core/ingestion/validator.py index b668a80a..581cf03e 100644 --- a/app/core/ingestion/validator.py +++ b/app/core/ingestion/validator.py @@ -138,8 +138,8 @@ def validate_unfccc_document_row( errors, ) - # Add to the collections that are referenced so we can valiate later - context.collection_ids_referenced.append(row.cpr_collection_id) + # Add to the collections that are referenced so we can validate later + context.collection_ids_referenced.extend(row.cpr_collection_id) if len(errors) > 0: context.results += errors diff --git a/tests/core/ingestion/test_collection.py b/tests/core/ingestion/test_collection.py index 45fb1c4b..106d47d8 100644 --- a/tests/core/ingestion/test_collection.py +++ b/tests/core/ingestion/test_collection.py @@ -1,8 +1,10 @@ from typing import cast from sqlalchemy.orm import Session -from app.core.ingestion.collection import handle_collection_and_link from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow -from app.core.ingestion.processor import build_params_from_cclw +from app.core.ingestion.processor import ( + build_params_from_cclw, + handle_cclw_collection_and_link, +) from app.core.ingestion.utils import get_or_create from app.db.models.law_policy.collection import ( Collection, @@ -43,7 +45,7 @@ def test_handle_collection_from_row__creates(test_db: Session): result = {} row, family = db_setup(test_db) - collection = handle_collection_and_link( + collection = handle_cclw_collection_and_link( test_db, build_params_from_cclw(row), 1, cast(str, family.import_id), result ) assert collection @@ -76,7 +78,7 @@ def test_handle_collection_from_row__updates(test_db: Session): first_result = {} row, family = db_setup(test_db) - handle_collection_and_link( + handle_cclw_collection_and_link( test_db, build_params_from_cclw(row), 1, @@ -87,7 +89,7 @@ def test_handle_collection_from_row__updates(test_db: Session): result = {} row.collection_name = "new name" row.collection_summary = "new summary" - collection = handle_collection_and_link( + collection = handle_cclw_collection_and_link( test_db, build_params_from_cclw(row), 1, cast(str, family.import_id), result ) assert collection @@ -112,7 +114,7 @@ def test_handle_collection_from_row__ignores(test_db: Session): row, family = db_setup(test_db) row.cpr_collection_id = "n/a" - collection = handle_collection_and_link( + collection = handle_cclw_collection_and_link( test_db, build_params_from_cclw(row), 1, cast(str, family.import_id), result ) diff --git a/tests/core/ingestion/test_unfccc_ingest_row.py b/tests/core/ingestion/test_unfccc_ingest_row.py index 4fd4dd7c..bf5f038d 100644 --- a/tests/core/ingestion/test_unfccc_ingest_row.py +++ b/tests/core/ingestion/test_unfccc_ingest_row.py @@ -34,7 +34,7 @@ document_role="MAIN", document_variant="Original Language", language=["en"], - cpr_collection_id="id1", + cpr_collection_id=["id1"], cpr_document_id="cpr_document_id", cpr_family_id="cpr_family_id", cpr_family_slug="cpr_family_slug", @@ -73,7 +73,7 @@ def test_ingest_single_collection_and_document(test_db: Session): document_row = DOC_ROW result = ingest_unfccc_document_row(test_db, context, document_row) - assert len(result) == 8 + assert len(result) == 7 def test_ingest_blank_geo(test_db: Session): @@ -95,7 +95,7 @@ def test_ingest_blank_geo(test_db: Session): document_row.geography_iso = "" result = ingest_unfccc_document_row(test_db, context, document_row) - assert len(result) == 8 + assert len(result) == 7 assert 1 == test_db.query(Family).count() family = test_db.query(Family).first() @@ -125,7 +125,7 @@ def test_ingest_international_geo(test_db: Session): document_row.geography_iso = "INT" result = ingest_unfccc_document_row(test_db, context, document_row) - assert len(result) == 8 + assert len(result) == 7 assert 1 == test_db.query(Family).count() family = test_db.query(Family).first() From c6857cc6818219a659995056ea8ac238aa6a1fb6 Mon Sep 17 00:00:00 2001 From: diversemix Date: Tue, 23 May 2023 13:00:52 +0100 Subject: [PATCH 2/4] add test for 2 collections --- .../core/ingestion/test_unfccc_ingest_row.py | 53 ++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/tests/core/ingestion/test_unfccc_ingest_row.py b/tests/core/ingestion/test_unfccc_ingest_row.py index bf5f038d..1bcc45f5 100644 --- a/tests/core/ingestion/test_unfccc_ingest_row.py +++ b/tests/core/ingestion/test_unfccc_ingest_row.py @@ -9,7 +9,7 @@ CollectionIngestRow, UNFCCCDocumentIngestRow, ) -from app.db.models.law_policy.collection import CollectionOrganisation +from app.db.models.law_policy.collection import CollectionFamily, CollectionOrganisation from app.db.models.law_policy.family import Family from app.db.models.law_policy.geography import GEO_INTERNATIONAL, GEO_NONE, Geography @@ -76,6 +76,57 @@ def test_ingest_single_collection_and_document(test_db: Session): assert len(result) == 7 +def test_ingest_two_collections_and_document(test_db: Session): + populate_for_ingest(test_db) + test_db.commit() + context = initialise_context(test_db, "UNFCCC") + + # Act - create collections + collection_row = CollectionIngestRow( + row_number=1, + cpr_collection_id="id1", + collection_name="collection-title", + collection_summary="collection-description", + ) + ingest_collection_row(test_db, context, collection_row) + collection_row2 = CollectionIngestRow( + row_number=2, + cpr_collection_id="id2", + collection_name="collection-title2", + collection_summary="collection-description2", + ) + ingest_collection_row(test_db, context, collection_row2) + + # Act - create document + document_row = DOC_ROW + document_row.cpr_collection_id = ["id1", "id2"] + result = ingest_unfccc_document_row(test_db, context, document_row) + + assert len(result) == 7 + assert ( + test_db.query(CollectionOrganisation) + .filter(CollectionOrganisation.collection_import_id == "id1") + .one() + ) + assert ( + test_db.query(CollectionOrganisation) + .filter(CollectionOrganisation.collection_import_id == "id2") + .one() + ) + assert ( + test_db.query(CollectionFamily) + .filter(CollectionFamily.collection_import_id == "id1") + .filter(CollectionFamily.family_import_id == "cpr_family_id") + .one() + ) + assert ( + test_db.query(CollectionFamily) + .filter(CollectionFamily.collection_import_id == "id2") + .filter(CollectionFamily.family_import_id == "cpr_family_id") + .one() + ) + + def test_ingest_blank_geo(test_db: Session): populate_for_ingest(test_db) test_db.commit() From 6a1569f593e3033e90077a1f8ee3a1a7917586c7 Mon Sep 17 00:00:00 2001 From: diversemix Date: Tue, 23 May 2023 14:27:14 +0100 Subject: [PATCH 3/4] deepcopy the DOC_ROW object --- tests/core/ingestion/test_unfccc_ingest_row.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/core/ingestion/test_unfccc_ingest_row.py b/tests/core/ingestion/test_unfccc_ingest_row.py index 1bcc45f5..734d493f 100644 --- a/tests/core/ingestion/test_unfccc_ingest_row.py +++ b/tests/core/ingestion/test_unfccc_ingest_row.py @@ -1,3 +1,4 @@ +import copy from datetime import datetime from sqlalchemy.orm import Session from app.core.ingestion.processor import ( @@ -70,7 +71,7 @@ def test_ingest_single_collection_and_document(test_db: Session): ) # Act - create document - document_row = DOC_ROW + document_row = copy.deepcopy(DOC_ROW) result = ingest_unfccc_document_row(test_db, context, document_row) assert len(result) == 7 @@ -96,9 +97,10 @@ def test_ingest_two_collections_and_document(test_db: Session): collection_summary="collection-description2", ) ingest_collection_row(test_db, context, collection_row2) + assert 2 == test_db.query(Collection).count() # Act - create document - document_row = DOC_ROW + document_row = copy.deepcopy(DOC_ROW) document_row.cpr_collection_id = ["id1", "id2"] result = ingest_unfccc_document_row(test_db, context, document_row) @@ -132,7 +134,7 @@ def test_ingest_blank_geo(test_db: Session): test_db.commit() context = initialise_context(test_db, "UNFCCC") - # Act - create collection + # Arrange - create collection collection_row = CollectionIngestRow( row_number=1, cpr_collection_id="id1", @@ -142,7 +144,7 @@ def test_ingest_blank_geo(test_db: Session): result = ingest_collection_row(test_db, context, collection_row) # Act - create document - document_row = DOC_ROW + document_row = copy.deepcopy(DOC_ROW) document_row.geography_iso = "" result = ingest_unfccc_document_row(test_db, context, document_row) @@ -162,7 +164,7 @@ def test_ingest_international_geo(test_db: Session): test_db.commit() context = initialise_context(test_db, "UNFCCC") - # Act - create collection + # Arrange - create collection collection_row = CollectionIngestRow( row_number=1, cpr_collection_id="id1", @@ -172,10 +174,11 @@ def test_ingest_international_geo(test_db: Session): result = ingest_collection_row(test_db, context, collection_row) # Act - create document - document_row = DOC_ROW + document_row = copy.deepcopy(DOC_ROW) document_row.geography_iso = "INT" result = ingest_unfccc_document_row(test_db, context, document_row) + test_db.commit() assert len(result) == 7 assert 1 == test_db.query(Family).count() From 4c7f61f05f8bfbf862efc07f2518a79516390377 Mon Sep 17 00:00:00 2001 From: diversemix Date: Tue, 23 May 2023 15:30:11 +0100 Subject: [PATCH 4/4] make the unfccc validation testable and write tests --- app/api/api_v1/routers/unfccc_ingest.py | 95 +++---------------- app/core/ingestion/unfccc/validate.py | 83 ++++++++++++++++ ...idate_row.py => test_validate_cclw_row.py} | 0 .../ingestion/test_validate_unfccc_csv.py | 32 +++++++ .../ingestion/test_validate_unfccc_row.py | 47 +++++++++ 5 files changed, 175 insertions(+), 82 deletions(-) create mode 100644 app/core/ingestion/unfccc/validate.py rename tests/core/ingestion/{test_validate_row.py => test_validate_cclw_row.py} (100%) create mode 100644 tests/core/ingestion/test_validate_unfccc_csv.py create mode 100644 tests/core/ingestion/test_validate_unfccc_row.py diff --git a/app/api/api_v1/routers/unfccc_ingest.py b/app/api/api_v1/routers/unfccc_ingest.py index 65942c41..deaf6fa6 100644 --- a/app/api/api_v1/routers/unfccc_ingest.py +++ b/app/api/api_v1/routers/unfccc_ingest.py @@ -29,18 +29,15 @@ get_collection_ingestor, initialise_context, get_unfccc_document_ingestor, - get_document_validator, ) from app.core.ingestion.reader import get_file_contents, read +from app.core.ingestion.unfccc.validate import validate_unfccc_csv from app.core.ingestion.utils import ( - IngestContext, - Result, ResultType, UNFCCCIngestContext, ) from app.core.ingestion.utils import ( ValidationResult, - get_result_counts, ) from app.core.validation.types import ImportSchemaMismatchError from app.core.validation.util import ( @@ -159,13 +156,16 @@ def validate_unfccc_law_policy( all_results = [] try: - _, _, message = _validate_unfccc_csv( - unfccc_data_csv, - collection_csv, + docs = get_file_contents(unfccc_data_csv) + collections = get_file_contents(collection_csv) + message = validate_unfccc_csv( + docs, + collections, db, cast(UNFCCCIngestContext, context), all_results, ) + _LOGGER.info(message) except ImportSchemaMismatchError as e: _LOGGER.exception( "Provided CSV failed law & policy schema validation", @@ -237,13 +237,16 @@ def ingest_unfccc_law_policy( # PHASE 1 - Validate try: - documents_file_contents, collection_file_contents, _ = _validate_unfccc_csv( - unfccc_data_csv, - collection_csv, + collection_file_contents = get_file_contents(collection_csv) + documents_file_contents = get_file_contents(unfccc_data_csv) + message = validate_unfccc_csv( + documents_file_contents, + collection_file_contents, db, cast(UNFCCCIngestContext, context), all_results, ) + _LOGGER.info(message) except ImportSchemaMismatchError as e: _LOGGER.exception( "Provided CSV failed law & policy schema validation", @@ -362,75 +365,3 @@ def ingest_unfccc_law_policy( import_s3_prefix=s3_prefix, detail=None, # TODO: add detail? ) - - -def _validate_unfccc_csv( - unfccc_data_csv: UploadFile, - collection_csv: UploadFile, - db: Session, - context: UNFCCCIngestContext, - all_results: list[Result], -) -> tuple[str, str, str]: - """ - Validates the csv file - - :param UploadFile law_policy_csv: incoming file to validate - :param Session db: connection to the database - :param IngestContext context: the ingest context - :param list[Result] all_results: the results - :return tuple[str, str]: the file contents of the csv and the summary message - """ - - # First read all the ids in the collection_csv - def collate_ids(context: IngestContext, row: CollectionIngestRow) -> None: - ctx = cast(UNFCCCIngestContext, context) - ctx.collection_ids_defined.append(row.cpr_collection_id) - - collection_file_contents = get_file_contents(collection_csv) - read(collection_file_contents, context, CollectionIngestRow, collate_ids) - - # Now do the validation of the documents - documents_file_contents = get_file_contents(unfccc_data_csv) - validator = get_document_validator(db, context) - read(documents_file_contents, context, UNFCCCDocumentIngestRow, validator) - # Get the rows here as this is the length of results - rows = len(context.results) - - # Check the set of defined collections against those referenced - defined = set(context.collection_ids_defined) - referenced = set(context.collection_ids_referenced) - - defined_not_referenced = defined.difference(referenced) - - if len(defined_not_referenced) > 0: - # Empty collections are allowed, but need reporting - context.results.append( - Result( - ResultType.OK, - "The following Collection IDs were " - + f"defined and not referenced: {list(defined_not_referenced)}", - ) - ) - - referenced_not_defined = referenced.difference(defined) - if len(referenced_not_defined) > 0: - context.results.append( - Result( - ResultType.ERROR, - "The following Collection IDs were " - f"referenced and not defined: {list(referenced_not_defined)}", - ) - ) - - _, fails, resolved = get_result_counts(context.results) - all_results.extend(context.results) - - context.results = [] - message = ( - f"UNFCCC validation result: {rows} Rows, {fails} Failures, " - f"{resolved} Resolved" - ) - - _LOGGER.info(message) - - return documents_file_contents, collection_file_contents, message diff --git a/app/core/ingestion/unfccc/validate.py b/app/core/ingestion/unfccc/validate.py new file mode 100644 index 00000000..480b43a6 --- /dev/null +++ b/app/core/ingestion/unfccc/validate.py @@ -0,0 +1,83 @@ +from typing import cast +from sqlalchemy.orm import Session +from app.core.ingestion.processor import get_document_validator +from app.core.ingestion.unfccc.ingest_row_unfccc import ( + CollectionIngestRow, + UNFCCCDocumentIngestRow, +) +from app.core.ingestion.utils import ( + IngestContext, + Result, + ResultType, + UNFCCCIngestContext, + get_result_counts, +) +from app.core.ingestion.reader import read + + +def validate_unfccc_csv( + documents_file_contents: str, + collection_file_contents: str, + db: Session, + context: UNFCCCIngestContext, + all_results: list[Result], +) -> str: + """ + Validates the csv file + + :param UploadFile law_policy_csv: incoming file to validate + :param Session db: connection to the database + :param IngestContext context: the ingest context + :param list[Result] all_results: the results + :return tuple[str, str]: the file contents of the csv and the summary message + """ + + # First read all the ids in the collection_csv + def collate_ids(context: IngestContext, row: CollectionIngestRow) -> None: + ctx = cast(UNFCCCIngestContext, context) + ctx.collection_ids_defined.append(row.cpr_collection_id) + + read(collection_file_contents, context, CollectionIngestRow, collate_ids) + + # Now do the validation of the documents + validator = get_document_validator(db, context) + read(documents_file_contents, context, UNFCCCDocumentIngestRow, validator) + # Get the rows here as this is the length of results + rows = len(context.results) + + # Check the set of defined collections against those referenced + defined = set(context.collection_ids_defined) + referenced = set(context.collection_ids_referenced) + + defined_not_referenced = defined.difference(referenced) + + if len(defined_not_referenced) > 0: + # Empty collections are allowed, but need reporting + context.results.append( + Result( + ResultType.OK, + "The following Collection IDs were " + + f"defined and not referenced: {list(defined_not_referenced)}", + ) + ) + + referenced_not_defined = referenced.difference(defined) + if len(referenced_not_defined) > 0: + context.results.append( + Result( + ResultType.ERROR, + "The following Collection IDs were " + f"referenced and not defined: {list(referenced_not_defined)}", + ) + ) + + _, fails, resolved = get_result_counts(context.results) + all_results.extend(context.results) + + context.results = [] + message = ( + f"UNFCCC validation result: {rows} Rows, {fails} Failures, " + f"{resolved} Resolved" + ) + + return message diff --git a/tests/core/ingestion/test_validate_row.py b/tests/core/ingestion/test_validate_cclw_row.py similarity index 100% rename from tests/core/ingestion/test_validate_row.py rename to tests/core/ingestion/test_validate_cclw_row.py diff --git a/tests/core/ingestion/test_validate_unfccc_csv.py b/tests/core/ingestion/test_validate_unfccc_csv.py new file mode 100644 index 00000000..69f56dfa --- /dev/null +++ b/tests/core/ingestion/test_validate_unfccc_csv.py @@ -0,0 +1,32 @@ +from typing import cast +from app.core.ingestion.processor import initialise_context +from app.core.ingestion.unfccc.validate import validate_unfccc_csv +from app.core.ingestion.utils import UNFCCCIngestContext +from tests.core.ingestion.helpers import populate_for_ingest + + +ONE_UNFCCC_ROW = """Category,Submission Type,Family Name,Document Title,Documents,Author,Author Type,Geography,Geography ISO,Date,Document Role,Document Variant,Language,Download URL,CPR Collection ID,CPR Document ID,CPR Document Slug,CPR Family ID,CPR Family Slug,CPR Document Status +UNFCCC,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url of downloaded document,UNFCCC.Collection.Found1;UNFCCC.Collection.Found2,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug, +""" + + +TWO_COLLECTION_ROW = """CPR Collection ID,Collection name,Collection summary +UNFCCC.Collection.Found1,Collection One,Everything to do with testing +UNFCCC.Collection.Found2,Collection One,Everything to do with testing +""" + + +def test_validate_unfccc_csv(test_db): + results = [] + populate_for_ingest(test_db) + test_db.commit() + ctx = initialise_context(test_db, "UNFCCC") + message = validate_unfccc_csv( + ONE_UNFCCC_ROW, + TWO_COLLECTION_ROW, + test_db, + cast(UNFCCCIngestContext, ctx), + results, + ) + + assert message == "UNFCCC validation result: 1 Rows, 0 Failures, 0 Resolved" diff --git a/tests/core/ingestion/test_validate_unfccc_row.py b/tests/core/ingestion/test_validate_unfccc_row.py new file mode 100644 index 00000000..aa69b797 --- /dev/null +++ b/tests/core/ingestion/test_validate_unfccc_row.py @@ -0,0 +1,47 @@ +from datetime import datetime +from app.core.ingestion.unfccc.ingest_row_unfccc import UNFCCCDocumentIngestRow +from app.core.ingestion.utils import ResultType, UNFCCCIngestContext +from app.core.ingestion.validator import ( + validate_unfccc_document_row, +) +from app.core.organisation import get_organisation_taxonomy + +from tests.core.ingestion.helpers import ( + populate_for_ingest, +) + + +def test_validate_row__multiple_collection_ids(test_db): + context = UNFCCCIngestContext() + populate_for_ingest(test_db) + _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) + + row = UNFCCCDocumentIngestRow( + row_number=1, + category="UNFCCC", + submission_type="Plan", + family_name="family_name", + document_title="document_title", + documents="documents", + author="author", + author_type="Party", + geography="GBR", + geography_iso="GBR", + date=datetime.now(), + document_role="MAIN", + document_variant="Original Language", + language=["en"], + cpr_collection_id=["id1", "id2"], + cpr_document_id="cpr_document_id", + cpr_family_id="cpr_family_id", + cpr_family_slug="cpr_family_slug", + cpr_document_slug="cpr_document_slug", + cpr_document_status="PUBLISHED", + download_url="download_url", + ) + + validate_unfccc_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + + assert context.results + assert len(context.results) == 1 + assert context.results[0].type == ResultType.OK