diff --git a/alembic/versions/0013_families_and_collections.py b/alembic/versions/0013_families_and_collections.py index a14d345e..0173fcae 100644 --- a/alembic/versions/0013_families_and_collections.py +++ b/alembic/versions/0013_families_and_collections.py @@ -85,7 +85,7 @@ def upgrade(): sa.Column('family_import_id', sa.Text(), nullable=False), sa.Column('physical_document_id', sa.Integer(), nullable=False), sa.Column('import_id', sa.Text(), nullable=False), - sa.Column('variant_name', sa.Text(), nullable=False), + sa.Column('variant_name', sa.Text(), nullable=True), sa.Column('document_status', sa.Enum('CREATED', 'PUBLISHED', 'DELETED', name='documentstatus'), nullable=False), sa.Column('document_type', sa.Text(), nullable=True), sa.Column('document_role', sa.Text(), nullable=True), diff --git a/app/api/api_v1/schemas/document.py b/app/api/api_v1/schemas/document.py index b034fd50..bb45051e 100644 --- a/app/api/api_v1/schemas/document.py +++ b/app/api/api_v1/schemas/document.py @@ -65,7 +65,7 @@ class FamilyDocumentResponse(BaseModel): """Response for a FamilyDocument, without any family information""" import_id: str - variant: str + variant: Optional[str] slug: str # What follows is off PhysicalDocument title: str @@ -74,7 +74,8 @@ class FamilyDocumentResponse(BaseModel): source_url: Optional[str] content_type: Optional[str] language: str - document_type: str + document_type: Optional[str] + document_role: Optional[str] class FamilyContext(BaseModel): diff --git a/app/core/ingestion/family.py b/app/core/ingestion/family.py index dbf541ad..72e98f6e 100644 --- a/app/core/ingestion/family.py +++ b/app/core/ingestion/family.py @@ -1,4 +1,4 @@ -from typing import Any, cast +from typing import Any, Optional, cast from sqlalchemy.orm import Session from app.core.ingestion.ingest_row import DocumentIngestRow @@ -13,12 +13,10 @@ FamilyCategory, Family, FamilyDocument, - FamilyDocumentType, FamilyOrganisation, FamilyStatus, Geography, Slug, - Variant, ) @@ -103,13 +101,8 @@ def _maybe_create_family_document( existing_document: Document, result: dict[str, Any], ) -> FamilyDocument: - # FIXME: these should come from well-known values, not whatever is in the CSV - variant_name = get_or_create( - db, Variant, variant_name=row.document_role, extra={"description": ""} - ).variant_name - document_type = get_or_create( - db, FamilyDocumentType, name=row.document_type, extra={"description": ""} - ).name + def none_if_empty(data: str) -> Optional[str]: + return data if data != "" else None family_document = ( db.query(FamilyDocument).filter_by(import_id=row.cpr_document_id).one_or_none() @@ -124,9 +117,10 @@ def _maybe_create_family_document( family_import_id=family.import_id, physical_document_id=physical_document.id, import_id=row.cpr_document_id, - variant_name=variant_name, + variant_name=none_if_empty(row.document_variant), document_status=DocumentStatus.PUBLISHED, - document_type=document_type, + document_type=none_if_empty(row.document_type), + document_role=none_if_empty(row.document_role), ) db.add(family_document) db.flush() diff --git a/app/core/ingestion/ingest_row.py b/app/core/ingestion/ingest_row.py index 55f6bb34..0be8bbba 100644 --- a/app/core/ingestion/ingest_row.py +++ b/app/core/ingestion/ingest_row.py @@ -18,6 +18,7 @@ "Family name", "Family summary", "Document role", + "Document variant", "Geography ISO", "Documents", "Category", @@ -126,6 +127,7 @@ class DocumentIngestRow(BaseIngestRow): family_name: str family_summary: str document_role: str + document_variant: str geography_iso: str documents: str category: FamilyCategory diff --git a/app/core/ingestion/metadata.py b/app/core/ingestion/metadata.py index 6d2cde66..79667dde 100644 --- a/app/core/ingestion/metadata.py +++ b/app/core/ingestion/metadata.py @@ -18,8 +18,6 @@ "keyword": "keywords", } -MAP_OF_STR_VALUES = {} - @dataclass(config=ConfigDict(validate_assignment=True, extra=Extra.forbid)) class TaxonomyEntry: diff --git a/app/core/ingestion/processor.py b/app/core/ingestion/processor.py index 0fab0c66..abf7853c 100644 --- a/app/core/ingestion/processor.py +++ b/app/core/ingestion/processor.py @@ -155,6 +155,8 @@ def get_dfc_validator(db: Session, context: IngestContext) -> ProcessFunc: def process(context: IngestContext, row: DocumentIngestRow) -> None: """Processes the row into the db.""" - validate_document_row(context=context, taxonomy=taxonomy, row=row) + _LOGGER.info(f"Validating document row: {row.row_number}") + with db.begin(): + validate_document_row(db=db, context=context, taxonomy=taxonomy, row=row) return process diff --git a/app/core/ingestion/validator.py b/app/core/ingestion/validator.py index 64495739..2796bb1f 100644 --- a/app/core/ingestion/validator.py +++ b/app/core/ingestion/validator.py @@ -1,9 +1,34 @@ +from sqlalchemy.orm import Session from app.core.ingestion.ingest_row import DocumentIngestRow, EventIngestRow from app.core.ingestion.metadata import build_metadata, Taxonomy from app.core.ingestion.utils import IngestContext, Result, ResultType +from app.db.models.law_policy.family import ( + FamilyDocumentRole, + FamilyDocumentType, + Variant, +) +from app.db.session import Base + +DbTable = Base +CheckResult = Result + + +def _check_value_in_db( + row_num: int, db: Session, value: str, model: DbTable +) -> CheckResult: + if value != "": + val = db.query(model).get(value) + if val is None: + result = Result( + ResultType.ERROR, + f"Row {row_num}: Not found in db {model.__tablename__}={value}", + ) + return result + return Result() def validate_document_row( + db: Session, context: IngestContext, row: DocumentIngestRow, taxonomy: Taxonomy, @@ -15,13 +40,34 @@ def validate_document_row( :param [DocumentIngestRow] row: DocumentIngestRow object from the current CSV row. :param [Taxonomy] taxonomy: the Taxonomy against which metadata should be validated. """ + + errors = [] + n = row.row_number + result = _check_value_in_db(n, db, row.document_type, FamilyDocumentType) + if result.type != ResultType.OK: + errors.append(result) + + result = _check_value_in_db(n, db, row.document_role, FamilyDocumentRole) + if result.type != ResultType.OK: + errors.append(result) + + result = _check_value_in_db(n, db, row.document_variant, Variant) + if result.type != ResultType.OK: + errors.append(result) + result, _ = build_metadata(taxonomy, row) - context.results.append(result) + if result.type != ResultType.OK: + errors.append(result) + + if len(errors) > 0: + context.results += errors + else: + context.results.append(Result()) def validate_event_row(context: IngestContext, row: EventIngestRow) -> None: """ - Validate the consituent elements that represent this event row. + Validate the constituent elements that represent this event row. :param [IngestContext] context: The ingest context. :param [DocumentIngestRow] row: DocumentIngestRow object from the current CSV row. diff --git a/app/data_migrations/__init__.py b/app/data_migrations/__init__.py index 64164f0b..ba672600 100644 --- a/app/data_migrations/__init__.py +++ b/app/data_migrations/__init__.py @@ -2,6 +2,7 @@ from .populate_category import populate_category from .populate_document_type import populate_document_type from .populate_document_role import populate_document_role +from .populate_document_variant import populate_document_variant from .populate_event_type import populate_event_type from .populate_framework import populate_framework from .populate_geo_statistics import populate_geo_statistics diff --git a/app/data_migrations/data/document_variant_data.json b/app/data_migrations/data/document_variant_data.json new file mode 100644 index 00000000..a358cd3e --- /dev/null +++ b/app/data_migrations/data/document_variant_data.json @@ -0,0 +1,10 @@ +[ + { + "variant_name": "Original Language", + "description": "Original Language" + }, + { + "variant_name": "Translation", + "description": "Translation" + } +] diff --git a/app/data_migrations/populate_document_variant.py b/app/data_migrations/populate_document_variant.py new file mode 100644 index 00000000..3c6bfcea --- /dev/null +++ b/app/data_migrations/populate_document_variant.py @@ -0,0 +1,19 @@ +import json + +from sqlalchemy.orm import Session + +from app.db.models.law_policy.family import Variant +from .utils import has_rows, load_list + + +def populate_document_variant(db: Session) -> None: + """Populates the document_type table with pre-defined data.""" + + if has_rows(db, Variant): + return + + with open( + "app/data_migrations/data/document_variant_data.json" + ) as document_variant_file: + document_variant_data = json.load(document_variant_file) + load_list(db, Variant, document_variant_data) diff --git a/app/db/crud/document.py b/app/db/crud/document.py index 8f60ea22..f8c3bc83 100644 --- a/app/db/crud/document.py +++ b/app/db/crud/document.py @@ -83,7 +83,7 @@ def get_family_document_and_context( published_date=family.published_date, last_updated_date=family.last_updated_date, ) - document = FamilyDocumentResponse( + response = FamilyDocumentResponse( import_id=document.import_id, variant=document.variant_name, slug=_get_slug_for_family_document_import_id(db, document.import_id), @@ -94,9 +94,10 @@ def get_family_document_and_context( content_type=physical_document.content_type, language=_get_language_for_phys_doc(db, physical_document.id), document_type=document.document_type, + document_role=document.document_role, ) - return FamilyDocumentWithContextResponse(family=family, document=document) + return FamilyDocumentWithContextResponse(family=family, document=response) def _get_language_for_phys_doc(db: Session, physical_document_id: str) -> str: @@ -123,9 +124,9 @@ def get_family_and_documents( db_objects = ( db.query(Family, Geography, FamilyMetadata, FamilyOrganisation, Organisation) .filter(Family.import_id == import_id) - .filter(Family.geography_id == Geography.id) - .filter(import_id == FamilyMetadata.family_import_id) - .filter(import_id == FamilyOrganisation.family_import_id) + .join(Geography, Family.geography_id == Geography.id) + .join(FamilyMetadata, import_id == FamilyMetadata.family_import_id) + .join(FamilyOrganisation, import_id == FamilyOrganisation.family_import_id) .filter(FamilyOrganisation.organisation_id == Organisation.id) ).one_or_none() @@ -197,9 +198,9 @@ def _get_collections_for_family_import_id( families=[ LinkableFamily(slug=data[0], title=data[1], description=data[2]) for data in db.query(Slug.name, Family.title, Family.description) - .filter(Slug.family_import_id == Family.import_id) - .filter(CollectionFamily.family_import_id == Family.import_id) .filter(CollectionFamily.collection_import_id == c.import_id) + .filter(CollectionFamily.family_import_id == Family.import_id) + .join(Slug, Slug.family_import_id == Family.import_id) .all() ], ) @@ -246,6 +247,7 @@ def _get_documents_for_family_import_id( content_type=pd.content_type, language=_get_language_for_phys_doc(db, pd.id), document_type=d.document_type, + document_role=d.document_role, ) for d, pd in db_documents ] diff --git a/app/db/models/law_policy/family.py b/app/db/models/law_policy/family.py index d767c877..4e1cf2af 100644 --- a/app/db/models/law_policy/family.py +++ b/app/db/models/law_policy/family.py @@ -146,7 +146,7 @@ class FamilyDocument(Base): ) import_id = sa.Column(sa.Text, primary_key=True) - variant_name = sa.Column(sa.ForeignKey(Variant.variant_name), nullable=False) + variant_name = sa.Column(sa.ForeignKey(Variant.variant_name), nullable=True) document_status = sa.Column( sa.Enum(DocumentStatus), default=DocumentStatus.CREATED, diff --git a/app/initial_data.py b/app/initial_data.py index 3e13548f..8619ac62 100644 --- a/app/initial_data.py +++ b/app/initial_data.py @@ -16,6 +16,7 @@ populate_category, populate_document_type, populate_document_role, + populate_document_variant, populate_event_type, populate_framework, populate_geo_statistics, @@ -40,6 +41,7 @@ def run_data_migrations(db): populate_category(db) populate_document_type(db) populate_document_role(db) + populate_document_variant(db) populate_event_type(db) populate_framework(db) populate_geography(db) diff --git a/tests/core/ingestion/helpers.py b/tests/core/ingestion/helpers.py index b3e19ebc..fc7b5b45 100644 --- a/tests/core/ingestion/helpers.py +++ b/tests/core/ingestion/helpers.py @@ -4,6 +4,8 @@ from datetime import datetime from app.data_migrations import ( populate_category, + populate_document_role, + populate_document_variant, populate_document_type, populate_event_type, populate_geography, @@ -14,13 +16,13 @@ from app.db.models.deprecated.document import Document from app.db.models.law_policy.family import Slug -THREE_DOC_ROWS = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug -1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,DZA,http://place1|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1001.0,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1 -1002,0,Test2,FALSE,FALSE,N/A,N/A,N/A,Title2,Fam2,Summary2,,MAIN,,DZA,http://place1|en;http://place2|fr,executive,28/04/2013|Law passed||,Energy;LULUCF;Social development;Transportation;Urban;Waste,"Processes, plans and strategies|Governance",Adaptation;Mitigation,Adaptation;Mitigation,,Plan,,,Adaptation;Energy Supply;Energy Demand;Redd+ And Lulucf;Transportation,Algeria,,,CCLW.executive.1002.0,CCLW.family.1002.0,N/A,FamSlug2,DocSlug2 -1003,0,Test3,FALSE,FALSE,N/A,N/A,N/A,Title3,Fam3,Summary3,,MAIN,,DZA,,executive,08/12/2011|Law passed,Energy,Subsidies|Economic,,Mitigation,,Decree,,,Research And Development;Energy Supply,Algeria,,,CCLW.executive.1003.0,CCLW.family.1003.0,N/A,FamSlug3,DocSlug3 +THREE_DOC_ROWS = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Document variant,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug +1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,DZA,http://place1|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,,Energy Supply,Algeria,,,CCLW.executive.1001.0,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1 +1002,0,Test2,FALSE,FALSE,N/A,N/A,N/A,Title2,Fam2,Summary2,,MAIN,,DZA,http://place1|en;http://place2|fr,executive,28/04/2013|Law passed||,Energy;LULUCF;Social development;Transportation;Urban;Waste,"Processes, plans and strategies|Governance",Adaptation;Mitigation,Adaptation;Mitigation,,Plan,,,,Adaptation;Energy Supply;Energy Demand;Redd+ And Lulucf;Transportation,Algeria,,,CCLW.executive.1002.0,CCLW.family.1002.0,N/A,FamSlug2,DocSlug2 +1003,0,Test3,FALSE,FALSE,N/A,N/A,N/A,Title3,Fam3,Summary3,,MAIN,,DZA,,executive,08/12/2011|Law passed,Energy,Subsidies|Economic,,Mitigation,,Decree,,,,Research And Development;Energy Supply,Algeria,,,CCLW.executive.1003.0,CCLW.family.1003.0,N/A,FamSlug3,DocSlug3 """ -THREE_DOC_ROWS_MISSING_FIELD = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug +THREE_DOC_ROWS_MISSING_FIELD = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Document variant,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug 1001,0,Test1,FALSE,FALSE,N/A,N/A,N/A,Title1,Fam1,Summary1,,MAIN,,DZA,,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1001.0,CCLW.family.1001.0,N/A,FamSlug1 1002,0,Test2,FALSE,FALSE,N/A,N/A,N/A,Title2,Fam2,Summary2,,MAIN,,DZA,,executive,28/04/2013|Law passed||,Energy;LULUCF;Social development;Transportation;Urban;Waste,"Processes, plans and strategies|Governance",Adaptation;Mitigation,Adaptation;Mitigation,,Plan,,,Adaptation;Energy Supply;Energy Demand;Redd+ And Lulucf;Transportation,Algeria,,,CCLW.executive.1002.0,CCLW.family.1002.0,N/A,FamSlug2 1003,0,Test3,FALSE,FALSE,N/A,N/A,N/A,Title3,Fam3,Summary3,,MAIN,,DZA,,executive,08/12/2011|Law passed,Energy,Subsidies|Economic,,Mitigation,,Decree,,,Research And Development;Energy Supply,Algeria,,,CCLW.executive.1003.0,CCLW.family.1003.0,N/A,FamSlug3 @@ -45,6 +47,7 @@ "Document Type", "Document role", "Document title", + "Document variant", "Documents", "Family name", "Family summary", @@ -131,6 +134,8 @@ def init_for_ingest(test_db: Session): populate_document_type(test_db) populate_event_type(test_db) populate_language(test_db) + populate_document_role(test_db) + populate_document_variant(test_db) test_db.flush() test_db.add( Document( diff --git a/tests/core/ingestion/test_validate_row.py b/tests/core/ingestion/test_validate_row.py index 0ba6926c..a3129b63 100644 --- a/tests/core/ingestion/test_validate_row.py +++ b/tests/core/ingestion/test_validate_row.py @@ -15,7 +15,7 @@ def test_validate_row__good_data(test_db): _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) - validate_document_row(context=context, row=row, taxonomy=taxonomy) + validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -29,7 +29,7 @@ def test_validate_row__bad_data(test_db): row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.sectors = ["fish"] - validate_document_row(context=context, row=row, taxonomy=taxonomy) + validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -43,8 +43,92 @@ def test_validate_row__resolvable_data(test_db): row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.sectors = ["TranSPORtation"] - validate_document_row(context=context, row=row, taxonomy=taxonomy) + validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 assert context.results[0].type == ResultType.RESOLVED + + +def test_validate_row__bad_document_type(test_db): + context = IngestContext(org_id=1, results=[]) + init_for_ingest(test_db) + _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) + row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row.document_type = "fish" + + validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + + assert context.results + assert len(context.results) == 1 + assert context.results[0].type == ResultType.ERROR + + +def test_validate_row__good_document_type(test_db): + context = IngestContext(org_id=1, results=[]) + init_for_ingest(test_db) + _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) + row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row.document_type = "Order" + + validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + + assert context.results + assert len(context.results) == 1 + assert context.results[0].type == ResultType.OK + + +def test_validate_row__bad_document_role(test_db): + context = IngestContext(org_id=1, results=[]) + init_for_ingest(test_db) + _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) + row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row.document_role = "fish" + + validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + + assert context.results + assert len(context.results) == 1 + assert context.results[0].type == ResultType.ERROR + + +def test_validate_row__good_document_role(test_db): + context = IngestContext(org_id=1, results=[]) + init_for_ingest(test_db) + _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) + row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row.document_role = "MAIN" + + validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + + assert context.results + assert len(context.results) == 1 + assert context.results[0].type == ResultType.OK + + +def test_validate_row__bad_document_variant(test_db): + context = IngestContext(org_id=1, results=[]) + init_for_ingest(test_db) + _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) + row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row.document_variant = "fish" + + validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + + assert context.results + assert len(context.results) == 1 + assert context.results[0].type == ResultType.ERROR + + +def test_validate_row__good_document_variant(test_db): + context = IngestContext(org_id=1, results=[]) + init_for_ingest(test_db) + _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) + row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row.document_variant = "Translation" + + validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + + assert context.results + assert len(context.results) == 1 + assert context.results[0].type == ResultType.OK diff --git a/tests/routes/document_helpers.py b/tests/routes/document_helpers.py new file mode 100644 index 00000000..29f672d7 --- /dev/null +++ b/tests/routes/document_helpers.py @@ -0,0 +1,148 @@ +from datetime import datetime +from app.api.api_v1.routers.admin import _start_ingest +from app.data_migrations import ( + populate_document_role, + populate_document_type, + populate_document_variant, + populate_event_type, + populate_taxonomy, +) +from app.db.models.deprecated.document import ( + Category, + Document, + DocumentType, + Framework, + Hazard, + Instrument, + Keyword, + Response, + Sector, +) +from app.db.models.deprecated.source import Source +from app.db.models.document.physical_document import Language +from app.db.models.law_policy.geography import Geography + + +ONE_DFC_ROW = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug,Document variant +1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1,Translation +""" + +ONE_EVENT_ROW = """Id,Eventable type,Eventable Id,Eventable name,Event type,Title,Description,Date,Url,CPR Event ID,CPR Family ID,Event Status +1101,Legislation,1001,Title1,Passed/Approved,Published,,2019-12-25,,CCLW.legislation_event.1101.0,CCLW.family.1001.0,OK +""" + +TWO_DFC_ROW = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug,Document variant +1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1, +2002,0,Test2,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title2,Fam2,Summary2,,MAIN,,GEO,http://another_somewhere|en,executive,03/03/2024|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.2.2,CCLW.family.2002.0,CPR.Collection.1,FamSlug2,DocSlug2, +""" + +TWO_DFC_ROW_ONE_LANGUAGE = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug,Document variant +1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,English,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1, +2002,0,Test2,FALSE,FALSE,N/A,Collection2,CollectionSummary2,Title2,Fam2,Summary2,,MAIN,,GEO,http://another_somewhere|en,executive,03/03/2024|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.2.2,CCLW.family.2002.0,CPR.Collection.2,FamSlug2,DocSlug2, +""" + +TWO_EVENT_ROWS = """Id,Eventable type,Eventable Id,Eventable name,Event type,Title,Description,Date,Url,CPR Event ID,CPR Family ID,Event Status +1101,Legislation,1001,Title1,Passed/Approved,Published,,2019-12-25,,CCLW.legislation_event.1101.0,CCLW.family.1001.0,OK +2202,Legislation,2002,Title2,Passed/Approved,Published,,2019-12-25,,CCLW.legislation_event.2202.0,CCLW.family.2002.0,OK +""" + + +def setup_with_docs(test_db, mocker): + mock_s3 = mocker.patch("app.core.aws.S3Client") + + populate_taxonomy(test_db) + populate_event_type(test_db) + populate_document_type(test_db) + populate_document_role(test_db) + populate_document_variant(test_db) + test_db.commit() + + populate_old_documents(test_db) + + _start_ingest(test_db, mock_s3, "s3_prefix", ONE_DFC_ROW, ONE_EVENT_ROW) + test_db.commit() + + +def setup_with_two_docs( + test_db, mocker, doc_data=TWO_DFC_ROW, event_data=TWO_EVENT_ROWS +): + mock_s3 = mocker.patch("app.core.aws.S3Client") + + populate_taxonomy(test_db) + populate_event_type(test_db) + populate_document_type(test_db) + populate_document_role(test_db) + populate_document_variant(test_db) + test_db.commit() + + populate_old_documents(test_db) + + _start_ingest(test_db, mock_s3, "s3_prefix", doc_data, event_data) + test_db.commit() + + +def populate_old_documents(test_db): + test_db.add(Source(name="CCLW")) + test_db.add( + Geography( + display_value="geography", slug="geography", value="GEO", type="country" + ) + ) + test_db.add(DocumentType(name="doctype", description="doctype")) + test_db.add(Language(language_code="LAN", name="language")) + test_db.add(Category(name="Policy", description="Policy")) + test_db.add(Keyword(name="keyword1", description="keyword1")) + test_db.add(Keyword(name="keyword2", description="keyword2")) + test_db.add(Hazard(name="hazard1", description="hazard1")) + test_db.add(Hazard(name="hazard2", description="hazard2")) + test_db.add(Response(name="topic", description="topic")) + test_db.add(Framework(name="framework", description="framework")) + + test_db.commit() + existing_doc_import_id = "CCLW.executive.1.2" + test_db.add(Instrument(name="instrument", description="instrument", source_id=1)) + test_db.add(Sector(name="sector", description="sector", source_id=1)) + test_db.add( + Document( + publication_ts=datetime(year=2014, month=1, day=1), + name="test", + description="test description", + source_url="http://somewhere", + source_id=1, + url="", + cdn_object="", + md5_sum=None, + content_type=None, + slug="geography_2014_test_1_2", + import_id=existing_doc_import_id, + geography_id=1, + type_id=1, + category_id=1, + ) + ) + test_db.commit() + test_db.add( + Document( + publication_ts=datetime(year=2014, month=1, day=1), + name="test", + description="test description", + source_url="http://another_somewhere", + source_id=1, + url="", + cdn_object="", + md5_sum=None, + content_type=None, + slug="geography_2014_test_2_2", + import_id="CCLW.executive.2.2", + geography_id=1, + type_id=1, + category_id=1, + ) + ) + test_db.commit() + + +def populate_languages(test_db): + test_db.add(Language(language_code="eng", name="English")) + test_db.add(Language(language_code="fra", name="French")) + test_db.commit() diff --git a/tests/routes/test_admin.py b/tests/routes/test_admin.py index bb5dba1a..d58d7476 100644 --- a/tests/routes/test_admin.py +++ b/tests/routes/test_admin.py @@ -5,6 +5,11 @@ import pytest from app.api.api_v1.routers.admin import ACCOUNT_ACTIVATION_EXPIRE_MINUTES +from app.data_migrations import ( + populate_document_role, + populate_document_type, + populate_document_variant, +) from app.data_migrations.populate_taxonomy import populate_taxonomy from app.db.models.deprecated import ( Source, @@ -281,8 +286,8 @@ def test_reset_password( assert mock_send_email.call_count == 2 -ONE_DFC_ROW = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug -1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1 +ONE_DFC_ROW = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug,Document variant +1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1,Translation """ TWO_EVENT_ROWS = """Id,Eventable type,Eventable Id,Eventable name,Event type,Title,Description,Date,Url,CPR Event ID,CPR Family ID,Event Status @@ -297,6 +302,9 @@ def test_validate_bulk_ingest_cclw_law_policy( test_db, ): populate_taxonomy(db=test_db) + populate_document_type(db=test_db) + populate_document_role(db=test_db) + populate_document_variant(db=test_db) test_db.commit() law_policy_csv_file = BytesIO(ONE_DFC_ROW.encode("utf8")) files = { @@ -331,6 +339,9 @@ def test_bulk_ingest_cclw_law_policy_preexisting_db_objects( mock_write_csv_to_s3 = mocker.patch("app.api.api_v1.routers.admin.write_csv_to_s3") populate_taxonomy(db=test_db) + populate_document_type(db=test_db) + populate_document_role(db=test_db) + populate_document_variant(db=test_db) test_db.add(Source(name="CCLW")) test_db.add( Geography( diff --git a/tests/routes/test_documents.py b/tests/routes/test_documents.py index 5ca17941..482385ca 100644 --- a/tests/routes/test_documents.py +++ b/tests/routes/test_documents.py @@ -1,148 +1,19 @@ -from datetime import datetime from typing import Callable, Generator - -import pytest from sqlalchemy.orm import Session - from fastapi.testclient import TestClient from pytest_mock import MockerFixture -from app.api.api_v1.routers.admin import _start_ingest -from app.data_migrations import populate_event_type, populate_taxonomy -from app.db.models.deprecated.document import ( - Category, - Document, - DocumentType, - Framework, - Hazard, - Instrument, - Keyword, - Response, - Sector, -) -from app.db.models.deprecated.source import Source -from app.db.models.document.physical_document import Language, PhysicalDocument from app.db.models.law_policy.family import Family, FamilyEvent -from app.db.models.law_policy.geography import Geography - -METADATA_COUNT = 6 - -ONE_DFC_ROW = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug -1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1 -""" - -ONE_EVENT_ROW = """Id,Eventable type,Eventable Id,Eventable name,Event type,Title,Description,Date,Url,CPR Event ID,CPR Family ID,Event Status -1101,Legislation,1001,Title1,Passed/Approved,Published,,2019-12-25,,CCLW.legislation_event.1101.0,CCLW.family.1001.0,OK -""" - -TWO_DFC_ROW = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug -1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1 -2002,0,Test2,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title2,Fam2,Summary2,,MAIN,,GEO,http://another_somewhere|en,executive,03/03/2024|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.2.2,CCLW.family.2002.0,CPR.Collection.1,FamSlug2,DocSlug2 -""" - -TWO_DFC_ROW_ONE_LANGUAGE = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug -1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,English,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1 -2002,0,Test2,FALSE,FALSE,N/A,Collection2,CollectionSummary2,Title2,Fam2,Summary2,,MAIN,,GEO,http://another_somewhere|en,executive,03/03/2024|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.2.2,CCLW.family.2002.0,CPR.Collection.2,FamSlug2,DocSlug2 -""" - -TWO_EVENT_ROWS = """Id,Eventable type,Eventable Id,Eventable name,Event type,Title,Description,Date,Url,CPR Event ID,CPR Family ID,Event Status -1101,Legislation,1001,Title1,Passed/Approved,Published,,2019-12-25,,CCLW.legislation_event.1101.0,CCLW.family.1001.0,OK -2202,Legislation,2002,Title2,Passed/Approved,Published,,2019-12-25,,CCLW.legislation_event.2202.0,CCLW.family.2002.0,OK -""" - - -def setup_with_docs(test_db, mocker): - mock_s3 = mocker.patch("app.core.aws.S3Client") - - populate_taxonomy(test_db) - populate_event_type(test_db) - test_db.commit() - - populate_old_documents(test_db) - - _start_ingest(test_db, mock_s3, "s3_prefix", ONE_DFC_ROW, ONE_EVENT_ROW) - test_db.commit() - - -def setup_with_two_docs( - test_db, mocker, doc_data=TWO_DFC_ROW, event_data=TWO_EVENT_ROWS -): - mock_s3 = mocker.patch("app.core.aws.S3Client") - - populate_taxonomy(test_db) - populate_event_type(test_db) - test_db.commit() - - populate_old_documents(test_db) - - _start_ingest(test_db, mock_s3, "s3_prefix", doc_data, event_data) - test_db.commit() - - -def populate_old_documents(test_db): - test_db.add(Source(name="CCLW")) - test_db.add( - Geography( - display_value="geography", slug="geography", value="GEO", type="country" - ) - ) - test_db.add(DocumentType(name="doctype", description="doctype")) - test_db.add(Language(language_code="LAN", name="language")) - test_db.add(Category(name="Policy", description="Policy")) - test_db.add(Keyword(name="keyword1", description="keyword1")) - test_db.add(Keyword(name="keyword2", description="keyword2")) - test_db.add(Hazard(name="hazard1", description="hazard1")) - test_db.add(Hazard(name="hazard2", description="hazard2")) - test_db.add(Response(name="topic", description="topic")) - test_db.add(Framework(name="framework", description="framework")) - - test_db.commit() - existing_doc_import_id = "CCLW.executive.1.2" - test_db.add(Instrument(name="instrument", description="instrument", source_id=1)) - test_db.add(Sector(name="sector", description="sector", source_id=1)) - test_db.add( - Document( - publication_ts=datetime(year=2014, month=1, day=1), - name="test", - description="test description", - source_url="http://somewhere", - source_id=1, - url="", - cdn_object="", - md5_sum=None, - content_type=None, - slug="geography_2014_test_1_2", - import_id=existing_doc_import_id, - geography_id=1, - type_id=1, - category_id=1, - ) - ) - test_db.commit() - test_db.add( - Document( - publication_ts=datetime(year=2014, month=1, day=1), - name="test", - description="test description", - source_url="http://another_somewhere", - source_id=1, - url="", - cdn_object="", - md5_sum=None, - content_type=None, - slug="geography_2014_test_2_2", - import_id="CCLW.executive.2.2", - geography_id=1, - type_id=1, - category_id=1, - ) - ) - test_db.commit() - +from tests.routes.document_helpers import ( + TWO_DFC_ROW_ONE_LANGUAGE, + populate_languages, + setup_with_docs, + setup_with_two_docs, +) -def populate_languages(test_db): - test_db.add(Language(language_code="eng", name="English")) - test_db.add(Language(language_code="fra", name="French")) - test_db.commit() +N_METADATA_KEYS = 6 +N_FAMILY_KEYS = 14 +N_FAMILY_OVERVIEW_KEYS = 7 +N_DOCUMENT_KEYS = 11 def test_documents_family_slug_returns_not_found( @@ -161,15 +32,38 @@ def test_documents_family_slug_returns_not_found( assert response.status_code == 404 -def test_documents_family_slug_preexisting_objects( +def test_documents_family_slug_returns_correct_family( + client: TestClient, + test_db: Session, + mocker: Callable[..., Generator[MockerFixture, None, None]], +): + setup_with_two_docs(test_db, mocker) + + # Test associations + response = client.get( + "/api/v1/documents/FamSlug1?group_documents=True", + ) + + json_response = response.json() + assert response.status_code == 200 + assert json_response["import_id"] == "CCLW.family.1001.0" + + # Ensure a different family is returned + response = client.get( + "/api/v1/documents/FamSlug2?group_documents=True", + ) + + json_response = response.json() + assert response.status_code == 200 + assert json_response["import_id"] == "CCLW.family.2002.0" + + +def test_documents_family_slug_returns_correct_json( client: TestClient, test_db: Session, mocker: Callable[..., Generator[MockerFixture, None, None]], ): setup_with_two_docs(test_db, mocker) - assert test_db.query(PhysicalDocument).count() == 2 - assert test_db.query(Family).count() == 2 - assert test_db.query(FamilyEvent).count() == 2 # Test associations response = client.get( @@ -178,7 +72,7 @@ def test_documents_family_slug_preexisting_objects( json_response = response.json() assert response.status_code == 200 - assert len(json_response) == 14 + assert len(json_response) == N_FAMILY_KEYS assert json_response["organisation"] == "CCLW" assert json_response["import_id"] == "CCLW.family.1001.0" assert json_response["title"] == "Fam1" @@ -189,7 +83,7 @@ def test_documents_family_slug_preexisting_objects( assert json_response["published_date"] == "2019-12-25T00:00:00+00:00" assert json_response["last_updated_date"] == "2019-12-25T00:00:00+00:00" - assert len(json_response["metadata"]) == METADATA_COUNT + assert len(json_response["metadata"]) == N_METADATA_KEYS assert json_response["metadata"]["keyword"] == ["Energy Supply"] assert json_response["slug"] == "FamSlug1" @@ -210,17 +104,6 @@ def test_documents_family_slug_preexisting_objects( {"title": "Fam2", "slug": "FamSlug2", "description": "Summary2"}, ] - # Ensure a different family is returned - response = client.get( - "/api/v1/documents/FamSlug2?group_documents=True", - ) - json_response = response.json() - assert response.status_code == 200 - assert len(json_response) == 14 - assert json_response["organisation"] == "CCLW" - assert json_response["title"] == "Fam2" - assert json_response["import_id"] == "CCLW.family.2002.0" - def test_documents_doc_slug_returns_not_found( client: TestClient, @@ -244,9 +127,6 @@ def test_documents_doc_slug_preexisting_objects( mocker: Callable[..., Generator[MockerFixture, None, None]], ): setup_with_two_docs(test_db, mocker) - assert test_db.query(PhysicalDocument).count() == 2 - assert test_db.query(Family).count() == 2 - assert test_db.query(FamilyEvent).count() == 2 # Test associations response = client.get( @@ -258,7 +138,7 @@ def test_documents_doc_slug_preexisting_objects( family = json_response["family"] assert family - assert len(family.keys()) == 7 + assert len(family.keys()) == N_FAMILY_OVERVIEW_KEYS assert family["title"] == "Fam2" assert family["import_id"] == "CCLW.family.2002.0" assert family["geography"] == "GEO" @@ -269,19 +149,20 @@ def test_documents_doc_slug_preexisting_objects( doc = json_response["document"] assert doc - assert len(doc) == 10 + assert len(doc) == N_DOCUMENT_KEYS assert doc["import_id"] == "CCLW.executive.2.2" - assert doc["variant"] == "MAIN" + assert doc["variant"] is None assert doc["slug"] == "DocSlug2" assert doc["title"] == "Title2" assert doc["md5_sum"] is None assert doc["cdn_object"] == "https://cdn.climatepolicyradar.org/" + assert doc["content_type"] is None assert doc["source_url"] == "http://another_somewhere" assert doc["language"] == "" assert doc["document_type"] == "Order" + assert doc["document_role"] == "MAIN" -@pytest.mark.languages def test_physical_doc_languages( client: TestClient, test_db: Session, @@ -290,11 +171,6 @@ def test_physical_doc_languages( populate_languages(test_db) setup_with_two_docs(test_db, mocker, doc_data=TWO_DFC_ROW_ONE_LANGUAGE) - assert test_db.query(Language).count() == 3 - assert test_db.query(PhysicalDocument).count() == 2 - assert test_db.query(Family).count() == 2 - assert test_db.query(FamilyEvent).count() == 2 - response = client.get( "/api/v1/documents/DocSlug1?group_documents=True", )