diff --git a/alembic/versions/0016_add_unfccc.py b/alembic/versions/0016_add_unfccc.py new file mode 100644 index 00000000..273258ce --- /dev/null +++ b/alembic/versions/0016_add_unfccc.py @@ -0,0 +1,30 @@ +"""remove deprecated db models + +Revision ID: 0016 +Revises: 0015 +Create Date: Manually see commit + +""" +from alembic import op +from alembic.op import execute +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '0016' +down_revision = '0015' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + execute("ALTER TYPE familycategory ADD VALUE 'UNFCCC'") + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + # Downgrading is not that simple see: https://stackoverflow.com/a/56777227 + raise ValueError("We really don't want to be here") + # ### end Alembic commands ### diff --git a/app/api/api_v1/routers/admin.py b/app/api/api_v1/routers/admin.py index 966fc3fc..c5b1ae66 100644 --- a/app/api/api_v1/routers/admin.py +++ b/app/api/api_v1/routers/admin.py @@ -1,402 +1,26 @@ import logging -from typing import Union -from sqlalchemy.orm import Session -from app.core.aws import S3Client from fastapi import ( APIRouter, - BackgroundTasks, Depends, HTTPException, Request, - UploadFile, status, ) from sqlalchemy import update -from app.core.aws import S3Document from app.api.api_v1.schemas.document import ( - BulkIngestResult, DocumentUpdateRequest, ) from app.core.auth import get_superuser_details -from app.core.aws import get_s3_client -from app.core.ingestion.ingest_row import DocumentIngestRow, EventIngestRow -from app.core.ingestion.pipeline import generate_pipeline_ingest_input -from app.core.ingestion.processor import ( - initialise_context, - get_dfc_ingestor, - get_dfc_validator, - get_event_ingestor, -) -from app.core.ingestion.reader import get_file_contents, read -from app.core.ingestion.utils import ( - IngestContext, - Result, - ResultType, - ValidationResult, - get_result_counts, -) -from app.core.ingestion.validator import validate_event_row from app.core.validation import IMPORT_ID_MATCHER -from app.core.validation.types import ImportSchemaMismatchError -from app.core.validation.util import ( - get_new_s3_prefix, - write_csv_to_s3, - write_documents_to_s3, - write_ingest_results_to_s3, -) from app.db.models.document.physical_document import PhysicalDocument from app.db.models.law_policy.family import FamilyDocument, Slug from app.db.session import get_db _LOGGER = logging.getLogger(__name__) -admin_users_router = r = APIRouter() - -# TODO: revisit activation timeout -ACCOUNT_ACTIVATION_EXPIRE_MINUTES = 4 * 7 * 24 * 60 # 4 weeks - - -def _start_ingest( - db: Session, - s3_client: S3Client, - s3_prefix: str, - documents_file_contents: str, - events_file_contents: str, -): - context = None - # TODO: add a way for a user to monitor progress of the ingest - try: - context = initialise_context(db) - document_ingestor = get_dfc_ingestor(db) - read(documents_file_contents, context, DocumentIngestRow, document_ingestor) - event_ingestor = get_event_ingestor(db) - read(events_file_contents, context, EventIngestRow, event_ingestor) - except Exception as e: - # This is a background task, so do not raise - _LOGGER.exception( - "Unexpected error on ingest", extra={"props": {"errors": str(e)}} - ) - - try: - if context is not None: - write_ingest_results_to_s3( - s3_client=s3_client, - s3_prefix=s3_prefix, - results=context.results, - ) - except Exception as e: - _LOGGER.exception( - "Unexpected error writing ingest results to s3", - extra={"props": {"errors": str(e)}}, - ) - - try: - pipeline_ingest_input = generate_pipeline_ingest_input(db) - write_documents_to_s3( - s3_client=s3_client, - s3_prefix=s3_prefix, - documents=pipeline_ingest_input, - ) - except Exception as e: - _LOGGER.exception( - "Unexpected error writing pipeline input document to s3", - extra={"props": {"errors": str(e)}}, - ) - - -@r.post( - "/bulk-ingest/validate/cclw/law-policy", - response_model=ValidationResult, - status_code=status.HTTP_200_OK, -) -def validate_law_policy( - request: Request, - law_policy_csv: UploadFile, - db=Depends(get_db), - current_user=Depends(get_superuser_details), -): - """ - Validates the provided CSV into the document / family / collection schema. - - :param [Request] request: Incoming request (UNUSED). - :param [UploadFile] law_policy_csv: CSV file to ingest. - :param [Session] db: Database connection. - Defaults to Depends(get_db). - :param [JWTUser] current_user: Current user. - Defaults to Depends(get_current_active_superuser). - :return [str]: A path to an s3 object containing document updates to be processed - by the ingest pipeline. - :raises HTTPException: The following HTTPExceptions are raised on errors: - 400 If the provided CSV file fails schema validation - 422 On failed validation on the input CSV (results included) - 500 On an unexpected error - """ - - _LOGGER.info( - f"Superuser '{current_user.email}' triggered Bulk Document Validation for " - "CCLW Law & Policy data" - ) - - try: - context = initialise_context(db) - except Exception as e: - _LOGGER.exception( - "Failed to create ingest context", extra={"props": {"errors": str(e)}} - ) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e - - all_results = [] - - try: - _, message = _validate_law_policy_csv(law_policy_csv, db, context, all_results) - except ImportSchemaMismatchError as e: - _LOGGER.exception( - "Provided CSV failed law & policy schema validation", - extra={"props": {"errors": str(e)}}, - ) - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) from e - except Exception as e: - _LOGGER.exception( - "Unexpected error, validating law & policy CSV on ingest", - extra={"props": {"errors": str(e)}}, - ) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e - - # Intended output for this is the console - so for now just format it up for that. - errors = [r for r in all_results if r.type == ResultType.ERROR] - return ValidationResult(message=message, errors=errors) - - -@r.post( - "/bulk-ingest/cclw/law-policy", - response_model=BulkIngestResult, - status_code=status.HTTP_202_ACCEPTED, -) -def ingest_law_policy( - request: Request, - law_policy_csv: UploadFile, - events_csv: UploadFile, - background_tasks: BackgroundTasks, - db=Depends(get_db), - current_user=Depends(get_superuser_details), - s3_client=Depends(get_s3_client), -): - """ - Ingest the provided CSV into the document / family / collection schema. - - :param [Request] request: Incoming request (UNUSED). - :param [UploadFile] law_policy_csv: CSV file containing documents to ingest. - :param [UploadFile] events_csv: CSV file containing events to ingest. - :param [BackgroundTasks] background_tasks: Tasks API to start ingest task. - :param [Session] db: Database connection. - Defaults to Depends(get_db). - :param [JWTUser] current_user: Current user. - Defaults to Depends(get_current_active_superuser). - :param [S3Client] s3_client: S3 connection. - Defaults to Depends(get_s3_client). - :return [str]: A path to an s3 object containing document updates to be processed - by the ingest pipeline. - :raises HTTPException: The following HTTPExceptions are raised on errors: - 400 If the provided CSV file fails schema validation - 422 On failed validation on the input CSV (results included) - 500 On an unexpected error - """ - # TODO: Combine with event import? refactor out shared structure? - - _LOGGER.info( - f"Superuser '{current_user.email}' triggered Bulk Document Ingest for " - "CCLW Law & Policy data" - ) - - try: - context = initialise_context(db) - except Exception as e: - _LOGGER.exception( - "Failed to create ingest context", extra={"props": {"errors": str(e)}} - ) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e - - all_results = [] - - # PHASE 1 - Validate - try: - documents_file_contents, _ = _validate_law_policy_csv( - law_policy_csv, db, context, all_results - ) - except ImportSchemaMismatchError as e: - _LOGGER.exception( - "Provided CSV failed law & policy schema validation", - extra={"props": {"errors": str(e)}}, - ) - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) from e - except Exception as e: - _LOGGER.exception( - "Unexpected error, validating law & policy CSV on ingest", - extra={"props": {"errors": str(e)}}, - ) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e - - try: - events_file_contents = get_file_contents(events_csv) - read(events_file_contents, context, EventIngestRow, validate_event_row) - rows, fails, resolved = get_result_counts(context.results) - all_results.extend(context.results) - context.results = all_results - - _LOGGER.info( - f"Events validation result: {rows} Rows, {fails} Failures, " - f"{resolved} Resolved" - ) - except ImportSchemaMismatchError as e: - _LOGGER.exception( - "Provided CSV failed events schema validation", - extra={"props": {"errors": str(e)}}, - ) - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) from e - except Exception as e: - _LOGGER.exception( - "Unexpected error, validating events CSV on ingest", - extra={"props": {"errors": str(e)}}, - ) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e - - # If we have any validation errors then raise - validation_errors = [r for r in context.results if r.type == ResultType.ERROR] - if validation_errors: - _LOGGER.error( - "Ingest failed validation (results attached)", - extra={"errors": validation_errors}, - ) - error_details = [e.details for e in validation_errors] - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=error_details - ) - - # PHASE 2 - Validation completed without errors, so store the ingest files. This - # will let us investigate errors later - s3_prefix = get_new_s3_prefix() - try: - result_documents: Union[S3Document, bool] = write_csv_to_s3( - s3_client=s3_client, - s3_prefix=s3_prefix, - s3_content_label="documents", - file_contents=documents_file_contents, - ) - result_events: Union[S3Document, bool] = write_csv_to_s3( - s3_client=s3_client, - s3_prefix=s3_prefix, - s3_content_label="events", - file_contents=events_file_contents, - ) - if ( - type(result_documents) is bool - ): # S3Client returns False if the object was not created - _LOGGER.error( - "Write Bulk Document Ingest CSV to S3 Failed.", - extra={ - "props": { - "superuser_email": current_user.email, - } - }, - ) - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Unexpected error, fail to write Bulk Document Ingest CSV to S3", - ) - if ( - type(result_events) is bool - ): # S3Client returns False if the object was not created - _LOGGER.error( - "Write Bulk Event Ingest CSV to S3 Failed.", - extra={ - "props": { - "superuser_email": current_user.email, - } - }, - ) - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Unexpected error, fail to write Bulk Event Ingest CSV to S3", - ) - else: - documents_csv_s3_location = str(result_documents.url) - events_csv_s3_location = str(result_events.url) - _LOGGER.info( - "Write Event Ingest CSV complete.", - extra={ - "props": { - "superuser_email": current_user.email, - "documents_csv_s3_location": documents_csv_s3_location, - "events_csv_s3_location": events_csv_s3_location, - } - }, - ) - except Exception as e: - _LOGGER.exception( - "Unexpected error, writing Bulk Document Ingest CSV content to S3", - extra={"props": {"errors": str(e)}}, - ) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e - - # PHASE 3 - Start the ingest (kick off background task to do the actual ingest) - background_tasks.add_task( - _start_ingest, - db, - s3_client, - s3_prefix, - documents_file_contents, - events_file_contents, - ) - - _LOGGER.info( - "Background Bulk Document/Event Ingest Task added", - extra={ - "props": { - "superuser_email": current_user.email, - "documents_csv_s3_location": documents_csv_s3_location, - "events_csv_s3_location": events_csv_s3_location, - } - }, - ) - - # TODO: Add some way the caller can monitor processing pipeline... - return BulkIngestResult( - import_s3_prefix=s3_prefix, - detail=None, # TODO: add detail? - ) - - -def _validate_law_policy_csv( - law_policy_csv: UploadFile, - db: Session, - context: IngestContext, - all_results: list[Result], -) -> tuple[str, str]: - """ - Validates the csv file - - :param UploadFile law_policy_csv: incoming file to validate - :param Session db: connection to the database - :param IngestContext context: the ingest context - :param list[Result] all_results: the results - :return tuple[str, str]: the file contents of the csv and the summary message - """ - documents_file_contents = get_file_contents(law_policy_csv) - validator = get_dfc_validator(db, context) - read(documents_file_contents, context, DocumentIngestRow, validator) - rows, fails, resolved = get_result_counts(context.results) - all_results.extend(context.results) - context.results = [] - message = ( - f"Law & Policy validation result: {rows} Rows, {fails} Failures, " - f"{resolved} Resolved" - ) - - _LOGGER.info(message) - - return documents_file_contents, message +admin_document_router = r = APIRouter() @r.put("/documents/{import_id_or_slug}", status_code=status.HTTP_200_OK) diff --git a/app/api/api_v1/routers/cclw_ingest.py b/app/api/api_v1/routers/cclw_ingest.py new file mode 100644 index 00000000..cb46dafe --- /dev/null +++ b/app/api/api_v1/routers/cclw_ingest.py @@ -0,0 +1,484 @@ +import logging +from typing import Union +from sqlalchemy.orm import Session +from app.core.aws import S3Client + +from fastapi import ( + APIRouter, + BackgroundTasks, + Depends, + HTTPException, + Request, + UploadFile, + status, +) +from sqlalchemy import update +from app.core.aws import S3Document + +from app.api.api_v1.schemas.document import ( + BulkIngestResult, + DocumentUpdateRequest, +) +from app.core.auth import get_superuser_details +from app.core.aws import get_s3_client +from app.core.ingestion.cclw.ingest_row_cclw import ( + CCLWDocumentIngestRow, + EventIngestRow, +) +from app.core.ingestion.cclw.pipeline import generate_pipeline_ingest_input +from app.core.ingestion.processor import ( + initialise_context, + get_document_ingestor, + get_document_validator, + get_event_ingestor, +) +from app.core.ingestion.cclw.reader import get_file_contents, read +from app.core.ingestion.utils import ( + IngestContext, + Result, + ResultType, + ValidationResult, + get_result_counts, +) +from app.core.ingestion.validator import validate_event_row +from app.core.validation import IMPORT_ID_MATCHER +from app.core.validation.types import ImportSchemaMismatchError +from app.core.validation.util import ( + get_new_s3_prefix, + write_csv_to_s3, + write_documents_to_s3, + write_ingest_results_to_s3, +) +from app.db.models.document.physical_document import PhysicalDocument +from app.db.models.law_policy.family import FamilyDocument, Slug +from app.db.session import get_db + +_LOGGER = logging.getLogger(__name__) + +cclw_ingest_router = r = APIRouter() + + +def _start_ingest( + db: Session, + s3_client: S3Client, + s3_prefix: str, + documents_file_contents: str, + events_file_contents: str, +): + context = None + # TODO: add a way for a user to monitor progress of the ingest + try: + context = initialise_context(db, "CCLW") + document_ingestor = get_document_ingestor(db, context) + read(documents_file_contents, context, CCLWDocumentIngestRow, document_ingestor) + event_ingestor = get_event_ingestor(db) + read(events_file_contents, context, EventIngestRow, event_ingestor) + except Exception as e: + # This is a background task, so do not raise + _LOGGER.exception( + "Unexpected error on ingest", extra={"props": {"errors": str(e)}} + ) + + try: + if context is not None: + write_ingest_results_to_s3( + s3_client=s3_client, + s3_prefix=s3_prefix, + results=context.results, + ) + except Exception as e: + _LOGGER.exception( + "Unexpected error writing ingest results to s3", + extra={"props": {"errors": str(e)}}, + ) + + try: + pipeline_ingest_input = generate_pipeline_ingest_input(db) + write_documents_to_s3( + s3_client=s3_client, + s3_prefix=s3_prefix, + documents=pipeline_ingest_input, + ) + except Exception as e: + _LOGGER.exception( + "Unexpected error writing pipeline input document to s3", + extra={"props": {"errors": str(e)}}, + ) + + +@r.post( + "/bulk-ingest/validate/cclw", + response_model=ValidationResult, + status_code=status.HTTP_200_OK, +) +def validate_law_policy( + request: Request, + law_policy_csv: UploadFile, + db=Depends(get_db), + current_user=Depends(get_superuser_details), +): + """ + Validates the provided CSV into the document / family / collection schema. + + :param [Request] request: Incoming request (UNUSED). + :param [UploadFile] law_policy_csv: CSV file to ingest. + :param [Session] db: Database connection. + Defaults to Depends(get_db). + :param [JWTUser] current_user: Current user. + Defaults to Depends(get_current_active_superuser). + :return [str]: A path to an s3 object containing document updates to be processed + by the ingest pipeline. + :raises HTTPException: The following HTTPExceptions are raised on errors: + 400 If the provided CSV file fails schema validation + 422 On failed validation on the input CSV (results included) + 500 On an unexpected error + """ + + _LOGGER.info( + f"Superuser '{current_user.email}' triggered Bulk Document Validation for " + "CCLW Law & Policy data" + ) + + try: + context = initialise_context(db, "CCLW") + except Exception as e: + _LOGGER.exception( + "Failed to create ingest context", extra={"props": {"errors": str(e)}} + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + all_results = [] + + try: + _, message = _validate_cclw_csv(law_policy_csv, db, context, all_results) + except ImportSchemaMismatchError as e: + _LOGGER.exception( + "Provided CSV failed law & policy schema validation", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) from e + except Exception as e: + _LOGGER.exception( + "Unexpected error, validating law & policy CSV on ingest", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + # Intended output for this is the console - so for now just format it up for that. + errors = [r for r in all_results if r.type == ResultType.ERROR] + return ValidationResult(message=message, errors=errors) + + +@r.post( + "/bulk-ingest/cclw", + response_model=BulkIngestResult, + status_code=status.HTTP_202_ACCEPTED, +) +def ingest_law_policy( + request: Request, + law_policy_csv: UploadFile, + events_csv: UploadFile, + background_tasks: BackgroundTasks, + db=Depends(get_db), + current_user=Depends(get_superuser_details), + s3_client=Depends(get_s3_client), +): + """ + Ingest the provided CSV into the document / family / collection schema. + + :param [Request] request: Incoming request (UNUSED). + :param [UploadFile] law_policy_csv: CSV file containing documents to ingest. + :param [UploadFile] events_csv: CSV file containing events to ingest. + :param [BackgroundTasks] background_tasks: Tasks API to start ingest task. + :param [Session] db: Database connection. + Defaults to Depends(get_db). + :param [JWTUser] current_user: Current user. + Defaults to Depends(get_current_active_superuser). + :param [S3Client] s3_client: S3 connection. + Defaults to Depends(get_s3_client). + :return [str]: A path to an s3 object containing document updates to be processed + by the ingest pipeline. + :raises HTTPException: The following HTTPExceptions are raised on errors: + 400 If the provided CSV file fails schema validation + 422 On failed validation on the input CSV (results included) + 500 On an unexpected error + """ + # TODO: Combine with event import? refactor out shared structure? + + _LOGGER.info( + f"Superuser '{current_user.email}' triggered Bulk Document Ingest for " + "CCLW Law & Policy data" + ) + + try: + context = initialise_context(db, "CCLW") + except Exception as e: + _LOGGER.exception( + "Failed to create ingest context", extra={"props": {"errors": str(e)}} + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + all_results = [] + + # PHASE 1 - Validate + try: + documents_file_contents, _ = _validate_cclw_csv( + law_policy_csv, db, context, all_results + ) + except ImportSchemaMismatchError as e: + _LOGGER.exception( + "Provided CSV failed law & policy schema validation", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) from e + except Exception as e: + _LOGGER.exception( + "Unexpected error, validating law & policy CSV on ingest", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + try: + events_file_contents = get_file_contents(events_csv) + read(events_file_contents, context, EventIngestRow, validate_event_row) + rows, fails, resolved = get_result_counts(context.results) + all_results.extend(context.results) + context.results = all_results + + _LOGGER.info( + f"Events validation result: {rows} Rows, {fails} Failures, " + f"{resolved} Resolved" + ) + except ImportSchemaMismatchError as e: + _LOGGER.exception( + "Provided CSV failed events schema validation", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) from e + except Exception as e: + _LOGGER.exception( + "Unexpected error, validating events CSV on ingest", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + # If we have any validation errors then raise + validation_errors = [r for r in context.results if r.type == ResultType.ERROR] + if validation_errors: + _LOGGER.error( + "Ingest failed validation (results attached)", + extra={"errors": validation_errors}, + ) + error_details = [e.details for e in validation_errors] + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=error_details + ) + + # PHASE 2 - Validation completed without errors, so store the ingest files. This + # will let us investigate errors later + s3_prefix = get_new_s3_prefix() + try: + result_documents: Union[S3Document, bool] = write_csv_to_s3( + s3_client=s3_client, + s3_prefix=s3_prefix, + s3_content_label="documents", + file_contents=documents_file_contents, + ) + result_events: Union[S3Document, bool] = write_csv_to_s3( + s3_client=s3_client, + s3_prefix=s3_prefix, + s3_content_label="events", + file_contents=events_file_contents, + ) + if ( + type(result_documents) is bool + ): # S3Client returns False if the object was not created + _LOGGER.error( + "Write Bulk Document Ingest CSV to S3 Failed.", + extra={ + "props": { + "superuser_email": current_user.email, + } + }, + ) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Unexpected error, fail to write Bulk Document Ingest CSV to S3", + ) + if ( + type(result_events) is bool + ): # S3Client returns False if the object was not created + _LOGGER.error( + "Write Bulk Event Ingest CSV to S3 Failed.", + extra={ + "props": { + "superuser_email": current_user.email, + } + }, + ) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Unexpected error, fail to write Bulk Event Ingest CSV to S3", + ) + else: + documents_csv_s3_location = str(result_documents.url) + events_csv_s3_location = str(result_events.url) + _LOGGER.info( + "Write Event CCLW Ingest CSV complete.", + extra={ + "props": { + "superuser_email": current_user.email, + "documents_csv_s3_location": documents_csv_s3_location, + "events_csv_s3_location": events_csv_s3_location, + } + }, + ) + except Exception as e: + _LOGGER.exception( + "Unexpected error, writing Bulk Document Ingest CSV content to S3", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + # PHASE 3 - Start the ingest (kick off background task to do the actual ingest) + background_tasks.add_task( + _start_ingest, + db, + s3_client, + s3_prefix, + documents_file_contents, + events_file_contents, + ) + + _LOGGER.info( + "Background Bulk Document/Event Ingest Task added", + extra={ + "props": { + "superuser_email": current_user.email, + "documents_csv_s3_location": documents_csv_s3_location, + "events_csv_s3_location": events_csv_s3_location, + } + }, + ) + + # TODO: Add some way the caller can monitor processing pipeline... + return BulkIngestResult( + import_s3_prefix=s3_prefix, + detail=None, # TODO: add detail? + ) + + +def _validate_cclw_csv( + law_policy_csv: UploadFile, + db: Session, + context: IngestContext, + all_results: list[Result], +) -> tuple[str, str]: + """ + Validates the csv file + + :param UploadFile law_policy_csv: incoming file to validate + :param Session db: connection to the database + :param IngestContext context: the ingest context + :param list[Result] all_results: the results + :return tuple[str, str]: the file contents of the csv and the summary message + """ + documents_file_contents = get_file_contents(law_policy_csv) + validator = get_document_validator(db, context) + read(documents_file_contents, context, CCLWDocumentIngestRow, validator) + rows, fails, resolved = get_result_counts(context.results) + all_results.extend(context.results) + context.results = [] + message = ( + f"Law & Policy validation result: {rows} Rows, {fails} Failures, " + f"{resolved} Resolved" + ) + + _LOGGER.info(message) + + return documents_file_contents, message + + +@r.put("/documents/{import_id_or_slug}", status_code=status.HTTP_200_OK) +async def update_document( + request: Request, + import_id_or_slug: str, + meta_data: DocumentUpdateRequest, + db=Depends(get_db), + current_user=Depends(get_superuser_details), +): + # TODO: As this grows move it out into the crud later. + + _LOGGER.info( + f"Superuser '{current_user.email}' called update_document", + extra={ + "props": { + "superuser_email": current_user.email, + "import_id_or_slug": import_id_or_slug, + "meta_data": meta_data, + } + }, + ) + + # First query the FamilyDocument + query = db.query(FamilyDocument) + if IMPORT_ID_MATCHER.match(import_id_or_slug) is not None: + family_document = query.filter( + FamilyDocument.import_id == import_id_or_slug + ).one_or_none() + _LOGGER.info("update_document called with import_id") + else: + family_document = ( + query.join(Slug, Slug.family_document_import_id == FamilyDocument.import_id) + .filter(Slug.name == import_id_or_slug) + .one_or_none() + ) + _LOGGER.info("update_document called with slug") + + # Check we have found one + if family_document is None: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + ) + + # Get the physical document to update + physical_document = family_document.physical_document + + # Note this code relies on the fields being the same as the db column names + num_changed = db.execute( + update(PhysicalDocument) + .values(meta_data.dict()) + .where(PhysicalDocument.id == physical_document.id) + ).rowcount + + if num_changed == 0: + _LOGGER.info("update_document complete - nothing changed") + return physical_document # Nothing to do - as should be idempotent + + if num_changed > 1: + # This should never happen due to table uniqueness constraints + # TODO Rollback + raise HTTPException( + detail=( + f"There was more than one document identified by {import_id_or_slug}. " + "This should not happen!!!" + ), + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + db.commit() + db.refresh(physical_document) + _LOGGER.info( + "Call to update_document complete", + extra={ + "props": { + "superuser_email": current_user.email, + "num_changed": num_changed, + "import_id": family_document.import_id, + "md5_sum": physical_document.md5_sum, + "content_type": physical_document.content_type, + "cdn_object": physical_document.cdn_object, + } + }, + ) + return physical_document diff --git a/app/api/api_v1/routers/unfccc_ingest.py b/app/api/api_v1/routers/unfccc_ingest.py new file mode 100644 index 00000000..f4e4bdab --- /dev/null +++ b/app/api/api_v1/routers/unfccc_ingest.py @@ -0,0 +1,430 @@ +import logging +from typing import Union, cast +from sqlalchemy.orm import Session +from app.core.aws import S3Client + +from fastapi import ( + APIRouter, + BackgroundTasks, + Depends, + HTTPException, + Request, + UploadFile, + status, +) +from app.core.aws import S3Document + +from app.api.api_v1.schemas.document import ( + BulkIngestResult, +) +from app.core.auth import get_superuser_details +from app.core.aws import get_s3_client +from app.core.ingestion.unfccc.ingest_row_unfccc import ( + CollectonIngestRow, + UNFCCCDocumentIngestRow, +) + +from app.core.ingestion.unfccc.pipeline import generate_pipeline_ingest_input +from app.core.ingestion.processor import ( + get_collection_ingestor, + initialise_context, + get_document_ingestor, + get_document_validator, +) +from app.core.ingestion.unfccc.reader import get_file_contents, read +from app.core.ingestion.utils import ( + IngestContext, + Result, + ResultType, + UNFCCCIngestContext, +) +from app.core.ingestion.utils import ( + ValidationResult, + get_result_counts, +) +from app.core.validation.types import ImportSchemaMismatchError +from app.core.validation.util import ( + get_new_s3_prefix, + write_csv_to_s3, + write_documents_to_s3, + write_ingest_results_to_s3, +) +from app.db.session import get_db + +_LOGGER = logging.getLogger(__name__) + +unfccc_ingest_router = r = APIRouter() + + +def _start_ingest( + db: Session, + s3_client: S3Client, + s3_prefix: str, + documents_file_contents: str, + collection_file_contents: str, +): + context = None + # TODO: add a way for a user to monitor progress of the ingest + try: + context = initialise_context(db, "UNFCCC") + # First the collections.... + collection_ingestor = get_collection_ingestor(db) + read(collection_file_contents, context, CollectonIngestRow, collection_ingestor) + + # FIXME: Write a unfccc ingestor + document_ingestor = get_document_ingestor(db, context) + read( + documents_file_contents, context, UNFCCCDocumentIngestRow, document_ingestor + ) + except Exception as e: + # This is a background task, so do not raise + _LOGGER.exception( + "Unexpected error on ingest", extra={"props": {"errors": str(e)}} + ) + + try: + if context is not None: + write_ingest_results_to_s3( + s3_client=s3_client, + s3_prefix=s3_prefix, + results=context.results, + ) + except Exception as e: + _LOGGER.exception( + "Unexpected error writing ingest results to s3", + extra={"props": {"errors": str(e)}}, + ) + + try: + pipeline_ingest_input = generate_pipeline_ingest_input(db) + write_documents_to_s3( + s3_client=s3_client, + s3_prefix=s3_prefix, + documents=pipeline_ingest_input, + ) + except Exception as e: + _LOGGER.exception( + "Unexpected error writing pipeline input document to s3", + extra={"props": {"errors": str(e)}}, + ) + + +@r.post( + "/bulk-ingest/validate/unfccc", + response_model=ValidationResult, + status_code=status.HTTP_200_OK, +) +def validate_unfccc_law_policy( + request: Request, + unfccc_data_csv: UploadFile, + collection_csv: UploadFile, + db=Depends(get_db), + current_user=Depends(get_superuser_details), +): + """ + Validates the provided CSV into the document / family / collection schema. + + :param [Request] request: Incoming request (UNUSED). + :param [UploadFile] law_policy_csv: CSV file to ingest. + :param [Session] db: Database connection. + Defaults to Depends(get_db). + :param [JWTUser] current_user: Current user. + Defaults to Depends(get_current_active_superuser). + :return [str]: A path to an s3 object containing document updates to be processed + by the ingest pipeline. + :raises HTTPException: The following HTTPExceptions are raised on errors: + 400 If the provided CSV file fails schema validation + 422 On failed validation on the input CSV (results included) + 500 On an unexpected error + """ + + _LOGGER.info( + f"Superuser '{current_user.email}' triggered Bulk Document Validation for " + "UNFCCC Law & Policy data" + ) + + try: + context = initialise_context(db, "UNFCCC") + except Exception as e: + _LOGGER.exception( + "Failed to create ingest context", extra={"props": {"errors": str(e)}} + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + all_results = [] + + try: + _, _, message = _validate_unfccc_csv( + unfccc_data_csv, + collection_csv, + db, + cast(UNFCCCIngestContext, context), + all_results, + ) + except ImportSchemaMismatchError as e: + _LOGGER.exception( + "Provided CSV failed law & policy schema validation", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) from e + except Exception as e: + _LOGGER.exception( + "Unexpected error, validating law & policy CSV on ingest", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + # Intended output for this is the console - so for now just format it up for that. + errors = [r for r in all_results if r.type == ResultType.ERROR] + return ValidationResult(message=message, errors=errors) + + +@r.post( + "/bulk-ingest/unfccc", + response_model=BulkIngestResult, + status_code=status.HTTP_202_ACCEPTED, +) +def ingest_unfccc_law_policy( + request: Request, + unfccc_data_csv: UploadFile, + collection_csv: UploadFile, + background_tasks: BackgroundTasks, + db=Depends(get_db), + current_user=Depends(get_superuser_details), + s3_client=Depends(get_s3_client), +): + """ + Ingest the provided CSV into the document / family / collection schema. + + :param [Request] request: Incoming request (UNUSED). + :param [UploadFile] unfccc_data_csv: CSV file containing documents to ingest. + :param [UploadFile] collection_csv: CSV file containing collection to ingest. + :param [BackgroundTasks] background_tasks: Tasks API to start ingest task. + :param [Session] db: Database connection. + Defaults to Depends(get_db). + :param [JWTUser] current_user: Current user. + Defaults to Depends(get_current_active_superuser). + :param [S3Client] s3_client: S3 connection. + Defaults to Depends(get_s3_client). + :return [str]: A path to an s3 object containing document updates to be processed + by the ingest pipeline. + :raises HTTPException: The following HTTPExceptions are raised on errors: + 400 If the provided CSV file fails schema validation + 422 On failed validation on the input CSV (results included) + 500 On an unexpected error + """ + # TODO: Combine with event import? refactor out shared structure? + + _LOGGER.info( + f"Superuser '{current_user.email}' triggered Bulk Document Ingest for " + "UNFCCC Law & Policy data" + ) + + try: + context = initialise_context(db, "UNFCCC") + except Exception as e: + _LOGGER.exception( + "Failed to create ingest context", extra={"props": {"errors": str(e)}} + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + all_results = [] + + # PHASE 1 - Validate + try: + documents_file_contents, collection_file_contents, _ = _validate_unfccc_csv( + unfccc_data_csv, + collection_csv, + db, + cast(UNFCCCIngestContext, context), + all_results, + ) + except ImportSchemaMismatchError as e: + _LOGGER.exception( + "Provided CSV failed law & policy schema validation", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) from e + except Exception as e: + _LOGGER.exception( + "Unexpected error, validating law & policy CSV on ingest", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + # If we have any validation errors then raise + validation_errors = [r for r in context.results if r.type == ResultType.ERROR] + if validation_errors: + _LOGGER.error( + "Ingest failed validation (results attached)", + extra={"errors": validation_errors}, + ) + error_details = [e.details for e in validation_errors] + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=error_details + ) + + # PHASE 2 - Validation completed without errors, so store the ingest files. This + # will let us investigate errors later + s3_prefix = get_new_s3_prefix() + try: + result_documents: Union[S3Document, bool] = write_csv_to_s3( + s3_client=s3_client, + s3_prefix=s3_prefix, + s3_content_label="documents", + file_contents=documents_file_contents, + ) + result_collections: Union[S3Document, bool] = write_csv_to_s3( + s3_client=s3_client, + s3_prefix=s3_prefix, + s3_content_label="collections", + file_contents=collection_file_contents, + ) + + if ( + type(result_documents) is bool + ): # S3Client returns False if the object was not created + _LOGGER.error( + "Write Bulk Document Ingest CSV to S3 Failed.", + extra={ + "props": { + "superuser_email": current_user.email, + } + }, + ) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Unexpected error, fail to write Bulk Document Ingest CSV to S3", + ) + if ( + type(result_collections) is bool + ): # S3Client returns False if the object was not created + _LOGGER.error( + "Write Bulk Collections Ingest CSV to S3 Failed.", + extra={ + "props": { + "superuser_email": current_user.email, + } + }, + ) + msg = "Unexpected error, fail to write Bulk Collections Ingest CSV to S3" + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=msg, + ) + else: + documents_csv_s3_location = str(result_documents.url) + collections_csv_s3_location = str(result_collections.url) + _LOGGER.info( + "Write Event UNFCCC Ingest CSV complete.", + extra={ + "props": { + "superuser_email": current_user.email, + "documents_csv_s3_location": documents_csv_s3_location, + "collections_csv_s3_location": collections_csv_s3_location, + } + }, + ) + except Exception as e: + _LOGGER.exception( + "Unexpected error, writing Bulk Document Ingest CSV content to S3", + extra={"props": {"errors": str(e)}}, + ) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) from e + + # PHASE 3 - Start the ingest (kick off background task to do the actual ingest) + background_tasks.add_task( + _start_ingest, + db, + s3_client, + s3_prefix, + documents_file_contents, + collection_file_contents, + ) + + _LOGGER.info( + "Background Bulk Document/Event Ingest Task added", + extra={ + "props": { + "superuser_email": current_user.email, + "documents_csv_s3_location": documents_csv_s3_location, + } + }, + ) + + # TODO: Add some way the caller can monitor processing pipeline... + return BulkIngestResult( + import_s3_prefix=s3_prefix, + detail=None, # TODO: add detail? + ) + + +def _validate_unfccc_csv( + unfccc_data_csv: UploadFile, + collection_csv: UploadFile, + db: Session, + context: UNFCCCIngestContext, + all_results: list[Result], +) -> tuple[str, str, str]: + """ + Validates the csv file + + :param UploadFile law_policy_csv: incoming file to validate + :param Session db: connection to the database + :param IngestContext context: the ingest context + :param list[Result] all_results: the results + :return tuple[str, str]: the file contents of the csv and the summary message + """ + + # First read all the ids in the collection_csv + def collate_ids(context: IngestContext, row: CollectonIngestRow) -> None: + ctx = cast(UNFCCCIngestContext, context) + ctx.collection_ids_defined.append(row.cpr_collection_id) + + collection_file_contents = get_file_contents(collection_csv) + read(collection_file_contents, context, CollectonIngestRow, collate_ids) + + # Now do the validation of the documents + documents_file_contents = get_file_contents(unfccc_data_csv) + validator = get_document_validator(db, context) + read(documents_file_contents, context, UNFCCCDocumentIngestRow, validator) + # Get the rows here as this is the length of results + rows = len(context.results) + + # Check the set of defined collections against those referenced + defined = set(context.collection_ids_defined) + referenced = set(context.collection_ids_referenced) + + defined_not_referenced = defined.difference(referenced) + + if len(defined_not_referenced) > 0: + context.results.append( + Result( + ResultType.ERROR, + "The following Collection IDs were " + + f"defined and not referenced: {list(defined_not_referenced)}", + ) + ) + + referenced_not_defined = referenced.difference(defined) + if len(referenced_not_defined) > 0: + context.results.append( + Result( + ResultType.ERROR, + "The following Collection IDs were " + f"referenced and not defined: {list(referenced_not_defined)}", + ) + ) + + _, fails, resolved = get_result_counts(context.results) + all_results.extend(context.results) + + context.results = [] + message = ( + f"UNFCCC validation result: {rows} Rows, {fails} Failures, " + f"{resolved} Resolved" + ) + + _LOGGER.info(message) + + return documents_file_contents, collection_file_contents, message diff --git a/app/core/ingestion/collection.py b/app/core/ingestion/cclw/collection.py similarity index 62% rename from app/core/ingestion/collection.py rename to app/core/ingestion/cclw/collection.py index 0cb2c652..a4723f61 100644 --- a/app/core/ingestion/collection.py +++ b/app/core/ingestion/cclw/collection.py @@ -1,16 +1,69 @@ from typing import Any, Optional from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import DocumentIngestRow +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.core.ingestion.unfccc.ingest_row_unfccc import CollectonIngestRow from app.core.ingestion.utils import create, to_dict, update_if_changed from app.db.models.law_policy import Collection from app.db.models.law_policy.collection import CollectionFamily, CollectionOrganisation +def create_collection( + db: Session, + row: CollectonIngestRow, + org_id: int, + result: dict[str, Any], +) -> Optional[Collection]: + # First check for the actual collection + existing_collection = ( + db.query(Collection) + .filter(Collection.import_id == row.cpr_collection_id) + .one_or_none() + ) + + if existing_collection is None: + collection = create( + db, + Collection, + import_id=row.cpr_collection_id, + title=row.collection_name, + extra={"description": row.collection_summary}, + ) + + collection_organisation = create( + db, + CollectionOrganisation, + collection_import_id=collection.import_id, + organisation_id=org_id, + ) + + result["collection_organisation"] = to_dict(collection_organisation) + result["collection"] = to_dict(collection) + + return collection + + if existing_collection is not None: + # Check it matches + # FIXME: also check for the collection-organisation relationship? + collection = ( + db.query(Collection) + .filter(Collection.title == row.collection_name) + .filter(Collection.description == row.collection_summary) + .filter(Collection.import_id == row.collection_summary) + .one_or_none() + ) + if collection: + return collection + + raise ValueError( + f"Collection {row.cpr_collection_id} is pre-exiting, and mis-matches" + ) + + def handle_collection_from_row( db: Session, - row: DocumentIngestRow, + row: CCLWDocumentIngestRow, org_id: int, family_import_id: str, result: dict[str, Any], diff --git a/app/core/ingestion/event.py b/app/core/ingestion/cclw/event.py similarity index 97% rename from app/core/ingestion/event.py rename to app/core/ingestion/cclw/event.py index 6bc4e800..8ba96d9f 100644 --- a/app/core/ingestion/event.py +++ b/app/core/ingestion/cclw/event.py @@ -5,7 +5,7 @@ from pydantic.json import pydantic_encoder from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import EventIngestRow +from app.core.ingestion.cclw.ingest_row_cclw import EventIngestRow from app.core.ingestion.utils import get_or_create, to_dict from app.db.models.law_policy import FamilyEvent diff --git a/app/core/ingestion/family.py b/app/core/ingestion/cclw/family.py similarity index 93% rename from app/core/ingestion/family.py rename to app/core/ingestion/cclw/family.py index 08ba3f96..06ee7803 100644 --- a/app/core/ingestion/family.py +++ b/app/core/ingestion/cclw/family.py @@ -2,10 +2,10 @@ from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import DocumentIngestRow -from app.core.ingestion.metadata import add_metadata +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.core.ingestion.cclw.metadata import add_metadata from app.core.organisation import get_organisation_taxonomy -from app.core.ingestion.physical_document import create_physical_document_from_row +from app.core.ingestion.cclw.physical_document import create_physical_document_from_row from app.core.ingestion.utils import ( create, get_or_create, @@ -26,7 +26,7 @@ def handle_family_from_row( db: Session, - row: DocumentIngestRow, + row: CCLWDocumentIngestRow, org_id: int, result: dict[str, Any], ) -> Family: @@ -50,7 +50,7 @@ def handle_family_from_row( def _after_create_family( - db: Session, row: DocumentIngestRow, org_id: int, result: dict[str, Any] + db: Session, row: CCLWDocumentIngestRow, org_id: int, result: dict[str, Any] ): def _create_family_links(family: Family): family_slug = Slug(name=row.cpr_family_slug, family_import_id=family.import_id) @@ -72,7 +72,7 @@ def _create_family_links(family: Family): def _operate_on_family( db: Session, - row: DocumentIngestRow, + row: CCLWDocumentIngestRow, org_id: int, result: dict[str, Any], ) -> Family: @@ -116,7 +116,7 @@ def _operate_on_family( def handle_family_document_from_row( db: Session, - row: DocumentIngestRow, + row: CCLWDocumentIngestRow, family: Family, result: dict[str, Any], ) -> FamilyDocument: @@ -209,7 +209,7 @@ def none_if_empty(data: str) -> Optional[str]: return family_document -def _get_geography(db: Session, row: DocumentIngestRow) -> Geography: +def _get_geography(db: Session, row: CCLWDocumentIngestRow) -> Geography: geography = ( db.query(Geography).filter(Geography.value == row.geography_iso).one_or_none() ) @@ -222,7 +222,7 @@ def _get_geography(db: Session, row: DocumentIngestRow) -> Geography: def _add_family_document_slug( db: Session, - row: DocumentIngestRow, + row: CCLWDocumentIngestRow, family_document: FamilyDocument, result: dict[str, Any], ) -> Slug: diff --git a/app/core/ingestion/ingest_row.py b/app/core/ingestion/cclw/ingest_row_cclw.py similarity index 56% rename from app/core/ingestion/ingest_row.py rename to app/core/ingestion/cclw/ingest_row_cclw.py index 9838fc4b..21521270 100644 --- a/app/core/ingestion/ingest_row.py +++ b/app/core/ingestion/cclw/ingest_row_cclw.py @@ -1,10 +1,9 @@ -import abc -from dataclasses import fields -from datetime import datetime, timezone -from typing import Any, ClassVar, Optional, Sequence +from datetime import datetime +from typing import ClassVar, Optional from pydantic import ConfigDict, Extra from pydantic.dataclasses import dataclass +from app.core.ingestion.ingest_row_base import BaseIngestRow from app.db.models.law_policy import EventStatus, FamilyCategory @@ -50,73 +49,8 @@ VALID_EVENT_COLUMN_NAMES = set(_REQUIRED_EVENT_COLUMNS) -def validate_csv_columns( - column_names: Sequence[str], - valid_column_names: set[str], -) -> list[str]: - """Check that the given set of column names is valid.""" - missing = list(valid_column_names.difference(set(column_names))) - missing.sort() - return missing - - -@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra=Extra.forbid)) -class BaseIngestRow(abc.ABC): - """Represents a single row of input from a CSV.""" - - row_number: int - - VALID_COLUMNS: ClassVar[set[str]] = set() - - @classmethod - def from_row(cls, row_number: int, data: dict[str, str]): - """Parse a row from a CSV.""" - field_info = cls.field_info() - return cls( - row_number=row_number, - **{ - cls._key(k): cls._parse_str(cls._key(k), v, field_info) - for (k, v) in data.items() - if cls._key(k) in field_info.keys() - }, - ) - - @classmethod - def field_info(cls) -> dict[str, type]: - """Returns an information mapping from field name to expected type.""" - return {field.name: field.type for field in fields(cls)} - - @classmethod - def _parse_str(cls, key: str, value: str, field_info: dict[str, type]) -> Any: - if key not in field_info: - # Let pydantic deal with unexpected fields - return value - - if field_info[key] == datetime: - return datetime.strptime(value, "%Y-%m-%d").replace(tzinfo=timezone.utc) - - if field_info[key] == list[str]: - return [e.strip() for e in value.split(";") if e.strip()] - - if field_info[key] == int: - return int(value) if value else 0 - - if field_info[key] == str: - if (na := str(value).lower()) == "n/a": - return na - else: - return value - - # Let pydantic deal with other field types (e.g. str-Enums) - return value - - @staticmethod - def _key(key: str) -> str: - return key.lower().replace(" ", "_") - - @dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra=Extra.forbid)) -class DocumentIngestRow(BaseIngestRow): +class CCLWDocumentIngestRow(BaseIngestRow): """Represents a single row of input from the documents-families-collections CSV.""" id: str diff --git a/app/core/ingestion/cclw/metadata.py b/app/core/ingestion/cclw/metadata.py new file mode 100644 index 00000000..4a656c88 --- /dev/null +++ b/app/core/ingestion/cclw/metadata.py @@ -0,0 +1,71 @@ +from typing import Union + +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.db.models.law_policy.metadata import FamilyMetadata +from sqlalchemy.orm import Session +from app.core.ingestion.utils import Result, ResultType +from app.core.ingestion.metadata import Taxonomy, MetadataJson, build_metadata_field + + +MAP_OF_LIST_VALUES = { + "sector": "sectors", + "instrument": "instruments", + "framework": "frameworks", + "topic": "responses", + "hazard": "natural_hazards", + "keyword": "keywords", +} + + +def add_metadata( + db: Session, + family_import_id: str, + taxonomy: Taxonomy, + taxonomy_id: int, + row: CCLWDocumentIngestRow, +) -> bool: + result, metadata = build_cclw_metadata(taxonomy, row) + if result.type == ResultType.ERROR: + return False + + db.add( + FamilyMetadata( + family_import_id=family_import_id, + taxonomy_id=taxonomy_id, + value=metadata, + ) + ) + return True + + +def build_cclw_metadata( + taxonomy: Taxonomy, row: CCLWDocumentIngestRow +) -> tuple[Result, MetadataJson]: + detail_list = [] + value: dict[str, Union[str, list[str]]] = {} + num_fails = 0 + num_resolved = 0 + + for tax_key, row_key in MAP_OF_LIST_VALUES.items(): + ingest_values = getattr(row, row_key) + result, field_value = build_metadata_field( + row.row_number, taxonomy, ingest_values, tax_key + ) + + if result.type == ResultType.OK: + value[tax_key] = field_value + elif result.type == ResultType.RESOLVED: + value[tax_key] = field_value + detail_list.append(result.details) + num_resolved += 1 + else: + detail_list.append(result.details) + num_fails += 1 + + row_result_type = ResultType.OK + if num_resolved: + row_result_type = ResultType.RESOLVED + if num_fails: + row_result_type = ResultType.ERROR + + return Result(type=row_result_type, details="\n".join(detail_list)), value diff --git a/app/core/ingestion/physical_document.py b/app/core/ingestion/cclw/physical_document.py similarity index 94% rename from app/core/ingestion/physical_document.py rename to app/core/ingestion/cclw/physical_document.py index d649ee1a..3ba0643e 100644 --- a/app/core/ingestion/physical_document.py +++ b/app/core/ingestion/cclw/physical_document.py @@ -1,7 +1,7 @@ from typing import Any from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import DocumentIngestRow +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow from app.core.ingestion.utils import to_dict from app.db.models.document import PhysicalDocument @@ -10,7 +10,7 @@ def create_physical_document_from_row( db: Session, - row: DocumentIngestRow, + row: CCLWDocumentIngestRow, result: dict[str, Any], ) -> PhysicalDocument: """ diff --git a/app/core/ingestion/pipeline.py b/app/core/ingestion/cclw/pipeline.py similarity index 100% rename from app/core/ingestion/pipeline.py rename to app/core/ingestion/cclw/pipeline.py diff --git a/app/core/ingestion/reader.py b/app/core/ingestion/cclw/reader.py similarity index 95% rename from app/core/ingestion/reader.py rename to app/core/ingestion/cclw/reader.py index 2f0b4bc3..0d558942 100644 --- a/app/core/ingestion/reader.py +++ b/app/core/ingestion/cclw/reader.py @@ -3,7 +3,7 @@ from typing import Type from fastapi import UploadFile -from app.core.ingestion.ingest_row import BaseIngestRow, validate_csv_columns +from app.core.ingestion.ingest_row_base import BaseIngestRow, validate_csv_columns from app.core.ingestion.processor import ProcessFunc from app.core.ingestion.utils import IngestContext diff --git a/app/core/ingestion/ingest_row_base.py b/app/core/ingestion/ingest_row_base.py new file mode 100644 index 00000000..7e9a448e --- /dev/null +++ b/app/core/ingestion/ingest_row_base.py @@ -0,0 +1,77 @@ +import abc +from dataclasses import fields +from pydantic.dataclasses import dataclass +from datetime import datetime, timezone +from typing import Any, ClassVar, Sequence +from pydantic import ConfigDict, Extra + + +@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra=Extra.forbid)) +class BaseIngestRow(abc.ABC): + """Represents a single row of input from a CSV.""" + + row_number: int + + VALID_COLUMNS: ClassVar[set[str]] = set() + + @classmethod + def from_row(cls, row_number: int, data: dict[str, str]): + """Parse a row from a CSV.""" + field_info = cls.field_info() + return cls( + row_number=row_number, + **{ + cls._key(k): cls._parse_str(cls._key(k), v, field_info) + for (k, v) in data.items() + if cls._key(k) in field_info.keys() + }, + ) + + @classmethod + def field_info(cls) -> dict[str, type]: + """Returns an information mapping from field name to expected type.""" + return {field.name: field.type for field in fields(cls)} + + @classmethod + def _parse_str(cls, key: str, value: str, field_info: dict[str, type]) -> Any: + if key not in field_info: + # Let pydantic deal with unexpected fields + return value + + if field_info[key] == datetime: + if "T" in value: + # this doesn't accept all valid ISO 8601 strings, only ones + # generated by isoformat. So we need some shenanigans. + value_to_use = value.replace("Z", "+00:00") + return datetime.fromisoformat(value_to_use) + else: + return datetime.strptime(value, "%Y-%m-%d").replace(tzinfo=timezone.utc) + + if field_info[key] == list[str]: + return [e.strip() for e in value.split(";") if e.strip()] + + if field_info[key] == int: + return int(value) if value else 0 + + if field_info[key] == str: + if (na := str(value).lower()) == "n/a": + return na + else: + return value + + # Let pydantic deal with other field types (e.g. str-Enums) + return value + + @staticmethod + def _key(key: str) -> str: + return key.lower().replace(" ", "_") + + +def validate_csv_columns( + column_names: Sequence[str], + valid_column_names: set[str], +) -> list[str]: + """Check that the given set of column names is valid.""" + missing = list(valid_column_names.difference(set(column_names))) + missing.sort() + return missing diff --git a/app/core/ingestion/metadata.py b/app/core/ingestion/metadata.py index 79667dde..75b6e71c 100644 --- a/app/core/ingestion/metadata.py +++ b/app/core/ingestion/metadata.py @@ -1,22 +1,10 @@ -from typing import Mapping, Sequence, Union +from typing import Any, Mapping, Sequence, Union from pydantic.dataclasses import dataclass from pydantic.config import ConfigDict, Extra -from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import DocumentIngestRow from app.core.ingestion.match import match_unknown_value from app.core.ingestion.utils import Result, ResultType -from app.db.models.law_policy.metadata import FamilyMetadata - -MAP_OF_LIST_VALUES = { - "sector": "sectors", - "instrument": "instruments", - "framework": "frameworks", - "topic": "responses", - "hazard": "natural_hazards", - "keyword": "keywords", -} @dataclass(config=ConfigDict(validate_assignment=True, extra=Extra.forbid)) @@ -31,70 +19,27 @@ class TaxonomyEntry: MetadataJson = Mapping[str, Union[str, Sequence[str]]] -def add_metadata( - db: Session, - family_import_id: str, - taxonomy: Taxonomy, - taxonomy_id: int, - row: DocumentIngestRow, -) -> bool: - result, metadata = build_metadata(taxonomy, row) - if result.type == ResultType.ERROR: - return False - - db.add( - FamilyMetadata( - family_import_id=family_import_id, - taxonomy_id=taxonomy_id, - value=metadata, - ) - ) - return True - - -def build_metadata( - taxonomy: Taxonomy, row: DocumentIngestRow -) -> tuple[Result, MetadataJson]: - detail_list = [] - value: dict[str, Union[str, list[str]]] = {} - num_fails = 0 - num_resolved = 0 - - for tax_key, row_key in MAP_OF_LIST_VALUES.items(): - result, field_value = build_metadata_field(taxonomy, row, tax_key, row_key) - - if result.type == ResultType.OK: - value[tax_key] = field_value - elif result.type == ResultType.RESOLVED: - value[tax_key] = field_value - detail_list.append(result.details) - num_resolved += 1 - else: - detail_list.append(result.details) - num_fails += 1 - - row_result_type = ResultType.OK - if num_resolved: - row_result_type = ResultType.RESOLVED - if num_fails: - row_result_type = ResultType.ERROR - - return Result(type=row_result_type, details="\n".join(detail_list)), value +def resolve_unknown(unknown_set: set[str], allowed_set: set[str]) -> set[str]: + suggestions = set() + for unknown_value in unknown_set: + suggestion = match_unknown_value(unknown_value, allowed_set) + if suggestion: + suggestions.add(suggestion) + return suggestions def build_metadata_field( - taxonomy: Taxonomy, row: DocumentIngestRow, tax_key: str, row_key: str + row_number: int, taxonomy: Taxonomy, ingest_values: Any, tax_key: str ) -> tuple[Result, list[str]]: - ingest_values = getattr(row, row_key) + if type(ingest_values) == str: + ingest_values = [ingest_values] row_set = set(ingest_values) allowed_set: set[str] = set(taxonomy[tax_key].allowed_values) allow_blanks = taxonomy[tax_key].allow_blanks if len(row_set) == 0: if not allow_blanks: - details = ( - f"Row {row.row_number} is blank for {tax_key} - which is not allowed." - ) + details = f"Row {row_number} is blank for {tax_key} - which is not allowed." return Result(type=ResultType.ERROR, details=details), [] return Result(), [] # field is blank and allowed @@ -105,14 +50,14 @@ def build_metadata_field( resolved_set = resolve_unknown(unknown_set, allowed_set) if len(resolved_set) == len(unknown_set): - details = f"Row {row.row_number} RESOLVED: {resolved_set}" + details = f"Row {row_number} RESOLVED: {resolved_set}" vals = row_set.difference(unknown_set).union(resolved_set) return Result(type=ResultType.RESOLVED, details=details), list(vals) # If we get here we have not managed to resolve the unknown values. details = ( - f"Row {row.row_number} has value(s) for '{tax_key}' that is/are " + f"Row {row_number} has value(s) for '{tax_key}' that is/are " f"unrecognised: '{unknown_set}' " ) @@ -120,12 +65,3 @@ def build_metadata_field( details += f"able to resolve: {resolved_set}" return Result(type=ResultType.ERROR, details=details), [] - - -def resolve_unknown(unknown_set: set[str], allowed_set: set[str]) -> set[str]: - suggestions = set() - for unknown_value in unknown_set: - suggestion = match_unknown_value(unknown_value, allowed_set) - if suggestion: - suggestions.add(suggestion) - return suggestions diff --git a/app/core/ingestion/processor.py b/app/core/ingestion/processor.py index 62b3c1e1..4d834c06 100644 --- a/app/core/ingestion/processor.py +++ b/app/core/ingestion/processor.py @@ -2,17 +2,33 @@ from typing import Any, Callable, TypeVar, cast from sqlalchemy.orm import Session -from app.core.ingestion.collection import handle_collection_from_row -from app.core.ingestion.event import family_event_from_row -from app.core.ingestion.family import handle_family_from_row -from app.core.ingestion.ingest_row import ( - BaseIngestRow, - DocumentIngestRow, +from app.core.ingestion.cclw.collection import ( + create_collection, + handle_collection_from_row, +) +from app.core.ingestion.cclw.event import family_event_from_row +from app.core.ingestion.cclw.family import handle_family_from_row +from app.core.ingestion.cclw.ingest_row_cclw import ( + CCLWDocumentIngestRow, EventIngestRow, ) +from app.core.ingestion.ingest_row_base import BaseIngestRow +from app.core.ingestion.unfccc.ingest_row_unfccc import ( + CollectonIngestRow, + UNFCCCDocumentIngestRow, +) from app.core.organisation import get_organisation_taxonomy -from app.core.ingestion.utils import IngestContext, Result, ResultType -from app.core.ingestion.validator import validate_document_row +from app.core.ingestion.utils import ( + CCLWIngestContext, + IngestContext, + Result, + ResultType, + UNFCCCIngestContext, +) +from app.core.ingestion.validator import ( + validate_cclw_document_row, + validate_unfccc_document_row, +) from app.db.models.app.users import Organisation _LOGGER = logging.getLogger(__name__) @@ -23,8 +39,8 @@ ProcessFunc = Callable[[IngestContext, _RowType], None] -def ingest_document_row( - db: Session, context: IngestContext, row: DocumentIngestRow +def ingest_cclw_document_row( + db: Session, context: IngestContext, row: CCLWDocumentIngestRow ) -> dict[str, Any]: """ Create the constituent elements in the database that represent this row. @@ -63,6 +79,42 @@ def ingest_document_row( return result +def ingest_unfccc_document_row( + db: Session, context: IngestContext, row: UNFCCCDocumentIngestRow +) -> dict[str, Any]: + """ + Create the constituent elements in the database that represent this row. + + :param [Session] db: the connection to the database. + :param [DocumentIngestRow] row: the IngestRow object of the current CSV row + :returns [dict[str, Any]]: a result dictionary describing what was created + """ + result = {} + import_id = row.cpr_document_id + + _LOGGER.info( + f"Ingest starting for row {row.row_number}.", + extra={ + "props": { + "row_number": row.row_number, + "import_id": import_id, + } + }, + ) + + # FIXME: Implement here + return result + + +def ingest_collection_row( + db: Session, context: IngestContext, row: CollectonIngestRow +) -> dict[str, Any]: + result = {} + with db.begin(): + create_collection(db, row, context.org_id, result) + return result + + def ingest_event_row( db: Session, context: IngestContext, row: EventIngestRow ) -> dict[str, Any]: @@ -78,15 +130,23 @@ def ingest_event_row( return result -def initialise_context(db: Session) -> IngestContext: +def initialise_context(db: Session, org_name: str) -> IngestContext: """ Initialise the database :return [IngestContext]: The organisation that will be used for the ingest. """ with db.begin(): - organisation = db.query(Organisation).filter_by(name="CCLW").one() - return IngestContext(org_id=cast(int, organisation.id), results=[]) + organisation = db.query(Organisation).filter_by(name=org_name).one() + if org_name == "CCLW": + return CCLWIngestContext( + org_name=org_name, org_id=cast(int, organisation.id), results=[] + ) + if org_name == "UNFCCC": + return UNFCCCIngestContext( + org_name=org_name, org_id=cast(int, organisation.id), results=[] + ) + raise ValueError(f"Code not in sync with data - org {org_name} unknown to code") def get_event_ingestor(db: Session) -> ProcessFunc: @@ -106,20 +166,37 @@ def process(context: IngestContext, row: EventIngestRow) -> None: return process -def get_dfc_ingestor(db: Session) -> ProcessFunc: +def get_collection_ingestor(db: Session) -> ProcessFunc: + """ + Get the ingestion function for ingesting a collection CSV row. + + :return [ProcessFunc]: The function used to ingest the CSV row. + """ + + def process(context: IngestContext, row: CollectonIngestRow) -> None: + """Processes the row into the db.""" + _LOGGER.info(f"Ingesting collection row: {row.row_number}") + + with db.begin(): + ingest_collection_row(db, context, row=row) + + return process + + +def get_document_ingestor(db: Session, context: IngestContext) -> ProcessFunc: """ Get the ingestion function for ingesting a law & policy CSV row. :return [ProcessFunc]: The function used to ingest the CSV row. """ - def process(context: IngestContext, row: DocumentIngestRow) -> None: + def cclw_process(context: IngestContext, row: CCLWDocumentIngestRow) -> None: """Processes the row into the db.""" _LOGGER.info(f"Ingesting document row: {row.row_number}") with db.begin(): try: - ingest_document_row(db, context, row=row) + ingest_cclw_document_row(db, context, row=row) except Exception as e: error = Result( ResultType.ERROR, f"Row {row.row_number}: Error {str(e)}" @@ -130,10 +207,32 @@ def process(context: IngestContext, row: DocumentIngestRow) -> None: extra={"props": {"row_number": row.row_number, "error": str(e)}}, ) - return process + def unfccc_process(context: IngestContext, row: UNFCCCDocumentIngestRow) -> None: + """Processes the row into the db.""" + _LOGGER.info(f"Ingesting document row: {row.row_number}") + + with db.begin(): + try: + ingest_unfccc_document_row(db, context, row=row) + except Exception as e: + error = Result( + ResultType.ERROR, f"Row {row.row_number}: Error {str(e)}" + ) + context.results.append(error) + _LOGGER.error( + "Error on ingest", + extra={"props": {"row_number": row.row_number, "error": str(e)}}, + ) + + if context.org_name == "CCLW": + return cclw_process + elif context.org_name == "UNFCCC": + return unfccc_process + raise ValueError(f"Unknown org {context.org_name} for validation.") -def get_dfc_validator(db: Session, context: IngestContext) -> ProcessFunc: + +def get_document_validator(db: Session, context: IngestContext) -> ProcessFunc: """ Get the validation function for ingesting a law & policy CSV. @@ -143,10 +242,31 @@ def get_dfc_validator(db: Session, context: IngestContext) -> ProcessFunc: with db.begin(): _, taxonomy = get_organisation_taxonomy(db, context.org_id) - def process(context: IngestContext, row: DocumentIngestRow) -> None: + def cclw_process(context: IngestContext, row: CCLWDocumentIngestRow) -> None: """Processes the row into the db.""" _LOGGER.info(f"Validating document row: {row.row_number}") with db.begin(): - validate_document_row(db=db, context=context, taxonomy=taxonomy, row=row) - - return process + validate_cclw_document_row( + db=db, + context=cast(CCLWIngestContext, context), + taxonomy=taxonomy, + row=row, + ) + + def unfccc_process(context: IngestContext, row: UNFCCCDocumentIngestRow) -> None: + """Processes the row into the db.""" + _LOGGER.info(f"Validating document row: {row.row_number}") + with db.begin(): + validate_unfccc_document_row( + db=db, + context=cast(UNFCCCIngestContext, context), + taxonomy=taxonomy, + row=row, + ) + + if context.org_name == "CCLW": + return cclw_process + elif context.org_name == "UNFCCC": + return unfccc_process + + raise ValueError(f"Unknown org {context.org_name} for validation.") diff --git a/app/core/ingestion/unfccc/collection.py b/app/core/ingestion/unfccc/collection.py new file mode 100644 index 00000000..b1c715c5 --- /dev/null +++ b/app/core/ingestion/unfccc/collection.py @@ -0,0 +1,87 @@ +from typing import Any, Optional + +from sqlalchemy.orm import Session +from app.core.ingestion.unfccc.ingest_row_unfccc import UNFCCCDocumentIngestRow + +from app.db.models.law_policy import Collection + + +def handle_collection_from_row( + db: Session, + row: UNFCCCDocumentIngestRow, + org_id: int, + family_import_id: str, + result: dict[str, Any], +) -> Optional[Collection]: + """ + Creates or Updates the collection part of the schema from the row if needed. + + NOTE: This determines the operation CREATE/UPDATE independently of the + operation being performed on the Family/FamilyDocument structures. + + :param [Session] db: connection to the database. + :param [DocumentIngestRow] row: the row built from the CSV. + :param [int] org_id: the organisation id associated with this row. + :param [str] family_import_id: the family id associated with this row. + :param [dict[str, Any]]: a result dict in which to record what was created. + :return [Collection | None]: A collection if one was created, otherwise None. + """ + pass + # if not row.cpr_collection_id or row.cpr_collection_id == "n/a": + # return None + + # # First check for the actual collection + # existing_collection = ( + # db.query(Collection) + # .filter(Collection.import_id == row.cpr_collection_id) + # .one_or_none() + # ) + + # if existing_collection is None: + # collection = create( + # db, + # Collection, + # import_id=row.cpr_collection_id, + # title=row.collection_name, + # extra={"description": row.collection_summary}, + # ) + + # collection_organisation = create( + # db, + # CollectionOrganisation, + # collection_import_id=collection.import_id, + # organisation_id=org_id, + # ) + + # result["collection_organisation"] = to_dict(collection_organisation) + # result["collection"] = to_dict(collection) + # else: + # collection = existing_collection + # updated = {} + # update_if_changed(updated, "title", row.collection_name, collection) + # update_if_changed(updated, "description", row.collection_summary, collection) + # if len(updated) > 0: + # result["collection"] = updated + # db.add(collection) + # db.flush() + + # # Second check for the family - collection link + # existing_link = ( + # db.query(CollectionFamily) + # .filter_by( + # collection_import_id=row.cpr_collection_id, + # family_import_id=row.cpr_family_id, + # ) + # .one_or_none() + # ) + + # if existing_link is None: + # collection_family = create( + # db, + # CollectionFamily, + # collection_import_id=collection.import_id, + # family_import_id=family_import_id, + # ) + # result["collection_family"] = to_dict(collection_family) + + # return collection diff --git a/app/core/ingestion/unfccc/family.py b/app/core/ingestion/unfccc/family.py new file mode 100644 index 00000000..0d48e7b1 --- /dev/null +++ b/app/core/ingestion/unfccc/family.py @@ -0,0 +1,248 @@ +from typing import Any, Optional, cast + +from sqlalchemy.orm import Session + +from app.core.ingestion.unfccc.ingest_row_unfccc import UNFCCCDocumentIngestRow +from app.core.ingestion.unfccc.metadata import add_metadata +from app.core.organisation import get_organisation_taxonomy +from app.core.ingestion.unfccc.physical_document import ( + create_physical_document_from_row, +) +from app.core.ingestion.utils import ( + create, + get_or_create, + to_dict, + update_if_changed, +) +from app.db.models.law_policy import ( + DocumentStatus, + FamilyCategory, + Family, + FamilyDocument, + FamilyOrganisation, + FamilyStatus, + Geography, + Slug, +) + + +def handle_family_from_row( + db: Session, + row: UNFCCCDocumentIngestRow, + org_id: int, + result: dict[str, Any], +) -> Family: + """ + Create any Family + other entities and links from the row found in the db. + + :param [Session] db: connection to the database. + :param [int] org_id: the organisation id associated with this row. + :param [IngestRow] row: the row built from the CSV. + :param [dict[str, Any]] result: a result dict in which to track what was created + :raises [ValueError]: When there is an existing family name that only differs by + case or when the geography associated with this row cannot be found in the + database. + :return [Family]: The family that was either retrieved or created + """ + family = _operate_on_family(db, row, org_id, result) + + handle_family_document_from_row(db, row, family, result) + + return family + + +def _after_create_family( + db: Session, row: UNFCCCDocumentIngestRow, org_id: int, result: dict[str, Any] +): + def _create_family_links(family: Family): + family_slug = Slug(name=row.cpr_family_slug, family_import_id=family.import_id) + + db.add(family_slug) + result["family_slug"] = (to_dict(family_slug),) + + family_organisation = FamilyOrganisation( + family_import_id=family.import_id, organisation_id=org_id + ) + db.add(family_organisation) + result["family_organisation"] = to_dict(family_organisation) + + id, taxonomy = get_organisation_taxonomy(db, org_id) + add_metadata(db, cast(str, family.import_id), taxonomy, id, row) + + return _create_family_links + + +def _operate_on_family( + db: Session, + row: UNFCCCDocumentIngestRow, + org_id: int, + result: dict[str, Any], +) -> Family: + # FIXME: Check this: + category = FamilyCategory.UNFCCC + + geography = _get_geography(db, row) + extra = { + "title": row.family_name, + "geography_id": geography.id, + "description": row.family_summary, + "family_category": category, + } + + family = ( + db.query(Family).filter(Family.import_id == row.cpr_family_id).one_or_none() + ) + + if family is None: + family = create( + db, + Family, + import_id=row.cpr_family_id, + extra={**extra, "family_status": FamilyStatus.CREATED}, + after_create=_after_create_family(db, row, org_id, result), + ) + result["family"] = to_dict(family) + else: + updated = {} + + update_if_changed(updated, "title", row.family_name, family) + update_if_changed(updated, "description", row.family_summary, family) + update_if_changed(updated, "family_category", category, family) + + if len(updated) > 0: + db.add(family) + db.flush() + result["family"] = updated + + return family + + +def handle_family_document_from_row( + db: Session, + row: UNFCCCDocumentIngestRow, + family: Family, + result: dict[str, Any], +) -> FamilyDocument: + def none_if_empty(data: str) -> Optional[str]: + return data if data != "" else None + + # NOTE: op is determined by existence or otherwise of FamilyDocument + family_document = ( + db.query(FamilyDocument) + .filter(FamilyDocument.import_id == row.cpr_document_id) + .one_or_none() + ) + + # If the family document exists we can assume that the associated physical + # document and slug have also been created + if family_document is not None: + updated = {} + update_if_changed( + updated, + "family_import_id", + none_if_empty(row.cpr_family_id), + family_document, + ) + update_if_changed( + updated, + "document_type", + none_if_empty(row.submission_type), + family_document, + ) + update_if_changed( + updated, + "document_role", + none_if_empty(row.document_role), + family_document, + ) + update_if_changed( + updated, + "variant_name", + none_if_empty(row.document_variant), + family_document, + ) + if len(updated) > 0: + db.add(family_document) + db.flush() + result["family_document"] = updated + + # Now the physical document + updated = {} + + # If source_url changed then create a new physical_document + if row.documents != family_document.physical_document.source_url: + physical_document = create_physical_document_from_row(db, row, result) + family_document.physical_document = physical_document + else: + update_if_changed( + updated, + "title", + row.document_title, + family_document.physical_document, + ) + + if len(updated) > 0: + db.add(family_document.physical_document) + db.flush() + result["physical_document"] = updated + + # Check if slug has changed + existing_slug = ( + db.query(Slug).filter(Slug.name == row.cpr_document_slug).one_or_none() + ) + if existing_slug is None: + _add_family_document_slug(db, row, family_document, result) + else: + physical_document = create_physical_document_from_row(db, row, result) + family_document = FamilyDocument( + family_import_id=family.import_id, + physical_document_id=physical_document.id, + import_id=row.cpr_document_id, + variant_name=none_if_empty(row.document_variant), + document_status=DocumentStatus.PUBLISHED, + document_type=none_if_empty(row.submission_type), + document_role=none_if_empty(row.document_role), + ) + + db.add(family_document) + db.flush() + result["family_document"] = to_dict(family_document) + _add_family_document_slug(db, row, family_document, result) + + return family_document + + +def _get_geography(db: Session, row: UNFCCCDocumentIngestRow) -> Geography: + geography = ( + db.query(Geography).filter(Geography.value == row.geography_iso).one_or_none() + ) + if geography is None: + raise ValueError( + f"Geography value of {row.geography_iso} does not exist in the database." + ) + return geography + + +def _add_family_document_slug( + db: Session, + row: UNFCCCDocumentIngestRow, + family_document: FamilyDocument, + result: dict[str, Any], +) -> Slug: + """ + Adds the slugs for the family and family_document. + + :param [Session] db: connection to the database. + :param [IngestRow] row: the row built from the CSV. + :param [FamilyDocument] family_document: family document associated with this row. + :param [dict[str, Any]] result: a dictionary in which to record what was created. + :return [Slug]: the created slug object + """ + family_document_slug = get_or_create( + db, + Slug, + name=row.cpr_document_slug, + family_document_import_id=family_document.import_id, + ) + result["family_document_slug"] = to_dict(family_document_slug) + return family_document_slug diff --git a/app/core/ingestion/unfccc/ingest_row_unfccc.py b/app/core/ingestion/unfccc/ingest_row_unfccc.py new file mode 100644 index 00000000..90ccb15b --- /dev/null +++ b/app/core/ingestion/unfccc/ingest_row_unfccc.py @@ -0,0 +1,84 @@ +from datetime import datetime +from typing import ClassVar + +from pydantic import ConfigDict, Extra +from pydantic.dataclasses import dataclass +from app.core.ingestion.ingest_row_base import BaseIngestRow + +_REQUIRED_DOCUMENT_COLUMNS = [ + "Category", + "md5sum", + "Submission type", + "Family name", + "Document title", + "Documents", + "Author", + "Author type", + "Geography", + "Geography ISO", + "Date", + "Document role", + "Document variant", + "Language", + "Download URL", + "CPR Collection ID", + "CPR Document ID", + "CPR Family ID", + "CPR Family Slug", + "CPR Document Slug", +] +VALID_DOCUMENT_COLUMN_NAMES = set(_REQUIRED_DOCUMENT_COLUMNS) + +_REQUIRED_COLLECTION_COLUMNS = [ + "CPR Collection ID", + "Collection name", + "Collection summary", +] +VALID_COLLECTION_COLUMN_NAMES = set(_REQUIRED_COLLECTION_COLUMNS) + + +@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra=Extra.forbid)) +class UNFCCCDocumentIngestRow(BaseIngestRow): + """Represents a single row of input from the documents-families-collections CSV.""" + + category: str + md5sum: str + submission_type: str # aka Document Type for UNFCCC + family_name: str + document_title: str + documents: str + author: str + author_type: str # METADATA + geography: str + geography_iso: str + date: datetime + document_role: str + document_variant: str + language: list[str] + download_url: str + + cpr_collection_id: str + cpr_document_id: str + cpr_family_id: str + cpr_family_slug: str + cpr_document_slug: str + + # FIXME: Where is the summary from? + family_summary: str = "summary" + + VALID_COLUMNS: ClassVar[set[str]] = VALID_DOCUMENT_COLUMN_NAMES + + @staticmethod + def _key(key: str) -> str: + return key.lower().replace(" ", "_") + + +@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra=Extra.ignore)) +class CollectonIngestRow(BaseIngestRow): + """Represents a single row of input from the collection CSV.""" + + cpr_collection_id: str + collection_name: str + collection_summary: str + + VALID_COLUMNS: ClassVar[set[str]] = VALID_COLLECTION_COLUMN_NAMES diff --git a/app/core/ingestion/unfccc/metadata.py b/app/core/ingestion/unfccc/metadata.py new file mode 100644 index 00000000..3546e58d --- /dev/null +++ b/app/core/ingestion/unfccc/metadata.py @@ -0,0 +1,66 @@ +from typing import Union +from sqlalchemy.orm import Session + +from app.core.ingestion.metadata import MetadataJson, Taxonomy, build_metadata_field +from app.core.ingestion.utils import Result, ResultType +from app.core.ingestion.unfccc.ingest_row_unfccc import UNFCCCDocumentIngestRow +from app.db.models.law_policy.metadata import FamilyMetadata + + +MAP_OF_LIST_VALUES = { + "author_type": "author_type", +} + + +def add_metadata( + db: Session, + family_import_id: str, + taxonomy: Taxonomy, + taxonomy_id: int, + row: UNFCCCDocumentIngestRow, +) -> bool: + result, metadata = build_unfccc_metadata(taxonomy, row) + if result.type == ResultType.ERROR: + return False + + db.add( + FamilyMetadata( + family_import_id=family_import_id, + taxonomy_id=taxonomy_id, + value=metadata, + ) + ) + return True + + +def build_unfccc_metadata( + taxonomy: Taxonomy, row: UNFCCCDocumentIngestRow +) -> tuple[Result, MetadataJson]: + detail_list = [] + value: dict[str, Union[str, list[str]]] = {} + num_fails = 0 + num_resolved = 0 + + for tax_key, row_key in MAP_OF_LIST_VALUES.items(): + ingest_values = getattr(row, row_key) + result, field_value = build_metadata_field( + row.row_number, taxonomy, ingest_values, tax_key + ) + + if result.type == ResultType.OK: + value[tax_key] = field_value + elif result.type == ResultType.RESOLVED: + value[tax_key] = field_value + detail_list.append(result.details) + num_resolved += 1 + else: + detail_list.append(result.details) + num_fails += 1 + + row_result_type = ResultType.OK + if num_resolved: + row_result_type = ResultType.RESOLVED + if num_fails: + row_result_type = ResultType.ERROR + + return Result(type=row_result_type, details="\n".join(detail_list)), value diff --git a/app/core/ingestion/unfccc/physical_document.py b/app/core/ingestion/unfccc/physical_document.py new file mode 100644 index 00000000..5511da20 --- /dev/null +++ b/app/core/ingestion/unfccc/physical_document.py @@ -0,0 +1,52 @@ +from typing import Any + +from sqlalchemy.orm import Session +from app.core.ingestion.unfccc.ingest_row_unfccc import UNFCCCDocumentIngestRow +from app.core.ingestion.utils import to_dict + +from app.db.models.document import PhysicalDocument +from app.db.models.document.physical_document import Language, PhysicalDocumentLanguage + + +def create_physical_document_from_row( + db: Session, + row: UNFCCCDocumentIngestRow, + result: dict[str, Any], +) -> PhysicalDocument: + """ + Create the document part of the schema from the row. + + :param [Session] db: connection to the database. + :param [IngestRow] row: the row built from the CSV. + :param [Document] existing_document: existing Document from which to retrieve data. + :return [dict[str, Any]]: a dictionary to describe what was created. + """ + physical_document = PhysicalDocument( + title=row.document_title, + source_url=row.documents, + md5_sum=None, + content_type=None, + cdn_object=None, + ) + db.add(physical_document) + db.flush() + result["physical_document"] = to_dict(physical_document) + + for language in row.language: + lang = db.query(Language).filter(Language.name == language).one_or_none() + if lang is not None: + doc_languages = result.get("language", []) + doc_languages.append(to_dict(lang)) + result["language"] = doc_languages + + physical_document_language = PhysicalDocumentLanguage( + language_id=lang.id, document_id=physical_document.id + ) + db.add(physical_document_language) + db.flush() + + phys_doc_languages = result.get("physical_document_language", []) + phys_doc_languages.append(to_dict(physical_document_language)) + result["physical_document_language"] = phys_doc_languages + + return physical_document diff --git a/app/core/ingestion/unfccc/pipeline.py b/app/core/ingestion/unfccc/pipeline.py new file mode 100644 index 00000000..9d9488fe --- /dev/null +++ b/app/core/ingestion/unfccc/pipeline.py @@ -0,0 +1,79 @@ +from datetime import datetime, timezone +from typing import Sequence, Tuple, cast + +from sqlalchemy.orm import Session + +from app.api.api_v1.schemas.document import DocumentParserInput +from app.db.models.app.users import Organisation +from app.db.models.law_policy.family import ( + Family, + FamilyDocument, + FamilyOrganisation, + Geography, + DocumentStatus, + FamilyStatus, +) + + +def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput]: + """Generates a complete view of the current document database as pipeline input""" + query = ( + db.query(Family, FamilyDocument, Geography, Organisation) + .join(Family, Family.import_id == FamilyDocument.family_import_id) + .join( + FamilyOrganisation, FamilyOrganisation.family_import_id == Family.import_id + ) + .join(Organisation, Organisation.id == FamilyOrganisation.organisation_id) + .join(Geography, Geography.id == Family.geography_id) + .filter( + Family.family_status.in_([FamilyStatus.PUBLISHED, FamilyStatus.CREATED]) + ) + .filter( + FamilyDocument.document_status.in_( + [DocumentStatus.CREATED, DocumentStatus.PUBLISHED] + ) + ) + ) + + query_result = cast( + Sequence[Tuple[Family, FamilyDocument, Geography, Organisation]], query.all() + ) + fallback_date = datetime(1900, 1, 1, tzinfo=timezone.utc) + documents: Sequence[DocumentParserInput] = [ + DocumentParserInput( + name=cast(str, family.title), # All documents in a family indexed by title + description=cast(str, family.description), + category=str(family.family_category), + publication_ts=family.published_date or fallback_date, + import_id=cast(str, family_document.import_id), + source_url=( + cast(str, family_document.physical_document.source_url) + if family_document.physical_document is not None + else None + ), + type=cast(str, family_document.document_type or ""), + source=cast(str, organisation.name), + slug=cast(str, family_document.slugs[-1].name), + geography=cast(str, geography.value), + languages=[ + cast(str, lang.name) + for lang in ( + family_document.physical_document.languages + if family_document.physical_document is not None + else [] + ) + ], + # TODO: the following are not used & should be removed + events=[], + frameworks=[], + hazards=[], + instruments=[], + keywords=[], + postfix=None, + sectors=[], + topics=[], + ) + for family, family_document, geography, organisation in query_result + ] + + return documents diff --git a/app/core/ingestion/unfccc/reader.py b/app/core/ingestion/unfccc/reader.py new file mode 100644 index 00000000..0d558942 --- /dev/null +++ b/app/core/ingestion/unfccc/reader.py @@ -0,0 +1,53 @@ +import csv +from io import StringIO +from typing import Type + +from fastapi import UploadFile +from app.core.ingestion.ingest_row_base import BaseIngestRow, validate_csv_columns +from app.core.ingestion.processor import ProcessFunc + +from app.core.ingestion.utils import IngestContext +from app.core.validation.types import ImportSchemaMismatchError + + +def get_file_contents(csv_upload: UploadFile) -> str: + """ + Gets the file contents from an UploadFile. + + :param [UploadFile] csv_upload: The UploadFile from an HTTP request. + :return [str]: The contents of the file. + """ + return csv_upload.file.read().decode("utf8") + + +def read( + file_contents: str, + context: IngestContext, + row_type: Type[BaseIngestRow], + process: ProcessFunc, +) -> None: + """ + Read a CSV file and call process() for each row. + + :param [str] file_contents: the content of the imported CSV file. + :param [IngestContext] context: a context to use during import. + :param [Type[BaseIngestRow]] row_type: the type of row expected from the CSV. + :param [ProcessFunc] process: the function to call to process a single row. + """ + reader = csv.DictReader(StringIO(initial_value=file_contents)) + if reader.fieldnames is None: + raise ImportSchemaMismatchError("No fields in CSV!", {}) + + missing_columns = validate_csv_columns( + reader.fieldnames, + row_type.VALID_COLUMNS, + ) + if missing_columns: + raise ImportSchemaMismatchError( + "Field names in CSV did not validate", {"missing": missing_columns} + ) + row_count = 0 + + for row in reader: + row_count += 1 + process(context, row_type.from_row(row_count, row)) diff --git a/app/core/ingestion/utils.py b/app/core/ingestion/utils.py index 7775980c..cc21c17d 100644 --- a/app/core/ingestion/utils.py +++ b/app/core/ingestion/utils.py @@ -1,3 +1,4 @@ +import abc from dataclasses import dataclass import enum from typing import Any, Callable, Optional, TypeVar, cast @@ -210,17 +211,43 @@ def _check_( @dataclass -class IngestContext: +class IngestContext(abc.ABC): """Context used when processing.""" + org_name: str org_id: int results: list[Result] + + +@dataclass +class UNFCCCIngestContext(IngestContext): + """Ingest Context for UNFCCC""" + + collection_ids_defined: list[str] + collection_ids_referenced: list[str] + # Just for families: consistency_validator: ConsistencyValidator - def __init__(self, org_id=1, results=None) -> None: + def __init__(self, org_name="UNFCCC", org_id=2, results=None): + self.collection_ids_defined = [] + self.collection_ids_referenced = [] + self.consistency_validator = ConsistencyValidator() + self.org_name = org_name self.org_id = org_id self.results = [] if results is None else results + + +@dataclass +class CCLWIngestContext(IngestContext): + """Ingest Context for CCLW""" + + consistency_validator: ConsistencyValidator + + def __init__(self, org_name="CCLW", org_id=1, results=None): self.consistency_validator = ConsistencyValidator() + self.org_name = org_name + self.org_id = org_id + self.results = [] if results is None else results @dataclass diff --git a/app/core/ingestion/validator.py b/app/core/ingestion/validator.py index b2b1ba7c..09217714 100644 --- a/app/core/ingestion/validator.py +++ b/app/core/ingestion/validator.py @@ -1,13 +1,21 @@ from sqlalchemy import Column from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import DocumentIngestRow, EventIngestRow -from app.core.ingestion.metadata import build_metadata, Taxonomy +from app.core.ingestion.cclw.ingest_row_cclw import ( + CCLWDocumentIngestRow, + EventIngestRow, +) +from app.core.ingestion.metadata import Taxonomy +from app.core.ingestion.unfccc.ingest_row_unfccc import UNFCCCDocumentIngestRow +from app.core.ingestion.cclw.metadata import build_cclw_metadata from app.core.ingestion.utils import ( + CCLWIngestContext, IngestContext, Result, ResultType, + UNFCCCIngestContext, ) +from app.core.ingestion.unfccc.metadata import build_unfccc_metadata from app.db.models.law_policy.family import ( FamilyDocumentRole, FamilyDocumentType, @@ -54,10 +62,89 @@ def _check_geo_in_db(row_num: int, db: Session, geo_iso: str) -> CheckResult: return Result() -def validate_document_row( +def validate_unfccc_document_row( + db: Session, + context: UNFCCCIngestContext, + row: UNFCCCDocumentIngestRow, + taxonomy: Taxonomy, +) -> None: + """ + Validate the constituent elements that represent this law & policy document row. + + :param [IngestContext] context: The ingest context. + :param [DocumentIngestRow] row: DocumentIngestRow object from the current CSV row. + :param [Taxonomy] taxonomy: the Taxonomy against which metadata should be validated. + """ + + errors = [] + n = row.row_number + + # don't validate: md5sum: str + # don't validate: collection_name: str + # don't validate: collection_id: str + # don't validate: family_name: str + # don't validate: document_title: str + # don't validate: documents: str + # don't validate: author: str + # don't validate: geography: str + # don't validate: date: datetime + + # validate: document_role: str + result = _check_value_in_db( + n, db, row.document_role, FamilyDocumentRole, FamilyDocumentRole.name + ) + if result.type != ResultType.OK: + errors.append(result) + + # validate: document_variant: str + result = _check_value_in_db( + n, db, row.document_variant, Variant, Variant.variant_name + ) + if result.type != ResultType.OK: + errors.append(result) + + # validate: geography_iso: str + result = _check_geo_in_db(n, db, row.geography_iso) + if result.type != ResultType.OK: + errors.append(result) + + # validate: Submission type as document type + result = _check_value_in_db( + n, db, row.submission_type, FamilyDocumentType, FamilyDocumentType.name + ) + if result.type != ResultType.OK: + errors.append(result) + + # validate: language: list[str] + + # Check metadata + # validate: author_type: str # METADATA + result, _ = build_unfccc_metadata(taxonomy, row) + if result.type != ResultType.OK: + errors.append(result) + + # Check family + context.consistency_validator.check_family( + row.row_number, + row.cpr_family_id, + row.family_name, + row.family_summary, + errors, + ) + + # Add to the collections that are referenced so we can valiate later + context.collection_ids_referenced.append(row.cpr_collection_id) + + if len(errors) > 0: + context.results += errors + else: + context.results.append(Result()) + + +def validate_cclw_document_row( db: Session, - context: IngestContext, - row: DocumentIngestRow, + context: CCLWIngestContext, + row: CCLWDocumentIngestRow, taxonomy: Taxonomy, ) -> None: """ @@ -93,7 +180,7 @@ def validate_document_row( errors.append(result) # Check metadata - result, _ = build_metadata(taxonomy, row) + result, _ = build_cclw_metadata(taxonomy, row) if result.type != ResultType.OK: errors.append(result) diff --git a/app/data_migrations/populate_document_type.py b/app/data_migrations/populate_document_type.py index 2e3e4c48..a93f95df 100644 --- a/app/data_migrations/populate_document_type.py +++ b/app/data_migrations/populate_document_type.py @@ -3,17 +3,30 @@ from sqlalchemy.orm import Session from app.db.models.law_policy import FamilyDocumentType -from .utils import has_rows, load_list +from .utils import load_list_idempotent def populate_document_type(db: Session) -> None: """Populates the document_type table with pre-defined data.""" - if has_rows(db, FamilyDocumentType): - return + # This is no longer fixed but additive, + # meaning we will add anything here that is not present in the table with open( "app/data_migrations/data/law_policy/document_type_data.json" - ) as document_type_file: - document_type_data = json.load(document_type_file) - load_list(db, FamilyDocumentType, document_type_data) + ) as submission_type_file: + document_type_data = json.load(submission_type_file) + load_list_idempotent( + db, FamilyDocumentType, FamilyDocumentType.name, "name", document_type_data + ) + + with open( + "app/data_migrations/data/unf3c/submission_type_data.json" + ) as submission_type_file: + submission_type_data = json.load(submission_type_file) + document_type_data = [ + {"name": e["name"], "description": e["name"]} for e in submission_type_data + ] + load_list_idempotent( + db, FamilyDocumentType, FamilyDocumentType.name, "name", document_type_data + ) diff --git a/app/data_migrations/taxonomy_unf3c.py b/app/data_migrations/taxonomy_unf3c.py index d418ffba..1003f419 100644 --- a/app/data_migrations/taxonomy_unf3c.py +++ b/app/data_migrations/taxonomy_unf3c.py @@ -2,12 +2,6 @@ TAXONOMY_DATA = [ - { - "key": "submission_type", - "filename": "app/data_migrations/data/unf3c/submission_type_data.json", - "file_key_path": "name", - "allow_blanks": False, - }, { "key": "author_type", "allow_blanks": False, diff --git a/app/data_migrations/utils.py b/app/data_migrations/utils.py index 7e60fdab..0fc1ad56 100644 --- a/app/data_migrations/utils.py +++ b/app/data_migrations/utils.py @@ -1,4 +1,5 @@ from typing import Mapping, Optional, Sequence, cast +from sqlalchemy import Column from sqlalchemy.orm import Session @@ -52,3 +53,27 @@ def load_list(db: Session, table: AnyModel, data_list: Sequence[Mapping]) -> Non """ for entry in data_list: db.add(table(**entry)) + + +def load_list_idempotent( + db: Session, + table: AnyModel, + unique_column: Column, + data_key: str, + data_list: Sequence[Mapping], +) -> None: + """ + Load a list of data stored as JSON into a database table + + :param [Session] db: An open database session + :param [AnyModel] table: The table (and therefore type) of entries to create + :param [Column] unique_column: The column on the table that has the unique value + :param [str] data_key: The key in the `data_list` objects that relates to the + unique_column. + :param [Sequence[Mapping]] data_list: A list of data objects to load + """ + for entry in data_list: + found = db.query(table).filter(unique_column == entry[data_key]).one_or_none() + if found is None: + db.add(table(**entry)) + db.flush() diff --git a/app/db/models/law_policy/family.py b/app/db/models/law_policy/family.py index 6d9baf26..6c27795b 100644 --- a/app/db/models/law_policy/family.py +++ b/app/db/models/law_policy/family.py @@ -30,6 +30,7 @@ class FamilyCategory(_BaseModelEnum): EXECUTIVE = "Executive" LEGISLATIVE = "Legislative" + UNFCCC = "UNFCCC" class Variant(Base): diff --git a/app/main.py b/app/main.py index 900dfb90..9641d293 100644 --- a/app/main.py +++ b/app/main.py @@ -4,7 +4,7 @@ import json_logging import uvicorn -from fastapi import Depends, FastAPI +from fastapi import APIRouter, Depends, FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi_health import health from fastapi_pagination import add_pagination @@ -12,7 +12,9 @@ from alembic.command import upgrade from alembic.config import Config -from app.api.api_v1.routers.admin import admin_users_router +from app.api.api_v1.routers.cclw_ingest import cclw_ingest_router +from app.api.api_v1.routers.unfccc_ingest import unfccc_ingest_router +from app.api.api_v1.routers.admin import admin_document_router from app.api.api_v1.routers.auth import auth_router from app.api.api_v1.routers.documents import documents_router from app.api.api_v1.routers.lookups import lookups_router @@ -101,9 +103,15 @@ async def root(): return {"message": "CPR API v1"} -# Routers +# Create an admin router that is a combination of: +admin_router = APIRouter() +admin_router.include_router(cclw_ingest_router) +admin_router.include_router(unfccc_ingest_router) +admin_router.include_router(admin_document_router) + +# App Routers app.include_router( - admin_users_router, + admin_router, prefix="/api/v1/admin", tags=["Admin"], dependencies=[Depends(get_superuser_details)], diff --git a/scripts/data_ingest/main.py b/scripts/data_ingest/main.py index 86b8123f..959bc110 100644 --- a/scripts/data_ingest/main.py +++ b/scripts/data_ingest/main.py @@ -12,7 +12,7 @@ ADMIN_EMAIL_ENV = "SUPERUSER_EMAIL" ADMIN_PASSWORD_ENV = "SUPERUSER_PASSWORD" ADMIN_TOKEN_ENV = "SUPERUSER_TOKEN" -BULK_IMPORT_ENDPOINT = "api/v1/admin/bulk-imports/cclw/law-policy" +BULK_IMPORT_ENDPOINT = "api/v1/admin/bulk-imports/cclw" DEFAULT_LOGGING = { diff --git a/scripts/validate_cclw_sheet/validate-ingest.sh b/scripts/validate_cclw_sheet/validate-ingest.sh index a3f3bcd0..c64d218d 100755 --- a/scripts/validate_cclw_sheet/validate-ingest.sh +++ b/scripts/validate_cclw_sheet/validate-ingest.sh @@ -33,7 +33,7 @@ validate_csv() { curl -s \ -H "Authorization: Bearer ${TOKEN}" \ -F "law_policy_csv=@${CSV_FILE}" \ - ${TEST_HOST}/api/v1/admin/bulk-ingest/validate/cclw/law-policy | jq + ${TEST_HOST}/api/v1/admin/bulk-ingest/validate/cclw | jq } echo "Validating as ${USER}" diff --git a/tests/core/ingestion/test_ingest_row.py b/tests/core/ingestion/test_cclw_ingest_row.py similarity index 87% rename from tests/core/ingestion/test_ingest_row.py rename to tests/core/ingestion/test_cclw_ingest_row.py index 2bf827dd..eca0370c 100644 --- a/tests/core/ingestion/test_ingest_row.py +++ b/tests/core/ingestion/test_cclw_ingest_row.py @@ -1,9 +1,9 @@ import pytest from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import DocumentIngestRow -from app.core.ingestion.processor import ingest_document_row -from app.core.ingestion.utils import IngestContext +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.core.ingestion.processor import ingest_cclw_document_row +from app.core.ingestion.utils import CCLWIngestContext from app.db.models.document.physical_document import PhysicalDocument from app.db.models.law_policy.collection import ( Collection, @@ -31,10 +31,10 @@ def setup_for_update(test_db): - context = IngestContext() - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + context = CCLWIngestContext() + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) populate_for_ingest(test_db) - ingest_document_row(test_db, context, row) + ingest_cclw_document_row(test_db, context, row) return context, row @@ -46,21 +46,21 @@ def assert_dfc(db: Session, n_docs: int, n_families: int, n_collections: int): def test_ingest_row__with_multiple_rows(test_db: Session): - context = IngestContext() - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + context = CCLWIngestContext() + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.cpr_family_id = "CCLW.family.test.1" row.cpr_family_slug = "fam-test-1" populate_for_ingest(test_db) # First row - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert 9 == len(result.keys()) assert_dfc(test_db, 1, 1, 1) # Second row - adds another document to family row.cpr_document_id = "CCLW.doc.test.1" row.cpr_document_slug = "doc-test-1" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert 3 == len(result.keys()) assert_dfc(test_db, 2, 1, 1) @@ -69,21 +69,21 @@ def test_ingest_row__with_multiple_rows(test_db: Session): row.cpr_family_slug = "fam-test-2" row.cpr_document_id = "CCLW.doc.test.2" row.cpr_document_slug = "doc-test-2" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert 7 == len(result.keys()) assert_dfc(test_db, 3, 2, 1) # Forth - adds another document to the family row.cpr_document_id = "CCLW.doc.test.3" row.cpr_document_slug = "doc-test-3" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert 3 == len(result.keys()) assert_dfc(test_db, 4, 2, 1) # Finally change the family id of the document just added row.cpr_family_id = "CCLW.family.test.1" row.cpr_family_slug = "fam-test-1" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert 1 == len(result.keys()) assert_dfc(test_db, 4, 2, 1) @@ -107,10 +107,10 @@ def test_ingest_row__with_multiple_rows(test_db: Session): def test_ingest_row__creates_missing_documents(test_db: Session): - context = IngestContext() - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + context = CCLWIngestContext() + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) populate_for_ingest(test_db) - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) actual_keys = set(result.keys()) expected_keys = set( [ @@ -155,7 +155,7 @@ def test_ingest_row__creates_missing_documents(test_db: Session): def test_ingest_row__idempotent(test_db: Session): context, row = setup_for_update(test_db) - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 0 # Assert db objects @@ -217,7 +217,7 @@ def test_ingest_row__updates_collection_name(test_db: Session): context, row = setup_for_update(test_db) row.collection_name = "changed" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "collection" in result assert result["collection"]["title"] == "changed" @@ -236,7 +236,7 @@ def test_ingest_row__updates_collection_summary(test_db: Session): context, row = setup_for_update(test_db) row.collection_summary = "changed" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "collection" in result assert result["collection"]["description"] == "changed" @@ -255,7 +255,7 @@ def test_ingest_row__updates_document_title(test_db: Session): context, row = setup_for_update(test_db) row.document_title = "changed" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "physical_document" in result assert result["physical_document"]["title"] == "changed" @@ -269,7 +269,7 @@ def test_ingest_row__updates_family_name(test_db: Session): context, row = setup_for_update(test_db) row.family_name = "changed" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "family" in result assert result["family"]["title"] == "changed" @@ -288,7 +288,7 @@ def test_ingest_row__updates_family_summary(test_db: Session): context, row = setup_for_update(test_db) row.family_summary = "changed" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "family" in result assert result["family"]["description"] == "changed" @@ -307,7 +307,7 @@ def test_ingest_row__updates_family_document_role(test_db: Session): context, row = setup_for_update(test_db) row.document_role = "ANNEX" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "family_document" in result assert result["family_document"]["document_role"] == "ANNEX" @@ -326,7 +326,7 @@ def test_ingest_row__updates_family_document_variant(test_db: Session): context, row = setup_for_update(test_db) row.document_variant = "Translation" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "family_document" in result assert result["family_document"]["variant_name"] == "Translation" @@ -345,7 +345,7 @@ def test_ingest_row__updates_source_url(test_db: Session): context, row = setup_for_update(test_db) row.documents = "https://www.com" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "physical_document" in result assert result["physical_document"]["source_url"] == "https://www.com" @@ -370,7 +370,7 @@ def test_ingest_row__updates_family_category(test_db: Session): context, row = setup_for_update(test_db) row.category = FamilyCategory.LEGISLATIVE - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "family" in result assert result["family"]["family_category"] == "Legislative" @@ -389,7 +389,7 @@ def test_ingest_row__updates_family_document_type(test_db: Session): context, row = setup_for_update(test_db) row.document_type = "Edict" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "family_document" in result assert result["family_document"]["document_type"] == "Edict" @@ -408,7 +408,7 @@ def test_ingest_row__updates_fd_slug(test_db: Session): context, row = setup_for_update(test_db) row.cpr_document_slug = "changed" - result = ingest_document_row(test_db, context, row) + result = ingest_cclw_document_row(test_db, context, row) assert len(result) == 1 assert "family_document_slug" in result assert result["family_document_slug"]["name"] == "changed" @@ -429,7 +429,7 @@ def test_ingest_row__updates_fd_slug(test_db: Session): def test_IngestRow__from_row(): - ingest_row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + ingest_row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) assert ingest_row assert ingest_row.cpr_document_id == "CCLW.executive.1001.0" @@ -437,7 +437,7 @@ def test_IngestRow__from_row(): def test_IngestRow__from_row_raises_when_multi_urls(): - ingest_row = DocumentIngestRow.from_row( + ingest_row = CCLWDocumentIngestRow.from_row( 1, get_doc_ingest_row_data(0, contents=BAD_MULTI_URL) ) diff --git a/tests/core/ingestion/test_collection.py b/tests/core/ingestion/test_collection.py index e0f9872b..c44351cc 100644 --- a/tests/core/ingestion/test_collection.py +++ b/tests/core/ingestion/test_collection.py @@ -1,7 +1,7 @@ from typing import cast from sqlalchemy.orm import Session -from app.core.ingestion.collection import handle_collection_from_row -from app.core.ingestion.ingest_row import DocumentIngestRow +from app.core.ingestion.cclw.collection import handle_collection_from_row +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow from app.core.ingestion.utils import get_or_create from app.db.models.law_policy.collection import ( Collection, @@ -21,7 +21,7 @@ def db_setup(test_db): populate_for_ingest(test_db) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) family = get_or_create( test_db, Family, diff --git a/tests/core/ingestion/test_dfc_processor.py b/tests/core/ingestion/test_dfc_processor.py index bd4d3484..63a6905f 100644 --- a/tests/core/ingestion/test_dfc_processor.py +++ b/tests/core/ingestion/test_dfc_processor.py @@ -1,7 +1,7 @@ -from app.core.ingestion.ingest_row import DocumentIngestRow -from app.core.ingestion.processor import get_dfc_ingestor -from app.core.ingestion.reader import read -from app.core.ingestion.utils import IngestContext, ResultType +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.core.ingestion.processor import get_document_ingestor +from app.core.ingestion.cclw.reader import read +from app.core.ingestion.utils import CCLWIngestContext, ResultType from app.db.models.law_policy.family import FamilyDocument from tests.core.ingestion.helpers import ( THREE_DOC_ROWS, @@ -10,25 +10,25 @@ ) -def test_dfc_ingestor__three_good_rows(test_db): +def test_cclw_ingestor__three_good_rows(test_db): populate_for_ingest(test_db) test_db.commit() - context = IngestContext() - document_ingestor = get_dfc_ingestor(test_db) + context = CCLWIngestContext() + document_ingestor = get_document_ingestor(test_db, context) - read(THREE_DOC_ROWS, context, DocumentIngestRow, document_ingestor) + read(THREE_DOC_ROWS, context, CCLWDocumentIngestRow, document_ingestor) assert len(context.results) == 0 assert 3 == test_db.query(FamilyDocument).count() -def test_dfc_ingestor__second_bad_row(test_db): +def test_cclw_ingestor__second_bad_row(test_db): populate_for_ingest(test_db) test_db.commit() - context = IngestContext() - document_ingestor = get_dfc_ingestor(test_db) + context = CCLWIngestContext() + document_ingestor = get_document_ingestor(test_db, context) - read(THREE_DOC_ROWS_2ND_BAD, context, DocumentIngestRow, document_ingestor) + read(THREE_DOC_ROWS_2ND_BAD, context, CCLWDocumentIngestRow, document_ingestor) assert len(context.results) == 1 assert context.results[0].type == ResultType.ERROR diff --git a/tests/core/ingestion/test_event.py b/tests/core/ingestion/test_event.py index bfa1d35b..37e1253d 100644 --- a/tests/core/ingestion/test_event.py +++ b/tests/core/ingestion/test_event.py @@ -2,9 +2,12 @@ from sqlalchemy.orm import Session -from app.core.ingestion.event import family_event_from_row -from app.core.ingestion.family import handle_family_from_row -from app.core.ingestion.ingest_row import DocumentIngestRow, EventIngestRow +from app.core.ingestion.cclw.event import family_event_from_row +from app.core.ingestion.cclw.family import handle_family_from_row +from app.core.ingestion.cclw.ingest_row_cclw import ( + CCLWDocumentIngestRow, + EventIngestRow, +) from app.db.models.law_policy.family import Family, FamilyEvent from tests.core.ingestion.helpers import ( EVENT_IMPORT_ID, @@ -17,7 +20,7 @@ def test_family_event_from_row(test_db: Session): populate_for_ingest(test_db) - doc_row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + doc_row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) event_row = EventIngestRow.from_row(1, get_event_ingest_row_data(0)) result = {} @@ -38,7 +41,7 @@ def test_family_event_from_row(test_db: Session): def test_family_multiple_events_from_row(test_db: Session): populate_for_ingest(test_db) - doc_row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + doc_row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) event_row_1 = EventIngestRow.from_row(1, get_event_ingest_row_data(0)) event_row_2 = EventIngestRow.from_row(2, get_event_ingest_row_data(1)) diff --git a/tests/core/ingestion/test_family.py b/tests/core/ingestion/test_family.py index 46c86376..50b01319 100644 --- a/tests/core/ingestion/test_family.py +++ b/tests/core/ingestion/test_family.py @@ -1,10 +1,10 @@ from sqlalchemy.orm import Session -from app.core.ingestion.family import ( +from app.core.ingestion.cclw.family import ( handle_family_document_from_row, handle_family_from_row, ) -from app.core.ingestion.ingest_row import DocumentIngestRow -from app.core.ingestion.physical_document import create_physical_document_from_row +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.core.ingestion.cclw.physical_document import create_physical_document_from_row from app.db.models.law_policy.family import ( DocumentStatus, Family, @@ -26,7 +26,7 @@ def test_family_from_row__creates(test_db: Session): populate_for_ingest(test_db) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) result = {} family = handle_family_from_row(test_db, row, org_id=1, result=result) @@ -58,7 +58,7 @@ def test_family_from_row__creates(test_db: Session): def test_family_from_row__updates(test_db: Session): populate_for_ingest(test_db) result = {} - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) # Pre-Add the family category = FamilyCategory(row.category.upper()) test_db.add( @@ -95,7 +95,7 @@ def test_family_from_row__updates(test_db: Session): def test_family_document_from_row__creates(test_db: Session): populate_for_ingest(test_db) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) family = add_a_family(test_db) result = {} family_document = handle_family_document_from_row( @@ -123,7 +123,7 @@ def test_family_document_from_row__creates(test_db: Session): def test_family_document_from_row__updates(test_db: Session): populate_for_ingest(test_db) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) family = add_a_family(test_db) result = {} handle_family_document_from_row(test_db, row, family, result=result) diff --git a/tests/core/ingestion/test_metadata.py b/tests/core/ingestion/test_metadata.py index 75513ba1..c7b6a118 100644 --- a/tests/core/ingestion/test_metadata.py +++ b/tests/core/ingestion/test_metadata.py @@ -1,8 +1,8 @@ import pytest from sqlalchemy.exc import NoResultFound from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import DocumentIngestRow -from app.core.ingestion.metadata import build_metadata +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.core.ingestion.cclw.metadata import build_cclw_metadata from app.core.organisation import get_organisation_taxonomy from app.core.ingestion.utils import ResultType from tests.core.ingestion.helpers import ( @@ -16,7 +16,7 @@ def test_build_metadata__all_fields(test_db): populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, org_id=1) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.responses = ["Loss AND Damage"] row.natural_hazards = ["Flood"] row.sectors = ["TransPortation"] @@ -25,7 +25,7 @@ def test_build_metadata__all_fields(test_db): row.instruments = ["Other|Governance"] row.document_type = "Act" - result, metadata = build_metadata(taxonomy, row) + result, metadata = build_cclw_metadata(taxonomy, row) assert result assert result.type == ResultType.RESOLVED @@ -67,10 +67,10 @@ def test_get_org_taxonomy__raises_on_no_organisation(test_db: Session): def test_build_metadata__error_when_sector_notfound(test_db): populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, org_id=1) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.sectors = ["Medical"] - result, metadata = build_metadata(taxonomy, row) + result, metadata = build_cclw_metadata(taxonomy, row) assert result assert result.type == ResultType.ERROR @@ -86,10 +86,10 @@ def test_build_metadata__error_when_sector_notfound(test_db): def test_build_metadata__reports_when_resolved(test_db): populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, org_id=1) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.sectors = ["Building"] - result, metadata = build_metadata(taxonomy, row) + result, metadata = build_cclw_metadata(taxonomy, row) assert result assert result.type == ResultType.RESOLVED diff --git a/tests/core/ingestion/test_physical_document.py b/tests/core/ingestion/test_physical_document.py index 78c1bcd6..5db3e2fb 100644 --- a/tests/core/ingestion/test_physical_document.py +++ b/tests/core/ingestion/test_physical_document.py @@ -1,7 +1,7 @@ from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import DocumentIngestRow -from app.core.ingestion.physical_document import create_physical_document_from_row +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.core.ingestion.cclw.physical_document import create_physical_document_from_row from app.db.models.document import PhysicalDocument from app.db.models.document.physical_document import PhysicalDocumentLanguage from tests.core.ingestion.helpers import ( @@ -13,7 +13,7 @@ def test_physical_document_from_row(test_db: Session): populate_for_ingest(test_db) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.language = ["English", "German"] result = {} diff --git a/tests/core/ingestion/test_pipeline.py b/tests/core/ingestion/test_pipeline.py index 136b019b..111fff57 100644 --- a/tests/core/ingestion/test_pipeline.py +++ b/tests/core/ingestion/test_pipeline.py @@ -5,11 +5,14 @@ import pytest from sqlalchemy.orm import Session -from app.core.ingestion.ingest_row import DocumentIngestRow, EventIngestRow -from app.core.ingestion.pipeline import generate_pipeline_ingest_input -from app.core.ingestion.processor import get_dfc_ingestor, get_event_ingestor -from app.core.ingestion.reader import read -from app.core.ingestion.utils import IngestContext +from app.core.ingestion.cclw.ingest_row_cclw import ( + CCLWDocumentIngestRow, + EventIngestRow, +) +from app.core.ingestion.cclw.pipeline import generate_pipeline_ingest_input +from app.core.ingestion.processor import get_document_ingestor, get_event_ingestor +from app.core.ingestion.cclw.reader import read +from app.core.ingestion.utils import CCLWIngestContext from tests.core.ingestion.helpers import ( FIVE_EVENT_ROWS, THREE_DOC_ROWS, @@ -26,11 +29,11 @@ def _populate_db_for_test( ) -> None: populate_for_ingest(test_db) test_db.commit() - context = IngestContext() - document_ingestor = get_dfc_ingestor(test_db) + context = CCLWIngestContext() + document_ingestor = get_document_ingestor(test_db, context) event_ingestor = get_event_ingestor(test_db) - read(ingest_doc_content, context, DocumentIngestRow, document_ingestor) + read(ingest_doc_content, context, CCLWDocumentIngestRow, document_ingestor) if ingest_event_content is not None: read(ingest_event_content, context, EventIngestRow, event_ingestor) diff --git a/tests/core/ingestion/test_read.py b/tests/core/ingestion/test_read.py index 831f4108..31bb9ea3 100644 --- a/tests/core/ingestion/test_read.py +++ b/tests/core/ingestion/test_read.py @@ -1,8 +1,8 @@ from unittest.mock import MagicMock import pytest -from app.core.ingestion.ingest_row import DocumentIngestRow -from app.core.ingestion.reader import read -from app.core.ingestion.utils import IngestContext +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.core.ingestion.cclw.reader import read +from app.core.ingestion.utils import CCLWIngestContext from app.core.validation.types import ImportSchemaMismatchError from tests.core.ingestion.helpers import ( ALPHABETICAL_DOC_COLUMNS, @@ -12,11 +12,11 @@ def test_read__raises_with_no_contents(): - context = IngestContext() + context = CCLWIngestContext() process = MagicMock() with pytest.raises(ImportSchemaMismatchError) as e_info: contents = "" - read(contents, context, DocumentIngestRow, process) + read(contents, context, CCLWDocumentIngestRow, process) assert len(context.results) == 0 assert ( @@ -27,12 +27,12 @@ def test_read__raises_with_no_contents(): def test_read__raises_with_wrong_fields(): - context = IngestContext() + context = CCLWIngestContext() process = MagicMock() with pytest.raises(ImportSchemaMismatchError) as e_info: contents = """a,b,c 1,2,3""" - read(contents, context, DocumentIngestRow, process) + read(contents, context, CCLWDocumentIngestRow, process) assert len(context.results) == 0 assert ( @@ -43,10 +43,10 @@ def test_read__raises_with_wrong_fields(): def test_read__raises_with_missing_field(): - context = IngestContext() + context = CCLWIngestContext() process = MagicMock() with pytest.raises(ImportSchemaMismatchError) as e_info: - read(THREE_DOC_ROWS_MISSING_FIELD, context, DocumentIngestRow, process) + read(THREE_DOC_ROWS_MISSING_FIELD, context, CCLWDocumentIngestRow, process) assert len(context.results) == 0 assert ( @@ -57,9 +57,9 @@ def test_read__raises_with_missing_field(): def test_read__processes_all_rows(): - context = IngestContext() + context = CCLWIngestContext() process = MagicMock() - read(THREE_DOC_ROWS, context, DocumentIngestRow, process) + read(THREE_DOC_ROWS, context, CCLWDocumentIngestRow, process) expected_rows = 3 assert process.call_count == expected_rows diff --git a/tests/core/ingestion/test_unfccc_ingest_row.py b/tests/core/ingestion/test_unfccc_ingest_row.py new file mode 100644 index 00000000..a44fe0dd --- /dev/null +++ b/tests/core/ingestion/test_unfccc_ingest_row.py @@ -0,0 +1,182 @@ +# from sqlalchemy.orm import Session +# from app.core.ingestion.processor import ingest_unfccc_document_row +# from app.core.ingestion.unfccc.ingest_row_unfccc import UNFCCCDocumentIngestRow +# from app.core.ingestion.utils import UNFCCCIngestContext +# from app.db.models.document.physical_document import PhysicalDocument +# from app.db.models.law_policy.collection import ( +# Collection, +# CollectionFamily, +# CollectionOrganisation, +# ) +# from app.db.models.law_policy.family import ( +# Family, +# FamilyDocument, +# FamilyOrganisation, +# Slug, +# ) +# from tests.core.ingestion.helpers import ( +# COLLECTION_IMPORT_ID, +# DOCUMENT_IMPORT_ID, +# DOCUMENT_TITLE, +# FAMILY_IMPORT_ID, +# SLUG_DOCUMENT_NAME, +# SLUG_FAMILY_NAME, +# get_doc_ingest_row_data, +# populate_for_ingest, +# ) + +# FIXME: All this file needs attention + + +# def setup_for_update(test_db): +# context = UNFCCCIngestContext() +# row = UNFCCCDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) +# populate_for_ingest(test_db) +# ingest_unfccc_document_row(test_db, context, row) +# return context, row + + +# def assert_dfc(db: Session, n_docs: int, n_families: int, n_collections: int): +# assert n_docs == db.query(FamilyDocument).count() +# assert n_docs == db.query(PhysicalDocument).count() +# assert n_families == db.query(Family).count() +# assert n_collections == db.query(Collection).count() + + +# def test_ingest_row__with_multiple_rows(test_db: Session): +# context = UNFCCCIngestContext() +# row = UNFCCCDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) +# row.cpr_family_id = "UNFCCC.family.test.1" +# row.cpr_family_slug = "fam-test-1" +# populate_for_ingest(test_db) + +# # First row +# result = ingest_unfccc_document_row(test_db, context, row) +# assert 9 == len(result.keys()) +# assert_dfc(test_db, 1, 1, 1) + +# # Second row - adds another document to family +# row.cpr_document_id = "UNFCCC.doc.test.1" +# row.cpr_document_slug = "doc-test-1" +# result = ingest_unfccc_document_row(test_db, context, row) +# assert 3 == len(result.keys()) +# assert_dfc(test_db, 2, 1, 1) + +# # Third row - adds another family and document +# row.cpr_family_id = "UNFCCC.family.test.2" +# row.cpr_family_slug = "fam-test-2" +# row.cpr_document_id = "UNFCCC.doc.test.2" +# row.cpr_document_slug = "doc-test-2" +# result = ingest_unfccc_document_row(test_db, context, row) +# assert 7 == len(result.keys()) +# assert_dfc(test_db, 3, 2, 1) + +# # Forth - adds another document to the family +# row.cpr_document_id = "UNFCCC.doc.test.3" +# row.cpr_document_slug = "doc-test-3" +# result = ingest_unfccc_document_row(test_db, context, row) +# assert 3 == len(result.keys()) +# assert_dfc(test_db, 4, 2, 1) + +# # Finally change the family id of the document just added +# row.cpr_family_id = "UNFCCC.family.test.1" +# row.cpr_family_slug = "fam-test-1" +# result = ingest_unfccc_document_row(test_db, context, row) +# assert 1 == len(result.keys()) +# assert_dfc(test_db, 4, 2, 1) + +# # Now assert both families have correct documents +# assert ( +# 3 +# == test_db.query(FamilyDocument) +# .filter_by(family_import_id="UNFCCC.family.test.1") +# .count() +# ) +# assert ( +# 1 +# == test_db.query(FamilyDocument) +# .filter_by(family_import_id="UNFCCC.family.test.2") +# .count() +# ) + +# # Now assert collection has 2 families +# assert 1 == test_db.query(Collection).count() +# assert 2 == test_db.query(CollectionFamily).count() + + +# def test_ingest_row__creates_missing_documents(test_db: Session): +# context = UNFCCCIngestContext() +# row = UNFCCCDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) +# populate_for_ingest(test_db) +# result = ingest_unfccc_document_row(test_db, context, row) +# actual_keys = set(result.keys()) +# expected_keys = set( +# [ +# "family_slug", +# "family_organisation", +# "family", +# "physical_document", +# "family_document", +# "family_document_slug", +# "collection", +# "collection_organisation", +# "collection_family", +# ] +# ) +# assert actual_keys.symmetric_difference(expected_keys) == set([]) +# # Assert db objects +# assert test_db.query(Slug).filter_by(name=SLUG_FAMILY_NAME).one() +# assert ( +# test_db.query(FamilyOrganisation) +# .filter_by(family_import_id=FAMILY_IMPORT_ID) +# .one() +# ) +# assert test_db.query(Family).filter_by(import_id=FAMILY_IMPORT_ID).one() +# assert test_db.query(PhysicalDocument).filter_by(title=DOCUMENT_TITLE).one() +# assert test_db.query(FamilyDocument).filter_by(import_id=DOCUMENT_IMPORT_ID).one() +# assert test_db.query(Slug).filter_by(name=SLUG_DOCUMENT_NAME).one() +# assert test_db.query(Collection).filter_by(import_id=COLLECTION_IMPORT_ID).one() +# assert ( +# test_db.query(CollectionOrganisation) +# .filter_by(collection_import_id=COLLECTION_IMPORT_ID) +# .one() +# ) +# assert ( +# test_db.query(CollectionFamily) +# .filter_by( +# collection_import_id=COLLECTION_IMPORT_ID, family_import_id=FAMILY_IMPORT_ID +# ) +# .one() +# ) + + +# def test_ingest_row__idempotent(test_db: Session): +# context, row = setup_for_update(test_db) + +# result = ingest_unfccc_document_row(test_db, context, row) +# assert len(result) == 0 + +# # Assert db objects +# assert test_db.query(Slug).filter_by(name=SLUG_FAMILY_NAME).one() +# assert ( +# test_db.query(FamilyOrganisation) +# .filter_by(family_import_id=FAMILY_IMPORT_ID) +# .one() +# ) +# assert test_db.query(Family).filter_by(import_id=FAMILY_IMPORT_ID).one() +# assert test_db.query(PhysicalDocument).filter_by(title=DOCUMENT_TITLE).one() +# assert test_db.query(FamilyDocument).filter_by(import_id=DOCUMENT_IMPORT_ID).one() +# assert test_db.query(Slug).filter_by(name=SLUG_DOCUMENT_NAME).one() +# assert test_db.query(Collection).filter_by(import_id=COLLECTION_IMPORT_ID).one() +# assert ( +# test_db.query(CollectionOrganisation) +# .filter_by(collection_import_id=COLLECTION_IMPORT_ID) +# .one() +# ) +# assert ( +# test_db.query(CollectionFamily) +# .filter_by( +# collection_import_id=COLLECTION_IMPORT_ID, family_import_id=FAMILY_IMPORT_ID +# ) +# .one() +# ) diff --git a/tests/core/ingestion/test_validate_row.py b/tests/core/ingestion/test_validate_row.py index 507a1115..120f18e2 100644 --- a/tests/core/ingestion/test_validate_row.py +++ b/tests/core/ingestion/test_validate_row.py @@ -1,6 +1,6 @@ -from app.core.ingestion.ingest_row import DocumentIngestRow -from app.core.ingestion.utils import IngestContext, ResultType -from app.core.ingestion.validator import validate_document_row +from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow +from app.core.ingestion.utils import CCLWIngestContext, ResultType +from app.core.ingestion.validator import validate_cclw_document_row from app.core.organisation import get_organisation_taxonomy from tests.core.ingestion.helpers import ( @@ -10,13 +10,13 @@ def test_validate_row__fails_bad_geography_iso(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.geography_iso = "XXX" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -25,13 +25,13 @@ def test_validate_row__fails_bad_geography_iso(test_db): def test_validate_row__fails_empty_geography_iso(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.geography_iso = "" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -40,14 +40,14 @@ def test_validate_row__fails_empty_geography_iso(test_db): def test_validate_row__consistent_family_and_collection(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) context.results = [] - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -55,15 +55,15 @@ def test_validate_row__consistent_family_and_collection(test_db): def test_validate_row__family_name_change(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) context.results = [] row.family_name = "changed" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -73,15 +73,15 @@ def test_validate_row__family_name_change(test_db): def test_validate_row__family_summary_change(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) context.results = [] row.family_summary = "changed" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -91,15 +91,15 @@ def test_validate_row__family_summary_change(test_db): def test_validate_row__collection_name_change(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) context.results = [] row.collection_name = "changed" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -109,15 +109,15 @@ def test_validate_row__collection_name_change(test_db): def test_validate_row__collection_summary_change(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) context.results = [] row.collection_summary = "changed" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -127,12 +127,12 @@ def test_validate_row__collection_summary_change(test_db): def test_validate_row__good_data(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -140,13 +140,13 @@ def test_validate_row__good_data(test_db): def test_validate_row__bad_data(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.sectors = ["fish"] - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -154,13 +154,13 @@ def test_validate_row__bad_data(test_db): def test_validate_row__resolvable_data(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.sectors = ["TranSPORtation"] - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -168,13 +168,13 @@ def test_validate_row__resolvable_data(test_db): def test_validate_row__bad_document_type(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.document_type = "fish" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -182,13 +182,13 @@ def test_validate_row__bad_document_type(test_db): def test_validate_row__good_document_type(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.document_type = "Order" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -196,13 +196,13 @@ def test_validate_row__good_document_type(test_db): def test_validate_row__bad_document_role(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.document_role = "fish" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -210,13 +210,13 @@ def test_validate_row__bad_document_role(test_db): def test_validate_row__good_document_role(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.document_role = "MAIN" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -224,13 +224,13 @@ def test_validate_row__good_document_role(test_db): def test_validate_row__bad_document_variant(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.document_variant = "fish" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 @@ -238,13 +238,13 @@ def test_validate_row__bad_document_variant(test_db): def test_validate_row__good_document_variant(test_db): - context = IngestContext() + context = CCLWIngestContext() populate_for_ingest(test_db) _, taxonomy = get_organisation_taxonomy(test_db, context.org_id) - row = DocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) + row = CCLWDocumentIngestRow.from_row(1, get_doc_ingest_row_data(0)) row.document_variant = "Translation" - validate_document_row(test_db, context=context, row=row, taxonomy=taxonomy) + validate_cclw_document_row(test_db, context=context, row=row, taxonomy=taxonomy) assert context.results assert len(context.results) == 1 diff --git a/tests/data_migrations/test_populate_taxonomy.py b/tests/data_migrations/test_populate_taxonomy.py index 244ad407..b8cf9893 100644 --- a/tests/data_migrations/test_populate_taxonomy.py +++ b/tests/data_migrations/test_populate_taxonomy.py @@ -46,13 +46,12 @@ def test_populate_taxonomy_unf3c_correct_counts(test_db): populate_taxonomy(test_db) taxonomy = get_organisation_taxonomy_by_name(test_db, "UNFCCC") - assert 3 == len(taxonomy) + assert 2 == len(taxonomy) assert "event_types" in taxonomy assert 17 == len(taxonomy["event_types"]["allowed_values"]) - assert "submission_type" in taxonomy - assert 24 == len(taxonomy["submission_type"]["allowed_values"]) + assert "submission_type" not in taxonomy assert "author_type" in taxonomy assert 2 == len(taxonomy["author_type"]["allowed_values"]) diff --git a/tests/routes/document_helpers.py b/tests/routes/document_helpers.py index c2b9b4c5..e08531af 100644 --- a/tests/routes/document_helpers.py +++ b/tests/routes/document_helpers.py @@ -1,4 +1,4 @@ -from app.api.api_v1.routers.admin import _start_ingest +from app.api.api_v1.routers.cclw_ingest import _start_ingest from app.data_migrations import ( populate_document_role, populate_document_type, diff --git a/tests/routes/test_admin.py b/tests/routes/test_admin_cclw.py similarity index 82% rename from tests/routes/test_admin.py rename to tests/routes/test_admin_cclw.py index d6257c01..e6a068a3 100644 --- a/tests/routes/test_admin.py +++ b/tests/routes/test_admin_cclw.py @@ -12,13 +12,13 @@ def test_unauthenticated_ingest(client): - response = client.post("/api/v1/admin/bulk-ingest/cclw/law-policy") + response = client.post("/api/v1/admin/bulk-ingest/cclw") assert response.status_code == 401 def test_unauthorized_ingest(client): response = client.post( - "/api/v1/admin/bulk-ingest/cclw/law-policy", + "/api/v1/admin/bulk-ingest/cclw", ) assert response.status_code == 401 @@ -38,11 +38,11 @@ def test_validate_bulk_ingest_cclw_law_policy( superuser_token_headers, test_db, ): - populate_taxonomy(db=test_db) + populate_taxonomy(test_db) populate_geography(test_db) - populate_document_type(db=test_db) - populate_document_role(db=test_db) - populate_document_variant(db=test_db) + populate_document_type(test_db) + populate_document_role(test_db) + populate_document_variant(test_db) test_db.commit() law_policy_csv_file = BytesIO(ONE_DFC_ROW.encode("utf8")) files = { @@ -54,7 +54,7 @@ def test_validate_bulk_ingest_cclw_law_policy( ), } response = client.post( - "/api/v1/admin/bulk-ingest/validate/cclw/law-policy", + "/api/v1/admin/bulk-ingest/validate/cclw", files=files, headers=superuser_token_headers, ) @@ -73,14 +73,16 @@ def test_bulk_ingest_cclw_law_policy( test_db, mocker, ): - mock_start_import = mocker.patch("app.api.api_v1.routers.admin._start_ingest") - mock_write_csv_to_s3 = mocker.patch("app.api.api_v1.routers.admin.write_csv_to_s3") - - populate_geography(db=test_db) - populate_taxonomy(db=test_db) - populate_document_type(db=test_db) - populate_document_role(db=test_db) - populate_document_variant(db=test_db) + mock_start_import = mocker.patch("app.api.api_v1.routers.cclw_ingest._start_ingest") + mock_write_csv_to_s3 = mocker.patch( + "app.api.api_v1.routers.cclw_ingest.write_csv_to_s3" + ) + + populate_geography(test_db) + populate_taxonomy(test_db) + populate_document_type(test_db) + populate_document_role(test_db) + populate_document_variant(test_db) test_db.commit() law_policy_csv_file = BytesIO(ONE_DFC_ROW.encode("utf8")) @@ -100,7 +102,7 @@ def test_bulk_ingest_cclw_law_policy( ), } response = client.post( - "/api/v1/admin/bulk-ingest/cclw/law-policy", + "/api/v1/admin/bulk-ingest/cclw", files=files, headers=superuser_token_headers, ) diff --git a/tests/routes/test_admin_unfccc.py b/tests/routes/test_admin_unfccc.py new file mode 100644 index 00000000..cfaa5ea7 --- /dev/null +++ b/tests/routes/test_admin_unfccc.py @@ -0,0 +1,218 @@ +from io import BytesIO + +from app.data_migrations import ( + populate_document_role, + populate_document_type, + populate_document_variant, + populate_geography, + populate_taxonomy, +) + + +def test_unauthenticated_ingest(client): + response = client.post("/api/v1/admin/bulk-ingest/unfccc") + assert response.status_code == 401 + + +def test_unauthorized_validation(client): + response = client.post( + "/api/v1/admin/bulk-ingest/validate/unfccc", + ) + assert response.status_code == 401 + + +MISSING_COLL_UNFCCC_ROW = """Category,md5sum,Submission type,Family name,Document title,Documents,Author,Author type,Geography,Geography ISO,Date,Document role,Document variant,Language,Download URL,CPR Collection ID,CPR Document ID,CPR Document Slug,CPR Family ID,CPR Family Slug +Cat1,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url,UNFCCC.Collection.1,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug +""" + +ONE_UNFCCC_ROW = """Category,md5sum,Submission type,Family name,Document title,Documents,Author,Author type,Geography,Geography ISO,Date,Document role,Document variant,Language,Download URL,CPR Collection ID,CPR Document ID,CPR Document Slug,CPR Family ID,CPR Family Slug +Cat1,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url,UNFCCC.Collection.Found,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug +""" + +ZERO_COLLECTION_ROW = """CPR Collection ID,Collection name,Collection summary +""" + +ONE_COLLECTION_ROW = """CPR Collection ID,Collection name,Collection summary +UNFCCC.Collection.Found,Collection One,Everything to do with testing +""" + + +def test_validate_unfccc_works( + client, + superuser_token_headers, + test_db, +): + populate_taxonomy(test_db) + populate_geography(test_db) + populate_document_type(test_db) + populate_document_role(test_db) + populate_document_variant(test_db) + test_db.commit() + unfccc_data_csv = BytesIO(ONE_UNFCCC_ROW.encode("utf8")) + collection_csv = BytesIO(ONE_COLLECTION_ROW.encode("utf8")) + files = { + "unfccc_data_csv": ( + "unfccc_data_csv.csv", + unfccc_data_csv, + "text/csv", + {"Expires": "0"}, + ), + "collection_csv": ( + "collection_csv.csv", + collection_csv, + "text/csv", + {"Expires": "0"}, + ), + } + response = client.post( + "/api/v1/admin/bulk-ingest/validate/unfccc", + files=files, + headers=superuser_token_headers, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json["errors"]) == 0 + assert ( + response_json["message"] + == "UNFCCC validation result: 1 Rows, 0 Failures, 0 Resolved" + ) + + +def test_validate_unfccc_fails_missing_defined_collection( + client, + superuser_token_headers, + test_db, +): + populate_taxonomy(test_db) + populate_geography(test_db) + populate_document_type(test_db) + populate_document_role(test_db) + populate_document_variant(test_db) + test_db.commit() + unfccc_data_csv = BytesIO(ONE_UNFCCC_ROW.encode("utf8")) + collection_csv = BytesIO(ZERO_COLLECTION_ROW.encode("utf8")) + files = { + "unfccc_data_csv": ( + "unfccc_data_csv.csv", + unfccc_data_csv, + "text/csv", + {"Expires": "0"}, + ), + "collection_csv": ( + "collection_csv.csv", + collection_csv, + "text/csv", + {"Expires": "0"}, + ), + } + response = client.post( + "/api/v1/admin/bulk-ingest/validate/unfccc", + files=files, + headers=superuser_token_headers, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json["errors"]) == 1 + assert response_json["errors"][0] == { + "details": "The following Collection IDs were referenced and not defined: ['UNFCCC.Collection.Found']", + "type": "Error", + } + assert ( + response_json["message"] + == "UNFCCC validation result: 1 Rows, 1 Failures, 0 Resolved" + ) + + +def test_validate_unfccc_fails_missing_referenced_collection( + client, + superuser_token_headers, + test_db, +): + populate_taxonomy(test_db) + populate_geography(test_db) + populate_document_type(test_db) + populate_document_role(test_db) + populate_document_variant(test_db) + test_db.commit() + unfccc_data_csv = BytesIO(MISSING_COLL_UNFCCC_ROW.encode("utf8")) + collection_csv = BytesIO(ZERO_COLLECTION_ROW.encode("utf8")) + files = { + "unfccc_data_csv": ( + "unfccc_data_csv.csv", + unfccc_data_csv, + "text/csv", + {"Expires": "0"}, + ), + "collection_csv": ( + "collection_csv.csv", + collection_csv, + "text/csv", + {"Expires": "0"}, + ), + } + response = client.post( + "/api/v1/admin/bulk-ingest/validate/unfccc", + files=files, + headers=superuser_token_headers, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json["errors"]) == 1 + assert response_json["errors"][0] == { + "details": "The following Collection IDs were referenced and not defined: ['UNFCCC.Collection.1']", + "type": "Error", + } + assert ( + response_json["message"] + == "UNFCCC validation result: 1 Rows, 1 Failures, 0 Resolved" + ) + + +def test_ingest_unfccc_works( + client, + superuser_token_headers, + test_db, + mocker, +): + mock_start_import = mocker.patch( + "app.api.api_v1.routers.unfccc_ingest._start_ingest" + ) + mock_write_csv_to_s3 = mocker.patch( + "app.api.api_v1.routers.unfccc_ingest.write_csv_to_s3" + ) + populate_taxonomy(test_db) + populate_geography(test_db) + populate_document_type(test_db) + populate_document_role(test_db) + populate_document_variant(test_db) + test_db.commit() + unfccc_data_csv = BytesIO(ONE_UNFCCC_ROW.encode("utf8")) + collection_csv = BytesIO(ONE_COLLECTION_ROW.encode("utf8")) + files = { + "unfccc_data_csv": ( + "unfccc_data_csv.csv", + unfccc_data_csv, + "text/csv", + {"Expires": "0"}, + ), + "collection_csv": ( + "collection_csv.csv", + collection_csv, + "text/csv", + {"Expires": "0"}, + ), + } + response = client.post( + "/api/v1/admin/bulk-ingest/unfccc", + files=files, + headers=superuser_token_headers, + ) + assert response.status_code == 202 + + mock_start_import.assert_called_once() + + assert mock_write_csv_to_s3.call_count == 2 # write docs & events csvs + call0 = mock_write_csv_to_s3.mock_calls[0] + assert len(call0.kwargs["file_contents"]) == unfccc_data_csv.getbuffer().nbytes + call1 = mock_write_csv_to_s3.mock_calls[1] + assert len(call1.kwargs["file_contents"]) == collection_csv.getbuffer().nbytes diff --git a/tests/routes/test_geography_summaries.py b/tests/routes/test_geography_summaries.py index d4d68de7..18d70ec0 100644 --- a/tests/routes/test_geography_summaries.py +++ b/tests/routes/test_geography_summaries.py @@ -15,12 +15,14 @@ def test_endpoint_returns_families_ok(client): assert resp["family_counts"]["Executive"] == 0 assert resp["family_counts"]["Legislative"] == 0 + assert resp["family_counts"]["UNFCCC"] == 0 assert len(resp["top_families"]["Executive"]) == 0 assert len(resp["top_families"]["Legislative"]) == 0 + assert len(resp["top_families"]["UNFCCC"]) == 0 - assert len(resp["family_counts"]) == 2 - assert len(resp["top_families"]) == 2 + assert len(resp["family_counts"]) == 3 + assert len(resp["top_families"]) == 3 assert len(resp["targets"]) == 0