From da19cf51578d521a58d85025f7c576384b0b8b4f Mon Sep 17 00:00:00 2001 From: Peter Hooper Date: Mon, 22 May 2023 11:36:05 +0100 Subject: [PATCH] SOF-40 Add document_url for the pipeline (#115) * Add a null document_url * add download url to pipeline input file --- app/api/api_v1/routers/unfccc_ingest.py | 4 ++++ app/api/api_v1/schemas/document.py | 2 ++ app/core/ingestion/pipeline.py | 1 + app/core/ingestion/processor.py | 3 +++ app/core/ingestion/unfccc/ingest_row_unfccc.py | 2 +- app/core/ingestion/utils.py | 2 ++ tests/core/validation/test_util.py | 1 + tests/routes/test_admin_unfccc.py | 7 ++++--- 8 files changed, 18 insertions(+), 4 deletions(-) diff --git a/app/api/api_v1/routers/unfccc_ingest.py b/app/api/api_v1/routers/unfccc_ingest.py index 83273437..e7aa6424 100644 --- a/app/api/api_v1/routers/unfccc_ingest.py +++ b/app/api/api_v1/routers/unfccc_ingest.py @@ -96,6 +96,10 @@ def start_unfccc_ingest( try: pipeline_ingest_input = generate_pipeline_ingest_input(db) + ctx = cast(UNFCCCIngestContext, context) + # We now have to populate the download_url values... + for doc in pipeline_ingest_input: + doc.download_url = ctx.download_urls[doc.import_id] write_documents_to_s3( s3_client=s3_client, s3_prefix=s3_prefix, diff --git a/app/api/api_v1/schemas/document.py b/app/api/api_v1/schemas/document.py index a071fb7e..2e2116ba 100644 --- a/app/api/api_v1/schemas/document.py +++ b/app/api/api_v1/schemas/document.py @@ -117,6 +117,8 @@ class DocumentParserInput(BaseModel): description: str postfix: Optional[str] source_url: Optional[str] + download_url: Optional[str] + slug: str type: str diff --git a/app/core/ingestion/pipeline.py b/app/core/ingestion/pipeline.py index 9d9488fe..312b01bc 100644 --- a/app/core/ingestion/pipeline.py +++ b/app/core/ingestion/pipeline.py @@ -51,6 +51,7 @@ def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput] if family_document.physical_document is not None else None ), + download_url=None, type=cast(str, family_document.document_type or ""), source=cast(str, organisation.name), slug=cast(str, family_document.slugs[-1].name), diff --git a/app/core/ingestion/processor.py b/app/core/ingestion/processor.py index 17931f26..ba89333c 100644 --- a/app/core/ingestion/processor.py +++ b/app/core/ingestion/processor.py @@ -178,6 +178,9 @@ def ingest_unfccc_document_row( result, ) + ctx = cast(UNFCCCIngestContext, context) + ctx.download_urls[import_id] = row.download_url + _LOGGER.info( f"Ingest complete for row {row.row_number}", extra={"props": {"result": str(result)}}, diff --git a/app/core/ingestion/unfccc/ingest_row_unfccc.py b/app/core/ingestion/unfccc/ingest_row_unfccc.py index 90ccb15b..16a4ff12 100644 --- a/app/core/ingestion/unfccc/ingest_row_unfccc.py +++ b/app/core/ingestion/unfccc/ingest_row_unfccc.py @@ -39,7 +39,7 @@ @dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra=Extra.forbid)) class UNFCCCDocumentIngestRow(BaseIngestRow): - """Represents a single row of input from the documents-families-collections CSV.""" + """Represents a single row of input from the UNFCCC CSV.""" category: str md5sum: str diff --git a/app/core/ingestion/utils.py b/app/core/ingestion/utils.py index cc21c17d..561764f9 100644 --- a/app/core/ingestion/utils.py +++ b/app/core/ingestion/utils.py @@ -227,11 +227,13 @@ class UNFCCCIngestContext(IngestContext): collection_ids_referenced: list[str] # Just for families: consistency_validator: ConsistencyValidator + download_urls: dict[str, str] # import_id -> url def __init__(self, org_name="UNFCCC", org_id=2, results=None): self.collection_ids_defined = [] self.collection_ids_referenced = [] self.consistency_validator = ConsistencyValidator() + self.download_urls = {} self.org_name = org_name self.org_id = org_id self.results = [] if results is None else results diff --git a/tests/core/validation/test_util.py b/tests/core/validation/test_util.py index c9b79809..476e86c2 100644 --- a/tests/core/validation/test_util.py +++ b/tests/core/validation/test_util.py @@ -84,6 +84,7 @@ def test_write_documents_to_s3(test_s3_client, mocker): postfix="pf-A", description="description", source_url=None, + download_url=None, type="executive", source="CCLW", import_id="1234-5678", diff --git a/tests/routes/test_admin_unfccc.py b/tests/routes/test_admin_unfccc.py index eb247e64..396f23cb 100644 --- a/tests/routes/test_admin_unfccc.py +++ b/tests/routes/test_admin_unfccc.py @@ -19,6 +19,7 @@ "description": "summary", "postfix": null, "source_url": "https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf", + "download_url": "url of downloaded document", "slug": "Doc-slug", "type": "Synthesis Report", "source": "UNFCCC", @@ -51,11 +52,11 @@ def test_unauthorized_validation(client): MISSING_COLL_UNFCCC_ROW = """Category,md5sum,Submission type,Family name,Document title,Documents,Author,Author type,Geography,Geography ISO,Date,Document role,Document variant,Language,Download URL,CPR Collection ID,CPR Document ID,CPR Document Slug,CPR Family ID,CPR Family Slug -UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url,UNFCCC.Collection.1,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug +UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url of downloaded document,UNFCCC.Collection.1,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug """ ONE_UNFCCC_ROW = """Category,md5sum,Submission type,Family name,Document title,Documents,Author,Author type,Geography,Geography ISO,Date,Document role,Document variant,Language,Download URL,CPR Collection ID,CPR Document ID,CPR Document Slug,CPR Family ID,CPR Family Slug -UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url,UNFCCC.Collection.Found,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug +UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url of downloaded document,UNFCCC.Collection.Found,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug """ ZERO_COLLECTION_ROW = """CPR Collection ID,Collection name,Collection summary @@ -270,4 +271,4 @@ def test_start_unfccc_ingest( documents_call = mock_write_s3.mock_calls[1] content = documents_call.kwargs["bytes_content"].read() - assert content == bytes(EXPECTED_DOCUMENTS, encoding="utf8") + assert content.decode("utf8") == EXPECTED_DOCUMENTS