Skip to content

Commit

Permalink
SOF-40 Add document_url for the pipeline (#115)
Browse files Browse the repository at this point in the history
* Add a null document_url

* add download url to pipeline input file
  • Loading branch information
diversemix authored May 22, 2023
1 parent a4c363c commit da19cf5
Show file tree
Hide file tree
Showing 8 changed files with 18 additions and 4 deletions.
4 changes: 4 additions & 0 deletions app/api/api_v1/routers/unfccc_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ def start_unfccc_ingest(

try:
pipeline_ingest_input = generate_pipeline_ingest_input(db)
ctx = cast(UNFCCCIngestContext, context)
# We now have to populate the download_url values...
for doc in pipeline_ingest_input:
doc.download_url = ctx.download_urls[doc.import_id]
write_documents_to_s3(
s3_client=s3_client,
s3_prefix=s3_prefix,
Expand Down
2 changes: 2 additions & 0 deletions app/api/api_v1/schemas/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ class DocumentParserInput(BaseModel):
description: str
postfix: Optional[str]
source_url: Optional[str]
download_url: Optional[str]

slug: str

type: str
Expand Down
1 change: 1 addition & 0 deletions app/core/ingestion/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput]
if family_document.physical_document is not None
else None
),
download_url=None,
type=cast(str, family_document.document_type or ""),
source=cast(str, organisation.name),
slug=cast(str, family_document.slugs[-1].name),
Expand Down
3 changes: 3 additions & 0 deletions app/core/ingestion/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ def ingest_unfccc_document_row(
result,
)

ctx = cast(UNFCCCIngestContext, context)
ctx.download_urls[import_id] = row.download_url

_LOGGER.info(
f"Ingest complete for row {row.row_number}",
extra={"props": {"result": str(result)}},
Expand Down
2 changes: 1 addition & 1 deletion app/core/ingestion/unfccc/ingest_row_unfccc.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra=Extra.forbid))
class UNFCCCDocumentIngestRow(BaseIngestRow):
"""Represents a single row of input from the documents-families-collections CSV."""
"""Represents a single row of input from the UNFCCC CSV."""

category: str
md5sum: str
Expand Down
2 changes: 2 additions & 0 deletions app/core/ingestion/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,13 @@ class UNFCCCIngestContext(IngestContext):
collection_ids_referenced: list[str]
# Just for families:
consistency_validator: ConsistencyValidator
download_urls: dict[str, str] # import_id -> url

def __init__(self, org_name="UNFCCC", org_id=2, results=None):
self.collection_ids_defined = []
self.collection_ids_referenced = []
self.consistency_validator = ConsistencyValidator()
self.download_urls = {}
self.org_name = org_name
self.org_id = org_id
self.results = [] if results is None else results
Expand Down
1 change: 1 addition & 0 deletions tests/core/validation/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def test_write_documents_to_s3(test_s3_client, mocker):
postfix="pf-A",
description="description",
source_url=None,
download_url=None,
type="executive",
source="CCLW",
import_id="1234-5678",
Expand Down
7 changes: 4 additions & 3 deletions tests/routes/test_admin_unfccc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"description": "summary",
"postfix": null,
"source_url": "https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf",
"download_url": "url of downloaded document",
"slug": "Doc-slug",
"type": "Synthesis Report",
"source": "UNFCCC",
Expand Down Expand Up @@ -51,11 +52,11 @@ def test_unauthorized_validation(client):


MISSING_COLL_UNFCCC_ROW = """Category,md5sum,Submission type,Family name,Document title,Documents,Author,Author type,Geography,Geography ISO,Date,Document role,Document variant,Language,Download URL,CPR Collection ID,CPR Document ID,CPR Document Slug,CPR Family ID,CPR Family Slug
UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url,UNFCCC.Collection.1,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug
UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url of downloaded document,UNFCCC.Collection.1,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug
"""

ONE_UNFCCC_ROW = """Category,md5sum,Submission type,Family name,Document title,Documents,Author,Author type,Geography,Geography ISO,Date,Document role,Document variant,Language,Download URL,CPR Collection ID,CPR Document ID,CPR Document Slug,CPR Family ID,CPR Family Slug
UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url,UNFCCC.Collection.Found,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug
UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url of downloaded document,UNFCCC.Collection.Found,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug
"""

ZERO_COLLECTION_ROW = """CPR Collection ID,Collection name,Collection summary
Expand Down Expand Up @@ -270,4 +271,4 @@ def test_start_unfccc_ingest(

documents_call = mock_write_s3.mock_calls[1]
content = documents_call.kwargs["bytes_content"].read()
assert content == bytes(EXPECTED_DOCUMENTS, encoding="utf8")
assert content.decode("utf8") == EXPECTED_DOCUMENTS

0 comments on commit da19cf5

Please sign in to comment.