Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SOF-40 Add document_url for the pipeline #115

Merged
merged 2 commits into from
May 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions app/api/api_v1/routers/unfccc_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ def start_unfccc_ingest(

try:
pipeline_ingest_input = generate_pipeline_ingest_input(db)
ctx = cast(UNFCCCIngestContext, context)
# We now have to populate the download_url values...
for doc in pipeline_ingest_input:
doc.download_url = ctx.download_urls[doc.import_id]
write_documents_to_s3(
s3_client=s3_client,
s3_prefix=s3_prefix,
Expand Down
2 changes: 2 additions & 0 deletions app/api/api_v1/schemas/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ class DocumentParserInput(BaseModel):
description: str
postfix: Optional[str]
source_url: Optional[str]
download_url: Optional[str]

slug: str

type: str
Expand Down
1 change: 1 addition & 0 deletions app/core/ingestion/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput]
if family_document.physical_document is not None
else None
),
download_url=None,
type=cast(str, family_document.document_type or ""),
source=cast(str, organisation.name),
slug=cast(str, family_document.slugs[-1].name),
Expand Down
3 changes: 3 additions & 0 deletions app/core/ingestion/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ def ingest_unfccc_document_row(
result,
)

ctx = cast(UNFCCCIngestContext, context)
ctx.download_urls[import_id] = row.download_url

_LOGGER.info(
f"Ingest complete for row {row.row_number}",
extra={"props": {"result": str(result)}},
Expand Down
2 changes: 1 addition & 1 deletion app/core/ingestion/unfccc/ingest_row_unfccc.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra=Extra.forbid))
class UNFCCCDocumentIngestRow(BaseIngestRow):
"""Represents a single row of input from the documents-families-collections CSV."""
"""Represents a single row of input from the UNFCCC CSV."""

category: str
md5sum: str
Expand Down
2 changes: 2 additions & 0 deletions app/core/ingestion/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,13 @@ class UNFCCCIngestContext(IngestContext):
collection_ids_referenced: list[str]
# Just for families:
consistency_validator: ConsistencyValidator
download_urls: dict[str, str] # import_id -> url

def __init__(self, org_name="UNFCCC", org_id=2, results=None):
self.collection_ids_defined = []
self.collection_ids_referenced = []
self.consistency_validator = ConsistencyValidator()
self.download_urls = {}
self.org_name = org_name
self.org_id = org_id
self.results = [] if results is None else results
Expand Down
1 change: 1 addition & 0 deletions tests/core/validation/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def test_write_documents_to_s3(test_s3_client, mocker):
postfix="pf-A",
description="description",
source_url=None,
download_url=None,
type="executive",
source="CCLW",
import_id="1234-5678",
Expand Down
7 changes: 4 additions & 3 deletions tests/routes/test_admin_unfccc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"description": "summary",
"postfix": null,
"source_url": "https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf",
"download_url": "url of downloaded document",
"slug": "Doc-slug",
"type": "Synthesis Report",
"source": "UNFCCC",
Expand Down Expand Up @@ -51,11 +52,11 @@ def test_unauthorized_validation(client):


MISSING_COLL_UNFCCC_ROW = """Category,md5sum,Submission type,Family name,Document title,Documents,Author,Author type,Geography,Geography ISO,Date,Document role,Document variant,Language,Download URL,CPR Collection ID,CPR Document ID,CPR Document Slug,CPR Family ID,CPR Family Slug
UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url,UNFCCC.Collection.1,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug
UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url of downloaded document,UNFCCC.Collection.1,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug
"""

ONE_UNFCCC_ROW = """Category,md5sum,Submission type,Family name,Document title,Documents,Author,Author type,Geography,Geography ISO,Date,Document role,Document variant,Language,Download URL,CPR Collection ID,CPR Document ID,CPR Document Slug,CPR Family ID,CPR Family Slug
UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url,UNFCCC.Collection.Found,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug
UNFCCC,00254c407297fbb50a77d748b817ee5c,Synthesis Report,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,Nationally determined contributions under the Paris Agreement. Revised note by the secretariat,https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf,UNFCCC Secretariat,Party,UK,GBR,2021-10-25T12:00:00Z,,,en,url of downloaded document,UNFCCC.Collection.Found,UNFCCC.Document.1,Doc-slug,UNFCCC.family.1,Family-slug
"""

ZERO_COLLECTION_ROW = """CPR Collection ID,Collection name,Collection summary
Expand Down Expand Up @@ -270,4 +271,4 @@ def test_start_unfccc_ingest(

documents_call = mock_write_s3.mock_calls[1]
content = documents_call.kwargs["bytes_content"].read()
assert content == bytes(EXPECTED_DOCUMENTS, encoding="utf8")
assert content.decode("utf8") == EXPECTED_DOCUMENTS
joel-wright marked this conversation as resolved.
Show resolved Hide resolved