Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SOF-8 Refactor for ingest #114

Merged
merged 40 commits into from
May 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
8c5fced
begin re-factor of how taxonomies are populated
diversemix May 15, 2023
f033042
refactor
diversemix May 15, 2023
146c739
add type to arg
diversemix May 15, 2023
2faca37
Add a little more documentation for taxonomy
diversemix May 15, 2023
0c4fa07
More refactoring and add tests
diversemix May 16, 2023
de8bb08
finish tests
diversemix May 16, 2023
e2c727d
Merge branch 'main' into SOF-8/ingest-UNF3C
diversemix May 16, 2023
febd157
First re-factor to accomodate UNFCCC
diversemix May 16, 2023
218eb33
Merge branch 'main' into SOF-8/ingest-UNF3C
diversemix May 16, 2023
76baa6b
get tests working (not)
diversemix May 17, 2023
7b9bb94
lunch time commit
diversemix May 17, 2023
0c672d3
fix datetime issue
diversemix May 17, 2023
21a2b59
more re-factoring
diversemix May 17, 2023
c6b6cd9
end of day commit
diversemix May 17, 2023
3940d7b
get validation working
diversemix May 17, 2023
c874baa
start implementing the collections csv
diversemix May 18, 2023
63d5af4
Validate the Collections CSV
diversemix May 18, 2023
dfe56b0
add more tests
diversemix May 18, 2023
ab7ce36
add database migration
diversemix May 18, 2023
d699923
Update app/core/ingestion/processor.py
diversemix May 18, 2023
c4710d9
remove law-policy from the routes
diversemix May 18, 2023
a32ad26
create sub-modules
diversemix May 18, 2023
af5be24
fully validate collections
diversemix May 18, 2023
ffc4562
update field names for collection csv
diversemix May 18, 2023
af6440d
Add new CSV columns
diversemix May 18, 2023
1211a5b
remove collection_id as no longer used
diversemix May 18, 2023
75623ac
write test but comment out - pivoting
diversemix May 18, 2023
7bace38
remove submission type from taxonomy and add to document type
diversemix May 18, 2023
2196d1c
fix failing populate_taxonomy test
diversemix May 18, 2023
6ecf87a
more ingest work
diversemix May 18, 2023
c48b162
more tests ready for writing ingest code
diversemix May 18, 2023
6c39e99
Allow pre-existing collection if they match
diversemix May 19, 2023
e980654
add download_url
diversemix May 19, 2023
83cb6da
add beginnings of ingest tests for unfccc
diversemix May 19, 2023
16ff44c
first refactor before unfccc ingest
diversemix May 19, 2023
fed205f
Merge branch 'main' into SOF-8/part-2-ingest
diversemix May 19, 2023
fb936d0
more stuff
diversemix May 19, 2023
a6d2e62
finish tests
diversemix May 19, 2023
5b8c57a
finish re-factor
diversemix May 19, 2023
6012322
Add a test for the documents object sent to the pipeline
diversemix May 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions app/api/api_v1/routers/cclw_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@
CCLWDocumentIngestRow,
EventIngestRow,
)
from app.core.ingestion.cclw.pipeline import generate_pipeline_ingest_input
from app.core.ingestion.pipeline import generate_pipeline_ingest_input
from app.core.ingestion.processor import (
initialise_context,
get_document_ingestor,
get_cclw_document_ingestor,
get_document_validator,
get_event_ingestor,
)
from app.core.ingestion.cclw.reader import get_file_contents, read
from app.core.ingestion.reader import get_file_contents, read
from app.core.ingestion.utils import (
IngestContext,
Result,
Expand Down Expand Up @@ -69,7 +69,7 @@ def _start_ingest(
# TODO: add a way for a user to monitor progress of the ingest
try:
context = initialise_context(db, "CCLW")
document_ingestor = get_document_ingestor(db, context)
document_ingestor = get_cclw_document_ingestor(db, context)
read(documents_file_contents, context, CCLWDocumentIngestRow, document_ingestor)
event_ingestor = get_event_ingestor(db)
read(events_file_contents, context, EventIngestRow, event_ingestor)
Expand Down
13 changes: 6 additions & 7 deletions app/api/api_v1/routers/unfccc_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
UNFCCCDocumentIngestRow,
)

from app.core.ingestion.unfccc.pipeline import generate_pipeline_ingest_input
from app.core.ingestion.pipeline import generate_pipeline_ingest_input
from app.core.ingestion.processor import (
get_collection_ingestor,
initialise_context,
get_document_ingestor,
get_unfccc_document_ingestor,
get_document_validator,
)
from app.core.ingestion.unfccc.reader import get_file_contents, read
from app.core.ingestion.reader import get_file_contents, read
from app.core.ingestion.utils import (
IngestContext,
Result,
Expand All @@ -56,7 +56,7 @@
unfccc_ingest_router = r = APIRouter()


def _start_ingest(
def start_unfccc_ingest(
db: Session,
s3_client: S3Client,
s3_prefix: str,
Expand All @@ -71,8 +71,7 @@ def _start_ingest(
collection_ingestor = get_collection_ingestor(db)
read(collection_file_contents, context, CollectonIngestRow, collection_ingestor)

# FIXME: Write a unfccc ingestor
document_ingestor = get_document_ingestor(db, context)
document_ingestor = get_unfccc_document_ingestor(db, context)
read(
documents_file_contents, context, UNFCCCDocumentIngestRow, document_ingestor
)
Expand Down Expand Up @@ -334,7 +333,7 @@ def ingest_unfccc_law_policy(

# PHASE 3 - Start the ingest (kick off background task to do the actual ingest)
background_tasks.add_task(
_start_ingest,
start_unfccc_ingest,
db,
s3_client,
s3_prefix,
Expand Down
2 changes: 1 addition & 1 deletion app/core/ingestion/cclw/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
}


def add_metadata(
def add_cclw_metadata(
db: Session,
family_import_id: str,
taxonomy: Taxonomy,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from typing import Any, Optional

from sqlalchemy.orm import Session
from app.core.ingestion.cclw.ingest_row_cclw import CCLWDocumentIngestRow
from app.core.ingestion.params import IngestParameters
from app.core.ingestion.unfccc.ingest_row_unfccc import CollectonIngestRow
from app.core.ingestion.utils import create, to_dict, update_if_changed
from app.core.ingestion.utils import (
create,
to_dict,
update_if_changed,
)

from app.db.models.law_policy import Collection
from app.db.models.law_policy.collection import CollectionFamily, CollectionOrganisation
Expand Down Expand Up @@ -61,9 +65,9 @@ def create_collection(
)


def handle_collection_from_row(
def handle_collection_and_link(
db: Session,
row: CCLWDocumentIngestRow,
params: IngestParameters,
org_id: int,
family_import_id: str,
result: dict[str, Any],
Expand All @@ -81,23 +85,28 @@ def handle_collection_from_row(
:param [dict[str, Any]]: a result dict in which to record what was created.
:return [Collection | None]: A collection if one was created, otherwise None.
"""
if not row.cpr_collection_id or row.cpr_collection_id == "n/a":
if not params.cpr_collection_id or params.cpr_collection_id == "n/a":
return None

# First check for the actual collection
existing_collection = (
db.query(Collection)
.filter(Collection.import_id == row.cpr_collection_id)
.filter(Collection.import_id == params.cpr_collection_id)
.one_or_none()
)

if existing_collection is None:
if params.create_collections is False:
id = params.cpr_collection_id
msg = f"Collection {id} is not pre-exsiting so not linking"
raise ValueError(msg)

collection = create(
db,
Collection,
import_id=row.cpr_collection_id,
title=row.collection_name,
extra={"description": row.collection_summary},
import_id=params.cpr_collection_id,
title=params.collection_name,
extra={"description": params.collection_summary},
)

collection_organisation = create(
Expand All @@ -112,8 +121,8 @@ def handle_collection_from_row(
else:
collection = existing_collection
updated = {}
update_if_changed(updated, "title", row.collection_name, collection)
update_if_changed(updated, "description", row.collection_summary, collection)
update_if_changed(updated, "title", params.collection_name, collection)
update_if_changed(updated, "description", params.collection_summary, collection)
if len(updated) > 0:
result["collection"] = updated
db.add(collection)
Expand All @@ -123,8 +132,8 @@ def handle_collection_from_row(
existing_link = (
db.query(CollectionFamily)
.filter_by(
collection_import_id=row.cpr_collection_id,
family_import_id=row.cpr_family_id,
collection_import_id=params.cpr_collection_id,
family_import_id=params.cpr_family_id,
)
.one_or_none()
)
Expand Down
Loading