Skip to content

Commit

Permalink
Improve ingest template (#210)
Browse files Browse the repository at this point in the history
* Remove events and documents from family template

* Move event metadata from family template to event template

* Feature/pdct 1351 ingest documents (#209)

* Add documents to the import_data response

* Process multiple documents from json

* Check if document data exists in json

* Refactor + give import_id default value on DocumentCreateDTO

* Validate that variant name is not empty when ingesting documents

* Validate metadata when ingesting documents

* Throw validation error when family does not exist for document + fix docstrings

* Make save functions not private so that they can be tested more easily

* Update version

* Update version

* Set family_import_id when saving documents

* Remove unnecessary TODO

* Save documents to db on ingest

* Fix catching RepositoryErrors

* Default import_id to None rather than empty string when not provided on collection or document dto when saving

* Tighten up test

* Remove events and documents from families

* Bump patch version
  • Loading branch information
annaCPR authored Sep 5, 2024
1 parent caab984 commit 7b658e3
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 125 deletions.
10 changes: 8 additions & 2 deletions app/api/api_v1/routers/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,17 @@ def _get_collection_template() -> dict:
return collection_template


def _get_event_template() -> dict:
def _get_event_template(corpus_type: str) -> dict:
"""
Gets an event template.
:return dict: The event template.
"""
event_schema = IngestEventDTO.model_json_schema(mode="serialization")
event_template = event_schema["properties"]
event_template["event_type_value"] = _get_metadata_template(
corpus_type, CountedEntity.Event
)

return event_template

Expand Down Expand Up @@ -74,8 +77,11 @@ def _get_metadata_template(corpus_type: str, metadata_type: CountedEntity) -> di
return {}
if metadata_type == CountedEntity.Document:
return metadata.pop(EntitySpecificTaxonomyKeys.DOCUMENT.value)
elif metadata_type == CountedEntity.Event:
return metadata.pop(EntitySpecificTaxonomyKeys.EVENT.value)
elif metadata_type == CountedEntity.Family:
metadata.pop(EntitySpecificTaxonomyKeys.DOCUMENT.value)
metadata.pop(EntitySpecificTaxonomyKeys.EVENT.value)
return metadata


Expand Down Expand Up @@ -116,7 +122,7 @@ async def get_ingest_template(corpus_type: str) -> Json:
"collections": [_get_collection_template()],
"families": [_get_family_template(corpus_type)],
"documents": [_get_document_template(corpus_type)],
"events": [_get_event_template()],
"events": [_get_event_template(corpus_type)],
}


Expand Down
9 changes: 5 additions & 4 deletions app/model/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ class IngestFamilyDTO(BaseModel):
category: str
metadata: Json
collections: list[str]
documents: list[str]
events: list[str]
corpus_import_id: str

def to_family_create_dto(self, corpus_import_id: str) -> FamilyCreateDTO:
Expand All @@ -65,6 +63,8 @@ class IngestEventDTO(BaseModel):
"""Representation of an event for ingest."""

import_id: str
family_import_id: str
family_document_import_id: str
event_title: str
date: datetime
event_type_value: str
Expand All @@ -74,14 +74,15 @@ class IngestDocumentDTO(BaseModel):
"""Representation of a document for ingest."""

import_id: str
family_import_id: str
variant_name: Optional[str] = None
metadata: Json
events: list[str]
title: str
source_url: Optional[AnyHttpUrl] = None
user_language_name: Optional[str]

def to_document_create_dto(self, family_import_id) -> DocumentCreateDTO:
def to_document_create_dto(self) -> DocumentCreateDTO:
"""
Convert IngestDocumentDTO to DocumentCreateDTO.
Expand All @@ -90,7 +91,7 @@ def to_document_create_dto(self, family_import_id) -> DocumentCreateDTO:

return DocumentCreateDTO(
import_id=self.import_id,
family_import_id=family_import_id,
family_import_id=self.family_import_id,
variant_name=self.variant_name,
metadata=self.metadata,
title=self.title,
Expand Down
29 changes: 7 additions & 22 deletions app/service/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def save_families(
def save_documents(
document_data: list[dict],
corpus_import_id: str,
family_document_mapping: dict,
db: Optional[Session] = None,
) -> list[str]:
"""
Expand All @@ -112,9 +111,7 @@ def save_documents(

document_import_ids = []
for doc in document_data:
family_import_id = family_document_mapping[doc["import_id"]]

dto = IngestDocumentDTO(**doc).to_document_create_dto(family_import_id)
dto = IngestDocumentDTO(**doc).to_document_create_dto()

if dto.variant_name == "":
raise ValidationError("Variant name is empty")
Expand All @@ -130,28 +127,20 @@ def save_documents(


def validate_entity_relationships(data: dict) -> None:
family_documents = []
families = []
if "families" in data:
for fam in data["families"]:
family_documents.extend(fam["documents"])
families.append(fam["import_id"])

documents = []
if "documents" in data:
for doc in data["documents"]:
documents.append(doc["import_id"])
documents.append(doc["family_import_id"])

family_document_set = set(family_documents)
family_document_set = set(families)
unmatched = [x for x in documents if x not in family_document_set]
if unmatched:
raise ValidationError(f"No family found for document(s): {unmatched}")


def create_family_document_mapping(family_data: dict) -> dict:
family_document_mapping = {}
for fam in family_data:
for doc in fam["documents"]:
family_document_mapping[doc] = fam["import_id"]
return family_document_mapping
raise ValidationError(f"No family with id {unmatched} found")


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
Expand Down Expand Up @@ -179,18 +168,14 @@ def import_data(data: dict, corpus_import_id: str) -> dict:
try:
validate_entity_relationships(data)

family_document_mapping = {}
if collection_data:
response["collections"] = save_collections(
collection_data, corpus_import_id, db
)
if family_data:
response["families"] = save_families(family_data, corpus_import_id, db)
family_document_mapping = create_family_document_mapping(family_data)
if document_data:
response["documents"] = save_documents(
document_data, corpus_import_id, family_document_mapping, db
)
response["documents"] = save_documents(document_data, corpus_import_id, db)

return response
except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "admin_backend"
version = "2.14.0"
version = "2.14.1"
description = ""
authors = ["CPR-dev-team <[email protected]>"]
packages = [{ include = "app" }, { include = "tests" }]
Expand Down
10 changes: 4 additions & 6 deletions tests/integration_tests/ingest/test.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
"author_type": ["Non-Party"],
"author": ["Test"]
},
"collections": ["test.new.collection.0"],
"events": [],
"documents": ["test.new.document.0", "test.new.document.1"]
"collections": ["test.new.collection.0"]
},
{
"import_id": "test.new.family.1",
Expand All @@ -36,21 +34,21 @@
"author_type": ["Party"],
"author": ["Test"]
},
"collections": ["test.new.collection.1"],
"events": [],
"documents": []
"collections": ["test.new.collection.1"]
}
],
"documents": [
{
"import_id": "test.new.document.0",
"family_import_id": "test.new.family.0",
"metadata": { "role": ["MAIN"], "type": ["Law"] },
"events": [],
"title": "",
"user_language_name": ""
},
{
"import_id": "test.new.document.1",
"family_import_id": "test.new.family.1",
"metadata": { "role": ["MAIN"], "type": ["Law"] },
"events": [],
"title": "",
Expand Down
62 changes: 27 additions & 35 deletions tests/integration_tests/ingest/test_ingest_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,65 +37,57 @@ def test_get_template_unfcc(
"allow_blanks": False,
"allowed_values": ["Party", "Non-Party"],
},
"event_type": {
"allow_any": False,
"allow_blanks": True,
"allowed_values": [
"Amended",
"Appealed",
"Closed",
"Declaration Of Climate Emergency",
"Dismissed",
"Entered Into Force",
"Filing",
"Granted",
"Implementation Details",
"International Agreement",
"Net Zero Pledge",
"Other",
"Passed/Approved",
"Repealed/Replaced",
"Set",
"Settled",
"Updated",
],
},
},
"collections": {
"items": {"type": "string"},
"title": "Collections",
"type": "array",
},
"documents": {
"items": {"type": "string"},
"title": "Documents",
"type": "array",
},
"events": {
"items": {"type": "string"},
"title": "Events",
"type": "array",
},
}
],
"events": [
{
"import_id": {"title": "Import Id", "type": "string"},
"family_import_id": {"title": "Family Import Id", "type": "string"},
"family_document_import_id": {
"title": "Family Document Import Id",
"type": "string",
},
"event_title": {"title": "Event Title", "type": "string"},
"date": {
"format": "date-time",
"title": "Date",
"type": "string",
},
"event_type_value": {
"title": "Event Type Value",
"type": "string",
"allow_any": False,
"allow_blanks": True,
"allowed_values": [
"Amended",
"Appealed",
"Closed",
"Declaration Of Climate Emergency",
"Dismissed",
"Entered Into Force",
"Filing",
"Granted",
"Implementation Details",
"International Agreement",
"Net Zero Pledge",
"Other",
"Passed/Approved",
"Repealed/Replaced",
"Set",
"Settled",
"Updated",
],
},
}
],
"documents": [
{
"import_id": {"title": "Import Id", "type": "string"},
"family_import_id": {"title": "Family Import Id", "type": "string"},
"events": {
"items": {"type": "string"},
"title": "Events",
Expand Down
9 changes: 8 additions & 1 deletion tests/mocks/repos/db_client_corpus_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@ def mock_get_taxonomy_by_corpus_type_name(_, __) -> Optional[TaxonomyData]:
"allow_any": False,
"allowed_values": [],
}
return cast(TaxonomyData, {"test": metadata, "_document": {"test": metadata}})
return cast(
TaxonomyData,
{
"test": metadata,
"event_type": {"test": metadata},
"_document": {"test": metadata},
},
)

monkeypatch.setattr(
taxonomy_service,
Expand Down
10 changes: 4 additions & 6 deletions tests/unit_tests/routers/ingest/test.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
"color": ["blue"],
"size": []
},
"collections": ["test.new.collection.0"],
"events": [],
"documents": ["test.new.document.0", "test.new.document.1"]
"collections": ["test.new.collection.0"]
},
{
"import_id": "test.new.family.1",
Expand All @@ -36,21 +34,21 @@
"color": ["pink"],
"size": []
},
"collections": ["test.new.collection.1"],
"events": [],
"documents": ["test.new.document.0"]
"collections": ["test.new.collection.1"]
}
],
"documents": [
{
"import_id": "test.new.document.0",
"family_import_id": "test.new.family.0",
"metadata": { "color": ["pink"] },
"events": [],
"title": "",
"user_language_name": ""
},
{
"import_id": "test.new.document.1",
"family_import_id": "test.new.family.1",
"metadata": { "color": ["pink"] },
"events": [],
"title": "",
Expand Down
23 changes: 11 additions & 12 deletions tests/unit_tests/routers/ingest/test_ingest_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,36 +52,35 @@ def test_ingest_template_when_ok(
"title": "Collections",
"type": "array",
},
"events": {
"items": {"type": "string"},
"title": "Events",
"type": "array",
},
"documents": {
"items": {"type": "string"},
"title": "Documents",
"type": "array",
},
}
],
"events": [
{
"import_id": {"title": "Import Id", "type": "string"},
"family_import_id": {"title": "Family Import Id", "type": "string"},
"family_document_import_id": {
"title": "Family Document Import Id",
"type": "string",
},
"event_title": {"title": "Event Title", "type": "string"},
"date": {
"format": "date-time",
"title": "Date",
"type": "string",
},
"event_type_value": {
"title": "Event Type Value",
"type": "string",
"test": {
"allow_any": False,
"allow_blanks": False,
"allowed_values": [],
},
},
}
],
"documents": [
{
"import_id": {"title": "Import Id", "type": "string"},
"family_import_id": {"title": "Family Import Id", "type": "string"},
"events": {
"items": {"type": "string"},
"title": "Events",
Expand Down
Loading

0 comments on commit 7b658e3

Please sign in to comment.