Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/pdct 1350 ingest events #211

Merged
merged 34 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
00a5eaf
Validate import_id and family_import_id before saving documents
annaCPR Sep 5, 2024
ebe78ff
Make fmaily_document_import_id optional on ingest events
annaCPR Sep 9, 2024
7da9c28
Ingest events
annaCPR Sep 9, 2024
2498caf
Validate import id when saving events
annaCPR Sep 9, 2024
f56bbcf
Validate family_import_id when saving events
annaCPR Sep 9, 2024
faee609
Do not generate an import_id when savin gevents if one is already pro…
annaCPR Sep 9, 2024
4b86125
Validate event type against corpus taxonomy
annaCPR Sep 9, 2024
0544951
Move collection validation logic to a separate service
annaCPR Sep 9, 2024
e1421ae
Move family validation logic to a separate service
annaCPR Sep 9, 2024
78aab80
Move document validation logic to a separate service
annaCPR Sep 9, 2024
80e4af3
Move event validation logci to a separate service
annaCPR Sep 9, 2024
f3aefa1
Switch to using validation service when ingesting collections
annaCPR Sep 9, 2024
101353b
Switch to using validation service when ingesting families
annaCPR Sep 9, 2024
eccee88
Use validation service when ingesting documents
annaCPR Sep 9, 2024
6000836
Use validation service when ingesting events
annaCPR Sep 9, 2024
121775d
Tidy up tests
annaCPR Sep 9, 2024
1468dbd
More tidy up of test data
annaCPR Sep 9, 2024
8856c9a
Tiny refactor
annaCPR Sep 9, 2024
a9cadd9
Return better error on Exception
annaCPR Sep 9, 2024
03ec9eb
Bump minor version
annaCPR Sep 9, 2024
5b9c859
Fix mocks in unit tests + tidy up
annaCPR Sep 10, 2024
e714f25
Add missing docstrings to validation service
annaCPR Sep 10, 2024
f467133
Add missing doctypes and tighten return types
annaCPR Sep 10, 2024
8c51a0a
Update docstrings
annaCPR Sep 10, 2024
2dd50ee
Make test filenames more meaningful
annaCPR Sep 10, 2024
3c5d3f1
Reference correct test files after rename
annaCPR Sep 10, 2024
8954223
Update docstrings, again...
annaCPR Sep 10, 2024
17d067e
Update tests/integration_tests/ingest/test_ingest.py
annaCPR Sep 10, 2024
8671316
Update tests/integration_tests/ingest/test_ingest.py
annaCPR Sep 10, 2024
6e99c08
Update tests/integration_tests/ingest/test_ingest.py
annaCPR Sep 10, 2024
8589f41
Update tests/unit_tests/routers/ingest/test_ingest.py
annaCPR Sep 10, 2024
bb21c69
Update tests/unit_tests/routers/ingest/test_ingest.py
annaCPR Sep 10, 2024
c3a9785
Update tests/integration_tests/ingest/test_ingest.py
annaCPR Sep 10, 2024
123b648
Add missing imports and params in docstrings for validation
annaCPR Sep 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/api/api_v1/routers/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,5 +157,5 @@ async def ingest(new_data: UploadFile, corpus_import_id: str) -> Json:
except Exception as e:
_LOGGER.error(e)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Unexpected error"
status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(e)
)
1 change: 1 addition & 0 deletions app/model/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class EventCreateDTO(BaseModel):
generated.
"""

import_id: Optional[str] = None
# From FamilyEvent
event_title: str
date: datetime
Expand Down
18 changes: 16 additions & 2 deletions app/model/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from app.model.collection import CollectionCreateDTO
from app.model.document import DocumentCreateDTO
from app.model.event import EventCreateDTO
from app.model.family import FamilyCreateDTO
from app.model.general import Json

Expand Down Expand Up @@ -64,11 +65,25 @@ class IngestEventDTO(BaseModel):

import_id: str
family_import_id: str
family_document_import_id: str
family_document_import_id: Optional[str] = None
event_title: str
date: datetime
event_type_value: str

def to_event_create_dto(self) -> EventCreateDTO:
"""
Convert IngestEventDTO to EventCreateDTO.

:return EventCreateDTO: Converted EventCreateDTO instance.
"""
return EventCreateDTO(
import_id=self.import_id,
family_import_id=self.family_import_id,
event_title=self.event_title,
date=self.date,
event_type_value=self.event_type_value,
)


class IngestDocumentDTO(BaseModel):
"""Representation of a document for ingest."""
Expand All @@ -77,7 +92,6 @@ class IngestDocumentDTO(BaseModel):
family_import_id: str
variant_name: Optional[str] = None
metadata: Json
events: list[str]
title: str
source_url: Optional[AnyHttpUrl] = None
user_language_name: Optional[str]
Expand Down
9 changes: 6 additions & 3 deletions app/repository/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def _event_to_dto(family_event_meta: FamilyEventTuple) -> EventReadDTO:

def _dto_to_event_dict(dto: EventCreateDTO) -> dict:
return {
"import_id": dto.import_id if dto.import_id else None,
"family_import_id": dto.family_import_id,
"family_document_import_id": dto.family_document_import_id,
"date": dto.date,
Expand Down Expand Up @@ -179,9 +180,11 @@ def create(db: Session, event: EventCreateDTO) -> str:
)

org_name = cast(str, org.name)
new_family_event.import_id = cast(
Column, generate_import_id(db, CountedEntity.Event, org_name)
)

if not new_family_event.import_id:
new_family_event.import_id = cast(
Column, generate_import_id(db, CountedEntity.Event, org_name)
)

db.add(new_family_event)
except Exception:
Expand Down
135 changes: 86 additions & 49 deletions app/service/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,128 +5,162 @@
import of data and other services for validation etc.
"""

from typing import Optional
from typing import Any, Optional

from db_client.models.dfce.taxonomy_entry import EntitySpecificTaxonomyKeys
from fastapi import HTTPException, status
from pydantic import ConfigDict, validate_call
from sqlalchemy.orm import Session

import app.clients.db.session as db_session
import app.repository.collection as collection_repository
import app.repository.document as document_repository
import app.repository.event as event_repository
import app.repository.family as family_repository
import app.service.category as category
import app.service.collection as collection
import app.service.corpus as corpus
import app.service.geography as geography
import app.service.metadata as metadata
import app.service.validation as validation
from app.errors import ValidationError
from app.model.ingest import IngestCollectionDTO, IngestDocumentDTO, IngestFamilyDTO
from app.service.collection import validate_import_id
from app.model.ingest import (
IngestCollectionDTO,
IngestDocumentDTO,
IngestEventDTO,
IngestFamilyDTO,
)


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def save_collections(
collection_data: list[dict], corpus_import_id: str, db: Optional[Session] = None
collection_data: list[dict[str, Any]],
corpus_import_id: str,
db: Optional[Session] = None,
) -> list[str]:
"""
Creates new collections with the values passed.

:param Session db: The database session to use for saving collections.
:param list[dict] collection_data: The data to use for creating collections.
:param list[dict[str, Any]] collection_data: The data to use for creating collections.
:param str corpus_import_id: The import_id of the corpus the collections belong to.
:return str: The new import_ids for the saved collections.
:param Optional[Session] db: The database session to use for saving collections or None.
:return list[str]: The new import_ids for the saved collections.
"""
if db is None:
db = db_session.get_db()

validation.validate_collections(collection_data)

collection_import_ids = []
org_id = corpus.get_corpus_org_id(corpus_import_id)

for coll in collection_data:
dto = IngestCollectionDTO(**coll).to_collection_create_dto()
if dto.import_id:
validate_import_id(dto.import_id)
import_id = collection_repository.create(db, dto, org_id)

collection_import_ids.append(import_id)

return collection_import_ids


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def save_families(
family_data: list[dict], corpus_import_id: str, db: Optional[Session] = None
family_data: list[dict[str, Any]],
corpus_import_id: str,
db: Optional[Session] = None,
) -> list[str]:
"""
Creates new families with the values passed.

:param Session db: The database session to use for saving families.
:param list[dict] families_data: The data to use for creating families.
:param list[dict[str, Any]] family_data: The data to use for creating families.
:param str corpus_import_id: The import_id of the corpus the families belong to.
:return str: The new import_ids for the saved families.
:param Optional[Session] db: The database session to use for saving families or None.
:return list[str]: The new import_ids for the saved families.
"""

if db is None:
db = db_session.get_db()

validation.validate_families(family_data, corpus_import_id)

family_import_ids = []
org_id = corpus.get_corpus_org_id(corpus_import_id)

for fam in family_data:
# TODO: Uncomment when implementing feature/pdct-1402-validate-collection-exists-before-creating-family
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
# collection.validate(collections, db)
dto = IngestFamilyDTO(
**fam, corpus_import_id=corpus_import_id
).to_family_create_dto(corpus_import_id)

if dto.import_id:
validate_import_id(dto.import_id)
corpus.validate(db, corpus_import_id)
geo_id = geography.get_id(db, dto.geography)
category.validate(dto.category)
collections = set(dto.collections)
collection.validate_multiple_ids(collections)
# TODO: Uncomment when implementing feature/pdct-1402-validate-collection-exists-before-creating-family
# collection.validate(collections, db)
metadata.validate_metadata(db, corpus_import_id, dto.metadata)

import_id = family_repository.create(db, dto, geo_id, org_id)
import_id = family_repository.create(
db, dto, geography.get_id(db, dto.geography), org_id
)
family_import_ids.append(import_id)

return family_import_ids


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def save_documents(
document_data: list[dict],
document_data: list[dict[str, Any]],
corpus_import_id: str,
db: Optional[Session] = None,
) -> list[str]:
"""
Creates new documents with the values passed.

:param Session db: The database session to use for saving documents.
:param list[dict] document_data: The data to use for creating documents.
:param list[dict[str, Any]] document_data: The data to use for creating documents.
:param str corpus_import_id: The import_id of the corpus the documents belong to.
:return str: The new import_ids for the saved documents.
:param Optional[Session] db: The database session to use for saving documents or None.
:return list[str]: The new import_ids for the saved documents.
"""
if db is None:
db = db_session.get_db()

validation.validate_documents(document_data, corpus_import_id)

document_import_ids = []

for doc in document_data:
dto = IngestDocumentDTO(**doc).to_document_create_dto()

if dto.variant_name == "":
raise ValidationError("Variant name is empty")
metadata.validate_metadata(
db,
corpus_import_id,
dto.metadata,
EntitySpecificTaxonomyKeys.DOCUMENT.value,
)
import_id = document_repository.create(db, dto)
document_import_ids.append(import_id)

return document_import_ids


def validate_entity_relationships(data: dict) -> None:
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def save_events(
event_data: list[dict[str, Any]],
corpus_import_id: str,
db: Optional[Session] = None,
) -> list[str]:
"""
Creates new events with the values passed.

:param list[dict[str, Any]] event_data: The data to use for creating events.
:param str corpus_import_id: The import_id of the corpus the events belong to.
:param Optional[Session] db: The database session to use for saving events or None.
:return list[str]: The new import_ids for the saved events.
"""
if db is None:
db = db_session.get_db()

validation.validate_events(event_data, corpus_import_id)

event_import_ids = []

for event in event_data:
dto = IngestEventDTO(**event).to_event_create_dto()
import_id = event_repository.create(db, dto)
event_import_ids.append(import_id)

return event_import_ids


def validate_entity_relationships(data: dict[str, Any]) -> None:
"""
Validates relationships between entities contained in data based on import_ids.
For documents, it validates that the family the document is linked to exists.

:param dict[str, Any] data: The data object containing entities to be validated.
:raises ValidationError: raised should there be any unmatched relationships.
"""
families = []
if "families" in data:
for fam in data["families"]:
Expand All @@ -144,21 +178,22 @@ def validate_entity_relationships(data: dict) -> None:


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def import_data(data: dict, corpus_import_id: str) -> dict:
def import_data(data: dict[str, Any], corpus_import_id: str) -> dict[str, str]:
"""
Imports data for a given corpus_import_id.

:param dict data: The data to be imported.
:param dict[str, Any] data: The data to be imported.
:param str corpus_import_id: The import_id of the corpus the data should be imported into.
:raises RepositoryError: raised on a database error.
:raises ValidationError: raised should the import_id be invalid.
:return dict: Import ids of the saved entities.
:raises ValidationError: raised should the data be invalid.
:return dict[str, str]: Import ids of the saved entities.
"""
db = db_session.get_db()

collection_data = data["collections"] if "collections" in data else None
family_data = data["families"] if "families" in data else None
document_data = data["documents"] if "documents" in data else None
event_data = data["events"] if "events" in data else None

if not data:
raise HTTPException(status_code=status.HTTP_204_NO_CONTENT)
Expand All @@ -176,6 +211,8 @@ def import_data(data: dict, corpus_import_id: str) -> dict:
response["families"] = save_families(family_data, corpus_import_id, db)
if document_data:
response["documents"] = save_documents(document_data, corpus_import_id, db)
if event_data:
response["events"] = save_events(event_data, corpus_import_id, db)

return response
except Exception as e:
Expand Down
Loading
Loading