Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/pdct 1350 ingest events #211

Merged
merged 34 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
00a5eaf
Validate import_id and family_import_id before saving documents
annaCPR Sep 5, 2024
ebe78ff
Make fmaily_document_import_id optional on ingest events
annaCPR Sep 9, 2024
7da9c28
Ingest events
annaCPR Sep 9, 2024
2498caf
Validate import id when saving events
annaCPR Sep 9, 2024
f56bbcf
Validate family_import_id when saving events
annaCPR Sep 9, 2024
faee609
Do not generate an import_id when savin gevents if one is already pro…
annaCPR Sep 9, 2024
4b86125
Validate event type against corpus taxonomy
annaCPR Sep 9, 2024
0544951
Move collection validation logic to a separate service
annaCPR Sep 9, 2024
e1421ae
Move family validation logic to a separate service
annaCPR Sep 9, 2024
78aab80
Move document validation logic to a separate service
annaCPR Sep 9, 2024
80e4af3
Move event validation logci to a separate service
annaCPR Sep 9, 2024
f3aefa1
Switch to using validation service when ingesting collections
annaCPR Sep 9, 2024
101353b
Switch to using validation service when ingesting families
annaCPR Sep 9, 2024
eccee88
Use validation service when ingesting documents
annaCPR Sep 9, 2024
6000836
Use validation service when ingesting events
annaCPR Sep 9, 2024
121775d
Tidy up tests
annaCPR Sep 9, 2024
1468dbd
More tidy up of test data
annaCPR Sep 9, 2024
8856c9a
Tiny refactor
annaCPR Sep 9, 2024
a9cadd9
Return better error on Exception
annaCPR Sep 9, 2024
03ec9eb
Bump minor version
annaCPR Sep 9, 2024
5b9c859
Fix mocks in unit tests + tidy up
annaCPR Sep 10, 2024
e714f25
Add missing docstrings to validation service
annaCPR Sep 10, 2024
f467133
Add missing doctypes and tighten return types
annaCPR Sep 10, 2024
8c51a0a
Update docstrings
annaCPR Sep 10, 2024
2dd50ee
Make test filenames more meaningful
annaCPR Sep 10, 2024
3c5d3f1
Reference correct test files after rename
annaCPR Sep 10, 2024
8954223
Update docstrings, again...
annaCPR Sep 10, 2024
17d067e
Update tests/integration_tests/ingest/test_ingest.py
annaCPR Sep 10, 2024
8671316
Update tests/integration_tests/ingest/test_ingest.py
annaCPR Sep 10, 2024
6e99c08
Update tests/integration_tests/ingest/test_ingest.py
annaCPR Sep 10, 2024
8589f41
Update tests/unit_tests/routers/ingest/test_ingest.py
annaCPR Sep 10, 2024
bb21c69
Update tests/unit_tests/routers/ingest/test_ingest.py
annaCPR Sep 10, 2024
c3a9785
Update tests/integration_tests/ingest/test_ingest.py
annaCPR Sep 10, 2024
123b648
Add missing imports and params in docstrings for validation
annaCPR Sep 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 35 additions & 24 deletions app/service/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import of data and other services for validation etc.
"""

from typing import Optional
from typing import Any, Optional

from fastapi import HTTPException, status
from pydantic import ConfigDict, validate_call
Expand All @@ -30,15 +30,17 @@

@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def save_collections(
collection_data: list[dict], corpus_import_id: str, db: Optional[Session] = None
collection_data: list[dict[str, Any]],
corpus_import_id: str,
db: Optional[Session] = None,
) -> list[str]:
"""
Creates new collections with the values passed.

:param list[dict] collection_data: The data to use for creating collections.
:param list[dict[str, Any]] collection_data: The data to use for creating collections.
:param str corpus_import_id: The import_id of the corpus the collections belong to.
:param Optional[Session] The database session to use for saving collections.
:return str: The new import_ids for the saved collections.
:param Optional[Session] The database session to use for saving collections or None.
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
:return list[str]: The new import_ids for the saved collections.
"""
if db is None:
db = db_session.get_db()
Expand All @@ -58,15 +60,17 @@ def save_collections(

@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def save_families(
family_data: list[dict], corpus_import_id: str, db: Optional[Session] = None
family_data: list[dict[str, Any]],
corpus_import_id: str,
db: Optional[Session] = None,
) -> list[str]:
"""
Creates new families with the values passed.

:param list[dict] families_data: The data to use for creating families.
:param list[dict[str, Any]] families_data: The data to use for creating families.
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
:param str corpus_import_id: The import_id of the corpus the families belong to.
:param Optional[Session] The database session to use for saving families.
:return str: The new import_ids for the saved families.
:param Optional[Session] The database session to use for saving families or None.
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
:return list[str]: The new import_ids for the saved families.
"""

if db is None:
Expand All @@ -93,17 +97,17 @@ def save_families(

@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def save_documents(
document_data: list[dict],
document_data: list[dict[str, Any]],
corpus_import_id: str,
db: Optional[Session] = None,
) -> list[str]:
"""
Creates new documents with the values passed.

:param list[dict] document_data: The data to use for creating documents.
:param list[dict[str, Any]] document_data: The data to use for creating documents.
:param str corpus_import_id: The import_id of the corpus the documents belong to.
:param Optional[Session] The database session to use for saving documents.
:return str: The new import_ids for the saved documents.
:param Optional[Session] The database session to use for saving documents or None.
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
:return list[str]: The new import_ids for the saved documents.
"""
if db is None:
db = db_session.get_db()
Expand All @@ -122,17 +126,17 @@ def save_documents(

@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def save_events(
event_data: list[dict],
event_data: list[dict[str, Any]],
corpus_import_id: str,
db: Optional[Session] = None,
) -> list[str]:
"""
Creates new events with the values passed.

:param list[dict] event_data: The data to use for creating events.
:param list[dict[str, Any]] event_data: The data to use for creating events.
:param str corpus_import_id: The import_id of the corpus the events belong to.
:param Optional[Session] The database session to use for saving events.
:return str: The new import_ids for the saved events.
:param Optional[Session] The database session to use for saving events or None.
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
:return list[str]: The new import_ids for the saved events.
"""
if db is None:
db = db_session.get_db()
Expand All @@ -141,15 +145,22 @@ def save_events(

event_import_ids = []

for ev in event_data:
dto = IngestEventDTO(**ev).to_event_create_dto()
for event in event_data:
dto = IngestEventDTO(**event).to_event_create_dto()
import_id = event_repository.create(db, dto)
event_import_ids.append(import_id)

return event_import_ids


def validate_entity_relationships(data: dict) -> None:
def validate_entity_relationships(data: dict[str, Any]) -> None:
"""
Validates relationships between entities contained in data based on import_ids.
For documents, it validates that the family the document is linked to exists.

:param dict[str, Any] data: The data object containing entities to be validated.
:raises ValidationError: raised should there be any unmatched relationships.
"""
families = []
if "families" in data:
for fam in data["families"]:
Expand All @@ -167,15 +178,15 @@ def validate_entity_relationships(data: dict) -> None:


@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def import_data(data: dict, corpus_import_id: str) -> dict:
def import_data(data: dict[str, Any], corpus_import_id: str) -> dict[str, str]:
"""
Imports data for a given corpus_import_id.

:param dict data: The data to be imported.
:param dict[str, Any] data: The data to be imported.
:param str corpus_import_id: The import_id of the corpus the data should be imported into.
:raises RepositoryError: raised on a database error.
:raises ValidationError: raised should the import_id be invalid.
:return dict: Import ids of the saved entities.
:raises ValidationError: raised should the data be invalid.
:return dict[str, str]: Import ids of the saved entities.
"""
db = db_session.get_db()

Expand Down
62 changes: 53 additions & 9 deletions app/service/validation.py
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Any, Optional

from db_client.functions.corpus_helpers import TaxonomyData, get_taxonomy_from_corpus
from db_client.models.dfce.taxonomy_entry import EntitySpecificTaxonomyKeys
Expand All @@ -12,16 +12,33 @@
from app.service.collection import validate_import_id


def validate_collection(collection: dict) -> None:
def validate_collection(collection: dict[str, Any]) -> None:
"""
Validates a collection.

:param dict[str, Any] collection: The collection object to be validated.
:raises ValidationError: raised should the data be invalid.
"""
validate_import_id(collection["import_id"])


def validate_collections(collections: list[dict]) -> None:
def validate_collections(collections: list[dict[str, Any]]) -> None:
"""
Validates a list of collections.

:param list[dict[str, Any]] collections: The list of collection objects to be validated.
"""
for coll in collections:
validate_collection(coll)


def validate_family(family: dict, corpus_import_id: str) -> None:
def validate_family(family: dict[str, Any], corpus_import_id: str) -> None:
"""
Validates a family.

:param dict[str, Any] family: The family object to be validated.
:raises ValidationError: raised should the data be invalid.
"""
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
db = db_session.get_db()

validate_import_id(family["import_id"])
Expand All @@ -32,12 +49,23 @@ def validate_family(family: dict, corpus_import_id: str) -> None:
metadata.validate_metadata(db, corpus_import_id, family["metadata"])


def validate_families(families: list[dict], corpus_import_id: str) -> None:
def validate_families(families: list[dict[str, Any]], corpus_import_id: str) -> None:
"""
Validates a list of families.

:param list[dict[str, Any]] families: The list of family objects to be validated.
"""
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
for fam in families:
validate_family(fam, corpus_import_id)


def validate_document(document: dict, corpus_import_id: str) -> None:
def validate_document(document: dict[str, Any], corpus_import_id: str) -> None:
"""
Validates a document.

:param dict[str, Any] document: The document object to be validated.
:raises ValidationError: raised should the data be invalid.
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
"""
db = db_session.get_db()

validate_import_id(document["import_id"])
Expand All @@ -52,12 +80,23 @@ def validate_document(document: dict, corpus_import_id: str) -> None:
)


def validate_documents(documents: list[dict], corpus_import_id: str) -> None:
def validate_documents(documents: list[dict[str, Any]], corpus_import_id: str) -> None:
"""
Validates a list of documents.

:param list[dict[str, Any]] documents: The list of document objects to be validated.
"""
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
for doc in documents:
validate_document(doc, corpus_import_id)


def validate_event(event: dict, taxonomy: Optional[TaxonomyData]) -> None:
def validate_event(event: dict[str, Any], taxonomy: Optional[TaxonomyData]) -> None:
"""
Validates an event.

:param dict[str, Any] event: The event object to be validated.
:raises ValidationError: raised should the data be invalid.
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
"""
validate_import_id(event["import_id"])
validate_import_id(event["family_import_id"])
allowed_event_types = taxonomy["event_type"]["allowed_values"] if taxonomy else None
Expand All @@ -72,7 +111,12 @@ def validate_event(event: dict, taxonomy: Optional[TaxonomyData]) -> None:
raise ValidationError(f"Event type ['{event['event_type_value']}'] is invalid!")


def validate_events(events: list[dict], corpus_import_id: str) -> None:
def validate_events(events: list[dict[str, Any]], corpus_import_id: str) -> None:
"""
Validates a list of events.

:param list[dict[str, Any]] events: The list of event objects to be validated.
"""
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
db = db_session.get_db()

event_taxonomy = get_taxonomy_from_corpus(db, corpus_import_id)
Expand Down
15 changes: 11 additions & 4 deletions tests/integration_tests/ingest/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ def test_ingest_when_ok(data_db: Session, client: TestClient, user_header_token)

response = client.post(
"/api/v1/ingest/UNFCCC.corpus.i00000001.n0000",
files={"new_data": open("tests/integration_tests/ingest/test.json", "rb")},
files={
"new_data": open("tests/integration_tests/ingest/test_bulk_data.json", "rb")
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
},
headers=user_header_token,
)

Expand Down Expand Up @@ -79,7 +81,9 @@ def test_ingest_rollback(

response = client.post(
"/api/v1/ingest/UNFCCC.corpus.i00000001.n0000",
files={"new_data": open("tests/integration_tests/ingest/test.json", "rb")},
files={
"new_data": open("tests/integration_tests/ingest/test_bulk_data.json", "rb")
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
},
headers=user_header_token,
)

Expand All @@ -98,7 +102,9 @@ def test_ingest_when_corpus_import_id_invalid(
invalid_corpus = "test"
response = client.post(
f"/api/v1/ingest/{invalid_corpus}",
files={"new_data": open("tests/integration_tests/ingest/test.json", "rb")},
files={
"new_data": open("tests/integration_tests/ingest/test_bulk_data.json", "rb")
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
},
headers=user_header_token,
)

Expand All @@ -116,7 +122,8 @@ def test_ingest_events_when_event_type_invalid(
"/api/v1/ingest/UNFCCC.corpus.i00000001.n0000",
files={
"new_data": open(
"tests/integration_tests/ingest/test_invalid_event_type.json", "rb"
"tests/integration_tests/ingest/test_bulk_data_with_invalid_event_type.json",
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
"rb",
)
},
headers=user_header_token,
Expand Down
12 changes: 10 additions & 2 deletions tests/unit_tests/routers/ingest/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ def test_ingest_data_when_ok(

response = client.post(
"/api/v1/ingest/test",
files={"new_data": open("tests/unit_tests/routers/ingest/test.json", "rb")},
files={
"new_data": open(
"tests/unit_tests/routers/ingest/test_bulk_data.json", "rb"
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
)
},
headers=user_header_token,
)

Expand Down Expand Up @@ -94,7 +98,11 @@ def test_ingest_data_when_db_error(

response = client.post(
"/api/v1/ingest/test",
files={"new_data": open("tests/unit_tests/routers/ingest/test.json", "rb")},
files={
"new_data": open(
"tests/unit_tests/routers/ingest/test_bulk_data.json", "rb"
annaCPR marked this conversation as resolved.
Show resolved Hide resolved
)
},
headers=user_header_token,
)

Expand Down
Loading