-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Reluctantly move ingest code into tests for setup
Unfortunatly we rely on the ingest code for test setup. So although this has been removed from the app, we still need it to maintain test coverage. That being said, we are one step closer to deleting it and no longer need to test it, so this is still progress. The ideal solution here probably to replace the many fragmented test setup with a single, repurposable setup factory. But thats beyond the scope of the current change
- Loading branch information
Showing
21 changed files
with
2,483 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import json | ||
import logging | ||
from typing import Any, Optional | ||
|
||
from pydantic.json import pydantic_encoder | ||
from sqlalchemy.exc import IntegrityError | ||
from sqlalchemy.orm import Session | ||
from tests.core.ingestion.legacy_setup.cclw.ingest_row_cclw import EventIngestRow | ||
from tests.core.ingestion.legacy_setup.utils import get_or_create, to_dict | ||
|
||
from app.db.models.law_policy import FamilyEvent | ||
|
||
_LOGGER = logging.getLogger(__name__) | ||
|
||
|
||
def family_event_from_row( | ||
db: Session, | ||
row: EventIngestRow, | ||
result: dict[str, Any], | ||
) -> FamilyEvent: | ||
""" | ||
Create any missing Family, FamilyDocument & Associated links from the given row | ||
:param [Session] db: connection to the database. | ||
:param [EventIngestRow] row: the row built from the events CSV. | ||
:param [dict[str, Any]] result: a result dict in which to track what was created | ||
:raises [ValueError]: When there is an existing family name that only differs by | ||
case or when the geography associated with this row cannot be found in the | ||
database. | ||
:return [FamilyEvent]: The family event that was either retrieved or created | ||
""" | ||
# Get or create FamilyEvent | ||
family_event = _maybe_create_family_event(db, row, result) | ||
|
||
return family_event | ||
|
||
|
||
def _maybe_create_family_event( | ||
db: Session, row: EventIngestRow, result: dict[str, Any] | ||
) -> Optional[FamilyEvent]: | ||
try: | ||
family_event = get_or_create( | ||
db, | ||
FamilyEvent, | ||
import_id=row.cpr_event_id, | ||
extra={ | ||
"title": row.title, | ||
"date": row.date, | ||
"event_type_name": row.event_type, | ||
"family_import_id": row.cpr_family_id, | ||
"family_document_import_id": None, # TODO: link to documents in future | ||
"status": row.event_status, | ||
}, | ||
) | ||
family_event_results = result.get("family_events", []) | ||
family_event_results.append(to_dict(family_event)) | ||
result["family_events"] = family_event_results | ||
return family_event | ||
except IntegrityError: | ||
row_dict = json.loads(json.dumps(row, default=pydantic_encoder)) | ||
_LOGGER.exception( | ||
"Failed to create family event due to foreign key violation", | ||
extra={"props": {"event_details": row_dict}}, | ||
) | ||
family_event_errors = result.get("family_event_errors", []) | ||
family_event_errors.append(row_dict) | ||
result["family_event_errors"] = family_event_errors | ||
return None |
117 changes: 117 additions & 0 deletions
117
tests/core/ingestion/legacy_setup/cclw/ingest_row_cclw.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from datetime import datetime | ||
from typing import ClassVar, Optional | ||
|
||
from pydantic import ConfigDict | ||
from pydantic.dataclasses import dataclass | ||
from tests.core.ingestion.legacy_setup.ingest_row_base import BaseIngestRow | ||
|
||
from app.db.models.law_policy import EventStatus, FamilyCategory | ||
|
||
|
||
_REQUIRED_DOCUMENT_COLUMNS = [ | ||
"ID", | ||
"Document ID", | ||
"Collection name", | ||
"Collection summary", | ||
"Document title", | ||
"Family name", | ||
"Family summary", | ||
"Document role", | ||
"Document variant", | ||
"Geography ISO", | ||
"Documents", | ||
"Category", | ||
"Sectors", | ||
"Instruments", | ||
"Frameworks", | ||
"Responses", | ||
"Natural Hazards", | ||
"Document Type", | ||
"Language", | ||
"Keywords", | ||
"Geography", | ||
"CPR Document ID", | ||
"CPR Family ID", | ||
"CPR Collection ID", | ||
"CPR Family Slug", | ||
"CPR Document Slug", | ||
"CPR Document Status", | ||
] | ||
VALID_DOCUMENT_COLUMN_NAMES = set(_REQUIRED_DOCUMENT_COLUMNS) | ||
|
||
_REQUIRED_EVENT_COLUMNS = [ | ||
"Id", | ||
"Event type", | ||
"Title", | ||
"Date", | ||
"CPR Event ID", | ||
"CPR Family ID", | ||
] | ||
VALID_EVENT_COLUMN_NAMES = set(_REQUIRED_EVENT_COLUMNS) | ||
|
||
|
||
@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra="forbid")) | ||
class CCLWDocumentIngestRow(BaseIngestRow): | ||
"""Represents a single row of input from the documents-families-collections CSV.""" | ||
|
||
id: str | ||
document_id: str | ||
collection_name: str | ||
collection_summary: str | ||
document_title: str | ||
family_name: str | ||
family_summary: str | ||
document_role: str | ||
document_variant: str | ||
geography_iso: str | ||
documents: str | ||
category: FamilyCategory | ||
sectors: list[str] # METADATA | ||
instruments: list[str] # METADATA | ||
frameworks: list[str] # METADATA | ||
responses: list[str] # METADATA - topics | ||
natural_hazards: list[str] # METADATA - hazard | ||
keywords: list[str] | ||
document_type: str | ||
language: list[str] | ||
geography: str | ||
cpr_document_id: str | ||
cpr_family_id: str | ||
cpr_collection_id: str | ||
cpr_family_slug: str | ||
cpr_document_slug: str | ||
cpr_document_status: str | ||
|
||
VALID_COLUMNS: ClassVar[set[str]] = VALID_DOCUMENT_COLUMN_NAMES | ||
|
||
@staticmethod | ||
def _key(key: str) -> str: | ||
return key.lower().replace(" ", "_") | ||
|
||
def get_first_url(self) -> Optional[str]: | ||
""" | ||
Get the first URL from the 'documents' attribute. | ||
TODO: This could/should be written with more validation. | ||
""" | ||
documents = self.documents.split(";") | ||
if len(documents) != 1: | ||
raise ValueError(f"Expected 1 document to be parsed from: {self.documents}") | ||
|
||
first_url = documents[0].split("|")[0] | ||
return first_url or None | ||
|
||
|
||
@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra="ignore")) | ||
class EventIngestRow(BaseIngestRow): | ||
"""Represents a single row of input from the events CSV.""" | ||
|
||
id: str | ||
event_type: str | ||
title: str | ||
date: datetime | ||
cpr_event_id: str | ||
cpr_family_id: str | ||
event_status: EventStatus | ||
|
||
VALID_COLUMNS: ClassVar[set[str]] = VALID_EVENT_COLUMN_NAMES |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
from typing import Union | ||
|
||
from tests.core.ingestion.legacy_setup.cclw.ingest_row_cclw import CCLWDocumentIngestRow | ||
from app.db.models.law_policy.metadata import FamilyMetadata | ||
from sqlalchemy.orm import Session | ||
from tests.core.ingestion.legacy_setup.utils import Result, ResultType | ||
from tests.core.ingestion.legacy_setup.metadata import ( | ||
Taxonomy, | ||
MetadataJson, | ||
build_metadata_field, | ||
) | ||
|
||
|
||
MAP_OF_LIST_VALUES = { | ||
"sector": "sectors", | ||
"instrument": "instruments", | ||
"framework": "frameworks", | ||
"topic": "responses", | ||
"hazard": "natural_hazards", | ||
"keyword": "keywords", | ||
} | ||
|
||
|
||
def add_cclw_metadata( | ||
db: Session, | ||
family_import_id: str, | ||
taxonomy: Taxonomy, | ||
taxonomy_id: int, | ||
row: CCLWDocumentIngestRow, | ||
) -> bool: | ||
result, metadata = build_cclw_metadata(taxonomy, row) | ||
if result.type == ResultType.ERROR: | ||
return False | ||
|
||
db.add( | ||
FamilyMetadata( | ||
family_import_id=family_import_id, | ||
taxonomy_id=taxonomy_id, | ||
value=metadata, | ||
) | ||
) | ||
return True | ||
|
||
|
||
def build_cclw_metadata( | ||
taxonomy: Taxonomy, row: CCLWDocumentIngestRow | ||
) -> tuple[Result, MetadataJson]: | ||
detail_list = [] | ||
value: dict[str, Union[str, list[str]]] = {} | ||
num_fails = 0 | ||
num_resolved = 0 | ||
|
||
for tax_key, row_key in MAP_OF_LIST_VALUES.items(): | ||
ingest_values = getattr(row, row_key) | ||
result, field_value = build_metadata_field( | ||
row.row_number, taxonomy, ingest_values, tax_key | ||
) | ||
|
||
if result.type == ResultType.OK: | ||
value[tax_key] = field_value | ||
elif result.type == ResultType.RESOLVED: | ||
value[tax_key] = field_value | ||
detail_list.append(result.details) | ||
num_resolved += 1 | ||
else: | ||
detail_list.append(result.details) | ||
num_fails += 1 | ||
|
||
row_result_type = ResultType.OK | ||
if num_resolved: | ||
row_result_type = ResultType.RESOLVED | ||
if num_fails: | ||
row_result_type = ResultType.ERROR | ||
|
||
return Result(type=row_result_type, details="\n".join(detail_list)), value |
Oops, something went wrong.