Skip to content

Commit

Permalink
Reluctantly move ingest code into tests for setup
Browse files Browse the repository at this point in the history
Unfortunatly we rely on the ingest code for test setup. So although this
has been removed from the app, we still need it to maintain test coverage.
That being said, we are one step closer to deleting it and no longer need
to test it, so this is still progress.

The ideal solution here probably to replace the many fragmented test
setup with a single, repurposable setup factory. But thats beyond the
scope of the current change
  • Loading branch information
olaughter committed Jan 25, 2024
1 parent f9bdea7 commit 5467c36
Show file tree
Hide file tree
Showing 21 changed files with 2,483 additions and 6 deletions.
68 changes: 68 additions & 0 deletions tests/core/ingestion/legacy_setup/cclw/event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import json
import logging
from typing import Any, Optional

from pydantic.json import pydantic_encoder
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from tests.core.ingestion.legacy_setup.cclw.ingest_row_cclw import EventIngestRow
from tests.core.ingestion.legacy_setup.utils import get_or_create, to_dict

from app.db.models.law_policy import FamilyEvent

_LOGGER = logging.getLogger(__name__)


def family_event_from_row(
db: Session,
row: EventIngestRow,
result: dict[str, Any],
) -> FamilyEvent:
"""
Create any missing Family, FamilyDocument & Associated links from the given row
:param [Session] db: connection to the database.
:param [EventIngestRow] row: the row built from the events CSV.
:param [dict[str, Any]] result: a result dict in which to track what was created
:raises [ValueError]: When there is an existing family name that only differs by
case or when the geography associated with this row cannot be found in the
database.
:return [FamilyEvent]: The family event that was either retrieved or created
"""
# Get or create FamilyEvent
family_event = _maybe_create_family_event(db, row, result)

return family_event


def _maybe_create_family_event(
db: Session, row: EventIngestRow, result: dict[str, Any]
) -> Optional[FamilyEvent]:
try:
family_event = get_or_create(
db,
FamilyEvent,
import_id=row.cpr_event_id,
extra={
"title": row.title,
"date": row.date,
"event_type_name": row.event_type,
"family_import_id": row.cpr_family_id,
"family_document_import_id": None, # TODO: link to documents in future
"status": row.event_status,
},
)
family_event_results = result.get("family_events", [])
family_event_results.append(to_dict(family_event))
result["family_events"] = family_event_results
return family_event
except IntegrityError:
row_dict = json.loads(json.dumps(row, default=pydantic_encoder))
_LOGGER.exception(
"Failed to create family event due to foreign key violation",
extra={"props": {"event_details": row_dict}},
)
family_event_errors = result.get("family_event_errors", [])
family_event_errors.append(row_dict)
result["family_event_errors"] = family_event_errors
return None
117 changes: 117 additions & 0 deletions tests/core/ingestion/legacy_setup/cclw/ingest_row_cclw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from datetime import datetime
from typing import ClassVar, Optional

from pydantic import ConfigDict
from pydantic.dataclasses import dataclass
from tests.core.ingestion.legacy_setup.ingest_row_base import BaseIngestRow

from app.db.models.law_policy import EventStatus, FamilyCategory


_REQUIRED_DOCUMENT_COLUMNS = [
"ID",
"Document ID",
"Collection name",
"Collection summary",
"Document title",
"Family name",
"Family summary",
"Document role",
"Document variant",
"Geography ISO",
"Documents",
"Category",
"Sectors",
"Instruments",
"Frameworks",
"Responses",
"Natural Hazards",
"Document Type",
"Language",
"Keywords",
"Geography",
"CPR Document ID",
"CPR Family ID",
"CPR Collection ID",
"CPR Family Slug",
"CPR Document Slug",
"CPR Document Status",
]
VALID_DOCUMENT_COLUMN_NAMES = set(_REQUIRED_DOCUMENT_COLUMNS)

_REQUIRED_EVENT_COLUMNS = [
"Id",
"Event type",
"Title",
"Date",
"CPR Event ID",
"CPR Family ID",
]
VALID_EVENT_COLUMN_NAMES = set(_REQUIRED_EVENT_COLUMNS)


@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra="forbid"))
class CCLWDocumentIngestRow(BaseIngestRow):
"""Represents a single row of input from the documents-families-collections CSV."""

id: str
document_id: str
collection_name: str
collection_summary: str
document_title: str
family_name: str
family_summary: str
document_role: str
document_variant: str
geography_iso: str
documents: str
category: FamilyCategory
sectors: list[str] # METADATA
instruments: list[str] # METADATA
frameworks: list[str] # METADATA
responses: list[str] # METADATA - topics
natural_hazards: list[str] # METADATA - hazard
keywords: list[str]
document_type: str
language: list[str]
geography: str
cpr_document_id: str
cpr_family_id: str
cpr_collection_id: str
cpr_family_slug: str
cpr_document_slug: str
cpr_document_status: str

VALID_COLUMNS: ClassVar[set[str]] = VALID_DOCUMENT_COLUMN_NAMES

@staticmethod
def _key(key: str) -> str:
return key.lower().replace(" ", "_")

def get_first_url(self) -> Optional[str]:
"""
Get the first URL from the 'documents' attribute.
TODO: This could/should be written with more validation.
"""
documents = self.documents.split(";")
if len(documents) != 1:
raise ValueError(f"Expected 1 document to be parsed from: {self.documents}")

first_url = documents[0].split("|")[0]
return first_url or None


@dataclass(config=ConfigDict(frozen=True, validate_assignment=True, extra="ignore"))
class EventIngestRow(BaseIngestRow):
"""Represents a single row of input from the events CSV."""

id: str
event_type: str
title: str
date: datetime
cpr_event_id: str
cpr_family_id: str
event_status: EventStatus

VALID_COLUMNS: ClassVar[set[str]] = VALID_EVENT_COLUMN_NAMES
75 changes: 75 additions & 0 deletions tests/core/ingestion/legacy_setup/cclw/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import Union

from tests.core.ingestion.legacy_setup.cclw.ingest_row_cclw import CCLWDocumentIngestRow
from app.db.models.law_policy.metadata import FamilyMetadata
from sqlalchemy.orm import Session
from tests.core.ingestion.legacy_setup.utils import Result, ResultType
from tests.core.ingestion.legacy_setup.metadata import (
Taxonomy,
MetadataJson,
build_metadata_field,
)


MAP_OF_LIST_VALUES = {
"sector": "sectors",
"instrument": "instruments",
"framework": "frameworks",
"topic": "responses",
"hazard": "natural_hazards",
"keyword": "keywords",
}


def add_cclw_metadata(
db: Session,
family_import_id: str,
taxonomy: Taxonomy,
taxonomy_id: int,
row: CCLWDocumentIngestRow,
) -> bool:
result, metadata = build_cclw_metadata(taxonomy, row)
if result.type == ResultType.ERROR:
return False

db.add(
FamilyMetadata(
family_import_id=family_import_id,
taxonomy_id=taxonomy_id,
value=metadata,
)
)
return True


def build_cclw_metadata(
taxonomy: Taxonomy, row: CCLWDocumentIngestRow
) -> tuple[Result, MetadataJson]:
detail_list = []
value: dict[str, Union[str, list[str]]] = {}
num_fails = 0
num_resolved = 0

for tax_key, row_key in MAP_OF_LIST_VALUES.items():
ingest_values = getattr(row, row_key)
result, field_value = build_metadata_field(
row.row_number, taxonomy, ingest_values, tax_key
)

if result.type == ResultType.OK:
value[tax_key] = field_value
elif result.type == ResultType.RESOLVED:
value[tax_key] = field_value
detail_list.append(result.details)
num_resolved += 1
else:
detail_list.append(result.details)
num_fails += 1

row_result_type = ResultType.OK
if num_resolved:
row_result_type = ResultType.RESOLVED
if num_fails:
row_result_type = ResultType.ERROR

return Result(type=row_result_type, details="\n".join(detail_list)), value
Loading

0 comments on commit 5467c36

Please sign in to comment.