Skip to content

Commit

Permalink
Update ingest and relax schema for variant (#70)
Browse files Browse the repository at this point in the history
* Only return single slug

* replace one_or_none with first

* Update ingest and relax schema for variant

* re-structure documents tests

* tidy up

* tidy up

* improve query by removing carteasian product

* get working with new variant column

* fix some tests

* fix documents endpoint tests
  • Loading branch information
diversemix authored Mar 24, 2023
1 parent 13c6c61 commit 0460bc6
Show file tree
Hide file tree
Showing 18 changed files with 407 additions and 206 deletions.
2 changes: 1 addition & 1 deletion alembic/versions/0013_families_and_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def upgrade():
sa.Column('family_import_id', sa.Text(), nullable=False),
sa.Column('physical_document_id', sa.Integer(), nullable=False),
sa.Column('import_id', sa.Text(), nullable=False),
sa.Column('variant_name', sa.Text(), nullable=False),
sa.Column('variant_name', sa.Text(), nullable=True),
sa.Column('document_status', sa.Enum('CREATED', 'PUBLISHED', 'DELETED', name='documentstatus'), nullable=False),
sa.Column('document_type', sa.Text(), nullable=True),
sa.Column('document_role', sa.Text(), nullable=True),
Expand Down
5 changes: 3 additions & 2 deletions app/api/api_v1/schemas/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class FamilyDocumentResponse(BaseModel):
"""Response for a FamilyDocument, without any family information"""

import_id: str
variant: str
variant: Optional[str]
slug: str
# What follows is off PhysicalDocument
title: str
Expand All @@ -74,7 +74,8 @@ class FamilyDocumentResponse(BaseModel):
source_url: Optional[str]
content_type: Optional[str]
language: str
document_type: str
document_type: Optional[str]
document_role: Optional[str]


class FamilyContext(BaseModel):
Expand Down
18 changes: 6 additions & 12 deletions app/core/ingestion/family.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, cast
from typing import Any, Optional, cast

from sqlalchemy.orm import Session
from app.core.ingestion.ingest_row import DocumentIngestRow
Expand All @@ -13,12 +13,10 @@
FamilyCategory,
Family,
FamilyDocument,
FamilyDocumentType,
FamilyOrganisation,
FamilyStatus,
Geography,
Slug,
Variant,
)


Expand Down Expand Up @@ -103,13 +101,8 @@ def _maybe_create_family_document(
existing_document: Document,
result: dict[str, Any],
) -> FamilyDocument:
# FIXME: these should come from well-known values, not whatever is in the CSV
variant_name = get_or_create(
db, Variant, variant_name=row.document_role, extra={"description": ""}
).variant_name
document_type = get_or_create(
db, FamilyDocumentType, name=row.document_type, extra={"description": ""}
).name
def none_if_empty(data: str) -> Optional[str]:
return data if data != "" else None

family_document = (
db.query(FamilyDocument).filter_by(import_id=row.cpr_document_id).one_or_none()
Expand All @@ -124,9 +117,10 @@ def _maybe_create_family_document(
family_import_id=family.import_id,
physical_document_id=physical_document.id,
import_id=row.cpr_document_id,
variant_name=variant_name,
variant_name=none_if_empty(row.document_variant),
document_status=DocumentStatus.PUBLISHED,
document_type=document_type,
document_type=none_if_empty(row.document_type),
document_role=none_if_empty(row.document_role),
)
db.add(family_document)
db.flush()
Expand Down
2 changes: 2 additions & 0 deletions app/core/ingestion/ingest_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"Family name",
"Family summary",
"Document role",
"Document variant",
"Geography ISO",
"Documents",
"Category",
Expand Down Expand Up @@ -126,6 +127,7 @@ class DocumentIngestRow(BaseIngestRow):
family_name: str
family_summary: str
document_role: str
document_variant: str
geography_iso: str
documents: str
category: FamilyCategory
Expand Down
2 changes: 0 additions & 2 deletions app/core/ingestion/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
"keyword": "keywords",
}

MAP_OF_STR_VALUES = {}


@dataclass(config=ConfigDict(validate_assignment=True, extra=Extra.forbid))
class TaxonomyEntry:
Expand Down
4 changes: 3 additions & 1 deletion app/core/ingestion/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ def get_dfc_validator(db: Session, context: IngestContext) -> ProcessFunc:

def process(context: IngestContext, row: DocumentIngestRow) -> None:
"""Processes the row into the db."""
validate_document_row(context=context, taxonomy=taxonomy, row=row)
_LOGGER.info(f"Validating document row: {row.row_number}")
with db.begin():
validate_document_row(db=db, context=context, taxonomy=taxonomy, row=row)

return process
50 changes: 48 additions & 2 deletions app/core/ingestion/validator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,34 @@
from sqlalchemy.orm import Session
from app.core.ingestion.ingest_row import DocumentIngestRow, EventIngestRow
from app.core.ingestion.metadata import build_metadata, Taxonomy
from app.core.ingestion.utils import IngestContext, Result, ResultType
from app.db.models.law_policy.family import (
FamilyDocumentRole,
FamilyDocumentType,
Variant,
)
from app.db.session import Base

DbTable = Base
CheckResult = Result


def _check_value_in_db(
row_num: int, db: Session, value: str, model: DbTable
) -> CheckResult:
if value != "":
val = db.query(model).get(value)
if val is None:
result = Result(
ResultType.ERROR,
f"Row {row_num}: Not found in db {model.__tablename__}={value}",
)
return result
return Result()


def validate_document_row(
db: Session,
context: IngestContext,
row: DocumentIngestRow,
taxonomy: Taxonomy,
Expand All @@ -15,13 +40,34 @@ def validate_document_row(
:param [DocumentIngestRow] row: DocumentIngestRow object from the current CSV row.
:param [Taxonomy] taxonomy: the Taxonomy against which metadata should be validated.
"""

errors = []
n = row.row_number
result = _check_value_in_db(n, db, row.document_type, FamilyDocumentType)
if result.type != ResultType.OK:
errors.append(result)

result = _check_value_in_db(n, db, row.document_role, FamilyDocumentRole)
if result.type != ResultType.OK:
errors.append(result)

result = _check_value_in_db(n, db, row.document_variant, Variant)
if result.type != ResultType.OK:
errors.append(result)

result, _ = build_metadata(taxonomy, row)
context.results.append(result)
if result.type != ResultType.OK:
errors.append(result)

if len(errors) > 0:
context.results += errors
else:
context.results.append(Result())


def validate_event_row(context: IngestContext, row: EventIngestRow) -> None:
"""
Validate the consituent elements that represent this event row.
Validate the constituent elements that represent this event row.
:param [IngestContext] context: The ingest context.
:param [DocumentIngestRow] row: DocumentIngestRow object from the current CSV row.
Expand Down
1 change: 1 addition & 0 deletions app/data_migrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .populate_category import populate_category
from .populate_document_type import populate_document_type
from .populate_document_role import populate_document_role
from .populate_document_variant import populate_document_variant
from .populate_event_type import populate_event_type
from .populate_framework import populate_framework
from .populate_geo_statistics import populate_geo_statistics
Expand Down
10 changes: 10 additions & 0 deletions app/data_migrations/data/document_variant_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[
{
"variant_name": "Original Language",
"description": "Original Language"
},
{
"variant_name": "Translation",
"description": "Translation"
}
]
19 changes: 19 additions & 0 deletions app/data_migrations/populate_document_variant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import json

from sqlalchemy.orm import Session

from app.db.models.law_policy.family import Variant
from .utils import has_rows, load_list


def populate_document_variant(db: Session) -> None:
"""Populates the document_type table with pre-defined data."""

if has_rows(db, Variant):
return

with open(
"app/data_migrations/data/document_variant_data.json"
) as document_variant_file:
document_variant_data = json.load(document_variant_file)
load_list(db, Variant, document_variant_data)
16 changes: 9 additions & 7 deletions app/db/crud/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_family_document_and_context(
published_date=family.published_date,
last_updated_date=family.last_updated_date,
)
document = FamilyDocumentResponse(
response = FamilyDocumentResponse(
import_id=document.import_id,
variant=document.variant_name,
slug=_get_slug_for_family_document_import_id(db, document.import_id),
Expand All @@ -94,9 +94,10 @@ def get_family_document_and_context(
content_type=physical_document.content_type,
language=_get_language_for_phys_doc(db, physical_document.id),
document_type=document.document_type,
document_role=document.document_role,
)

return FamilyDocumentWithContextResponse(family=family, document=document)
return FamilyDocumentWithContextResponse(family=family, document=response)


def _get_language_for_phys_doc(db: Session, physical_document_id: str) -> str:
Expand All @@ -123,9 +124,9 @@ def get_family_and_documents(
db_objects = (
db.query(Family, Geography, FamilyMetadata, FamilyOrganisation, Organisation)
.filter(Family.import_id == import_id)
.filter(Family.geography_id == Geography.id)
.filter(import_id == FamilyMetadata.family_import_id)
.filter(import_id == FamilyOrganisation.family_import_id)
.join(Geography, Family.geography_id == Geography.id)
.join(FamilyMetadata, import_id == FamilyMetadata.family_import_id)
.join(FamilyOrganisation, import_id == FamilyOrganisation.family_import_id)
.filter(FamilyOrganisation.organisation_id == Organisation.id)
).one_or_none()

Expand Down Expand Up @@ -197,9 +198,9 @@ def _get_collections_for_family_import_id(
families=[
LinkableFamily(slug=data[0], title=data[1], description=data[2])
for data in db.query(Slug.name, Family.title, Family.description)
.filter(Slug.family_import_id == Family.import_id)
.filter(CollectionFamily.family_import_id == Family.import_id)
.filter(CollectionFamily.collection_import_id == c.import_id)
.filter(CollectionFamily.family_import_id == Family.import_id)
.join(Slug, Slug.family_import_id == Family.import_id)
.all()
],
)
Expand Down Expand Up @@ -246,6 +247,7 @@ def _get_documents_for_family_import_id(
content_type=pd.content_type,
language=_get_language_for_phys_doc(db, pd.id),
document_type=d.document_type,
document_role=d.document_role,
)
for d, pd in db_documents
]
Expand Down
2 changes: 1 addition & 1 deletion app/db/models/law_policy/family.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ class FamilyDocument(Base):
)

import_id = sa.Column(sa.Text, primary_key=True)
variant_name = sa.Column(sa.ForeignKey(Variant.variant_name), nullable=False)
variant_name = sa.Column(sa.ForeignKey(Variant.variant_name), nullable=True)
document_status = sa.Column(
sa.Enum(DocumentStatus),
default=DocumentStatus.CREATED,
Expand Down
2 changes: 2 additions & 0 deletions app/initial_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
populate_category,
populate_document_type,
populate_document_role,
populate_document_variant,
populate_event_type,
populate_framework,
populate_geo_statistics,
Expand All @@ -40,6 +41,7 @@ def run_data_migrations(db):
populate_category(db)
populate_document_type(db)
populate_document_role(db)
populate_document_variant(db)
populate_event_type(db)
populate_framework(db)
populate_geography(db)
Expand Down
15 changes: 10 additions & 5 deletions tests/core/ingestion/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from datetime import datetime
from app.data_migrations import (
populate_category,
populate_document_role,
populate_document_variant,
populate_document_type,
populate_event_type,
populate_geography,
Expand All @@ -14,13 +16,13 @@
from app.db.models.deprecated.document import Document
from app.db.models.law_policy.family import Slug

THREE_DOC_ROWS = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug
1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,DZA,http://place1|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1001.0,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1
1002,0,Test2,FALSE,FALSE,N/A,N/A,N/A,Title2,Fam2,Summary2,,MAIN,,DZA,http://place1|en;http://place2|fr,executive,28/04/2013|Law passed||,Energy;LULUCF;Social development;Transportation;Urban;Waste,"Processes, plans and strategies|Governance",Adaptation;Mitigation,Adaptation;Mitigation,,Plan,,,Adaptation;Energy Supply;Energy Demand;Redd+ And Lulucf;Transportation,Algeria,,,CCLW.executive.1002.0,CCLW.family.1002.0,N/A,FamSlug2,DocSlug2
1003,0,Test3,FALSE,FALSE,N/A,N/A,N/A,Title3,Fam3,Summary3,,MAIN,,DZA,,executive,08/12/2011|Law passed,Energy,Subsidies|Economic,,Mitigation,,Decree,,,Research And Development;Energy Supply,Algeria,,,CCLW.executive.1003.0,CCLW.family.1003.0,N/A,FamSlug3,DocSlug3
THREE_DOC_ROWS = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Document variant,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug
1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,DZA,http://place1|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,,Energy Supply,Algeria,,,CCLW.executive.1001.0,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1
1002,0,Test2,FALSE,FALSE,N/A,N/A,N/A,Title2,Fam2,Summary2,,MAIN,,DZA,http://place1|en;http://place2|fr,executive,28/04/2013|Law passed||,Energy;LULUCF;Social development;Transportation;Urban;Waste,"Processes, plans and strategies|Governance",Adaptation;Mitigation,Adaptation;Mitigation,,Plan,,,,Adaptation;Energy Supply;Energy Demand;Redd+ And Lulucf;Transportation,Algeria,,,CCLW.executive.1002.0,CCLW.family.1002.0,N/A,FamSlug2,DocSlug2
1003,0,Test3,FALSE,FALSE,N/A,N/A,N/A,Title3,Fam3,Summary3,,MAIN,,DZA,,executive,08/12/2011|Law passed,Energy,Subsidies|Economic,,Mitigation,,Decree,,,,Research And Development;Energy Supply,Algeria,,,CCLW.executive.1003.0,CCLW.family.1003.0,N/A,FamSlug3,DocSlug3
"""

THREE_DOC_ROWS_MISSING_FIELD = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug
THREE_DOC_ROWS_MISSING_FIELD = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Document variant,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug
1001,0,Test1,FALSE,FALSE,N/A,N/A,N/A,Title1,Fam1,Summary1,,MAIN,,DZA,,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1001.0,CCLW.family.1001.0,N/A,FamSlug1
1002,0,Test2,FALSE,FALSE,N/A,N/A,N/A,Title2,Fam2,Summary2,,MAIN,,DZA,,executive,28/04/2013|Law passed||,Energy;LULUCF;Social development;Transportation;Urban;Waste,"Processes, plans and strategies|Governance",Adaptation;Mitigation,Adaptation;Mitigation,,Plan,,,Adaptation;Energy Supply;Energy Demand;Redd+ And Lulucf;Transportation,Algeria,,,CCLW.executive.1002.0,CCLW.family.1002.0,N/A,FamSlug2
1003,0,Test3,FALSE,FALSE,N/A,N/A,N/A,Title3,Fam3,Summary3,,MAIN,,DZA,,executive,08/12/2011|Law passed,Energy,Subsidies|Economic,,Mitigation,,Decree,,,Research And Development;Energy Supply,Algeria,,,CCLW.executive.1003.0,CCLW.family.1003.0,N/A,FamSlug3
Expand All @@ -45,6 +47,7 @@
"Document Type",
"Document role",
"Document title",
"Document variant",
"Documents",
"Family name",
"Family summary",
Expand Down Expand Up @@ -131,6 +134,8 @@ def init_for_ingest(test_db: Session):
populate_document_type(test_db)
populate_event_type(test_db)
populate_language(test_db)
populate_document_role(test_db)
populate_document_variant(test_db)
test_db.flush()
test_db.add(
Document(
Expand Down
Loading

0 comments on commit 0460bc6

Please sign in to comment.