Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update ingest and relax schema for variant #70

Merged
merged 12 commits into from
Mar 24, 2023
2 changes: 1 addition & 1 deletion alembic/versions/0013_families_and_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def upgrade():
sa.Column('family_import_id', sa.Text(), nullable=False),
sa.Column('physical_document_id', sa.Integer(), nullable=False),
sa.Column('import_id', sa.Text(), nullable=False),
sa.Column('variant_name', sa.Text(), nullable=False),
sa.Column('variant_name', sa.Text(), nullable=True),
sa.Column('document_status', sa.Enum('CREATED', 'PUBLISHED', 'DELETED', name='documentstatus'), nullable=False),
sa.Column('document_type', sa.Text(), nullable=True),
sa.Column('document_role', sa.Text(), nullable=True),
Expand Down
5 changes: 3 additions & 2 deletions app/api/api_v1/schemas/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class FamilyDocumentResponse(BaseModel):
"""Response for a FamilyDocument, without any family information"""

import_id: str
variant: str
variant: Optional[str]
slug: str
# What follows is off PhysicalDocument
title: str
Expand All @@ -74,7 +74,8 @@ class FamilyDocumentResponse(BaseModel):
source_url: Optional[str]
content_type: Optional[str]
language: str
document_type: str
document_type: Optional[str]
document_role: Optional[str]


class FamilyContext(BaseModel):
Expand Down
18 changes: 6 additions & 12 deletions app/core/ingestion/family.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, cast
from typing import Any, Optional, cast

from sqlalchemy.orm import Session
from app.core.ingestion.ingest_row import DocumentIngestRow
Expand All @@ -13,12 +13,10 @@
FamilyCategory,
Family,
FamilyDocument,
FamilyDocumentType,
FamilyOrganisation,
FamilyStatus,
Geography,
Slug,
Variant,
)


Expand Down Expand Up @@ -103,13 +101,8 @@ def _maybe_create_family_document(
existing_document: Document,
result: dict[str, Any],
) -> FamilyDocument:
# FIXME: these should come from well-known values, not whatever is in the CSV
variant_name = get_or_create(
db, Variant, variant_name=row.document_role, extra={"description": ""}
).variant_name
document_type = get_or_create(
db, FamilyDocumentType, name=row.document_type, extra={"description": ""}
).name
def none_if_empty(data: str) -> Optional[str]:
return data if data != "" else None

family_document = (
db.query(FamilyDocument).filter_by(import_id=row.cpr_document_id).one_or_none()
Expand All @@ -124,9 +117,10 @@ def _maybe_create_family_document(
family_import_id=family.import_id,
physical_document_id=physical_document.id,
import_id=row.cpr_document_id,
variant_name=variant_name,
variant_name=none_if_empty(row.document_variant),
document_status=DocumentStatus.PUBLISHED,
document_type=document_type,
document_type=none_if_empty(row.document_type),
document_role=none_if_empty(row.document_role),
)
db.add(family_document)
db.flush()
Expand Down
2 changes: 2 additions & 0 deletions app/core/ingestion/ingest_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"Family name",
"Family summary",
"Document role",
"Document variant",
"Geography ISO",
"Documents",
"Category",
Expand Down Expand Up @@ -126,6 +127,7 @@ class DocumentIngestRow(BaseIngestRow):
family_name: str
family_summary: str
document_role: str
document_variant: str
geography_iso: str
documents: str
category: FamilyCategory
Expand Down
2 changes: 0 additions & 2 deletions app/core/ingestion/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
"keyword": "keywords",
}

MAP_OF_STR_VALUES = {}


@dataclass(config=ConfigDict(validate_assignment=True, extra=Extra.forbid))
class TaxonomyEntry:
Expand Down
4 changes: 3 additions & 1 deletion app/core/ingestion/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ def get_dfc_validator(db: Session, context: IngestContext) -> ProcessFunc:

def process(context: IngestContext, row: DocumentIngestRow) -> None:
"""Processes the row into the db."""
validate_document_row(context=context, taxonomy=taxonomy, row=row)
_LOGGER.info(f"Validating document row: {row.row_number}")
with db.begin():
validate_document_row(db=db, context=context, taxonomy=taxonomy, row=row)

return process
50 changes: 48 additions & 2 deletions app/core/ingestion/validator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,34 @@
from sqlalchemy.orm import Session
from app.core.ingestion.ingest_row import DocumentIngestRow, EventIngestRow
from app.core.ingestion.metadata import build_metadata, Taxonomy
from app.core.ingestion.utils import IngestContext, Result, ResultType
from app.db.models.law_policy.family import (
FamilyDocumentRole,
FamilyDocumentType,
Variant,
)
from app.db.session import Base

DbTable = Base
CheckResult = Result


def _check_value_in_db(
row_num: int, db: Session, value: str, model: DbTable
) -> CheckResult:
if value != "":
val = db.query(model).get(value)
if val is None:
result = Result(
ResultType.ERROR,
f"Row {row_num}: Not found in db {model.__tablename__}={value}",
)
return result
return Result()


def validate_document_row(
db: Session,
context: IngestContext,
row: DocumentIngestRow,
taxonomy: Taxonomy,
Expand All @@ -15,13 +40,34 @@ def validate_document_row(
:param [DocumentIngestRow] row: DocumentIngestRow object from the current CSV row.
:param [Taxonomy] taxonomy: the Taxonomy against which metadata should be validated.
"""

errors = []
n = row.row_number
result = _check_value_in_db(n, db, row.document_type, FamilyDocumentType)
if result.type != ResultType.OK:
errors.append(result)

result = _check_value_in_db(n, db, row.document_role, FamilyDocumentRole)
if result.type != ResultType.OK:
errors.append(result)

result = _check_value_in_db(n, db, row.document_variant, Variant)
if result.type != ResultType.OK:
errors.append(result)

result, _ = build_metadata(taxonomy, row)
context.results.append(result)
if result.type != ResultType.OK:
errors.append(result)

if len(errors) > 0:
context.results += errors
else:
context.results.append(Result())


def validate_event_row(context: IngestContext, row: EventIngestRow) -> None:
"""
Validate the consituent elements that represent this event row.
Validate the constituent elements that represent this event row.

:param [IngestContext] context: The ingest context.
:param [DocumentIngestRow] row: DocumentIngestRow object from the current CSV row.
Expand Down
1 change: 1 addition & 0 deletions app/data_migrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .populate_category import populate_category
from .populate_document_type import populate_document_type
from .populate_document_role import populate_document_role
from .populate_document_variant import populate_document_variant
from .populate_event_type import populate_event_type
from .populate_framework import populate_framework
from .populate_geo_statistics import populate_geo_statistics
Expand Down
10 changes: 10 additions & 0 deletions app/data_migrations/data/document_variant_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[
{
"variant_name": "Original Language",
"description": "Original Language"
},
{
"variant_name": "Translation",
"description": "Translation"
}
]
19 changes: 19 additions & 0 deletions app/data_migrations/populate_document_variant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import json

from sqlalchemy.orm import Session

from app.db.models.law_policy.family import Variant
from .utils import has_rows, load_list


def populate_document_variant(db: Session) -> None:
"""Populates the document_type table with pre-defined data."""

if has_rows(db, Variant):
return

with open(
"app/data_migrations/data/document_variant_data.json"
) as document_variant_file:
document_variant_data = json.load(document_variant_file)
load_list(db, Variant, document_variant_data)
16 changes: 9 additions & 7 deletions app/db/crud/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_family_document_and_context(
published_date=family.published_date,
last_updated_date=family.last_updated_date,
)
document = FamilyDocumentResponse(
response = FamilyDocumentResponse(
import_id=document.import_id,
variant=document.variant_name,
slug=_get_slug_for_family_document_import_id(db, document.import_id),
Expand All @@ -94,9 +94,10 @@ def get_family_document_and_context(
content_type=physical_document.content_type,
language=_get_language_for_phys_doc(db, physical_document.id),
document_type=document.document_type,
document_role=document.document_role,
)

return FamilyDocumentWithContextResponse(family=family, document=document)
return FamilyDocumentWithContextResponse(family=family, document=response)


def _get_language_for_phys_doc(db: Session, physical_document_id: str) -> str:
Expand All @@ -123,9 +124,9 @@ def get_family_and_documents(
db_objects = (
db.query(Family, Geography, FamilyMetadata, FamilyOrganisation, Organisation)
.filter(Family.import_id == import_id)
.filter(Family.geography_id == Geography.id)
.filter(import_id == FamilyMetadata.family_import_id)
.filter(import_id == FamilyOrganisation.family_import_id)
.join(Geography, Family.geography_id == Geography.id)
.join(FamilyMetadata, import_id == FamilyMetadata.family_import_id)
.join(FamilyOrganisation, import_id == FamilyOrganisation.family_import_id)
.filter(FamilyOrganisation.organisation_id == Organisation.id)
).one_or_none()

Expand Down Expand Up @@ -197,9 +198,9 @@ def _get_collections_for_family_import_id(
families=[
LinkableFamily(slug=data[0], title=data[1], description=data[2])
for data in db.query(Slug.name, Family.title, Family.description)
.filter(Slug.family_import_id == Family.import_id)
.filter(CollectionFamily.family_import_id == Family.import_id)
.filter(CollectionFamily.collection_import_id == c.import_id)
.filter(CollectionFamily.family_import_id == Family.import_id)
.join(Slug, Slug.family_import_id == Family.import_id)
.all()
],
)
Expand Down Expand Up @@ -246,6 +247,7 @@ def _get_documents_for_family_import_id(
content_type=pd.content_type,
language=_get_language_for_phys_doc(db, pd.id),
document_type=d.document_type,
document_role=d.document_role,
)
for d, pd in db_documents
]
Expand Down
2 changes: 1 addition & 1 deletion app/db/models/law_policy/family.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ class FamilyDocument(Base):
)

import_id = sa.Column(sa.Text, primary_key=True)
variant_name = sa.Column(sa.ForeignKey(Variant.variant_name), nullable=False)
variant_name = sa.Column(sa.ForeignKey(Variant.variant_name), nullable=True)
document_status = sa.Column(
sa.Enum(DocumentStatus),
default=DocumentStatus.CREATED,
Expand Down
2 changes: 2 additions & 0 deletions app/initial_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
populate_category,
populate_document_type,
populate_document_role,
populate_document_variant,
populate_event_type,
populate_framework,
populate_geo_statistics,
Expand All @@ -40,6 +41,7 @@ def run_data_migrations(db):
populate_category(db)
populate_document_type(db)
populate_document_role(db)
populate_document_variant(db)
populate_event_type(db)
populate_framework(db)
populate_geography(db)
Expand Down
15 changes: 10 additions & 5 deletions tests/core/ingestion/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from datetime import datetime
from app.data_migrations import (
populate_category,
populate_document_role,
populate_document_variant,
populate_document_type,
populate_event_type,
populate_geography,
Expand All @@ -14,13 +16,13 @@
from app.db.models.deprecated.document import Document
from app.db.models.law_policy.family import Slug

THREE_DOC_ROWS = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug
1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,DZA,http://place1|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1001.0,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1
1002,0,Test2,FALSE,FALSE,N/A,N/A,N/A,Title2,Fam2,Summary2,,MAIN,,DZA,http://place1|en;http://place2|fr,executive,28/04/2013|Law passed||,Energy;LULUCF;Social development;Transportation;Urban;Waste,"Processes, plans and strategies|Governance",Adaptation;Mitigation,Adaptation;Mitigation,,Plan,,,Adaptation;Energy Supply;Energy Demand;Redd+ And Lulucf;Transportation,Algeria,,,CCLW.executive.1002.0,CCLW.family.1002.0,N/A,FamSlug2,DocSlug2
1003,0,Test3,FALSE,FALSE,N/A,N/A,N/A,Title3,Fam3,Summary3,,MAIN,,DZA,,executive,08/12/2011|Law passed,Energy,Subsidies|Economic,,Mitigation,,Decree,,,Research And Development;Energy Supply,Algeria,,,CCLW.executive.1003.0,CCLW.family.1003.0,N/A,FamSlug3,DocSlug3
THREE_DOC_ROWS = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Document variant,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug
1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,DZA,http://place1|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,,Energy Supply,Algeria,,,CCLW.executive.1001.0,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1
1002,0,Test2,FALSE,FALSE,N/A,N/A,N/A,Title2,Fam2,Summary2,,MAIN,,DZA,http://place1|en;http://place2|fr,executive,28/04/2013|Law passed||,Energy;LULUCF;Social development;Transportation;Urban;Waste,"Processes, plans and strategies|Governance",Adaptation;Mitigation,Adaptation;Mitigation,,Plan,,,,Adaptation;Energy Supply;Energy Demand;Redd+ And Lulucf;Transportation,Algeria,,,CCLW.executive.1002.0,CCLW.family.1002.0,N/A,FamSlug2,DocSlug2
1003,0,Test3,FALSE,FALSE,N/A,N/A,N/A,Title3,Fam3,Summary3,,MAIN,,DZA,,executive,08/12/2011|Law passed,Energy,Subsidies|Economic,,Mitigation,,Decree,,,,Research And Development;Energy Supply,Algeria,,,CCLW.executive.1003.0,CCLW.family.1003.0,N/A,FamSlug3,DocSlug3
"""

THREE_DOC_ROWS_MISSING_FIELD = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug
THREE_DOC_ROWS_MISSING_FIELD = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Document variant,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug
1001,0,Test1,FALSE,FALSE,N/A,N/A,N/A,Title1,Fam1,Summary1,,MAIN,,DZA,,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1001.0,CCLW.family.1001.0,N/A,FamSlug1
1002,0,Test2,FALSE,FALSE,N/A,N/A,N/A,Title2,Fam2,Summary2,,MAIN,,DZA,,executive,28/04/2013|Law passed||,Energy;LULUCF;Social development;Transportation;Urban;Waste,"Processes, plans and strategies|Governance",Adaptation;Mitigation,Adaptation;Mitigation,,Plan,,,Adaptation;Energy Supply;Energy Demand;Redd+ And Lulucf;Transportation,Algeria,,,CCLW.executive.1002.0,CCLW.family.1002.0,N/A,FamSlug2
1003,0,Test3,FALSE,FALSE,N/A,N/A,N/A,Title3,Fam3,Summary3,,MAIN,,DZA,,executive,08/12/2011|Law passed,Energy,Subsidies|Economic,,Mitigation,,Decree,,,Research And Development;Energy Supply,Algeria,,,CCLW.executive.1003.0,CCLW.family.1003.0,N/A,FamSlug3
Expand All @@ -45,6 +47,7 @@
"Document Type",
"Document role",
"Document title",
"Document variant",
"Documents",
"Family name",
"Family summary",
Expand Down Expand Up @@ -131,6 +134,8 @@ def init_for_ingest(test_db: Session):
populate_document_type(test_db)
populate_event_type(test_db)
populate_language(test_db)
populate_document_role(test_db)
populate_document_variant(test_db)
test_db.flush()
test_db.add(
Document(
Expand Down
Loading