Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update ingest and relax schema for variant #70

Merged
merged 12 commits into from
Mar 24, 2023
2 changes: 1 addition & 1 deletion alembic/versions/0013_families_and_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def upgrade():
sa.Column('family_import_id', sa.Text(), nullable=False),
sa.Column('physical_document_id', sa.Integer(), nullable=False),
sa.Column('import_id', sa.Text(), nullable=False),
sa.Column('variant_name', sa.Text(), nullable=False),
sa.Column('variant_name', sa.Text(), nullable=True),
sa.Column('document_status', sa.Enum('CREATED', 'PUBLISHED', 'DELETED', name='documentstatus'), nullable=False),
sa.Column('document_type', sa.Text(), nullable=True),
sa.Column('document_role', sa.Text(), nullable=True),
Expand Down
5 changes: 3 additions & 2 deletions app/api/api_v1/schemas/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class FamilyDocumentResponse(BaseModel):
"""Response for a FamilyDocument, without any family information"""

import_id: str
variant: str
variant: Optional[str]
slug: str
# What follows is off PhysicalDocument
title: str
Expand All @@ -74,7 +74,8 @@ class FamilyDocumentResponse(BaseModel):
source_url: Optional[str]
content_type: Optional[str]
language: str
document_type: str
document_type: Optional[str]
document_role: Optional[str]


class FamilyContext(BaseModel):
Expand Down
32 changes: 20 additions & 12 deletions app/core/ingestion/family.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, cast
from typing import Any, Optional, cast

from sqlalchemy.orm import Session
from app.core.ingestion.ingest_row import DocumentIngestRow
Expand All @@ -13,12 +13,10 @@
FamilyCategory,
Family,
FamilyDocument,
FamilyDocumentType,
FamilyOrganisation,
FamilyStatus,
Geography,
Slug,
Variant,
)


Expand Down Expand Up @@ -96,20 +94,29 @@ def _create_family_links(family: Family):
return family


def _get_role_and_variant(row: DocumentIngestRow) -> tuple[str, Optional[str]]:
data = row.document_role.upper()
if data.startswith("MAIN"):
if "LANGUAGE VERSION" in data:
return ("MAIN", "Original Language")
return ("MAIN", None)

if "ENGLISH TRANSLATION" in data:
# FIXME: Marcus still to determine:
# REF: https://docs.google.com/spreadsheets/d/1RO7wp2XN4mXsKYJ4IiV7iu2GFY3cJSBWSPqblacMTT4
return (data, "Translation")

return (data, None)


def _maybe_create_family_document(
db: Session,
row: DocumentIngestRow,
family: Family,
existing_document: Document,
result: dict[str, Any],
) -> FamilyDocument:
# FIXME: these should come from well-known values, not whatever is in the CSV
variant_name = get_or_create(
db, Variant, variant_name=row.document_role, extra={"description": ""}
).variant_name
document_type = get_or_create(
db, FamilyDocumentType, name=row.document_type, extra={"description": ""}
).name
role, variant = _get_role_and_variant(row)

family_document = (
db.query(FamilyDocument).filter_by(import_id=row.cpr_document_id).one_or_none()
Expand All @@ -124,9 +131,10 @@ def _maybe_create_family_document(
family_import_id=family.import_id,
physical_document_id=physical_document.id,
import_id=row.cpr_document_id,
variant_name=variant_name,
variant_name=variant,
document_status=DocumentStatus.PUBLISHED,
document_type=document_type,
document_type=row.document_type,
document_role=role,
)
db.add(family_document)
db.flush()
Expand Down
2 changes: 0 additions & 2 deletions app/core/ingestion/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
"keyword": "keywords",
}

MAP_OF_STR_VALUES = {}


@dataclass(config=ConfigDict(validate_assignment=True, extra=Extra.forbid))
class TaxonomyEntry:
Expand Down
1 change: 1 addition & 0 deletions app/data_migrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .populate_category import populate_category
from .populate_document_type import populate_document_type
from .populate_document_role import populate_document_role
from .populate_document_variant import populate_document_variant
from .populate_event_type import populate_event_type
from .populate_framework import populate_framework
from .populate_geo_statistics import populate_geo_statistics
Expand Down
4 changes: 4 additions & 0 deletions app/data_migrations/data/document_role_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
"name": "MAIN",
"description": "MAIN"
},
{
"name": "ENGLISH TRANSLATION",
"description": "ENGLISH TRANSLATION"
},
joel-wright marked this conversation as resolved.
Show resolved Hide resolved
{
"name": "AMENDMENT",
"description": "AMENDMENT"
Expand Down
10 changes: 10 additions & 0 deletions app/data_migrations/data/document_variant_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[
{
"variant_name": "Original Language",
"description": "Original Language"
},
{
"variant_name": "Translation",
"description": "Translation"
}
]
19 changes: 19 additions & 0 deletions app/data_migrations/populate_document_variant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import json

from sqlalchemy.orm import Session

from app.db.models.law_policy.family import Variant
from .utils import has_rows, load_list


def populate_document_variant(db: Session) -> None:
"""Populates the document_type table with pre-defined data."""

if has_rows(db, Variant):
return

with open(
"app/data_migrations/data/document_variant_data.json"
) as document_variant_file:
document_variant_data = json.load(document_variant_file)
load_list(db, Variant, document_variant_data)
12 changes: 7 additions & 5 deletions app/db/crud/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_family_document_and_context(
published_date=family.published_date,
last_updated_date=family.last_updated_date,
)
document = FamilyDocumentResponse(
response = FamilyDocumentResponse(
import_id=document.import_id,
variant=document.variant_name,
slug=_get_slug_for_family_document_import_id(db, document.import_id),
Expand All @@ -94,9 +94,10 @@ def get_family_document_and_context(
content_type=physical_document.content_type,
language=_get_language_for_phys_doc(db, physical_document.id),
document_type=document.document_type,
document_role=document.document_role,
)

return FamilyDocumentWithContextResponse(family=family, document=document)
return FamilyDocumentWithContextResponse(family=family, document=response)


def _get_language_for_phys_doc(db: Session, physical_document_id: str) -> str:
Expand All @@ -123,9 +124,9 @@ def get_family_and_documents(
db_objects = (
db.query(Family, Geography, FamilyMetadata, FamilyOrganisation, Organisation)
.filter(Family.import_id == import_id)
.filter(Family.geography_id == Geography.id)
.filter(import_id == FamilyMetadata.family_import_id)
.filter(import_id == FamilyOrganisation.family_import_id)
.join(Geography, Family.geography_id == Geography.id)
.join(FamilyMetadata, import_id == FamilyMetadata.family_import_id)
.join(FamilyOrganisation, import_id == FamilyOrganisation.family_import_id)
.filter(FamilyOrganisation.organisation_id == Organisation.id)
).one_or_none()

Expand Down Expand Up @@ -246,6 +247,7 @@ def _get_documents_for_family_import_id(
content_type=pd.content_type,
language=_get_language_for_phys_doc(db, pd.id),
document_type=d.document_type,
document_role=d.document_role,
)
for d, pd in db_documents
]
Expand Down
2 changes: 1 addition & 1 deletion app/db/models/law_policy/family.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ class FamilyDocument(Base):
)

import_id = sa.Column(sa.Text, primary_key=True)
variant_name = sa.Column(sa.ForeignKey(Variant.variant_name), nullable=False)
variant_name = sa.Column(sa.ForeignKey(Variant.variant_name), nullable=True)
document_status = sa.Column(
sa.Enum(DocumentStatus),
default=DocumentStatus.CREATED,
Expand Down
2 changes: 2 additions & 0 deletions app/initial_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
populate_category,
populate_document_type,
populate_document_role,
populate_document_variant,
populate_event_type,
populate_framework,
populate_geo_statistics,
Expand All @@ -40,6 +41,7 @@ def run_data_migrations(db):
populate_category(db)
populate_document_type(db)
populate_document_role(db)
populate_document_variant(db)
populate_event_type(db)
populate_framework(db)
populate_geography(db)
Expand Down
4 changes: 4 additions & 0 deletions tests/core/ingestion/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from datetime import datetime
from app.data_migrations import (
populate_category,
populate_document_role,
populate_document_variant,
populate_document_type,
populate_event_type,
populate_geography,
Expand Down Expand Up @@ -131,6 +133,8 @@ def init_for_ingest(test_db: Session):
populate_document_type(test_db)
populate_event_type(test_db)
populate_language(test_db)
populate_document_role(test_db)
populate_document_variant(test_db)
test_db.flush()
test_db.add(
Document(
Expand Down
145 changes: 145 additions & 0 deletions tests/routes/document_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from datetime import datetime
from app.api.api_v1.routers.admin import _start_ingest
from app.data_migrations import (
populate_document_role,
populate_document_type,
populate_event_type,
populate_taxonomy,
)
from app.db.models.deprecated.document import (
Category,
Document,
DocumentType,
Framework,
Hazard,
Instrument,
Keyword,
Response,
Sector,
)
from app.db.models.deprecated.source import Source
from app.db.models.document.physical_document import Language
from app.db.models.law_policy.geography import Geography


ONE_DFC_ROW = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug
1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1
"""

ONE_EVENT_ROW = """Id,Eventable type,Eventable Id,Eventable name,Event type,Title,Description,Date,Url,CPR Event ID,CPR Family ID,Event Status
1101,Legislation,1001,Title1,Passed/Approved,Published,,2019-12-25,,CCLW.legislation_event.1101.0,CCLW.family.1001.0,OK
"""

TWO_DFC_ROW = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug
1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1
2002,0,Test2,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title2,Fam2,Summary2,,MAIN,,GEO,http://another_somewhere|en,executive,03/03/2024|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.2.2,CCLW.family.2002.0,CPR.Collection.1,FamSlug2,DocSlug2
"""

TWO_DFC_ROW_ONE_LANGUAGE = """ID,Document ID,CCLW Description,Part of collection?,Create new family/ies?,Collection ID,Collection name,Collection summary,Document title,Family name,Family summary,Family ID,Document role,Applies to ID,Geography ISO,Documents,Category,Events,Sectors,Instruments,Frameworks,Responses,Natural Hazards,Document Type,Year,Language,Keywords,Geography,Parent Legislation,Comment,CPR Document ID,CPR Family ID,CPR Collection ID,CPR Family Slug,CPR Document Slug
1001,0,Test1,FALSE,FALSE,N/A,Collection1,CollectionSummary1,Title1,Fam1,Summary1,,MAIN,,GEO,http://somewhere|en,executive,02/02/2014|Law passed,Energy,,,Mitigation,,Order,,English,Energy Supply,Algeria,,,CCLW.executive.1.2,CCLW.family.1001.0,CPR.Collection.1,FamSlug1,DocSlug1
2002,0,Test2,FALSE,FALSE,N/A,Collection2,CollectionSummary2,Title2,Fam2,Summary2,,MAIN,,GEO,http://another_somewhere|en,executive,03/03/2024|Law passed,Energy,,,Mitigation,,Order,,,Energy Supply,Algeria,,,CCLW.executive.2.2,CCLW.family.2002.0,CPR.Collection.2,FamSlug2,DocSlug2
"""

TWO_EVENT_ROWS = """Id,Eventable type,Eventable Id,Eventable name,Event type,Title,Description,Date,Url,CPR Event ID,CPR Family ID,Event Status
1101,Legislation,1001,Title1,Passed/Approved,Published,,2019-12-25,,CCLW.legislation_event.1101.0,CCLW.family.1001.0,OK
2202,Legislation,2002,Title2,Passed/Approved,Published,,2019-12-25,,CCLW.legislation_event.2202.0,CCLW.family.2002.0,OK
"""


def setup_with_docs(test_db, mocker):
mock_s3 = mocker.patch("app.core.aws.S3Client")

populate_taxonomy(test_db)
populate_event_type(test_db)
populate_document_type(test_db)
populate_document_role(test_db)
test_db.commit()

populate_old_documents(test_db)

_start_ingest(test_db, mock_s3, "s3_prefix", ONE_DFC_ROW, ONE_EVENT_ROW)
test_db.commit()


def setup_with_two_docs(
test_db, mocker, doc_data=TWO_DFC_ROW, event_data=TWO_EVENT_ROWS
):
mock_s3 = mocker.patch("app.core.aws.S3Client")

populate_taxonomy(test_db)
populate_event_type(test_db)
populate_document_type(test_db)
populate_document_role(test_db)
test_db.commit()

populate_old_documents(test_db)

_start_ingest(test_db, mock_s3, "s3_prefix", doc_data, event_data)
test_db.commit()


def populate_old_documents(test_db):
test_db.add(Source(name="CCLW"))
test_db.add(
Geography(
display_value="geography", slug="geography", value="GEO", type="country"
)
)
test_db.add(DocumentType(name="doctype", description="doctype"))
test_db.add(Language(language_code="LAN", name="language"))
test_db.add(Category(name="Policy", description="Policy"))
test_db.add(Keyword(name="keyword1", description="keyword1"))
test_db.add(Keyword(name="keyword2", description="keyword2"))
test_db.add(Hazard(name="hazard1", description="hazard1"))
test_db.add(Hazard(name="hazard2", description="hazard2"))
test_db.add(Response(name="topic", description="topic"))
test_db.add(Framework(name="framework", description="framework"))

test_db.commit()
existing_doc_import_id = "CCLW.executive.1.2"
test_db.add(Instrument(name="instrument", description="instrument", source_id=1))
test_db.add(Sector(name="sector", description="sector", source_id=1))
test_db.add(
Document(
publication_ts=datetime(year=2014, month=1, day=1),
name="test",
description="test description",
source_url="http://somewhere",
source_id=1,
url="",
cdn_object="",
md5_sum=None,
content_type=None,
slug="geography_2014_test_1_2",
import_id=existing_doc_import_id,
geography_id=1,
type_id=1,
category_id=1,
)
)
test_db.commit()
test_db.add(
Document(
publication_ts=datetime(year=2014, month=1, day=1),
name="test",
description="test description",
source_url="http://another_somewhere",
source_id=1,
url="",
cdn_object="",
md5_sum=None,
content_type=None,
slug="geography_2014_test_2_2",
import_id="CCLW.executive.2.2",
geography_id=1,
type_id=1,
category_id=1,
)
)
test_db.commit()


def populate_languages(test_db):
test_db.add(Language(language_code="eng", name="English"))
test_db.add(Language(language_code="fra", name="French"))
test_db.commit()
Loading