Skip to content

Commit

Permalink
Update db_state output to include metadata & remove corpus specificity (
Browse files Browse the repository at this point in the history
#164)

* Update db_state output to include metadata & remove corpus specificity

* Add family import id to db_state output
  • Loading branch information
Joel Wright authored Aug 15, 2023
1 parent c199d9a commit 44c6cf3
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 40 deletions.
15 changes: 5 additions & 10 deletions app/api/api_v1/schemas/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from . import CLIMATE_LAWS_MATCH


Json = dict[str, Any]


class Event(BaseModel): # noqa: D101
name: str
description: str
Expand Down Expand Up @@ -115,7 +118,6 @@ class DocumentParserInput(BaseModel):
publication_ts: datetime
name: str
description: str
postfix: Optional[str]
source_url: Optional[str]
download_url: Optional[str]

Expand All @@ -124,18 +126,12 @@ class DocumentParserInput(BaseModel):
type: str
source: str
import_id: str
family_import_id: str
category: str

frameworks: Sequence[str]
geography: str
hazards: Sequence[str]
instruments: Sequence[str]
keywords: Sequence[str]
languages: Sequence[str]
sectors: Sequence[str]
topics: Sequence[str]

events: Sequence[Event]
metadata: Json

def to_json(self) -> Mapping[str, Any]:
"""Provide a serialisable version of the model"""
Expand All @@ -144,7 +140,6 @@ def to_json(self) -> Mapping[str, Any]:
json_dict["publication_ts"] = (
self.publication_ts.isoformat() if self.publication_ts is not None else None
)
json_dict["events"] = [event.to_json() for event in self.events]
return json_dict

class Config: # noqa: D106
Expand Down
30 changes: 17 additions & 13 deletions app/core/ingestion/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime, timezone
from typing import Sequence, Tuple, cast
from typing import Any, Sequence, Tuple, cast

from sqlalchemy.orm import Session

Expand All @@ -11,22 +11,27 @@
FamilyOrganisation,
Geography,
)
from app.db.models.law_policy.metadata import FamilyMetadata


def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput]:
"""Generates a complete view of the current document database as pipeline input"""
query = (
db.query(Family, FamilyDocument, Geography, Organisation)
db.query(Family, FamilyDocument, FamilyMetadata, Geography, Organisation)
.join(Family, Family.import_id == FamilyDocument.family_import_id)
.join(
FamilyOrganisation, FamilyOrganisation.family_import_id == Family.import_id
)
.join(FamilyMetadata, Family.import_id == FamilyMetadata.family_import_id)
.join(Organisation, Organisation.id == FamilyOrganisation.organisation_id)
.join(Geography, Geography.id == Family.geography_id)
)

query_result = cast(
Sequence[Tuple[Family, FamilyDocument, Geography, Organisation]], query.all()
Sequence[
Tuple[Family, FamilyDocument, FamilyMetadata, Geography, Organisation]
],
query.all(),
)
fallback_date = datetime(1900, 1, 1, tzinfo=timezone.utc)
documents: Sequence[DocumentParserInput] = [
Expand All @@ -36,6 +41,7 @@ def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput]
category=str(family.family_category),
publication_ts=family.published_date or fallback_date,
import_id=cast(str, family_document.import_id),
family_import_id=cast(str, family.import_id),
source_url=(
cast(str, family_document.physical_document.source_url)
if family_document.physical_document is not None
Expand All @@ -54,17 +60,15 @@ def generate_pipeline_ingest_input(db: Session) -> Sequence[DocumentParserInput]
else []
)
],
# TODO: the following are not used & should be removed
events=[],
frameworks=[],
hazards=[],
instruments=[],
keywords=[],
postfix=None,
sectors=[],
topics=[],
metadata=cast(dict[str, Any], family_metadata.value),
)
for family, family_document, geography, organisation in query_result
for (
family,
family_document,
family_metadata,
geography,
organisation,
) in query_result
]

return documents
2 changes: 1 addition & 1 deletion makefile-docker.defs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ setup_test_search_index:
docker-compose -f docker-compose.yml -f docker-compose.dev.yml run --rm opensearch-test-loader multielasticdump --direction=load --input=/cpr-backend/tests/data/ --output=${OPENSEARCH_URL} --ignoreType=template

test_search: setup_test_search_index
docker-compose -f docker-compose.yml -f docker-compose.dev.yml run --name search_test -v ./data:/data backend pytest -vvv -m 'search'
docker-compose -f docker-compose.yml -f docker-compose.dev.yml run --name search_test -v "${PWD}/data:/data" backend pytest -vvv -m 'search'
docker cp search_test:/data/benchmark_browse.txt .
docker cp search_test:/data/benchmark_search.txt .
docker rm search_test
Expand Down
10 changes: 2 additions & 8 deletions tests/core/validation/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,24 +81,18 @@ def test_write_documents_to_s3(test_s3_client, mocker):
d = DocumentParserInput(
publication_ts=datetime.datetime(year=2008, month=12, day=25),
name="name",
postfix="pf-A",
description="description",
source_url=None,
download_url=None,
type="executive",
source="CCLW",
import_id="1234-5678",
family_import_id="family_1234-5678",
slug="geo_2008_name_1234_5678",
category="category",
frameworks=[],
geography="GEO",
hazards=[],
instruments=[],
keywords=[],
languages=[],
sectors=[],
topics=[],
events=[],
metadata={},
)

upload_file_mock = mocker.patch.object(test_s3_client, "upload_fileobj")
Expand Down
17 changes: 9 additions & 8 deletions tests/routes/test_admin_unfccc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,24 @@
"publication_ts": "2021-10-25T00:00:00+00:00",
"name": "Nationally determined contributions under the Paris Agreement. Revised note by the secretariat",
"description": "Nationally determined contributions under the Paris Agreement. Revised note by the secretariat, Synthesis Report from UNFCCC Secretariat in 2021",
"postfix": null,
"source_url": "https://unfccc.int/sites/default/files/resource/cma2021_08r01_S.pdf",
"download_url": "url of downloaded document",
"slug": "Doc-slug",
"type": "Synthesis Report",
"source": "UNFCCC",
"import_id": "UNFCCC.Document.1.0",
"family_import_id": "UNFCCC.family.1.0",
"category": "UNFCCC",
"frameworks": [],
"geography": "GBR",
"hazards": [],
"instruments": [],
"keywords": [],
"languages": [],
"sectors": [],
"topics": [],
"events": []
"metadata": {
"author": [
"UNFCCC Secretariat"
],
"author_type": [
"Party"
]
}
}
}
}"""
Expand Down

0 comments on commit 44c6cf3

Please sign in to comment.