From 7de4e0848b17e459787efc0065da44643471b8d0 Mon Sep 17 00:00:00 2001 From: Jin-Sun-tts Date: Mon, 21 Oct 2024 17:03:11 -0400 Subject: [PATCH 1/8] added extra metadata fields including harvest_source, and record_id etc --- database/interface.py | 4 ++-- harvester/harvest.py | 3 +-- harvester/utils/ckan_utils.py | 28 +++++++++++++++++++++++----- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/database/interface.py b/database/interface.py index 7bf2213..a5d731c 100644 --- a/database/interface.py +++ b/database/interface.py @@ -249,7 +249,7 @@ def _clear_harvest_records(): ckan = RemoteCKAN(os.getenv("CKAN_API_URL"), apikey=os.getenv("CKAN_API_TOKEN")) - result = ckan.action.package_search(fq=f"owner_org:{organization_id}") + result = ckan.action.package_search(fq=f"harvest_source_id:{source_id}") ckan_datasets = result["count"] start = datetime.now(timezone.utc) retry_count = 0 @@ -258,7 +258,7 @@ def _clear_harvest_records(): # Retry loop to handle timeouts from cloud.gov and CKAN's Solr backend, # ensuring datasets are cleared despite possible interruptions. while ckan_datasets > 0 and retry_count < retry_max: - result = ckan.action.package_search(fq=f"owner_org:{organization_id}") + result = ckan.action.package_search(fq=f"harvest_source_id:{source_id}") ckan_datasets = result["count"] logger.info( f"Attempt {retry_count + 1}: " diff --git a/harvester/harvest.py b/harvester/harvest.py index c7de97d..434686c 100644 --- a/harvester/harvest.py +++ b/harvester/harvest.py @@ -247,7 +247,6 @@ def write_compare_to_db(self) -> dict: "ckan_name": record.ckan_name, } ) - self.internal_records_lookup_table = self.db_interface.add_harvest_records( records ) @@ -515,7 +514,7 @@ def update_self_in_db(self) -> bool: def ckanify_dcatus(self) -> None: try: self.ckanified_metadata = ckanify_dcatus( - self.metadata, self.harvest_source.organization_id + self.metadata, self.harvest_source ) except Exception as e: self.status = "error" diff --git a/harvester/utils/ckan_utils.py b/harvester/utils/ckan_utils.py index d34482e..5817202 100644 --- a/harvester/utils/ckan_utils.py +++ b/harvester/utils/ckan_utils.py @@ -151,7 +151,7 @@ def munge_tag(tag: str) -> str: return tag -def create_ckan_extras(metadata: dict) -> list[dict]: +def create_ckan_extras(metadata: dict, harvest_source) -> list[dict]: extras = [ "accessLevel", "bureauCode", @@ -161,7 +161,25 @@ def create_ckan_extras(metadata: dict) -> list[dict]: "publisher", ] - output = [{"key": "resource-type", "value": "Dataset"}] + output = [ + { + "key": "resource-type", + "value": "Dataset" + }, + { + "key": "harvest_object_id", + "value": harvest_source.internal_records_lookup_table[ + metadata["identifier"]] + }, + { + "key": "harvest_source_id", + "value": harvest_source.id, + }, + { + "key": "harvest_source_title", + "value": harvest_source.name, + } + ] for extra in extras: if extra not in metadata: @@ -283,14 +301,14 @@ def simple_transform(metadata: dict, owner_org: str) -> dict: return output -def ckanify_dcatus(metadata: dict, owner_org: str) -> dict: - ckanified_metadata = simple_transform(metadata, owner_org) +def ckanify_dcatus(metadata: dict, harvest_source) -> dict: + ckanified_metadata = simple_transform(metadata, harvest_source.organization_id) ckanified_metadata["resources"] = create_ckan_resources(metadata) ckanified_metadata["tags"] = ( create_ckan_tags(metadata["keyword"]) if "keyword" in metadata else [] ) - ckanified_metadata["extras"] = create_ckan_extras(metadata) + ckanified_metadata["extras"] = create_ckan_extras(metadata, harvest_source) return ckanified_metadata From 8821bb6a5116ad049e2d57cf2a3c2bd7c9fc09e0 Mon Sep 17 00:00:00 2001 From: Jin-Sun-tts Date: Tue, 22 Oct 2024 10:38:34 -0400 Subject: [PATCH 2/8] modified test --- tests/integration/harvest/test_ckan_load.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/integration/harvest/test_ckan_load.py b/tests/integration/harvest/test_ckan_load.py index 33583c2..ea5ad15 100644 --- a/tests/integration/harvest/test_ckan_load.py +++ b/tests/integration/harvest/test_ckan_load.py @@ -88,6 +88,18 @@ def test_ckanify_dcatus( harvest_source = HarvestSource(harvest_job.id) harvest_source.prepare_external_data() + records = [( + { + "identifier": 'cftc-dc1', + "harvest_job_id": job_data_dcatus["id"], + "harvest_source_id": job_data_dcatus["harvest_source_id"] + } + )] + interface.add_harvest_records(records) + harvest_source.get_record_changes() + harvest_source.write_compare_to_db() + record_id = harvest_source.internal_records_lookup_table['cftc-dc1'] + expected_result = { "name": "commitment-of-traders", "owner_org": "d925f84d-955b-4cb7-812f-dcfd6681a18f", @@ -110,6 +122,9 @@ def test_ckanify_dcatus( ], "extras": [ {"key": "resource-type", "value": "Dataset"}, + {"key": "harvest_object_id", "value": record_id}, + {"key": "harvest_source_id", "value": "2f2652de-91df-4c63-8b53-bfced20b276b"}, + {"key": "harvest_source_title", "value": "Test Source"}, {"key": "accessLevel", "value": "public"}, {"key": "bureauCode", "value": "339:00"}, {"key": "identifier", "value": "cftc-dc1"}, From 618b5aae3f79a0aa1d9c3a548ec8f5e35d3fc0a6 Mon Sep 17 00:00:00 2001 From: Jin-Sun-tts Date: Tue, 22 Oct 2024 12:47:10 -0400 Subject: [PATCH 3/8] added new route for raw record data --- app/routes.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/app/routes.py b/app/routes.py index 3a24133..1242b2b 100644 --- a/app/routes.py +++ b/app/routes.py @@ -20,6 +20,7 @@ from . import htmx from .forms import HarvestSourceForm, OrganizationForm from .paginate import Pagination +import json logger = logging.getLogger("harvest_admin") @@ -731,6 +732,17 @@ def get_harvest_record(record_id=None): return db._to_dict(record) +@mod.route("/harvest_record/raw/", methods=["GET"]) +def get_harvest_record_raw(record_id=None): + record = db.get_harvest_record(record_id) + if record: + try: + source_raw_json = json.loads(record.source_raw) + return source_raw_json, 200 + except json.JSONDecodeError: + return {"error": "Invalid JSON format in source_raw"}, 500 + else: + return {"error": "Not Found"}, 404 ### Add record @mod.route("/harvest_record/add", methods=["POST", "GET"]) From 5267b7d51983ac0c038d8c8a16e99655e83693b2 Mon Sep 17 00:00:00 2001 From: Jin-Sun-tts Date: Tue, 22 Oct 2024 13:19:02 -0400 Subject: [PATCH 4/8] change the raw to end of the route --- app/routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/routes.py b/app/routes.py index 1242b2b..be296e2 100644 --- a/app/routes.py +++ b/app/routes.py @@ -732,7 +732,7 @@ def get_harvest_record(record_id=None): return db._to_dict(record) -@mod.route("/harvest_record/raw/", methods=["GET"]) +@mod.route("/harvest_record//raw", methods=["GET"]) def get_harvest_record_raw(record_id=None): record = db.get_harvest_record(record_id) if record: From e0e0a8d08f65d0187588f2539b3d7ec45f5d2282 Mon Sep 17 00:00:00 2001 From: Jin-Sun-tts Date: Wed, 23 Oct 2024 17:07:34 -0400 Subject: [PATCH 5/8] add new field for source datajon format --- harvester/utils/ckan_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/harvester/utils/ckan_utils.py b/harvester/utils/ckan_utils.py index 5817202..6024c1c 100644 --- a/harvester/utils/ckan_utils.py +++ b/harvester/utils/ckan_utils.py @@ -171,6 +171,10 @@ def create_ckan_extras(metadata: dict, harvest_source) -> list[dict]: "value": harvest_source.internal_records_lookup_table[ metadata["identifier"]] }, + { + "key": "source_datajson_identifier", # dataset is datajson format or not + "value": True, + }, { "key": "harvest_source_id", "value": harvest_source.id, From 2b152394d20d5efdfb93e0f222a1f1a3888d757a Mon Sep 17 00:00:00 2001 From: Jin-Sun-tts Date: Wed, 23 Oct 2024 17:10:39 -0400 Subject: [PATCH 6/8] fix test --- tests/integration/harvest/test_ckan_load.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/harvest/test_ckan_load.py b/tests/integration/harvest/test_ckan_load.py index ea5ad15..a897576 100644 --- a/tests/integration/harvest/test_ckan_load.py +++ b/tests/integration/harvest/test_ckan_load.py @@ -123,6 +123,7 @@ def test_ckanify_dcatus( "extras": [ {"key": "resource-type", "value": "Dataset"}, {"key": "harvest_object_id", "value": record_id}, + {"key": "source_datajson_identifier", "value": True}, {"key": "harvest_source_id", "value": "2f2652de-91df-4c63-8b53-bfced20b276b"}, {"key": "harvest_source_title", "value": "Test Source"}, {"key": "accessLevel", "value": "public"}, From 6c604a34c335073ece521dcda55aa648508d3d07 Mon Sep 17 00:00:00 2001 From: Jin-Sun-tts Date: Wed, 23 Oct 2024 17:40:36 -0400 Subject: [PATCH 7/8] added type hint --- harvester/harvest.py | 3 ++- harvester/utils/ckan_utils.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/harvester/harvest.py b/harvester/harvest.py index 434686c..efc5db5 100644 --- a/harvester/harvest.py +++ b/harvester/harvest.py @@ -25,7 +25,6 @@ SynchronizeException, ValidationException, ) -from harvester.utils.ckan_utils import add_uuid_to_package_name, ckanify_dcatus from harvester.utils.general_utils import ( dataset_to_hash, download_file, @@ -473,6 +472,7 @@ def validate(self) -> None: ) def create_record(self, retry=False): + from harvester.utils.ckan_utils import add_uuid_to_package_name try: result = ckan.action.package_create(**self.ckanified_metadata) self.ckan_id = result["id"] @@ -512,6 +512,7 @@ def update_self_in_db(self) -> bool: ) def ckanify_dcatus(self) -> None: + from harvester.utils.ckan_utils import ckanify_dcatus try: self.ckanified_metadata = ckanify_dcatus( self.metadata, self.harvest_source diff --git a/harvester/utils/ckan_utils.py b/harvester/utils/ckan_utils.py index 6024c1c..8cbe7a7 100644 --- a/harvester/utils/ckan_utils.py +++ b/harvester/utils/ckan_utils.py @@ -1,5 +1,6 @@ import re import uuid +from harvester.harvest import HarvestSource # all of these are copy/pasted from ckan core # https://github.com/ckan/ckan/blob/master/ckan/lib/munge.py @@ -151,7 +152,7 @@ def munge_tag(tag: str) -> str: return tag -def create_ckan_extras(metadata: dict, harvest_source) -> list[dict]: +def create_ckan_extras(metadata: dict, harvest_source: HarvestSource) -> list[dict]: extras = [ "accessLevel", "bureauCode", @@ -305,7 +306,7 @@ def simple_transform(metadata: dict, owner_org: str) -> dict: return output -def ckanify_dcatus(metadata: dict, harvest_source) -> dict: +def ckanify_dcatus(metadata: dict, harvest_source: HarvestSource) -> dict: ckanified_metadata = simple_transform(metadata, harvest_source.organization_id) ckanified_metadata["resources"] = create_ckan_resources(metadata) From 3dc3c31b8f5783a04021e35be334e43065e8ea28 Mon Sep 17 00:00:00 2001 From: Jin-Sun-tts Date: Wed, 23 Oct 2024 17:58:57 -0400 Subject: [PATCH 8/8] modify test for the path to ckanify_dcatus function --- tests/integration/harvest/test_exception_handling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/harvest/test_exception_handling.py b/tests/integration/harvest/test_exception_handling.py index 946ff56..5acd2fc 100644 --- a/tests/integration/harvest/test_exception_handling.py +++ b/tests/integration/harvest/test_exception_handling.py @@ -136,7 +136,7 @@ def test_validation_exception( assert interface_record.status == "error" assert interface_errors[0].type == "ValidationException" - @patch("harvester.harvest.ckanify_dcatus", side_effect=Exception("Broken")) + @patch("harvester.utils.ckan_utils.ckanify_dcatus", side_effect=Exception("Broken")) def test_dcatus_to_ckan_exception( self, ckanify_dcatus_mock,