diff --git a/app/routes.py b/app/routes.py index 3a24133..be296e2 100644 --- a/app/routes.py +++ b/app/routes.py @@ -20,6 +20,7 @@ from . import htmx from .forms import HarvestSourceForm, OrganizationForm from .paginate import Pagination +import json logger = logging.getLogger("harvest_admin") @@ -731,6 +732,17 @@ def get_harvest_record(record_id=None): return db._to_dict(record) +@mod.route("/harvest_record//raw", methods=["GET"]) +def get_harvest_record_raw(record_id=None): + record = db.get_harvest_record(record_id) + if record: + try: + source_raw_json = json.loads(record.source_raw) + return source_raw_json, 200 + except json.JSONDecodeError: + return {"error": "Invalid JSON format in source_raw"}, 500 + else: + return {"error": "Not Found"}, 404 ### Add record @mod.route("/harvest_record/add", methods=["POST", "GET"]) diff --git a/database/interface.py b/database/interface.py index 7bf2213..a5d731c 100644 --- a/database/interface.py +++ b/database/interface.py @@ -249,7 +249,7 @@ def _clear_harvest_records(): ckan = RemoteCKAN(os.getenv("CKAN_API_URL"), apikey=os.getenv("CKAN_API_TOKEN")) - result = ckan.action.package_search(fq=f"owner_org:{organization_id}") + result = ckan.action.package_search(fq=f"harvest_source_id:{source_id}") ckan_datasets = result["count"] start = datetime.now(timezone.utc) retry_count = 0 @@ -258,7 +258,7 @@ def _clear_harvest_records(): # Retry loop to handle timeouts from cloud.gov and CKAN's Solr backend, # ensuring datasets are cleared despite possible interruptions. while ckan_datasets > 0 and retry_count < retry_max: - result = ckan.action.package_search(fq=f"owner_org:{organization_id}") + result = ckan.action.package_search(fq=f"harvest_source_id:{source_id}") ckan_datasets = result["count"] logger.info( f"Attempt {retry_count + 1}: " diff --git a/harvester/harvest.py b/harvester/harvest.py index c7de97d..efc5db5 100644 --- a/harvester/harvest.py +++ b/harvester/harvest.py @@ -25,7 +25,6 @@ SynchronizeException, ValidationException, ) -from harvester.utils.ckan_utils import add_uuid_to_package_name, ckanify_dcatus from harvester.utils.general_utils import ( dataset_to_hash, download_file, @@ -247,7 +246,6 @@ def write_compare_to_db(self) -> dict: "ckan_name": record.ckan_name, } ) - self.internal_records_lookup_table = self.db_interface.add_harvest_records( records ) @@ -474,6 +472,7 @@ def validate(self) -> None: ) def create_record(self, retry=False): + from harvester.utils.ckan_utils import add_uuid_to_package_name try: result = ckan.action.package_create(**self.ckanified_metadata) self.ckan_id = result["id"] @@ -513,9 +512,10 @@ def update_self_in_db(self) -> bool: ) def ckanify_dcatus(self) -> None: + from harvester.utils.ckan_utils import ckanify_dcatus try: self.ckanified_metadata = ckanify_dcatus( - self.metadata, self.harvest_source.organization_id + self.metadata, self.harvest_source ) except Exception as e: self.status = "error" diff --git a/harvester/utils/ckan_utils.py b/harvester/utils/ckan_utils.py index d34482e..8cbe7a7 100644 --- a/harvester/utils/ckan_utils.py +++ b/harvester/utils/ckan_utils.py @@ -1,5 +1,6 @@ import re import uuid +from harvester.harvest import HarvestSource # all of these are copy/pasted from ckan core # https://github.com/ckan/ckan/blob/master/ckan/lib/munge.py @@ -151,7 +152,7 @@ def munge_tag(tag: str) -> str: return tag -def create_ckan_extras(metadata: dict) -> list[dict]: +def create_ckan_extras(metadata: dict, harvest_source: HarvestSource) -> list[dict]: extras = [ "accessLevel", "bureauCode", @@ -161,7 +162,29 @@ def create_ckan_extras(metadata: dict) -> list[dict]: "publisher", ] - output = [{"key": "resource-type", "value": "Dataset"}] + output = [ + { + "key": "resource-type", + "value": "Dataset" + }, + { + "key": "harvest_object_id", + "value": harvest_source.internal_records_lookup_table[ + metadata["identifier"]] + }, + { + "key": "source_datajson_identifier", # dataset is datajson format or not + "value": True, + }, + { + "key": "harvest_source_id", + "value": harvest_source.id, + }, + { + "key": "harvest_source_title", + "value": harvest_source.name, + } + ] for extra in extras: if extra not in metadata: @@ -283,14 +306,14 @@ def simple_transform(metadata: dict, owner_org: str) -> dict: return output -def ckanify_dcatus(metadata: dict, owner_org: str) -> dict: - ckanified_metadata = simple_transform(metadata, owner_org) +def ckanify_dcatus(metadata: dict, harvest_source: HarvestSource) -> dict: + ckanified_metadata = simple_transform(metadata, harvest_source.organization_id) ckanified_metadata["resources"] = create_ckan_resources(metadata) ckanified_metadata["tags"] = ( create_ckan_tags(metadata["keyword"]) if "keyword" in metadata else [] ) - ckanified_metadata["extras"] = create_ckan_extras(metadata) + ckanified_metadata["extras"] = create_ckan_extras(metadata, harvest_source) return ckanified_metadata diff --git a/tests/integration/harvest/test_ckan_load.py b/tests/integration/harvest/test_ckan_load.py index 33583c2..a897576 100644 --- a/tests/integration/harvest/test_ckan_load.py +++ b/tests/integration/harvest/test_ckan_load.py @@ -88,6 +88,18 @@ def test_ckanify_dcatus( harvest_source = HarvestSource(harvest_job.id) harvest_source.prepare_external_data() + records = [( + { + "identifier": 'cftc-dc1', + "harvest_job_id": job_data_dcatus["id"], + "harvest_source_id": job_data_dcatus["harvest_source_id"] + } + )] + interface.add_harvest_records(records) + harvest_source.get_record_changes() + harvest_source.write_compare_to_db() + record_id = harvest_source.internal_records_lookup_table['cftc-dc1'] + expected_result = { "name": "commitment-of-traders", "owner_org": "d925f84d-955b-4cb7-812f-dcfd6681a18f", @@ -110,6 +122,10 @@ def test_ckanify_dcatus( ], "extras": [ {"key": "resource-type", "value": "Dataset"}, + {"key": "harvest_object_id", "value": record_id}, + {"key": "source_datajson_identifier", "value": True}, + {"key": "harvest_source_id", "value": "2f2652de-91df-4c63-8b53-bfced20b276b"}, + {"key": "harvest_source_title", "value": "Test Source"}, {"key": "accessLevel", "value": "public"}, {"key": "bureauCode", "value": "339:00"}, {"key": "identifier", "value": "cftc-dc1"}, diff --git a/tests/integration/harvest/test_exception_handling.py b/tests/integration/harvest/test_exception_handling.py index 946ff56..5acd2fc 100644 --- a/tests/integration/harvest/test_exception_handling.py +++ b/tests/integration/harvest/test_exception_handling.py @@ -136,7 +136,7 @@ def test_validation_exception( assert interface_record.status == "error" assert interface_errors[0].type == "ValidationException" - @patch("harvester.harvest.ckanify_dcatus", side_effect=Exception("Broken")) + @patch("harvester.utils.ckan_utils.ckanify_dcatus", side_effect=Exception("Broken")) def test_dcatus_to_ckan_exception( self, ckanify_dcatus_mock,