Skip to content

Commit

Permalink
Merge pull request #103 from GSA/add-extra-metatdata-fields
Browse files Browse the repository at this point in the history
Add extra metatdata fields
  • Loading branch information
FuhuXia authored Oct 24, 2024
2 parents 65a69f4 + 3dc3c31 commit af25dca
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 11 deletions.
12 changes: 12 additions & 0 deletions app/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from . import htmx
from .forms import HarvestSourceForm, OrganizationForm
from .paginate import Pagination
import json

logger = logging.getLogger("harvest_admin")

Expand Down Expand Up @@ -731,6 +732,17 @@ def get_harvest_record(record_id=None):

return db._to_dict(record)

@mod.route("/harvest_record/<record_id>/raw", methods=["GET"])
def get_harvest_record_raw(record_id=None):
record = db.get_harvest_record(record_id)
if record:
try:
source_raw_json = json.loads(record.source_raw)
return source_raw_json, 200
except json.JSONDecodeError:
return {"error": "Invalid JSON format in source_raw"}, 500
else:
return {"error": "Not Found"}, 404

### Add record
@mod.route("/harvest_record/add", methods=["POST", "GET"])
Expand Down
4 changes: 2 additions & 2 deletions database/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def _clear_harvest_records():

ckan = RemoteCKAN(os.getenv("CKAN_API_URL"), apikey=os.getenv("CKAN_API_TOKEN"))

result = ckan.action.package_search(fq=f"owner_org:{organization_id}")
result = ckan.action.package_search(fq=f"harvest_source_id:{source_id}")
ckan_datasets = result["count"]
start = datetime.now(timezone.utc)
retry_count = 0
Expand All @@ -258,7 +258,7 @@ def _clear_harvest_records():
# Retry loop to handle timeouts from cloud.gov and CKAN's Solr backend,
# ensuring datasets are cleared despite possible interruptions.
while ckan_datasets > 0 and retry_count < retry_max:
result = ckan.action.package_search(fq=f"owner_org:{organization_id}")
result = ckan.action.package_search(fq=f"harvest_source_id:{source_id}")
ckan_datasets = result["count"]
logger.info(
f"Attempt {retry_count + 1}: "
Expand Down
6 changes: 3 additions & 3 deletions harvester/harvest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
SynchronizeException,
ValidationException,
)
from harvester.utils.ckan_utils import add_uuid_to_package_name, ckanify_dcatus
from harvester.utils.general_utils import (
dataset_to_hash,
download_file,
Expand Down Expand Up @@ -247,7 +246,6 @@ def write_compare_to_db(self) -> dict:
"ckan_name": record.ckan_name,
}
)

self.internal_records_lookup_table = self.db_interface.add_harvest_records(
records
)
Expand Down Expand Up @@ -474,6 +472,7 @@ def validate(self) -> None:
)

def create_record(self, retry=False):
from harvester.utils.ckan_utils import add_uuid_to_package_name
try:
result = ckan.action.package_create(**self.ckanified_metadata)
self.ckan_id = result["id"]
Expand Down Expand Up @@ -513,9 +512,10 @@ def update_self_in_db(self) -> bool:
)

def ckanify_dcatus(self) -> None:
from harvester.utils.ckan_utils import ckanify_dcatus
try:
self.ckanified_metadata = ckanify_dcatus(
self.metadata, self.harvest_source.organization_id
self.metadata, self.harvest_source
)
except Exception as e:
self.status = "error"
Expand Down
33 changes: 28 additions & 5 deletions harvester/utils/ckan_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import uuid
from harvester.harvest import HarvestSource

# all of these are copy/pasted from ckan core
# https://github.com/ckan/ckan/blob/master/ckan/lib/munge.py
Expand Down Expand Up @@ -151,7 +152,7 @@ def munge_tag(tag: str) -> str:
return tag


def create_ckan_extras(metadata: dict) -> list[dict]:
def create_ckan_extras(metadata: dict, harvest_source: HarvestSource) -> list[dict]:
extras = [
"accessLevel",
"bureauCode",
Expand All @@ -161,7 +162,29 @@ def create_ckan_extras(metadata: dict) -> list[dict]:
"publisher",
]

output = [{"key": "resource-type", "value": "Dataset"}]
output = [
{
"key": "resource-type",
"value": "Dataset"
},
{
"key": "harvest_object_id",
"value": harvest_source.internal_records_lookup_table[
metadata["identifier"]]
},
{
"key": "source_datajson_identifier", # dataset is datajson format or not
"value": True,
},
{
"key": "harvest_source_id",
"value": harvest_source.id,
},
{
"key": "harvest_source_title",
"value": harvest_source.name,
}
]

for extra in extras:
if extra not in metadata:
Expand Down Expand Up @@ -283,14 +306,14 @@ def simple_transform(metadata: dict, owner_org: str) -> dict:
return output


def ckanify_dcatus(metadata: dict, owner_org: str) -> dict:
ckanified_metadata = simple_transform(metadata, owner_org)
def ckanify_dcatus(metadata: dict, harvest_source: HarvestSource) -> dict:
ckanified_metadata = simple_transform(metadata, harvest_source.organization_id)

ckanified_metadata["resources"] = create_ckan_resources(metadata)
ckanified_metadata["tags"] = (
create_ckan_tags(metadata["keyword"]) if "keyword" in metadata else []
)
ckanified_metadata["extras"] = create_ckan_extras(metadata)
ckanified_metadata["extras"] = create_ckan_extras(metadata, harvest_source)

return ckanified_metadata

Expand Down
16 changes: 16 additions & 0 deletions tests/integration/harvest/test_ckan_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,18 @@ def test_ckanify_dcatus(
harvest_source = HarvestSource(harvest_job.id)
harvest_source.prepare_external_data()

records = [(
{
"identifier": 'cftc-dc1',
"harvest_job_id": job_data_dcatus["id"],
"harvest_source_id": job_data_dcatus["harvest_source_id"]
}
)]
interface.add_harvest_records(records)
harvest_source.get_record_changes()
harvest_source.write_compare_to_db()
record_id = harvest_source.internal_records_lookup_table['cftc-dc1']

expected_result = {
"name": "commitment-of-traders",
"owner_org": "d925f84d-955b-4cb7-812f-dcfd6681a18f",
Expand All @@ -110,6 +122,10 @@ def test_ckanify_dcatus(
],
"extras": [
{"key": "resource-type", "value": "Dataset"},
{"key": "harvest_object_id", "value": record_id},
{"key": "source_datajson_identifier", "value": True},
{"key": "harvest_source_id", "value": "2f2652de-91df-4c63-8b53-bfced20b276b"},
{"key": "harvest_source_title", "value": "Test Source"},
{"key": "accessLevel", "value": "public"},
{"key": "bureauCode", "value": "339:00"},
{"key": "identifier", "value": "cftc-dc1"},
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/harvest/test_exception_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def test_validation_exception(
assert interface_record.status == "error"
assert interface_errors[0].type == "ValidationException"

@patch("harvester.harvest.ckanify_dcatus", side_effect=Exception("Broken"))
@patch("harvester.utils.ckan_utils.ckanify_dcatus", side_effect=Exception("Broken"))
def test_dcatus_to_ckan_exception(
self,
ckanify_dcatus_mock,
Expand Down

1 comment on commit af25dca

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests Skipped Failures Errors Time
2 0 💤 0 ❌ 0 🔥 7.521s ⏱️

Please sign in to comment.