Skip to content

Commit

Permalink
Merge pull request #1005 from ecds/feature/960-metadata
Browse files Browse the repository at this point in the history
Allow ingest of metadata not in model fields, add custom metadata facets to search (#960)
  • Loading branch information
jayvarner authored Jan 23, 2024
2 parents 10c5978 + b7a51b4 commit 79f7667
Show file tree
Hide file tree
Showing 15 changed files with 253 additions and 66 deletions.
57 changes: 52 additions & 5 deletions apps/iiif/manifests/documents.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Elasticsearch indexing rules for IIIF manifests"""

from html import unescape
from django.conf import settings
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from elasticsearch_dsl import analyzer
from elasticsearch_dsl import MetaField, Keyword, analyzer
from django.db.models.query import Prefetch
from django.utils.html import strip_tags
from unidecode import unidecode
Expand All @@ -26,6 +27,7 @@ class ManifestDocument(Document):
"""Elasticsearch Document class for IIIF Manifest"""

# fields to map explicitly in Elasticsearch
attribution = fields.TextField()
authors = fields.KeywordField(multi=True) # only used for faceting/filtering
author = fields.TextField() # only used for searching
canvas_set = fields.NestedField(
Expand All @@ -42,6 +44,10 @@ class ManifestDocument(Document):
label = fields.TextField(analyzer=stemmer)
label_alphabetical = fields.KeywordField()
languages = fields.KeywordField(multi=True)
license = fields.TextField()
metadata = fields.NestedField()
published_city = fields.TextField()
publisher = fields.TextField()
summary = fields.TextField(analyzer=stemmer)

class Index:
Expand All @@ -56,19 +62,28 @@ class Django:

# fields to map dynamically in Elasticsearch
fields = [
"attribution",
"created_at",
"date_sort_ascending",
"date_sort_descending",
"license",
"pid",
"published_city",
"published_date",
"publisher",
"viewingdirection",
]
related_models = [Collection, Canvas]

class Meta:
# make Keyword type default for strings, for custom dynamically-mapped facet fields
dynamic_templates = MetaField(
[
{
"strings": {
"match_mapping_type": "string",
"mapping": Keyword().to_dict(),
}
}
]
)

def prepare_authors(self, instance):
"""convert authors string into list"""
if instance.author:
Expand All @@ -92,6 +107,38 @@ def prepare_languages(self, instance):
return [lang.name for lang in instance.languages.all()]
return ["[no language]"]

def prepare_metadata(self, instance):
"""use custom metadata settings to prepare metadata field"""
custom_metadata = {}

if (
settings
and hasattr(settings, "CUSTOM_METADATA")
and isinstance(settings.CUSTOM_METADATA, dict)
):
# should be a dict like {meta_key: {"multi": bool, "separator": str}}
for key, opts in settings.CUSTOM_METADATA.items():
val = None
# each key in CUSTOM_METADATA dict should be a metadata key.
# however, instance.metadata will generally be a list rather than a dict: it's a
# jsonfield that maps to the IIIF manifest metadata field, which is a list
# consisting of dicts like { label: str, value: str }
if isinstance(instance.metadata, list):
# find matching value by "label" == key
for obj in instance.metadata:
if "label" in obj and obj["label"] == key and "value" in obj:
val = obj["value"]
break
elif isinstance(instance.metadata, dict):
# in some cases it may be just a dict, so in that case, use get()
val = instance.metadata.get(key, None)
# should have "multi" bool and if multi is True, "separator" string
if val and opts.get("multi", False) == True:
val = val.split(opts.get("separator", ";"))
custom_metadata[key] = val

return custom_metadata

def prepare_summary(self, instance):
"""Strip HTML tags from summary"""
return unescape(strip_tags(instance.summary))
Expand Down
19 changes: 14 additions & 5 deletions apps/iiif/serializers/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ def start_serialization(self):
def end_serialization(self):
self.stream.write('')

def serialize_metadata(self, obj):
"""Convert metadata on object into list of {label, value} dicts"""
if isinstance(obj.metadata, list):
# most common case: metadata is already a list of {label, value} dicts
return obj.metadata
elif isinstance(obj.metadata, dict):
# convert dict into list of label/value pair dicts
return [{"label": key, "value": val} for (key, val) in obj.metadata.items()]
else:
return []

def get_dump_object(self, obj):
# TODO: Raise error if version is not v2 or v3
if self.version == 'v2' or self.version is None:
Expand Down Expand Up @@ -69,10 +80,6 @@ def get_dump_object(self, obj):
"label": "Publication Date",
"value": obj.published_date
},
{
"label": "Notes",
"value": obj.metadata
},
{
"label": "Record Created",
"value": obj.created_at
Expand All @@ -92,7 +99,9 @@ def get_dump_object(self, obj):
{
"label": "Export Date",
"value": self.exportdate
}
},
# unpack serialized metadata (list of label, value pairs)
*self.serialize_metadata(obj),
],
"description": obj.summary,
"related": obj.related_links,
Expand Down
6 changes: 3 additions & 3 deletions apps/ingest/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .forms import BulkVolumeUploadForm
from .models import Bulk, IngestTaskWatcher, Local, Remote, S3Ingest
from .services import (clean_metadata, create_manifest, get_associated_meta,
get_metadata_from, lowercase_first_line)
get_metadata_from, normalize_header)

LOGGER = logging.getLogger(__name__)
class LocalAdmin(admin.ModelAdmin):
Expand Down Expand Up @@ -253,7 +253,7 @@ def clean(self):
csv_file = self.cleaned_data.get('metadata_spreadsheet')
if csv_file:
reader = csv.DictReader(
lowercase_first_line(
normalize_header(
StringIO(csv_file.read().decode('utf-8'))
),
)
Expand Down Expand Up @@ -284,7 +284,7 @@ def save_model(self, request, obj, form, change):
# Get spreadsheet with metadata to match each volume
obj.metadata_spreadsheet.seek(0)
metadata = csv.DictReader(
lowercase_first_line(
normalize_header(
StringIO(obj.metadata_spreadsheet.read().decode('utf-8'))
),
)
Expand Down
9 changes: 7 additions & 2 deletions apps/ingest/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
from django.forms import ClearableFileInput
from .models import Bulk


class MultipleFileInput(forms.ClearableFileInput):
allow_multiple_selected = True


class BulkVolumeUploadForm(forms.ModelForm):
class Meta:
model = Bulk
fields = ['image_server', 'volume_files', 'collections']
fields = ["image_server", "volume_files", "collections"]
widgets = {
'volume_files': ClearableFileInput(attrs={'allow_multiple_selected': True}),
"volume_files": MultipleFileInput,
}
84 changes: 55 additions & 29 deletions apps/ingest/services.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" Module of service classes and methods for ingest. """
import itertools
import re
from mimetypes import guess_type
from urllib.parse import unquote, urlparse

Expand All @@ -10,20 +11,21 @@


def clean_metadata(metadata):
print(metadata)
"""Remove keys that do not align with Manifest fields.
"""Normalize names of fields that align with Manifest fields.
:param metadata:
:type metadata: tablib.Dataset
:return: Dictionary with keys matching Manifest fields
:rtype: dict
"""
metadata = {key.casefold().replace(' ', '_'): value for key, value in metadata.items()}
fields = [
*(f.name for f in Manifest._meta.get_fields()),
"related", # used for related external links
]
invalid_keys = []
fields = [f.name for f in Manifest._meta.get_fields()]
metadata = {
(
key.casefold().replace(" ", "_")
if key.casefold().replace(" ", "_") in fields
else key
): value for key, value in metadata.items()
}

for key in metadata.keys():
if key != 'metadata' and isinstance(metadata[key], list):
Expand All @@ -33,12 +35,6 @@ def clean_metadata(metadata):
metadata[key] = metadata[key][0][meta_key]
else:
metadata[key] = ', '.join(metadata[key])
if key not in fields:
invalid_keys.append(key)

# TODO: Update this method to allow all "invalid" keys to populate Manifest.metadata JSONField
for invalid_key in invalid_keys:
metadata.pop(invalid_key)

return metadata

Expand All @@ -47,7 +43,7 @@ def create_related_links(manifest, related_str):
Create RelatedLink objects from supplied related links string and associate each with supplied
Manifest. String should consist of semicolon-separated URLs.
:param manifest:
:type related_str: iiif.manifest.models.Manifest
:type manifest: iiif.manifest.models.Manifest
:param related_str:
:type related_str: str
:rtype: None
Expand All @@ -61,6 +57,41 @@ def create_related_links(manifest, related_str):
is_structured_data=False, # assume this is not meant for seeAlso
)

def set_metadata(manifest, metadata):
"""
Update Manifest.metadata using supplied metadata dict
:param manifest:
:type manifest: iiif.manifest.models.Manifest
:param metadata:
:type metadata: dict
:rtype: None
"""
fields = [f.name for f in Manifest._meta.get_fields()]
for (key, value) in metadata.items():
if key == "related":
# add RelatedLinks from metadata spreadsheet key "related"
create_related_links(manifest, value)
elif key in fields:
setattr(manifest, key, value)
else:
# all other keys go into Manifest.metadata JSONField
if isinstance(manifest.metadata, list):
# add label and value to list
manifest.metadata.append({"label": key, "value": value})
elif isinstance(manifest.metadata, dict):
# convert to list of {label, value} as expected by iiif spec
manifest.metadata = [
*[
{"label": k, "value": v}
for (k, v) in manifest.metadata.items()
],
{"label": key, "value": value},
]
else:
# instantiate as list
manifest.metadata = [{"label": key, "value": value}]
manifest.save()

def create_manifest(ingest):
"""
Create or update a Manifest from supplied metadata and images.
Expand All @@ -81,14 +112,7 @@ def create_manifest(ingest):
manifest, created = Manifest.objects.get_or_create(pid=metadata['pid'].replace('_', '-'))
else:
manifest = Manifest.objects.create()
for (key, value) in metadata.items():
if key == "related":
# add RelatedLinks from metadata spreadsheet key "related"
create_related_links(manifest, value)
else:
# all other keys should exist as fields on Manifest (for now)
setattr(manifest, key, value)
# TODO: if the key doesn't exist on Manifest model, add it to Manifest.metadata
set_metadata(manifest, metadata)
else:
manifest = Manifest()

Expand Down Expand Up @@ -232,6 +256,7 @@ def get_associated_meta(all_metadata, file):
file_meta = {}
extless_filename = file.name[0:file.name.rindex('.')]
for meta_dict in all_metadata:
metadata_found_filename = None
for key, val in meta_dict.items():
if key.casefold() == 'filename':
metadata_found_filename = val
Expand All @@ -240,9 +265,10 @@ def get_associated_meta(all_metadata, file):
file_meta = meta_dict
return file_meta

def lowercase_first_line(iterator):
"""Lowercase the first line of a text file (such as the header row of a CSV)"""
return itertools.chain(
# ignore unicode characters, set lowercase, and strip whitespace
[next(iterator).encode('ascii', 'ignore').decode().casefold().strip()], iterator
)
def normalize_header(iterator):
"""Normalize the header row of a metadata CSV"""
# ignore unicode characters and strip whitespace
header_row = next(iterator).encode("ascii", "ignore").decode().strip()
# lowercase the word "pid" in this row so we can access it easily
header_row = re.sub(r"[Pp][Ii][Dd]", lambda m: m.group(0).casefold(), header_row)
return itertools.chain([header_row], iterator)
10 changes: 2 additions & 8 deletions apps/ingest/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from apps.ingest.models import IngestTaskWatcher

from .mail import send_email_on_failure, send_email_on_success
from .services import create_manifest, create_related_links
from .services import create_manifest, create_related_links, set_metadata

# Use `apps.get_model` to avoid circular import error. Because the parameters used to
# create a background task have to be serializable, we can't just pass in the model object.
Expand Down Expand Up @@ -128,13 +128,7 @@ def create_canvases_from_s3_ingest(metadata, ingest_id):
manifest = Manifest.objects.get(pid=pid)
except Manifest.DoesNotExist:
manifest = Manifest.objects.create(pid=pid)
for (key, value) in metadata.items():
if key == "related":
# add RelatedLinks from metadata spreadsheet key "related"
create_related_links(manifest, value)
else:
# all other keys should exist as fields on Manifest (for now)
setattr(manifest, key, value)
set_metadata(manifest, metadata)
# Image server: set from ingest
ingest = S3Ingest.objects.get(pk=ingest_id)
manifest.image_server = ingest.image_server
Expand Down
6 changes: 3 additions & 3 deletions apps/ingest/tests/test_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,15 @@ def test_creating_canvases(self):
# except Local.DoesNotExist:
# pass

def test_it_creates_mainfest_with_metadata_property(self):
def test_it_creates_manifest_with_metadata_property(self):
metadata = {
'pid': '808',
'title': 'Goodie Mob'
'label': 'Goodie Mob'
}
local = self.mock_local('no_meta_file.zip', metadata=metadata)
local.manifest = create_manifest(local)
assert local.manifest.pid == '808'
assert local.manifest.title == 'Goodie Mob'
assert local.manifest.label == 'Goodie Mob'

def test_create_related_links(self):
metadata = {
Expand Down
2 changes: 1 addition & 1 deletion apps/ingest/tests/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from apps.iiif.manifests.tests.factories import ManifestFactory
from .factories import RemoteFactory
from ..services import (clean_metadata, create_manifest, get_associated_meta,
get_metadata_from, lowercase_first_line)
get_metadata_from, normalize_header)


class RemoteTest(TestCase):
Expand Down
2 changes: 1 addition & 1 deletion apps/ingest/tests/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from apps.iiif.manifests.tests.factories import ManifestFactory, ImageServerFactory
from ..models import S3Ingest
from ..services import (clean_metadata, create_manifest, get_associated_meta,
get_metadata_from, lowercase_first_line)
get_metadata_from, normalize_header)

pytestmark = pytest.mark.django_db(transaction=True) # pylint: disable = invalid-name

Expand Down
Loading

0 comments on commit 79f7667

Please sign in to comment.