Skip to content

Commit

Permalink
example: Implement IIIF Content Search for ALTO example
Browse files Browse the repository at this point in the history
  • Loading branch information
jbaiter authored and bitzl committed Jun 17, 2019
1 parent 0278ec9 commit 86214a9
Show file tree
Hide file tree
Showing 8 changed files with 312 additions and 121 deletions.
7 changes: 4 additions & 3 deletions example/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ services:
command: pipenv run prod
build: iiif-prezi
volumes:
- ./google1000:/google1000
- ./data:/data
environment:
- CFG_SOLR_HANDLER=http://solr:8983/solr/ocrtest/select
- CFG_SOLR_BASE=http://solr:8983/solr
- CFG_SERVER_NAME=localhost:8181
- CFG_IMAGE_API_BASE=https://ocrhl.jbaiter.de/iiif/image
- CFG_GOOGLE1000_PATH=/google1000
- CFG_GOOGLE1000_PATH=/data/google1000
- CFG_BNL_PATH=/data/bnl_lunion
- CFG_APP_PATH=/iiif/presentation

frontend:
Expand Down
125 changes: 125 additions & 0 deletions example/iiif-prezi/bnl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import copy
from pathlib import Path

import lxml.etree as etree

from common import make_id, MANIFEST_TEMPLATE, CANVAS_TEMPLATE, NSMAP


def make_manifest(app, issue_id):
base_dir = Path(app.config.get('BNL_PATH', '../data/bnl_lunion'))
id_parts = issue_id[4:].split("_")
issue_basename = f'{id_parts[0]}_newspaper_lunion_{id_parts[1]}'
issue_path = base_dir / issue_basename
protocol = app.config.get('PROTOCOL', 'http')
location = app.config.get('SERVER_NAME', 'localhost:8008')
app_path = app.config.get('APP_PATH', '')
manifest_path = app.url_for('get_manifest', volume_id=issue_id)
issue_search_path = app.url_for('search', doc_id=issue_id)
image_api_base = app.config.get('IMAGE_API_BASE', 'http://localhost:8080')
manifest = copy.deepcopy(MANIFEST_TEMPLATE)
manifest['@id'] = f'{protocol}://{location}{app_path}/{manifest_path}'
manifest['service']['@id'] = f'{protocol}://{location}{app_path}/{issue_search_path}'
manifest['sequences'][0]['@id'] = make_id(app, issue_id, 'sequence')
tree = etree.parse(f"{issue_path}/{issue_basename}-mets.xml")

# Metadata
meta_elem = tree.find(".//mets:dmdSec[@ID='MODSMD_PRINT']//mods:mods", namespaces=NSMAP)
issue_meta = {
'newspaper': meta_elem.findtext('.//mods:titleInfo/mods:title', namespaces=NSMAP),
'title': meta_elem.findtext('.//mods:titleInfo/mods:partNumber', namespaces=NSMAP),
'date': meta_elem.findtext('.//mods:dateIssued', namespaces=NSMAP),
'publisher': meta_elem.findtext('.//mods:publisher', namespaces=NSMAP)}
manifest['metadata'] = []
if issue_meta['newspaper']:
manifest['metadata'].append({'label': 'Newspaper Title', 'value': issue_meta['newspaper']})
if issue_meta['title']:
manifest['metadata'].append({'label': 'Issue Title', 'value': issue_meta['title']})
if issue_meta['date']:
manifest['metadata'].append({'label': 'Issue Date', 'value': issue_meta['date']})
if issue_meta['publisher']:
manifest['metadata'].append({'label': 'Publisher', 'value': issue_meta['publisher']})
manifest['label'] = "{newspaper}: {title} ({date})".format(**issue_meta)
search_path = app.url_for('search', doc_id=issue_id)
manifest['service']['@id'] = f'{protocol}://{location}{app_path}/{search_path}'

# Canvases
for page_elem in tree.xpath(".//mets:structMap[@TYPE='PHYSICAL']//mets:div[@TYPE='PAGE']",
namespaces=NSMAP):
file_id = next(i for i in page_elem.xpath(".//mets:area/@FILEID", namespaces=NSMAP)
if i.startswith('IMG'))
adm_id = file_id.replace('IMG', 'IMGPARAM')
img_elem = tree.find(f".//mets:amdSec[@ID='{adm_id}']//mix:BasicImageCharacteristics",
namespaces=NSMAP)
img_width = int(img_elem.findtext('./mix:imageWidth', namespaces=NSMAP))
img_height = int(img_elem.findtext('./mix:imageHeight', namespaces=NSMAP))
img_fname = tree.xpath(f'.//mets:file[@ADMID="{adm_id}"]/mets:FLocat/@xlink:href',
namespaces=NSMAP)[0].split('/')[-1].split('.')[0]
canvas = copy.deepcopy(CANVAS_TEMPLATE)
page_id = page_elem.find('.//mets:area[@BETYPE="IDREF"]', namespaces=NSMAP).attrib['BEGIN']
canvas['@id'] = f'{protocol}://{location}{app_path}/{issue_id}/canvas/{page_id}'
image_url = f'{image_api_base}/{issue_id}_{img_fname.split("-")[-1]}'
canvas['width'] = img_width
canvas['height'] = img_height
canvas['images'][0]['on'] = canvas['@id']
canvas['images'][0]['resource']['width'] = img_width
canvas['images'][0]['resource']['height'] = img_height
canvas['images'][0]['resource']['@id'] = f'{image_url}/full/full/0/default.jpg'
canvas['images'][0]['resource']['service']['@id'] = image_url
manifest['sequences'][0]['canvases'].append(canvas)

# Ranges
# TODO: Yeah, we don't do nested ranges, since it's such a major PITA with
# Presentation API 2.0. Will do when 3.0 is stabilized
manifest['structures'] = []
alto_boxes = {k: v for d in (parse_alto(p) for p in (issue_path / 'text').glob("*.xml"))
for k, v in d.items()}
toc_elems = tree.xpath(
'.//mets:structMap[@TYPE="LOGICAL"]//mets:div[@TYPE="ISSUE"]'
'//mets:div[@TYPE="ARTICLE" or @TYPE="SECTION" or @TYPE="ADVERTISEMENT"]',
namespaces=NSMAP)
for elem in toc_elems:
dtl_id = elem.attrib["ID"]
range_id = f'{protocol}://{location}{app_path}/{issue_id}/range/{dtl_id}'
iiif_range = {
'@id': range_id,
'@type': 'sc:Range',
'canvases': []
}
if elem.attrib['TYPE'] == 'ARTICLE':
article_id = elem.attrib['DMDID'].replace('MODSMD_ARTICLE', '')
search_id = f'{issue_id}-{article_id}'
search_path = app.url_for('search', doc_id=search_id)
iiif_range['service'] = {
"@context": "http://iiif.io/api/search/0/context.json",
"@id": f'{protocol}://{location}{app_path}{search_path}',
"profile": "http://iiif.io/api/search/0/search"
}
# TODO: If @TYPE='ARTICLE', add a search service for this article
if 'LABEL' in elem.attrib:
iiif_range['label'] = elem.attrib['LABEL']
else:
iiif_range['label'] = f"Unnamed {elem.attrib['TYPE'].lower()}"
for area_elem in elem.findall('.//mets:area[@BETYPE="IDREF"]', namespaces=NSMAP):
box_id = area_elem.attrib['BEGIN']
canvas_idx = int(box_id.split("_")[0][1:]) - 1
canvas_id = manifest['sequences'][0]['canvases'][canvas_idx]['@id']
ocr_box = alto_boxes[box_id]
canvas_region = f'#xywh={ocr_box[0]},{ocr_box[1]},{ocr_box[2]},{ocr_box[3]}'
iiif_range['canvases'].append(canvas_id + canvas_region)
manifest['structures'].append(iiif_range)
manifest['structures'].insert(0, {
'@id': f'{protocol}://{location}{app_path}/{issue_id}/range/top',
'@type': 'sc:Range',
'label': 'Table of Contents',
'viewingHint': 'top',
'ranges': [r['@id'] for r in manifest['structures']]
})
return manifest


def parse_alto(alto_path):
tree = etree.parse(str(alto_path))
attribs = ('HPOS', 'VPOS', 'WIDTH', 'HEIGHT')
return { e.attrib['ID']: tuple(int(e.attrib[x]) for x in attribs)
for e in tree.xpath('.//*[@ID and @HPOS and @VPOS and @WIDTH and @HEIGHT]')}
62 changes: 62 additions & 0 deletions example/iiif-prezi/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import re

import monsterurl


NSMAP = {
'mets': 'http://www.loc.gov/METS/',
'mods': 'http://www.loc.gov/mods/v3',
'mix': 'http://www.loc.gov/mix/v20',
'xlink': 'http://www.w3.org/1999/xlink',
'alto': 'http://www.loc.gov/standards/alto/ns-v3#'
}
MANIFEST_TEMPLATE = {
"@id": None,
"@context": "http://iiif.io/api/presentation/2/context.json",
"@type": "sc:Manifest",
"metadata": [],
"attribution": "Provided by Google via Google Books 1000 dataset",
"service": {
"@context": "http://iiif.io/api/search/0/context.json",
"@id": None,
"profile": "http://iiif.io/api/search/0/search"
},
"sequences": [{
"@id": None,
"@type": "sc:Sequence",
"label": "Current Page Order",
"viewingDirection": "left-to-right",
"viewingHint": "paged",
"canvases": []}]}
CANVAS_TEMPLATE = {
"@id": None,
"@type": "sc:Canvas",
"height": -1,
"width": -1,
"images": [
{
"@type": "oa:Annotation",
"motivation": "sc:painting",
"resource":{
"@id": None,
"@type": "dctypes:Image",
"format": "image/jpeg",
"service": {
"@context": "http://iiif.io/api/image/2/context.json",
"@id": None,
"profile": "http://iiif.io/api/image/2/level1.json"
},
"height": None,
"width": None,
},
"on": None}]}


def make_id(app, vol_id, resource_type="annotation"):
protocol = app.config.get('PROTOCOL', 'http')
location = app.config.get('SERVER_NAME', 'localhost:8008')
app_path = app.config.get('APP_PATH', '')
ident = re.sub('(.)([A-Z][a-z]+)', r'\1-\2', monsterurl.get_monster())
ident = re.sub('([a-z0-9])([A-Z])', r'\1-\2', ident).replace('--', '-').lower()
return f'{protocol}://{location}{app_path}/{vol_id}/{resource_type}/{ident}'

45 changes: 45 additions & 0 deletions example/iiif-prezi/gbooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import copy
from pathlib import Path

import lxml.etree as etree

from common import make_id, MANIFEST_TEMPLATE, CANVAS_TEMPLATE


def make_manifest(app, vol_id):
base_dir = Path(app.config.get('GOOGLE1000_PATH', '../data/google1000'))
hocr_path = base_dir / f'Volume_{vol_id.split(":")[1]}.hocr'
protocol = app.config.get('PROTOCOL', 'http')
location = app.config.get('SERVER_NAME', 'localhost:8008')
app_path = app.config.get('APP_PATH', '')
manifest_path = app.url_for('get_manifest', volume_id=vol_id)
search_path = app.url_for('search', doc_id=vol_id)
image_api_base = app.config.get('IMAGE_API_BASE', 'http://localhost:8080')
manifest = copy.deepcopy(MANIFEST_TEMPLATE)
manifest['@id'] = f'{protocol}://{location}{app_path}/{manifest_path}'
manifest['service']['@id'] = f'{protocol}://{location}{app_path}{search_path}'
manifest['sequences'][0]['@id'] = make_id(app, vol_id, 'sequence')
tree = etree.parse(str(hocr_path))
metadata = {}
for meta_elem in tree.findall('.//meta'):
if not meta_elem.attrib.get('name', '').startswith('DC.'):
continue
metadata[meta_elem.attrib['name'][3:]] = meta_elem.attrib['content']
manifest['label'] = metadata.get('title', vol_id)
manifest['metadata'] = [{'@label': k, '@value': v} for k, v in metadata.items()]
for page_elem in tree.findall('.//div[@class="ocr_page"]'):
canvas = copy.deepcopy(CANVAS_TEMPLATE)
page_id = page_elem.attrib['id']
canvas['@id'] = f'{protocol}://{location}{app_path}/{vol_id}/canvas/{page_id}'
page_idx = int(page_id.split('_')[-1]) - 1
image_url = f'{image_api_base}/{vol_id}_{page_idx:04}'
_, _, width, height = (int(x) for x in page_elem.attrib['title'].split(' ')[1:])
canvas['width'] = width
canvas['height'] = height
canvas['images'][0]['on'] = canvas['@id']
canvas['images'][0]['resource']['width'] = width
canvas['images'][0]['resource']['height'] = height
canvas['images'][0]['resource']['@id'] = f'{image_url}/full/full/0/default.jpg'
canvas['images'][0]['resource']['service']['@id'] = image_url
manifest['sequences'][0]['canvases'].append(canvas)
return manifest
Loading

0 comments on commit 86214a9

Please sign in to comment.