Skip to content

Commit

Permalink
Omeka import (#65)
Browse files Browse the repository at this point in the history

* feat: remove old data and metadata

* feat: enhance gh action and config-metadata

* fix: artifact handling added

* fix: path fixed

* feat: download all objects

* fix: script fixed

* fix: multiple added

* fix: types and original url fixed

* fix: removed original url for parent

* fix: remove omeka_import branch

* fix: settings added

* fix: fix fullscreen

---------
  • Loading branch information
maehr authored Mar 11, 2024
1 parent 6d889e3 commit ce8de23
Show file tree
Hide file tree
Showing 50 changed files with 203 additions and 671 deletions.
54 changes: 33 additions & 21 deletions .github/workflows/jekyll.yml
Original file line number Diff line number Diff line change
@@ -1,60 +1,72 @@
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

# Sample workflow for building and deploying a Jekyll site to GitHub Pages
name: Deploy Jekyll site to Pages
name: Deploy Jekyll site with Data Processing

on:
# Runs on pushes targeting the default branch
push:
branches:
- $default-branch

# Allows you to run this workflow manually from the Actions tab
- main
workflow_dispatch:

# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
contents: read
pages: write
id-token: write

# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
concurrency:
group: "pages"
cancel-in-progress: false

env:
OMEKA_API_URL: ${{ secrets.OMEKA_API_URL }}
KEY_IDENTITY: ${{ secrets.KEY_IDENTITY }}
KEY_CREDENTIAL: ${{ secrets.KEY_CREDENTIAL }}
ITEM_SET_ID: ${{ secrets.ITEM_SET_ID }}

jobs:
# Build job
data-processing:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- run: pip install pandas requests
- name: Process Data
run: python .github/workflows/process_data.py
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: sgb-metadata
path: _data/sgb-metadata.csv

build:
needs: data-processing
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download artifact
uses: actions/download-artifact@v3
with:
name: sgb-metadata
path: _data
- name: Setup Ruby
uses: ruby/setup-ruby@8575951200e472d5f2d95c625da0c7bec8217c42 # v1.161.0
with:
bundler-cache: true # runs 'bundle install' and caches installed gems automatically
cache-version: 0 # Increment this number if you need to re-download cached gems
bundler-cache: true
cache-version: 0
- name: Setup Pages
id: pages
uses: actions/configure-pages@v4
- name: Build with Jekyll
# Outputs to the './_site' directory by default
run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}"
env:
JEKYLL_ENV: production
- name: Upload artifact
# Automatically uploads an artifact from the './_site' directory by default
uses: actions/upload-pages-artifact@v3

# Deployment job
deploy:
runs-on: ubuntu-latest
needs: build
runs-on: ubuntu-latest
steps:
- name: Deploy to GitHub Pages
id: deployment
Expand Down
155 changes: 155 additions & 0 deletions .github/workflows/process_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import os
import pandas as pd
import requests
from urllib.parse import urlparse, parse_qs

OMEKA_API_URL = os.environ.get("OMEKA_API_URL")
KEY_IDENTITY = os.environ.get("KEY_IDENTITY")
KEY_CREDENTIAL = os.environ.get("KEY_CREDENTIAL")
ITEM_SET_ID = os.environ.get("ITEM_SET_ID")

def get_items_from_collection(collection_id):
url = OMEKA_API_URL + "items"
all_items = []
params = {
"item_set_id": collection_id,
"key_identity": KEY_IDENTITY,
"key_credential": KEY_CREDENTIAL,
}

while True:
response = requests.get(url, params=params)
if response.status_code != 200:
print(f"Error: {response.status_code}")
break

# Add the items from the current page to our list
all_items.extend(response.json())

# Check if there is a 'next' page
links = requests.utils.parse_header_links(response.headers.get("Link", ""))
next_link = [link for link in links if link["rel"] == "next"]
if not next_link:
break

# Update the URL for the next request
next_url = next_link[0]["url"]
url_parsed = urlparse(next_url)
next_params = parse_qs(url_parsed.query)
params.update(next_params)

return all_items


def get_media(item_id):
url = OMEKA_API_URL + "media/" + str(item_id)
params = {"key_identity": KEY_IDENTITY, "key_credential": KEY_CREDENTIAL}
response = requests.get(url, params=params)
if response.status_code != 200:
print(f"Error: {response.status_code}")
return None
return response.json()


def media_type(media_list, index=None):
if not media_list:
return "record"
if len(media_list) > 1 and index is None:
media_types = [media.get("o:media_type") for media in media_list]
if all([media_type.startswith("image") for media_type in media_types]):
return "multiple"
return "compound_object"
media = media_list[index] if index is not None else media_list[0]
media_type = media.get("o:media_type")
if media_type.startswith("image"):
return "image"
if media_type.startswith("audio"):
return "audio"
if media_type.startswith("video"):
return "video"
if media_type.startswith("application/pdf"):
return "pdf"
return "record"


def map_columns(item_set):
return_list = []
for data in item_set:
media_list = [
get_media(media_item.get("o:id")) for media_item in data.get("o:media", [])
]
type = media_type(media_list)
media = media_list[0] if media_list else {}
media_data = create_media_data_dict(data, media, type)
return_list.append(media_data)
if len(media_list) > 1:
for index, media in enumerate(media_list):
type_child = media_type(media_list, index)
media_data = create_media_data_dict(data, media, type_child, index)
return_list.append(media_data)
return return_list


def create_media_data_dict(data, media, type, index=None):
media_id_suffix = f"_{index}" if index is not None else ""
media_url = (
media.get("thumbnail_display_urls", {}).get("large")
if media
and media.get("thumbnail_display_urls", {}).get("large", "").startswith("http")
else None
)
media_original_url = (
media.get("o:original_url")
if media and
(len(data.get("o:media", [])) > 1 and
index is not None) and
media.get("o:original_url", "").startswith("http") and media.get("o:is_public")
else None
)
media_title = (
media.get("o:title")
if index is not None
else data.get("dcterms:title", [{}])[0].get("@value")
)

return {
"objectid": data.get("dcterms:identifier", [{}])[0].get("@value").lower()
+ media_id_suffix,
"parentid": None
if index is None
else data.get("dcterms:identifier", [{}])[0].get("@value").lower(),
"title": media_title,
"creator": data.get("dcterms:creator", [{}])[0].get("@value"),
"date": data.get("dcterms:date", [{}])[0].get("@value"),
"era": data.get("dcterms:temporal", [{}])[0].get("@value"),
"description": data.get("dcterms:description", [{}])[0].get("@value"),
"subject": "; ".join(
[item.get("@value", "") for item in data.get("dcterms:subject", [])]
),
"publisher": data.get("dcterms:publisher", [{}])[0].get("@value"),
"source": data.get("dcterms:source", [{}])[0].get("@value"),
"relation": data.get("dcterms:relation", [{}])[0].get("@value"),
"hasVersion": data.get("dcterms:hasVersion", [{}])[0].get("@value"),
"type": data.get("dcterms:type", [{}])[0].get("@id"),
"format": data.get("dcterms:format", [{}])[0].get("@value"),
"extent": data.get("dcterms:extent", [{}])[0].get("@value"),
"language": data.get("dcterms:language", [{}])[0].get("o:label"),
"rights": "; ".join(
[item.get("@value", "") for item in data.get("dcterms:rights", [])]
),
"license": data.get("dcterms:license", [{}])[0].get("@value"),
"isPartOf": data.get("dcterms:isPartOf", [{}])[0].get("@value"),
"isReferencedBy": data.get("dcterms:isReferencedBy", [{}])[0].get("@value"),
"display_template": type,
"object_location": media_original_url,
"image_small": media_url,
"image_thumb": media_url,
"image_alt_text": None,
"object_transcript": None,
}


item_set = get_items_from_collection(ITEM_SET_ID)
item_set_mapped = map_columns(item_set)
item_set_mapped_df = pd.DataFrame(item_set_mapped)
item_set_mapped_df.to_csv("_data/sgb-metadata.csv", index=False)
21 changes: 10 additions & 11 deletions _data/config-metadata.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
field,display_name,browse_link,external_link,dc_map,schema_map
objectid,Identifikator,,,DCTERMS.identifier,identifier
title,Titel,,,DCTERMS.title,headline
subject,Thema,true,,DCTERMS.subject,keywords
description,Beschreibung,,,DCTERMS.description,description
creator,Ersteller:in,,,DCTERMS.creator,creator
publisher,Rechteinhaber:in,true,,DCTERMS.publisher,publisher
creator,Ersteller*in,,,DCTERMS.creator,creator
date,Datum,,,DCTERMS.date,dateCreated
era,Epoche,true,,DCTERMS.temporal,
description,Beschreibung,,,DCTERMS.description,description
subject,Thema,true,,DCTERMS.subject,keywords
publisher,Rechteinhaber*in,true,,DCTERMS.publisher,publisher
source,Quelle,,,DCTERMS.source,
relation,Link,,true,DCTERMS.relation,
hasVersion,IIIF Manifest,,true,DCTERMS.hasVersion,version
type,Typ,,,DCTERMS.type,
format,Format,,,DCTERMS.format,encodingFormat
extent,Umfang,,,DCTERMS.extent,
source,Quelle,,,DCTERMS.source,
extent,Auflösung,,,DCTERMS.extent,
language,Sprache,true,,DCTERMS.language,inLanguage
relation,Zugeordnete Ressource,,,,
rights,Rechte,,,DCTERMS.rights,usageInfo
license,Lizenz,,true,DCTERMS.license,license
iiif,IIIF-Manifest,,,,
hasVersion,Version,,,DCTERMS.hasVersion,version
isPartOf,Teil von,true,,DCTERMS.isPartOf,isPartOf
isReferencedBy,Referenziert durch,,,DCTERMS.isReferencedBy,
isReferencedBy,Abbildung,,,DCTERMS.isReferencedBy,
isPartOf,Band,,,DCTERMS.isPartOf,isPartOf
Loading

0 comments on commit ce8de23

Please sign in to comment.