Omeka import (#65)

* feat: remove old data and metadata * feat: enhance gh action and config-metadata * fix: artifact handling added * fix: path fixed * feat: download all objects * fix: script fixed * fix: multiple added * fix: types and original url fixed * fix: removed original url for parent * fix: remove omeka_import branch * fix: settings added * fix: fix fullscreen ---------
Stadt-Geschichte-Basel · Mar 11, 2024 · ce8de23 · ce8de23
1 parent 6d889e3
commit ce8de23
Show file tree

Hide file tree

Showing 50 changed files with 203 additions and 671 deletions.
diff --git a/.github/workflows/jekyll.yml b/.github/workflows/jekyll.yml
@@ -1,60 +1,72 @@
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-# Sample workflow for building and deploying a Jekyll site to GitHub Pages
-name: Deploy Jekyll site to Pages
+name: Deploy Jekyll site with Data Processing
 
 on:
-  # Runs on pushes targeting the default branch
   push:
     branches:
-        - $default-branch
-
-  # Allows you to run this workflow manually from the Actions tab
+      - main
   workflow_dispatch:
 
-# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 permissions:
   contents: read
   pages: write
   id-token: write
 
-# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
-# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
 concurrency:
   group: "pages"
   cancel-in-progress: false
 
+env:
+  OMEKA_API_URL: ${{ secrets.OMEKA_API_URL }}
+  KEY_IDENTITY: ${{ secrets.KEY_IDENTITY }}
+  KEY_CREDENTIAL: ${{ secrets.KEY_CREDENTIAL }}
+  ITEM_SET_ID: ${{ secrets.ITEM_SET_ID }}
+
 jobs:
-  # Build job
+  data-processing:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - run: pip install pandas requests
+      - name: Process Data
+        run: python .github/workflows/process_data.py
+      - name: Upload artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: sgb-metadata
+          path: _data/sgb-metadata.csv
+
   build:
+    needs: data-processing
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+      - name: Download artifact
+        uses: actions/download-artifact@v3
+        with:
+          name: sgb-metadata
+          path: _data
       - name: Setup Ruby
         uses: ruby/setup-ruby@8575951200e472d5f2d95c625da0c7bec8217c42 # v1.161.0
         with:
-          bundler-cache: true # runs 'bundle install' and caches installed gems automatically
-          cache-version: 0 # Increment this number if you need to re-download cached gems
+          bundler-cache: true
+          cache-version: 0
       - name: Setup Pages
         id: pages
         uses: actions/configure-pages@v4
       - name: Build with Jekyll
-        # Outputs to the './_site' directory by default
         run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}"
         env:
           JEKYLL_ENV: production
       - name: Upload artifact
-        # Automatically uploads an artifact from the './_site' directory by default
         uses: actions/upload-pages-artifact@v3
 
-  # Deployment job
   deploy:
-    runs-on: ubuntu-latest
     needs: build
+    runs-on: ubuntu-latest
     steps:
       - name: Deploy to GitHub Pages
         id: deployment

diff --git a/.github/workflows/process_data.py b/.github/workflows/process_data.py
@@ -0,0 +1,155 @@
+import os
+import pandas as pd
+import requests
+from urllib.parse import urlparse, parse_qs
+
+OMEKA_API_URL = os.environ.get("OMEKA_API_URL")
+KEY_IDENTITY = os.environ.get("KEY_IDENTITY")
+KEY_CREDENTIAL = os.environ.get("KEY_CREDENTIAL")
+ITEM_SET_ID = os.environ.get("ITEM_SET_ID")   
+
+def get_items_from_collection(collection_id):
+    url = OMEKA_API_URL + "items"
+    all_items = []
+    params = {
+        "item_set_id": collection_id,
+        "key_identity": KEY_IDENTITY,
+        "key_credential": KEY_CREDENTIAL,
+    }
+
+    while True:
+        response = requests.get(url, params=params)
+        if response.status_code != 200:
+            print(f"Error: {response.status_code}")
+            break
+
+        # Add the items from the current page to our list
+        all_items.extend(response.json())
+
+        # Check if there is a 'next' page
+        links = requests.utils.parse_header_links(response.headers.get("Link", ""))
+        next_link = [link for link in links if link["rel"] == "next"]
+        if not next_link:
+            break
+
+        # Update the URL for the next request
+        next_url = next_link[0]["url"]
+        url_parsed = urlparse(next_url)
+        next_params = parse_qs(url_parsed.query)
+        params.update(next_params)
+
+    return all_items
+
+
+def get_media(item_id):
+    url = OMEKA_API_URL + "media/" + str(item_id)
+    params = {"key_identity": KEY_IDENTITY, "key_credential": KEY_CREDENTIAL}
+    response = requests.get(url, params=params)
+    if response.status_code != 200:
+        print(f"Error: {response.status_code}")
+        return None
+    return response.json()
+
+
+def media_type(media_list, index=None):
+    if not media_list:
+        return "record"
+    if len(media_list) > 1 and index is None:
+        media_types = [media.get("o:media_type") for media in media_list]
+        if all([media_type.startswith("image") for media_type in media_types]):
+            return "multiple"
+        return "compound_object"
+    media = media_list[index] if index is not None else media_list[0]
+    media_type = media.get("o:media_type")
+    if media_type.startswith("image"):
+        return "image"
+    if media_type.startswith("audio"):
+        return "audio"
+    if media_type.startswith("video"):
+        return "video"
+    if media_type.startswith("application/pdf"):
+        return "pdf"
+    return "record"
+
+
+def map_columns(item_set):
+    return_list = []
+    for data in item_set:
+        media_list = [
+            get_media(media_item.get("o:id")) for media_item in data.get("o:media", [])
+        ]
+        type = media_type(media_list)
+        media = media_list[0] if media_list else {}
+        media_data = create_media_data_dict(data, media, type)
+        return_list.append(media_data)
+        if len(media_list) > 1:
+            for index, media in enumerate(media_list):
+                type_child = media_type(media_list, index)
+                media_data = create_media_data_dict(data, media, type_child, index)
+                return_list.append(media_data)
+    return return_list
+
+
+def create_media_data_dict(data, media, type, index=None):
+    media_id_suffix = f"_{index}" if index is not None else ""
+    media_url = (
+        media.get("thumbnail_display_urls", {}).get("large")
+        if media
+        and media.get("thumbnail_display_urls", {}).get("large", "").startswith("http")
+        else None
+    )
+    media_original_url = (
+        media.get("o:original_url")
+        if media and
+        (len(data.get("o:media", [])) > 1 and
+         index is not None) and
+        media.get("o:original_url", "").startswith("http") and media.get("o:is_public")
+        else None
+    )
+    media_title = (
+        media.get("o:title")
+        if index is not None
+        else data.get("dcterms:title", [{}])[0].get("@value")
+    )
+
+    return {
+        "objectid": data.get("dcterms:identifier", [{}])[0].get("@value").lower()
+        + media_id_suffix,
+        "parentid": None
+        if index is None
+        else data.get("dcterms:identifier", [{}])[0].get("@value").lower(),
+        "title": media_title,
+        "creator": data.get("dcterms:creator", [{}])[0].get("@value"),
+        "date": data.get("dcterms:date", [{}])[0].get("@value"),
+        "era": data.get("dcterms:temporal", [{}])[0].get("@value"),
+        "description": data.get("dcterms:description", [{}])[0].get("@value"),
+        "subject": "; ".join(
+            [item.get("@value", "") for item in data.get("dcterms:subject", [])]
+        ),
+        "publisher": data.get("dcterms:publisher", [{}])[0].get("@value"),
+        "source": data.get("dcterms:source", [{}])[0].get("@value"),
+        "relation": data.get("dcterms:relation", [{}])[0].get("@value"),
+        "hasVersion": data.get("dcterms:hasVersion", [{}])[0].get("@value"),
+        "type": data.get("dcterms:type", [{}])[0].get("@id"),
+        "format": data.get("dcterms:format", [{}])[0].get("@value"),
+        "extent": data.get("dcterms:extent", [{}])[0].get("@value"),
+        "language": data.get("dcterms:language", [{}])[0].get("o:label"),
+        "rights": "; ".join(
+            [item.get("@value", "") for item in data.get("dcterms:rights", [])]
+        ),
+        "license": data.get("dcterms:license", [{}])[0].get("@value"),
+        "isPartOf": data.get("dcterms:isPartOf", [{}])[0].get("@value"),
+        "isReferencedBy": data.get("dcterms:isReferencedBy", [{}])[0].get("@value"),
+        "display_template": type,
+        "object_location": media_original_url,
+        "image_small": media_url,
+        "image_thumb": media_url,
+        "image_alt_text": None,
+        "object_transcript": None,
+    }
+
+
+item_set = get_items_from_collection(ITEM_SET_ID)
+item_set_mapped = map_columns(item_set)
+item_set_mapped_df = pd.DataFrame(item_set_mapped)
+item_set_mapped_df.to_csv("_data/sgb-metadata.csv", index=False)
diff --git a/_data/config-metadata.csv b/_data/config-metadata.csv
@@ -1,21 +1,20 @@
 field,display_name,browse_link,external_link,dc_map,schema_map
 objectid,Identifikator,,,DCTERMS.identifier,identifier
 title,Titel,,,DCTERMS.title,headline
-subject,Thema,true,,DCTERMS.subject,keywords
-description,Beschreibung,,,DCTERMS.description,description
-creator,Ersteller:in,,,DCTERMS.creator,creator
-publisher,Rechteinhaber:in,true,,DCTERMS.publisher,publisher
+creator,Ersteller*in,,,DCTERMS.creator,creator
 date,Datum,,,DCTERMS.date,dateCreated
 era,Epoche,true,,DCTERMS.temporal,
+description,Beschreibung,,,DCTERMS.description,description
+subject,Thema,true,,DCTERMS.subject,keywords
+publisher,Rechteinhaber*in,true,,DCTERMS.publisher,publisher
+source,Quelle,,,DCTERMS.source,
+relation,Link,,true,DCTERMS.relation,
+hasVersion,IIIF Manifest,,true,DCTERMS.hasVersion,version
 type,Typ,,,DCTERMS.type,
 format,Format,,,DCTERMS.format,encodingFormat
-extent,Umfang,,,DCTERMS.extent,
-source,Quelle,,,DCTERMS.source,
+extent,Auflösung,,,DCTERMS.extent,
 language,Sprache,true,,DCTERMS.language,inLanguage
-relation,Zugeordnete Ressource,,,,
 rights,Rechte,,,DCTERMS.rights,usageInfo
 license,Lizenz,,true,DCTERMS.license,license
-iiif,IIIF-Manifest,,,,
-hasVersion,Version,,,DCTERMS.hasVersion,version
-isPartOf,Teil von,true,,DCTERMS.isPartOf,isPartOf
-isReferencedBy,Referenziert durch,,,DCTERMS.isReferencedBy,
+isReferencedBy,Abbildung,,,DCTERMS.isReferencedBy,
+isPartOf,Band,,,DCTERMS.isPartOf,isPartOf