Merge branch 'dataset' of github.com:EMMC-ASBL/tripper into dataset

EMMC-ASBL · Nov 18, 2024 · b4a205b · b4a205b
2 parents d20295b + 6373e73
commit b4a205b
Show file tree

Hide file tree

Showing 17 changed files with 926 additions and 242 deletions.
diff --git a/.github/workflows/cd_release.yml b/.github/workflows/cd_release.yml
@@ -8,7 +8,7 @@ on:
 jobs:
   build:
     name: External
-    uses: SINTEF/ci-cd/.github/workflows/[email protected].2
+    uses: SINTEF/ci-cd/.github/workflows/[email protected].3
     if: github.repository == 'EMMC-ASBL/tripper' && startsWith(github.ref, 'refs/tags/v')
     with:
       # General

diff --git a/.github/workflows/ci_automerge_dependency_prs.yml b/.github/workflows/ci_automerge_dependency_prs.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   update-dependencies-branch:
     name: External
-    uses: SINTEF/ci-cd/.github/workflows/[email protected].2
+    uses: SINTEF/ci-cd/.github/workflows/[email protected].3
     if: github.repository_owner == 'EMMC-ASBL' && ( ( startsWith(github.event.pull_request.head.ref, 'dependabot/') && github.actor == 'dependabot[bot]' ) || ( github.event.pull_request.head.ref == 'ci/update-pyproject' && github.actor == 'TEAM4-0' ) )
     secrets:
       PAT: ${{ secrets.RELEASE_PAT }}
diff --git a/.github/workflows/ci_cd_updated_main.yml b/.github/workflows/ci_cd_updated_main.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   update-deps-branch-and-docs:
     name: External
-    uses: SINTEF/ci-cd/.github/workflows/[email protected].2
+    uses: SINTEF/ci-cd/.github/workflows/[email protected].3
     if: github.repository_owner == 'EMMC-ASBL'
     with:
       # General

diff --git a/.github/workflows/ci_check_dependencies.yml b/.github/workflows/ci_check_dependencies.yml
@@ -9,7 +9,7 @@ on:
 jobs:
   check-dependencies:
     name: External
-    uses: SINTEF/ci-cd/.github/workflows/[email protected].2
+    uses: SINTEF/ci-cd/.github/workflows/[email protected].3
     if: github.repository_owner == 'EMMC-ASBL'
     with:
       git_username: "TEAM 4.0[bot]"

diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml
@@ -10,7 +10,7 @@ on:
 jobs:
   basic-tests:
     name: External
-    uses: SINTEF/ci-cd/.github/workflows/[email protected].2
+    uses: SINTEF/ci-cd/.github/workflows/[email protected].3
     with:
       ## General settings:
       install_extras: "[dev]"

diff --git a/.github/workflows/ci_update_dependencies.yml b/.github/workflows/ci_update_dependencies.yml
@@ -9,7 +9,7 @@ on:
 jobs:
   create-collected-pr:
     name: External
-    uses: SINTEF/ci-cd/.github/workflows/[email protected].2
+    uses: SINTEF/ci-cd/.github/workflows/[email protected].3
     if: github.repository_owner == 'EMMC-ASBL'
     with:
       git_username: "TEAM 4.0[bot]"

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -54,7 +54,7 @@ repos:
       files: ^tripper/.*$
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.11.2
+    rev: v1.13.0
     hooks:
     - id: mypy
       exclude: ^tests/.*$
@@ -63,7 +63,7 @@ repos:
         - "pydantic"
 
   - repo: https://github.com/SINTEF/ci-cd
-    rev: v2.8.2
+    rev: v2.8.3
     hooks:
     - id: docs-api-reference
       args:

diff --git a/docs/figs/dataset.drawio b/docs/figs/dataset.drawio
diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py
@@ -24,6 +24,10 @@ def test_get_context():
     assert "@version" in context
     assert len(context) > 20
 
+    # Check for consistency between context online and on disk
+    online_context = get_context(fromfile=False)
+    assert online_context == context
+
 
 def test_get_prefixes():
     """Test get_prefixes()."""
@@ -46,6 +50,7 @@ def test_get_shortnames():
         "prefixes",
         "configuration",
         "statements",
+        "mappings",
         "@type",
     )
 
@@ -57,29 +62,6 @@ def test_get_shortnames():
             assert k.rsplit("#", 1)[-1].rsplit("/", 1)[-1] == v
 
 
-# def test_expand_prefixes():
-#     """Test expand_prefixes()."""
-#     from tripper import DCTERMS, EMMO, OTEIO
-#     from tripper.dataset.dataset import expand_prefixes, get_prefixes
-#
-#     prefixes = get_prefixes()
-#     d = {
-#         "a": "oteio:Parser",
-#         "b": [
-#             "emmo:Atom",
-#             {
-#                 "Z": "emmo:AtomicNumber",
-#                 "v": "dcterms:a/b",
-#             },
-#         ],
-#     }
-#     expand_prefixes(d, prefixes)
-#     assert d["a"] == OTEIO.Parser
-#     assert d["b"][0] == EMMO.Atom
-#     assert d["b"][1]["Z"] == EMMO.AtomicNumber
-#     assert d["b"][1]["v"] == DCTERMS["a/b"]
-
-
 def test_add():
     """Test help-function add()."""
     from tripper.dataset.dataset import add
@@ -125,8 +107,9 @@ def test_expand_iri():
 # if True:
 def test_save_and_load():
     """Test save_datadoc() and load()."""
+    # pylint: disable=too-many-statements
 
-    from tripper import CHAMEO, DCAT, OTEIO, Triplestore
+    from tripper import CHAMEO, DCAT, DCTERMS, OTEIO, Triplestore
     from tripper.dataset import (
         list_dataset_iris,
         load,
@@ -155,6 +138,9 @@ def test_save_and_load():
     assert d.inSeries == SEMDATA["SEM_cement_batch2/77600-23-001"]
     assert d.distribution.mediaType == "image/tiff"
 
+    assert not load_dict(ts, "non-existing")
+    assert not load_dict(ts, "non-existing", use_sparql=True)
+
     # Test load using SPARQL - this should give the same result as above
     d2 = load_dict(ts, iri, use_sparql=True)
     assert d2 == d
@@ -168,53 +154,124 @@ def test_save_and_load():
     assert parser.parserType == "application/vnd.dlite-parse"
     assert parser == d.distribution.parser
 
+    # Add generator to distribution (in KB)
+    GEN = ts.namespaces["gen"]
+    ts.add((d.distribution["@id"], OTEIO.generator, GEN.sem_hitachi))
+
     # Test saving a generator and add it to the distribution
-    GEN = ts.bind("gen", "http://sintef.no/dlite/generator#")
-    generator = {
-        "@id": GEN.sem_hitachi,
-        "generatorType": "application/vnd.dlite-generate",
-        "configuration": {"driver": "hitachi"},
-    }
-    save_dict(ts, "generator", generator)
-    ts.add((d.distribution["@id"], OTEIO.generator, generator["@id"]))
     dist = load_dict(ts, d.distribution["@id"])
     assert dist.generator["@id"] == GEN.sem_hitachi
     assert dist.generator["@type"] == OTEIO.Generator
     assert dist.generator.generatorType == "application/vnd.dlite-generate"
 
+    # Test save dict
+    save_dict(
+        ts,
+        "distribution",
+        {"@id": SEMDATA.newdistr, "format": "txt"},
+        prefixes={"echem": "https://w3id.org/emmo/domain/electrochemistry"},
+    )
+    newdistr = load_dict(ts, SEMDATA.newdistr)
+    assert newdistr["@type"] == DCAT.Distribution
+    assert newdistr.format == "txt"
+
     # Test load dataset (this downloads an actual image from github)
     data = load(ts, iri)
     assert len(data) == 53502
 
     # Test load updated distribution
     dd = load_dict(ts, iri)
-    assert dd != d  # we have added a generator
-    assert dd.distribution.generator == load_dict(ts, generator["@id"])
+    assert dd.distribution.generator == load_dict(ts, GEN.sem_hitachi)
+    del dd.distribution["generator"]
+    assert dd == d
 
-    # Test save dataset
+    # Test save dataset with anonymous distribution
     newfile = outputdir / "newimage.tiff"
     newfile.unlink(missing_ok=True)
-    distribution = {
-        "@id": SEMDATA.newimage,
-        "@type": SEM.SimImage,
-        "downloadURL": f"file:{newfile}",
-    }
     buf = b"some bytes..."
-    save(ts, buf, distribution=distribution)
+    save(
+        ts,
+        buf,
+        dataset={
+            "@id": SEMDATA.newimage,
+            "@type": SEM.SEMImage,
+            DCTERMS.title: "New SEM image",
+        },
+        distribution={"downloadURL": f"file:{newfile}"},
+    )
     assert newfile.exists()
     assert newfile.stat().st_size == len(buf)
-
-    # Test load new distribution
-    # dnew = load_dict(ts, SEMDATA.newimage)
+    newimage = load_dict(ts, SEMDATA.newimage)
+    assert newimage["@id"] == SEMDATA.newimage
+    assert DCAT.Dataset in newimage["@type"]
+    assert SEM.SEMImage in newimage["@type"]
+    assert newimage.distribution["@id"].startswith("_:")
+    assert newimage.distribution["@type"] == DCAT.Distribution
+    assert newimage.distribution.downloadURL == f"file:{newfile}"
+
+    # Test save dataset with named distribution
+    newfile2 = outputdir / "newimage.png"
+    newfile2.unlink(missing_ok=True)
+    save(
+        ts,
+        buf,
+        dataset=SEMDATA.newimage2,
+        distribution={
+            "@id": SEMDATA.newdistr2,
+            "downloadURL": f"file:{newfile2}",
+            "mediaType": "image/png",
+            "generator": GEN.sem_hitachi,
+            "parser": PARSER.sem_hitachi,
+        },
+    )
+    assert newfile2.exists()
+    assert newfile2.stat().st_size == len(buf)
+    newimage2 = load_dict(ts, SEMDATA.newimage2)
+    assert newimage2["@id"] == SEMDATA.newimage2
+    assert newimage2["@type"] == DCAT.Dataset
+    assert newimage2.distribution["@id"] == SEMDATA.newdistr2
+    assert newimage2.distribution["@type"] == DCAT.Distribution
+    assert newimage2.distribution.downloadURL == f"file:{newfile2}"
+
+    # Test save anonymous dataset with existing distribution
+    newfile2.unlink(missing_ok=True)
+    save(ts, buf, distribution=SEMDATA.newdistr2)
+    assert newfile2.exists()
+    assert newfile2.stat().st_size == len(buf)
+
+    # Test save existing dataset with anonymous distribution
+    newfile2.unlink(missing_ok=True)
+    save(ts, buf, dataset=SEMDATA.newimage2)
+    assert newfile2.exists()
+    assert newfile2.stat().st_size == len(buf)
+
+    # Test save new dataset with reference to existing distribution
+    newfile2.unlink(missing_ok=True)
+    save(
+        ts,
+        buf,
+        dataset={
+            "@id": SEMDATA.newimage3,
+            "title": "A dataset with no default distribution",
+            "distribution": SEMDATA.newdistr2,
+        },
+        generator=GEN.sem_hitachi,
+    )
+    assert newfile2.exists()
+    assert newfile2.stat().st_size == len(buf)
 
     # Test searching the triplestore
     SAMPLE = ts.namespaces["sample"]
-    assert set(list_dataset_iris(ts)) == {
+    datasets = list_dataset_iris(ts)
+    named_datasets = {
         SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"],
         SEMDATA["SEM_cement_batch2/77600-23-001"],
         SEMDATA["SEM_cement_batch2"],
         SAMPLE["SEM_cement_batch2/77600-23-001"],
+        SEMDATA.newimage,
+        SEMDATA.newimage2,
     }
+    assert not named_datasets.difference(datasets)
     assert set(list_dataset_iris(ts, creator="Sigurd Wenner")) == {
         SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"],
         SEMDATA["SEM_cement_batch2/77600-23-001"],
@@ -225,6 +282,34 @@ def test_save_and_load():
     }
 
 
+# if True:
+def test_pipeline():
+    """Test creating OTEAPI pipeline."""
+    from tripper import Triplestore
+    from tripper.dataset import get_partial_pipeline, save_datadoc
+
+    otelib = pytest.importorskip("otelib")
+
+    # Prepare triplestore
+    ts = Triplestore("rdflib")
+    save_datadoc(ts, inputdir / "semdata.yaml")
+
+    SEMDATA = ts.namespaces["semdata"]
+    GEN = ts.namespaces["gen"]
+
+    client = otelib.OTEClient("python")
+    iri = SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"]
+    parse = get_partial_pipeline(ts, client, iri, parser=True)
+    generate = get_partial_pipeline(ts, client, iri, generator=GEN.sem_hitachi)
+
+    # Entity-service doesn't work, so we skip the generate part for now...
+    # pipeline = parse >> generate
+    assert generate
+    pipeline = parse
+
+    pipeline.get()
+
+
 def test_fuseki():
     """Test save and load dataset with Fuseki."""
     import os

diff --git a/tests/input/datasets.yaml b/tests/input/datasets.yaml
diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml
@@ -7,6 +7,7 @@ prefixes:
   mat: https://he-matchmaker.eu/material/
   dm: http://onto-ns.com/meta/characterisation/0.1/SEMImage#
   parser: http://sintef.no/dlite/parser#
+  gen: http://sintef.no/dlite/generator#
 
 
 # List of documented datasets
@@ -21,6 +22,7 @@ datasets:
 
     datamodel: http://onto-ns.com/meta/matchmaker/0.1/SEMImage
     #datamodel: http://onto-ns.com/meta/characterisation/0.1/SEMImage
+    datamodelStorage: https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml
     #mappingURL: https://raw.githubusercontent.com/HEU-MatCHMaker/DataDocumentation/refs/heads/master/SEM/datamodels/SEMImage.ttl
 
     # Contextual documentation of the dataset
@@ -68,3 +70,11 @@ parsers:
     parserType: application/vnd.dlite-parse
     configuration:
       driver: hitachi
+
+
+generators:
+  - "@id": gen:sem_hitachi
+    "@type": oteio:Generator
+    generatorType: application/vnd.dlite-generate
+    configuration:
+      driver: hitachi