Skip to content

Commit

Permalink
Merge branch 'dataset' of github.com:EMMC-ASBL/tripper into dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
jesper-friis committed Nov 18, 2024
2 parents d20295b + 6373e73 commit b4a205b
Show file tree
Hide file tree
Showing 17 changed files with 926 additions and 242 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cd_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
jobs:
build:
name: External
uses: SINTEF/ci-cd/.github/workflows/[email protected].2
uses: SINTEF/ci-cd/.github/workflows/[email protected].3
if: github.repository == 'EMMC-ASBL/tripper' && startsWith(github.ref, 'refs/tags/v')
with:
# General
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_automerge_dependency_prs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
jobs:
update-dependencies-branch:
name: External
uses: SINTEF/ci-cd/.github/workflows/[email protected].2
uses: SINTEF/ci-cd/.github/workflows/[email protected].3
if: github.repository_owner == 'EMMC-ASBL' && ( ( startsWith(github.event.pull_request.head.ref, 'dependabot/') && github.actor == 'dependabot[bot]' ) || ( github.event.pull_request.head.ref == 'ci/update-pyproject' && github.actor == 'TEAM4-0' ) )
secrets:
PAT: ${{ secrets.RELEASE_PAT }}
2 changes: 1 addition & 1 deletion .github/workflows/ci_cd_updated_main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
jobs:
update-deps-branch-and-docs:
name: External
uses: SINTEF/ci-cd/.github/workflows/[email protected].2
uses: SINTEF/ci-cd/.github/workflows/[email protected].3
if: github.repository_owner == 'EMMC-ASBL'
with:
# General
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_check_dependencies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
jobs:
check-dependencies:
name: External
uses: SINTEF/ci-cd/.github/workflows/[email protected].2
uses: SINTEF/ci-cd/.github/workflows/[email protected].3
if: github.repository_owner == 'EMMC-ASBL'
with:
git_username: "TEAM 4.0[bot]"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
jobs:
basic-tests:
name: External
uses: SINTEF/ci-cd/.github/workflows/[email protected].2
uses: SINTEF/ci-cd/.github/workflows/[email protected].3
with:
## General settings:
install_extras: "[dev]"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_update_dependencies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
jobs:
create-collected-pr:
name: External
uses: SINTEF/ci-cd/.github/workflows/[email protected].2
uses: SINTEF/ci-cd/.github/workflows/[email protected].3
if: github.repository_owner == 'EMMC-ASBL'
with:
git_username: "TEAM 4.0[bot]"
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ repos:
files: ^tripper/.*$

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.11.2
rev: v1.13.0
hooks:
- id: mypy
exclude: ^tests/.*$
Expand All @@ -63,7 +63,7 @@ repos:
- "pydantic"

- repo: https://github.com/SINTEF/ci-cd
rev: v2.8.2
rev: v2.8.3
hooks:
- id: docs-api-reference
args:
Expand Down
488 changes: 487 additions & 1 deletion docs/figs/dataset.drawio

Large diffs are not rendered by default.

175 changes: 130 additions & 45 deletions tests/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ def test_get_context():
assert "@version" in context
assert len(context) > 20

# Check for consistency between context online and on disk
online_context = get_context(fromfile=False)
assert online_context == context


def test_get_prefixes():
"""Test get_prefixes()."""
Expand All @@ -46,6 +50,7 @@ def test_get_shortnames():
"prefixes",
"configuration",
"statements",
"mappings",
"@type",
)

Expand All @@ -57,29 +62,6 @@ def test_get_shortnames():
assert k.rsplit("#", 1)[-1].rsplit("/", 1)[-1] == v


# def test_expand_prefixes():
# """Test expand_prefixes()."""
# from tripper import DCTERMS, EMMO, OTEIO
# from tripper.dataset.dataset import expand_prefixes, get_prefixes
#
# prefixes = get_prefixes()
# d = {
# "a": "oteio:Parser",
# "b": [
# "emmo:Atom",
# {
# "Z": "emmo:AtomicNumber",
# "v": "dcterms:a/b",
# },
# ],
# }
# expand_prefixes(d, prefixes)
# assert d["a"] == OTEIO.Parser
# assert d["b"][0] == EMMO.Atom
# assert d["b"][1]["Z"] == EMMO.AtomicNumber
# assert d["b"][1]["v"] == DCTERMS["a/b"]


def test_add():
"""Test help-function add()."""
from tripper.dataset.dataset import add
Expand Down Expand Up @@ -125,8 +107,9 @@ def test_expand_iri():
# if True:
def test_save_and_load():
"""Test save_datadoc() and load()."""
# pylint: disable=too-many-statements

from tripper import CHAMEO, DCAT, OTEIO, Triplestore
from tripper import CHAMEO, DCAT, DCTERMS, OTEIO, Triplestore
from tripper.dataset import (
list_dataset_iris,
load,
Expand Down Expand Up @@ -155,6 +138,9 @@ def test_save_and_load():
assert d.inSeries == SEMDATA["SEM_cement_batch2/77600-23-001"]
assert d.distribution.mediaType == "image/tiff"

assert not load_dict(ts, "non-existing")
assert not load_dict(ts, "non-existing", use_sparql=True)

# Test load using SPARQL - this should give the same result as above
d2 = load_dict(ts, iri, use_sparql=True)
assert d2 == d
Expand All @@ -168,53 +154,124 @@ def test_save_and_load():
assert parser.parserType == "application/vnd.dlite-parse"
assert parser == d.distribution.parser

# Add generator to distribution (in KB)
GEN = ts.namespaces["gen"]
ts.add((d.distribution["@id"], OTEIO.generator, GEN.sem_hitachi))

# Test saving a generator and add it to the distribution
GEN = ts.bind("gen", "http://sintef.no/dlite/generator#")
generator = {
"@id": GEN.sem_hitachi,
"generatorType": "application/vnd.dlite-generate",
"configuration": {"driver": "hitachi"},
}
save_dict(ts, "generator", generator)
ts.add((d.distribution["@id"], OTEIO.generator, generator["@id"]))
dist = load_dict(ts, d.distribution["@id"])
assert dist.generator["@id"] == GEN.sem_hitachi
assert dist.generator["@type"] == OTEIO.Generator
assert dist.generator.generatorType == "application/vnd.dlite-generate"

# Test save dict
save_dict(
ts,
"distribution",
{"@id": SEMDATA.newdistr, "format": "txt"},
prefixes={"echem": "https://w3id.org/emmo/domain/electrochemistry"},
)
newdistr = load_dict(ts, SEMDATA.newdistr)
assert newdistr["@type"] == DCAT.Distribution
assert newdistr.format == "txt"

# Test load dataset (this downloads an actual image from github)
data = load(ts, iri)
assert len(data) == 53502

# Test load updated distribution
dd = load_dict(ts, iri)
assert dd != d # we have added a generator
assert dd.distribution.generator == load_dict(ts, generator["@id"])
assert dd.distribution.generator == load_dict(ts, GEN.sem_hitachi)
del dd.distribution["generator"]
assert dd == d

# Test save dataset
# Test save dataset with anonymous distribution
newfile = outputdir / "newimage.tiff"
newfile.unlink(missing_ok=True)
distribution = {
"@id": SEMDATA.newimage,
"@type": SEM.SimImage,
"downloadURL": f"file:{newfile}",
}
buf = b"some bytes..."
save(ts, buf, distribution=distribution)
save(
ts,
buf,
dataset={
"@id": SEMDATA.newimage,
"@type": SEM.SEMImage,
DCTERMS.title: "New SEM image",
},
distribution={"downloadURL": f"file:{newfile}"},
)
assert newfile.exists()
assert newfile.stat().st_size == len(buf)

# Test load new distribution
# dnew = load_dict(ts, SEMDATA.newimage)
newimage = load_dict(ts, SEMDATA.newimage)
assert newimage["@id"] == SEMDATA.newimage
assert DCAT.Dataset in newimage["@type"]
assert SEM.SEMImage in newimage["@type"]
assert newimage.distribution["@id"].startswith("_:")
assert newimage.distribution["@type"] == DCAT.Distribution
assert newimage.distribution.downloadURL == f"file:{newfile}"

# Test save dataset with named distribution
newfile2 = outputdir / "newimage.png"
newfile2.unlink(missing_ok=True)
save(
ts,
buf,
dataset=SEMDATA.newimage2,
distribution={
"@id": SEMDATA.newdistr2,
"downloadURL": f"file:{newfile2}",
"mediaType": "image/png",
"generator": GEN.sem_hitachi,
"parser": PARSER.sem_hitachi,
},
)
assert newfile2.exists()
assert newfile2.stat().st_size == len(buf)
newimage2 = load_dict(ts, SEMDATA.newimage2)
assert newimage2["@id"] == SEMDATA.newimage2
assert newimage2["@type"] == DCAT.Dataset
assert newimage2.distribution["@id"] == SEMDATA.newdistr2
assert newimage2.distribution["@type"] == DCAT.Distribution
assert newimage2.distribution.downloadURL == f"file:{newfile2}"

# Test save anonymous dataset with existing distribution
newfile2.unlink(missing_ok=True)
save(ts, buf, distribution=SEMDATA.newdistr2)
assert newfile2.exists()
assert newfile2.stat().st_size == len(buf)

# Test save existing dataset with anonymous distribution
newfile2.unlink(missing_ok=True)
save(ts, buf, dataset=SEMDATA.newimage2)
assert newfile2.exists()
assert newfile2.stat().st_size == len(buf)

# Test save new dataset with reference to existing distribution
newfile2.unlink(missing_ok=True)
save(
ts,
buf,
dataset={
"@id": SEMDATA.newimage3,
"title": "A dataset with no default distribution",
"distribution": SEMDATA.newdistr2,
},
generator=GEN.sem_hitachi,
)
assert newfile2.exists()
assert newfile2.stat().st_size == len(buf)

# Test searching the triplestore
SAMPLE = ts.namespaces["sample"]
assert set(list_dataset_iris(ts)) == {
datasets = list_dataset_iris(ts)
named_datasets = {
SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"],
SEMDATA["SEM_cement_batch2/77600-23-001"],
SEMDATA["SEM_cement_batch2"],
SAMPLE["SEM_cement_batch2/77600-23-001"],
SEMDATA.newimage,
SEMDATA.newimage2,
}
assert not named_datasets.difference(datasets)
assert set(list_dataset_iris(ts, creator="Sigurd Wenner")) == {
SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"],
SEMDATA["SEM_cement_batch2/77600-23-001"],
Expand All @@ -225,6 +282,34 @@ def test_save_and_load():
}


# if True:
def test_pipeline():
"""Test creating OTEAPI pipeline."""
from tripper import Triplestore
from tripper.dataset import get_partial_pipeline, save_datadoc

otelib = pytest.importorskip("otelib")

# Prepare triplestore
ts = Triplestore("rdflib")
save_datadoc(ts, inputdir / "semdata.yaml")

SEMDATA = ts.namespaces["semdata"]
GEN = ts.namespaces["gen"]

client = otelib.OTEClient("python")
iri = SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"]
parse = get_partial_pipeline(ts, client, iri, parser=True)
generate = get_partial_pipeline(ts, client, iri, generator=GEN.sem_hitachi)

# Entity-service doesn't work, so we skip the generate part for now...
# pipeline = parse >> generate
assert generate
pipeline = parse

pipeline.get()


def test_fuseki():
"""Test save and load dataset with Fuseki."""
import os
Expand Down
48 changes: 0 additions & 48 deletions tests/input/datasets.yaml

This file was deleted.

10 changes: 10 additions & 0 deletions tests/input/semdata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ prefixes:
mat: https://he-matchmaker.eu/material/
dm: http://onto-ns.com/meta/characterisation/0.1/SEMImage#
parser: http://sintef.no/dlite/parser#
gen: http://sintef.no/dlite/generator#


# List of documented datasets
Expand All @@ -21,6 +22,7 @@ datasets:

datamodel: http://onto-ns.com/meta/matchmaker/0.1/SEMImage
#datamodel: http://onto-ns.com/meta/characterisation/0.1/SEMImage
datamodelStorage: https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml
#mappingURL: https://raw.githubusercontent.com/HEU-MatCHMaker/DataDocumentation/refs/heads/master/SEM/datamodels/SEMImage.ttl

# Contextual documentation of the dataset
Expand Down Expand Up @@ -68,3 +70,11 @@ parsers:
parserType: application/vnd.dlite-parse
configuration:
driver: hitachi


generators:
- "@id": gen:sem_hitachi
"@type": oteio:Generator
generatorType: application/vnd.dlite-generate
configuration:
driver: hitachi
Loading

0 comments on commit b4a205b

Please sign in to comment.