diff --git a/docs/introduction.rst b/docs/introduction.rst index 06b93f4..c8fe220 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -27,10 +27,11 @@ Importing from alternative modeling frameworks See :ref:`importers` * OWL (but this only works for schema-style OWL) +* SHACL (in progress) * JSON-Schema * SQL DDL -In future other frameworks will be supported +In future other frameworks will be supported. Annotating schemas ------------------ diff --git a/docs/packages/importers.rst b/docs/packages/importers.rst index 0aa4546..bdeb1fa 100644 --- a/docs/packages/importers.rst +++ b/docs/packages/importers.rst @@ -42,6 +42,16 @@ Use robot to convert ahead of time: robot convert -i schemaorg.ttl -o schemaorg.ofn schemauto import-owl schemaorg.ofn +Importing from SHACL +-------------------- + +You can import from a SHACL shapes file. + +.. code-block:: + + schemauto import-shacl tests/resources/test_shacl_simple.ttl + + Importing from SQL ------------------ diff --git a/poetry.lock b/poetry.lock index dd714bc..3666b03 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "airium" @@ -3292,9 +3292,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -3792,8 +3792,8 @@ files = [ annotated-types = ">=0.4.0" pydantic-core = "2.20.1" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -4116,7 +4116,6 @@ description = "A pure Python implementation of the trie data structure." optional = false python-versions = "*" files = [ - {file = "PyTrie-0.4.0-py3-none-any.whl", hash = "sha256:f687c224ee8c66cda8e8628a903011b692635ffbb08d4b39c5f92b18eb78c950"}, {file = "PyTrie-0.4.0.tar.gz", hash = "sha256:8f4488f402d3465993fb6b6efa09866849ed8cda7903b50647b7d0342b805379"}, ] @@ -5095,7 +5094,7 @@ sphinx = ">=4.0" name = "sphinx-pdj-theme" version = "0.4.0" description = "A cool theme for sphinx documentation" -optional = false +optional = true python-versions = "*" files = [ {file = "sphinx-pdj-theme-0.4.0.tar.gz", hash = "sha256:4b86bfd8b8e20344db56aba13473f634286149fa0203d18e0437157f48c7e0fa"}, @@ -5167,7 +5166,7 @@ test = ["flake8", "mypy", "pytest"] name = "sphinxcontrib-mermaid" version = "0.9.2" description = "Mermaid diagrams in yours Sphinx powered docs" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "sphinxcontrib-mermaid-0.9.2.tar.gz", hash = "sha256:252ef13dd23164b28f16d8b0205cf184b9d8e2b714a302274d9f59eb708e77af"}, @@ -5959,10 +5958,9 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -docs = [] -mariadb = [] +docs = ["Sphinx", "sphinx-pdj-theme", "sphinxcontrib-mermaid"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "036cba73b6fd660157c70cb76be27a501017e8904b35c8d2ccb00d412bbba870" +content-hash = "9c29a704add4aaf15c228f9d6a81164390f060582bee85a89d266e2232c4b0ed" diff --git a/pyproject.toml b/pyproject.toml index 684e019..1f31453 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,16 +54,19 @@ linkml-runtime = "^1.7.2" duckdb = "^0.10.1" numpy = "<2.0" +Sphinx = { version = ">=4.4.0", optional = true } +sphinx-pdj-theme = { version = ">=0.2.1", optional = true } +sphinx-click = ">=3.1.0" +sphinxcontrib-mermaid = { version = ">=0.9.2", optional = true } + [tool.poetry.dev-dependencies] pytest = ">=7.1.1" -Sphinx = ">=4.4.0" -sphinx-pdj-theme = ">=0.2.1" -sphinx-click = ">=3.1.0" -sphinxcontrib-mermaid = ">=0.9.2" myst-parser = "*" jupyter = ">=1.0.0" lxml = ">=4.9.1" +#mariadb = { version = "^1.3", optional = true } + [tool.poetry.group.llm.dependencies] llm = ">=0.12" @@ -82,7 +85,7 @@ extract-schema = "schema_automator.utils.schema_extractor:cli" [tool.poetry.extras] docs = ["Sphinx", "sphinx-pdj-theme", "sphinxcontrib-mermaid"] -mariadb = ["mariadb"] +#mariadb = ["mariadb"] [tool.codespell] # Ref: https://github.com/codespell-project/codespell#using-a-config-file diff --git a/schema_automator/cli.py b/schema_automator/cli.py index bd76320..8e29b40 100644 --- a/schema_automator/cli.py +++ b/schema_automator/cli.py @@ -497,6 +497,34 @@ def import_rdfs(rdfsfile, output, metamodel_mappings, **args): schema = sie.convert(rdfsfile, **args) write_schema(schema, output) +@main.command() +@click.argument('shaclfile') +@output_option +@schema_name_option +@click.option('--input-type', '-I', + default='turtle', + help="Input format, eg. turtle") +@click.option('--identifier', '-I', help="Slot to use as identifier") +@click.option('--model-uri', help="Model URI prefix") +@click.option('--metamodel-mappings', + help="Path to metamodel mappings YAML dictionary") +@click.option('--output', '-o', help="Path to saved yaml schema") +def import_shacl(shaclfile, output, metamodel_mappings, **args): + """ + Import an SHACL profile to LinkML + + Example: + + schemauto import-shacl mymodel.shacl.ttl -o mymodel.yaml + """ + mappings_obj = None + if metamodel_mappings: + with open(metamodel_mappings) as f: + mappings_obj = yaml.safe_load(f) + sie = ShaclImportEngine(initial_metamodel_mappings=mappings_obj) + schema = sie.convert(shaclfile, **args) + write_schema(schema, output) + @main.command() @click.argument('rdffile') @output_option diff --git a/schema_automator/importers/__init__.py b/schema_automator/importers/__init__.py index 2011d25..fa187ac 100644 --- a/schema_automator/importers/__init__.py +++ b/schema_automator/importers/__init__.py @@ -3,3 +3,4 @@ from schema_automator.importers.dosdp_import_engine import DOSDPImportEngine from schema_automator.importers.frictionless_import_engine import FrictionlessImportEngine from schema_automator.importers.cadsr_import_engine import CADSRImportEngine +from schema_automator.importers.shacl_import_engine import ShaclImportEngine diff --git a/schema_automator/importers/shacl_import_engine.py b/schema_automator/importers/shacl_import_engine.py new file mode 100644 index 0000000..352c9fa --- /dev/null +++ b/schema_automator/importers/shacl_import_engine.py @@ -0,0 +1,241 @@ +from collections import defaultdict +import logging + +from dataclasses import dataclass +from typing import Dict, List, Any + +from rdflib import Graph, RDF, OWL, URIRef, RDFS, SKOS, SDO, Namespace + +from funowl import Literal + +from linkml.utils.schema_builder import SchemaBuilder +from linkml_runtime import SchemaView +from linkml_runtime.utils.formatutils import underscore +from linkml_runtime.utils.introspection import package_schemaview +from linkml_runtime.linkml_model import ( + SchemaDefinition, + SlotDefinition, + ClassDefinition, +) +from schema_automator.importers.import_engine import ImportEngine + +logger = logging.getLogger(__name__) + +HTTP_SDO = Namespace("http://schema.org/") + +DEFAULT_METAMODEL_MAPPINGS = { + "is_a": [RDFS.subClassOf, SKOS.broader], + "domain_of": [HTTP_SDO.domainIncludes, SDO.domainIncludes], + "rangeIncludes": [HTTP_SDO.rangeIncludes, SDO.rangeIncludes], + "exact_mappings": [OWL.sameAs, HTTP_SDO.sameAs], + ClassDefinition.__name__: [RDFS.Class, OWL.Class, SKOS.Concept], + SlotDefinition.__name__: [ + RDF.Property, + OWL.ObjectProperty, + OWL.DatatypeProperty, + OWL.AnnotationProperty, + ], +} + + +@dataclass +class ShaclImportEngine(ImportEngine): + """ + An ImportEngine that takes SHACL and converts it to a LinkML schema + """ + + mappings: dict = None + initial_metamodel_mappings: Dict[str, List[URIRef]] = None + metamodel_mappings: Dict[str, List[URIRef]] = None + reverse_metamodel_mappings: Dict[URIRef, List[str]] = None + include_unmapped_annotations = False + metamodel = None + metamodel_schemaview: SchemaView = None + classdef_slots: List[str] = None + + def __post_init__(self): + sv = package_schemaview("linkml_runtime.linkml_model.meta") + self.metamodel_schemaview = sv + self.metamodel = sv + self.metamodel_mappings = defaultdict(list) + self.reverse_metamodel_mappings = defaultdict(list) + for k, vs in DEFAULT_METAMODEL_MAPPINGS.items(): + self.metamodel_mappings[k].extend(vs) + for v in vs: + self.reverse_metamodel_mappings[v].append(k) + if self.initial_metamodel_mappings: + for k, vs in self.initial_metamodel_mappings.items(): + if not isinstance(vs, list): + vs = [vs] + self.metamodel_mappings[k].extend(vs) + for v in vs: + self.reverse_metamodel_mappings[URIRef(v)].append(k) + logging.info(f"Adding mapping {k} -> {v}") + for e in sv.all_elements().values(): + mappings = [] + for ms in sv.get_mappings(e.name, expand=True).values(): + for m in ms: + uri = URIRef(m) + mappings.append(uri) + self.reverse_metamodel_mappings[uri].append(e.name) + self.metamodel_mappings[e.name] = mappings + self.defclass_slots = [s.name for s in sv.class_induced_slots(ClassDefinition.class_name)] + + def convert( + self, + file: str, + name: str = None, + format="turtle", + default_prefix: str = None, + model_uri: str = None, + identifier: str = None, + **kwargs, + ) -> SchemaDefinition: + """ + Converts an shacl shapes file + + :param file: + :param name: + :param model_uri: + :param identifier: + :param kwargs: + :return: + """ + self.mappings = {} + g = Graph() + g.parse(file, format=format) + if name is not None and default_prefix is None: + default_prefix = name + if name is None: + name = default_prefix + if name is None: + name = "example" + sb = SchemaBuilder(name=name) + sb.add_defaults() + schema = sb.schema + for k, v in g.namespaces(): + if k == "schema" and v != "http://schema.org/": + continue + sb.add_prefix(k, v, replace_if_present=True) + if default_prefix is not None: + schema.default_prefix = default_prefix + if default_prefix not in schema.prefixes: + sb.add_prefix(default_prefix, model_uri, replace_if_present=True) + schema.id = schema.prefixes[default_prefix].prefix_reference + + cls_slots = defaultdict(list) + props = [] + for rdfs_property_metaclass in self._rdfs_metamodel_iri( + SlotDefinition.__name__ + ): + for p in g.subjects(RDF.type, rdfs_property_metaclass): + props.append(p) + # implicit properties + for metap in ( + self.reverse_metamodel_mappings["domain_of"] + + self.reverse_metamodel_mappings["rangeIncludes"] + ): + for p, _, _o in g.triples((None, metap, None)): + props.append(p) + for p in set(props): + sn = self.iri_to_name(p) + init_dict = self._dict_for_subject(g, p) + if "domain_of" in init_dict: + for x in init_dict["domain_of"]: + cls_slots[x].append(sn) + del init_dict["domain_of"] + if "rangeIncludes" in init_dict: + init_dict["any_of"] = [{"range": x} for x in init_dict["rangeIncludes"]] + del init_dict["rangeIncludes"] + slot = SlotDefinition(sn, **init_dict) + slot.slot_uri = str(p.n3(g.namespace_manager)) + sb.add_slot(slot) + + rdfs_classes = [] + for rdfs_class_metaclass in self._rdfs_metamodel_iri(ClassDefinition.__name__): + for s in g.subjects(RDF.type, rdfs_class_metaclass): + rdfs_classes.append(s) + # implicit classes + for metap in [RDFS.subClassOf]: + for s, _, o in g.triples((None, metap, None)): + rdfs_classes.append(s) + rdfs_classes.append(o) + for s in set(rdfs_classes): + cn = self.iri_to_name(s) + init_dict = self._dict_for_subject(g, s) + c = ClassDefinition(cn, **init_dict) + c.slots = cls_slots.get(cn, []) + c.class_uri = str(s.n3(g.namespace_manager)) + sb.add_class(c) + if identifier is not None: + id_slot = SlotDefinition(identifier, identifier=True, range="uriorcurie") + schema.slots[identifier] = id_slot + for c in schema.classes.values(): + if not c.is_a and not c.mixins: + if identifier not in c.slots: + c.slots.append(identifier) + return schema + + def _dict_for_subject(self, g: Graph, s: URIRef) -> Dict[str, Any]: + """ + Looks up triples for a subject and converts to dict using linkml keys. + + :param g: + :param p: + :return: + """ + init_dict = {} + for pp, obj in g.predicate_objects(s): + if pp == RDF.type: + continue + metaslot_name = self._element_from_iri(pp) + logging.debug(f"Mapping {pp} -> {metaslot_name}") + if metaslot_name not in self.defclass_slots: + continue + if metaslot_name is None: + logging.warning(f"Not mapping {pp}") + continue + if metaslot_name == "name": + metaslot_name = "title" + metaslot = self.metamodel.get_slot(metaslot_name) + v = self._object_to_value(obj, metaslot=metaslot) + metaslot_name_safe = underscore(metaslot_name) + if not metaslot or metaslot.multivalued: + if metaslot_name_safe not in init_dict: + init_dict[metaslot_name_safe] = [] + init_dict[metaslot_name_safe].append(v) + else: + init_dict[metaslot_name_safe] = v + return init_dict + + def _rdfs_metamodel_iri(self, name: str) -> List[URIRef]: + return self.metamodel_mappings.get(name, []) + + def _element_from_iri(self, iri: URIRef) -> str: + r = self.reverse_metamodel_mappings.get(iri, []) + if len(r) > 0: + if len(r) > 1: + logger.debug(f"Multiple mappings for {iri}: {r}") + return r[0] + + def _object_to_value(self, obj: Any, metaslot: SlotDefinition = None) -> Any: + if isinstance(obj, URIRef): + if metaslot.range == "uriorcurie" or metaslot.range == "uri": + return str(obj) + return self.iri_to_name(obj) + if isinstance(obj, Literal): + return obj.value + return obj + + def iri_to_name(self, v: URIRef) -> str: + n = self._as_name(v) + if n != v: + self.mappings[n] = v + return n + + def _as_name(self, v: URIRef): + v = str(v) + for sep in ["#", "/", ":"]: + if sep in v: + return v.split(sep)[-1] + return v diff --git a/tests/__init__.py b/tests/__init__.py index b092ca8..ad4619d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,9 +1,6 @@ import os -import pprint ROOT = os.path.abspath(os.path.dirname(__file__)) INPUT_DIR = os.path.join(ROOT, 'resources') OUTPUT_DIR = os.path.join(ROOT, 'outputs') MODEL_DIR = os.path.join(ROOT, 'test_models') - - diff --git a/tests/resources/shacl_simple.ttl b/tests/resources/shacl_simple.ttl new file mode 100644 index 0000000..9c3b08b --- /dev/null +++ b/tests/resources/shacl_simple.ttl @@ -0,0 +1,34 @@ +# example from http://book.validatingrdf.com/bookHtml011.html#ch050SHACLExample + +@prefix schema: . +@prefix sh: . +@prefix xsd: . +@prefix ex: . + +ex:UserShape a sh:NodeShape; + sh:targetClass ex:User ; + sh:property [ # Blank node 1 + sh:path schema:name ; + sh:minCount 1; + sh:maxCount 1; + sh:datatype xsd:string ; + ] ; + sh:property [ # Blank node 2 + sh:path schema:gender ; + sh:minCount 1; + sh:maxCount 1; + sh:or ( + [ sh:in (schema:Male schema:Female) ] + [ sh:datatype xsd:string] + ) + ] ; + sh:property [ # Blank node 3 + sh:path schema:birthDate ; + sh:maxCount 1; + sh:datatype xsd:date ; + ] ; + sh:property [ # Blank node 4 + sh:path schema:knows ; + sh:nodeKind sh:IRI ; + sh:class ex:User ; + ] . diff --git a/tests/resources/test_shacl_simple.ttl b/tests/resources/test_shacl_simple.ttl new file mode 100644 index 0000000..9b869e0 --- /dev/null +++ b/tests/resources/test_shacl_simple.ttl @@ -0,0 +1 @@ +# tbw diff --git a/tests/test_importers/test_shacl_importer.py b/tests/test_importers/test_shacl_importer.py new file mode 100644 index 0000000..556c58f --- /dev/null +++ b/tests/test_importers/test_shacl_importer.py @@ -0,0 +1,36 @@ +import os +import pytest + +from linkml_runtime import SchemaView + +from schema_automator.importers.shacl_import_engine import ShaclImportEngine +from linkml.generators.yamlgen import YAMLGenerator + +from schema_automator.utils.schemautils import write_schema +from tests import INPUT_DIR, OUTPUT_DIR + +# TODO - Write tests (this is a copy of test_rdfs_importer) + +REPRO = os.path.join(INPUT_DIR, 'shacl_simple.ttl') +OUTSCHEMA = os.path.join(OUTPUT_DIR, 'user_from_shacl_simple2.yaml') + + +def test_from_shacl(): + """Test Shacl conversion.""" + sie = ShaclImportEngine() + + schema = sie.convert(REPRO, default_prefix='usr', identifier='id') + write_schema(schema, OUTSCHEMA) + return + # roundtrip + s = YAMLGenerator(OUTSCHEMA).serialize() + print(s[0:100]) + sv = SchemaView(OUTSCHEMA) + activity = sv.get_class("Activity") + assert activity + assert activity.name == "Activity" + assert activity.is_a == "CreativeWork" + slots = sv.class_induced_slots(activity.name) + assert len(slots) == 1 + slot = slots[0] + assert slot.name == "id"