Skip to content

Commit

Permalink
ENH: updating reproschema commands to the new pydantic model (#36)
Browse files Browse the repository at this point in the history
* add print for testing

* update clear_header

* remove print

* fix order and other errors

* change ui yesno to radio

* fix typo

* update context, field->item, fix isVis

* remove useless due to failed validation

* remove visibility at the item level & remove matrixInfo

* fix choice

* remove identifier

* updating validate command to the new pydantic model

* updating/fixing the tests; updating the model to use CreativeWork; changes in formating

* fix conversion tests

* remove test output

* change test output directory

* final improvments on tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* model version after adding Thing class

* updating model after removing CreativeWork and ImageUrl

* adding tests to initialize the model classes

* fixing load_file; adding write_obj_jsonld function and expanding test_schema

* changing redcap2reproschema to use ned pydantic classes; some small changes to the pydantic model

* changing name from string to landstring with en as language

* fixing jsonld files

* Adding option to return compact schema to load_file

* fixing the protocol jsonld file

* changing reproschema2redcap to use the new model

* adding contectfile to write_obj_jsonld function and improving test; improving compact option for load_file

* fixing reproschema2redcap and tests

* removing file with the context and fixing references to the context_url (for now the link rfom the ref/linkm branch

* updating the reproschema2redcap to work for activity/items from urls

* improving error message for file_load and validate; checking the suffix of the file before treating it as jsonld

* fixing identify_model_class, so Item and Field are treated the same

* fixing reproschema2redcap so it reads responseOptions from another file

* rewriting parts of redcap2reproschema, fixing some bugs[wip]

* fixing compute: removing isvis condition

* fixing process_csv so it doesn't go multiple time through the same condition

* changes to input and value mapping (mapping explicitly or raising errors if not found); fixing choices and adding slider; adding sql to compute types (this does not work properly right now); adding many comments

* adding output for redcap2reproschema command; removing argparse

* model without decimal; revert changes to valueType in the model

* adding migrade command

* fixing multiple issues with redcap2rp and rp2redcap: adding compute, fixing preamble (can be either activity level or issue level

* WIP: addining test to test rp2redcap and redcap2repo using nimh exampl

* fixing paths in the tests, should run now

* ignore .DS_Store files in validation

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: yibeichan <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Jun 15, 2024
1 parent 89c6337 commit 0988c20
Show file tree
Hide file tree
Showing 52 changed files with 2,280 additions and 1,238 deletions.
39 changes: 33 additions & 6 deletions reproschema/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from . import get_logger, set_logger_level
from . import __version__
from .redcap2reproschema import redcap2reproschema as redcap2rs
from .reproschema2redcap import main as rs2redcap
from .reproschema2redcap import reproschema2redcap as rs2redcap
from .migrate import migrate2newschema

lgr = get_logger()

Expand Down Expand Up @@ -42,14 +43,33 @@ def main(log_level):


@main.command()
@click.option("--shapefile", default=None, type=click.Path(exists=True, dir_okay=False))
@click.argument("path", nargs=1, type=str)
def validate(shapefile, path):
def validate(path):
if not (path.startswith("http") or os.path.exists(path)):
raise ValueError(f"{path} must be a URL or an existing file or directory")
from .validate import validate

validate(shapefile, path)
result = validate(path)
if result:
click.echo("Validation successful")


@main.command()
@click.argument("path", nargs=1, type=click.Path(exists=True, dir_okay=True))
@click.option("--inplace", is_flag=True, help="Changing file in place")
@click.option(
"--fixed-path",
type=click.Path(dir_okay=True, writable=True, resolve_path=True),
help="Path to the fixed file/directory, if not provide suffix 'after_migration' is used",
)
def migrate(path, inplace, fixed_path):
if not (path.startswith("http") or os.path.exists(path)):
raise ValueError(f"{path} must be a URL or an existing file or directory")
if fixed_path and inplace:
raise Exception("Either inplace or fixed_path has to be provided.")
new_path = migrate2newschema(path, inplace=inplace, fixed_path=fixed_path)
if new_path:
click.echo(f"File/Directory after migration {new_path}")


@main.command()
Expand Down Expand Up @@ -103,12 +123,19 @@ def serve(port):
@main.command()
@click.argument("csv_path", type=click.Path(exists=True, dir_okay=False))
@click.argument("yaml_path", type=click.Path(exists=True, dir_okay=False))
def redcap2reproschema(csv_path, yaml_path):
@click.option(
"--output-path",
type=click.Path(dir_okay=True, writable=True, resolve_path=True),
default=".",
show_default=True,
help="Path to the output directory, defaults to the current directory.",
)
def redcap2reproschema(csv_path, yaml_path, output_path):
"""
Convert REDCap CSV files to Reproschema format.
"""
try:
redcap2rs(csv_path, yaml_path)
redcap2rs(csv_path, yaml_path, output_path)
click.echo("Converted REDCap data dictionary to Reproschema format.")
except Exception as e:
raise click.ClickException(f"Error during conversion: {e}")
Expand Down
163 changes: 116 additions & 47 deletions reproschema/jsonldutils.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,146 @@
from pyld import jsonld
from pyshacl import validate as shacl_validate
import json
import os
from .utils import start_server, stop_server, lgr
from pathlib import Path
from copy import deepcopy
import requests
from urllib.parse import urlparse
from .utils import start_server, stop_server, lgr, fixing_old_schema, CONTEXTFILE_URL
from .models import (
Item,
Activity,
Protocol,
ResponseOption,
ResponseActivity,
Response,
identify_model_class,
)


def load_file(path_or_url, started=False, http_kwargs={}):
try:
def _is_url(path):
"""
Determine whether the given path is a URL.
"""
parsed = urlparse(str(path))
return parsed.scheme in ("http", "https", "ftp", "ftps")


def _is_file(path):
"""
Determine whether the given path is a valid file path.
"""
return os.path.isfile(path)


def _fetch_jsonld_context(url):
response = requests.get(url)
return response.json()


def load_file(
path_or_url,
started=False,
http_kwargs=None,
compact=False,
compact_context=None,
fixoldschema=False,
):
"""Load a file or URL and return the expanded JSON-LD data."""
path_or_url = str(path_or_url)
if http_kwargs is None:
http_kwargs = {}
if _is_url(path_or_url):
data = jsonld.expand(path_or_url)
if len(data) == 1:
if "@id" not in data[0]:
if "@id" not in data[0] and "id" not in data[0]:
data[0]["@id"] = path_or_url
except jsonld.JsonLdError as e:
if 'only "http" and "https"' in str(e):
lgr.debug("Reloading with local server")
root = os.path.dirname(path_or_url)
elif _is_file(path_or_url):
lgr.debug("Reloading with local server")
root = os.path.dirname(path_or_url)
if not started:
stop, port = start_server(**http_kwargs)
else:
if "port" not in http_kwargs:
raise KeyError("port key missing in http_kwargs")
port = http_kwargs["port"]
base_url = f"http://localhost:{port}/"
if root:
base_url += f"{root}/"
with open(path_or_url) as json_file:
try:
data = json.load(json_file)
except json.JSONDecodeError as e:
raise json.JSONDecodeError(
f"Error parsing JSON file {json_file}: {e.msg}", e.doc, e.pos
) from e
try:
data = jsonld.expand(data, options={"base": base_url})
except:
raise
finally:
if not started:
stop, port = start_server(**http_kwargs)
stop_server(stop)
if len(data) == 1:
if "@id" not in data[0] and "id" not in data[0]:
data[0]["@id"] = base_url + os.path.basename(path_or_url)
else:
raise Exception(f"{path_or_url} is not a valid URL or file path")

if isinstance(data, list) and len(data) == 1:
data = data[0]

if fixoldschema:
data = fixing_old_schema(data, copy_data=True)
if compact:
if compact_context:
if _is_file(compact_context):
with open(compact_context) as fp:
context = json.load(fp)
elif _is_url(compact_context):
context = _fetch_jsonld_context(compact_context)
else:
if "port" not in http_kwargs:
raise KeyError("port key missing in http_kwargs")
port = http_kwargs["port"]
base_url = f"http://localhost:{port}/"
if root:
base_url += f"{root}/"
with open(path_or_url) as json_file:
data = json.load(json_file)
try:
data = jsonld.expand(data, options={"base": base_url})
except:
raise
finally:
if not started:
stop_server(stop)
if len(data) == 1:
if "@id" not in data[0]:
data[0]["@id"] = base_url + os.path.basename(path_or_url)
raise Exception(
f"compact_context has tobe a file or url, but {compact_context} provided"
)
if _is_file(path_or_url):
data = jsonld.compact(data, ctx=context, options={"base": base_url})
else:
raise
data = jsonld.compact(data, ctx=context)

return data


def validate_data(data, shape_file_path):
"""Validate an expanded jsonld document against a shape.
def validate_data(data):
"""Validate an expanded jsonld document against the pydantic model.
Parameters
----------
data : dict
Python dictionary containing JSONLD object
shape_file_path : str
SHACL file for the document
Returns
-------
conforms: bool
Whether the document is conformant with the shape
v_text: str
Validation information returned by PySHACL
Validation errors if any returned by pydantic
"""
kwargs = {"algorithm": "URDNA2015", "format": "application/n-quads"}
normalized = jsonld.normalize(data, kwargs)
data_file_format = "nquads"
shape_file_format = "turtle"
conforms, v_graph, v_text = shacl_validate(
normalized,
shacl_graph=shape_file_path,
data_graph_format=data_file_format,
shacl_graph_format=shape_file_format,
inference="rdfs",
debug=False,
serialize_report_graph=True,
)
# do we need it?
# kwargs = {"algorithm": "URDNA2015", "format": "application/n-quads"}
# normalized = jsonld.normalize(data, kwargs)
obj_type = identify_model_class(data["@type"][0])
data_fixed = [fixing_old_schema(data, copy_data=True)]
context = _fetch_jsonld_context(CONTEXTFILE_URL)
data_fixed_comp = jsonld.compact(data_fixed, context)
del data_fixed_comp["@context"]
conforms = False
v_text = ""
try:
obj_type(**data_fixed_comp)
conforms = True
except Exception as e:
v_text = str(e)
return conforms, v_text


Expand Down
44 changes: 44 additions & 0 deletions reproschema/migrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import json, os
import shutil
from pathlib import Path

from .jsonldutils import load_file
from .utils import fixing_old_schema


def migrate2newschema(path, inplace=False, fixed_path=None):
path = Path(path).resolve()
if path.is_file():
print(f"migration of file: {path}")
new_path = migrate2newschema_file(path, inplace, fixed_path)
else: # path.is_dir
if inplace:
new_path = path
elif fixed_path:
new_path = Path(fixed_path).resolve()
shutil.copytree(path, new_path)
else:
new_path = path.parent / f"{path.name}_after_migration"
shutil.copytree(path, new_path)
# fixing all files in new_path
all_files = Path(new_path).rglob("*")
for file in all_files:
if file.is_file():
migrate2newschema_file(jsonld_path=file, inplace=True)
return new_path


def migrate2newschema_file(jsonld_path, inplace=False, fixed_path=None):
print(f"Fixing {jsonld_path}")
data = load_file(jsonld_path, started=False)
data_fixed = [fixing_old_schema(data, copy_data=True)]
if inplace:
new_filename = jsonld_path
elif fixedjsonld_path:
new_filename = fixed_path
else:
root, ext = os.path.splitext(jsonld_path)
new_filename = f"{root}_after_migration{ext}"
with open(new_filename, "w") as f:
json.dump(data_fixed, f, indent=4)
return new_filename
5 changes: 2 additions & 3 deletions reproschema/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
from .protocol import Protocol
from .activity import Activity
from .item import Item
from .model import Activity, Item, Protocol, ResponseOption, ResponseActivity, Response
from .utils import write_obj_jsonld, identify_model_class
66 changes: 0 additions & 66 deletions reproschema/models/activity.py

This file was deleted.

Loading

0 comments on commit 0988c20

Please sign in to comment.