Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Complete id validation #369

Merged
merged 17 commits into from
Jan 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bioimageio/spec/VERSION
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"version": "0.4.2"
"version": "0.4.2patch1"
}
13 changes: 10 additions & 3 deletions bioimageio/spec/collection/v0_2/converters.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import copy
from typing import Any, Dict

from bioimageio.spec.rdf.v0_2.converters import maybe_convert as maybe_convert_rdf


def maybe_convert(data: Dict[str, Any]) -> Dict[str, Any]:
data = copy.deepcopy(data)
if data.get("format_version") in ("0.2.0", "0.2.1"):
# move all type groups to the 'collection' field
if "collection" not in data:
data["collection"] = []

for group in ["application", "model", "dataset", "notebook"]:
if group in data:
data["collection"] += data[group]

config = data.get("config")
if config and isinstance(config, dict):
id_ = config.pop("id", data.get("id"))
if id_ is not None:
data["id"] = id_

data["format_version"] = "0.2.2"

return maybe_convert_rdf(data)
return data
15 changes: 10 additions & 5 deletions bioimageio/spec/collection/v0_2/raw_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
RDF <--schema--> raw nodes
"""
import distutils.version
from dataclasses import dataclass
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, NewType, Union

Expand All @@ -30,9 +30,12 @@ class CollectionEntry(RawNode):
rdf_source: Union[_Missing, URI] = missing
rdf_update: Dict[str, Any] = missing

def __init__(self, rdf_source=missing, **rdf_update):
def __init__(
self, rdf_source: Union[_Missing, URI] = missing, rdf_update: Dict[str, Any] = missing, **implicit_rdf_update
):
self.rdf_source = rdf_source
self.rdf_update = rdf_update
self.rdf_update = rdf_update or {}
self.rdf_update.update(implicit_rdf_update)
super().__init__()


Expand Down Expand Up @@ -65,7 +68,8 @@ def __init__(
tags: List[str],
# collection RDF
collection: List[CollectionEntry],
**unknown,
unknown: Dict[str, Any] = missing,
**implicitly_unknown,
):
self.collection = collection
super().__init__(
Expand All @@ -87,4 +91,5 @@ def __init__(
type=type,
version=version,
)
self.unknown = unknown
self.unknown = unknown or {}
self.unknown.update(implicitly_unknown)
61 changes: 60 additions & 1 deletion bioimageio/spec/collection/v0_2/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,64 @@
from . import raw_nodes
from typing import List, Optional, Tuple, Union

from marshmallow import missing
from marshmallow.utils import _Missing

from . import raw_nodes, schema


def filter_resource_description(raw_rd: raw_nodes.RDF) -> raw_nodes.RDF:
return raw_rd


def resolve_collection_entries(
collection: raw_nodes.Collection, collection_id: Optional[str] = None
) -> List[Tuple[dict, Optional[str]]]:
from bioimageio.spec import serialize_raw_resource_description_to_dict
from bioimageio.spec.shared.utils import resolve_rdf_source

ret = []
seen_ids = set()
for idx, entry in enumerate(collection.collection): # type: ignore
entry_error: Optional[str] = None
id_info = f"(id={entry.rdf_update['id']}) " if "id" in entry.rdf_update else ""

# rdf entries are based on collection RDF...
rdf_data = serialize_raw_resource_description_to_dict(collection)
assert missing not in rdf_data.values()
rdf_data.pop("collection") # ... without the collection field to avoid recursion

root_id = rdf_data.pop("id", None) if collection_id is None else collection_id
# update rdf entry with entry's rdf_source
sub_id: Union[str, _Missing] = missing
if entry.rdf_source is not missing:
try:
remote_rdf_update, _, _ = resolve_rdf_source(entry.rdf_source)
except Exception as e:
entry_error = f"collection[{idx}]: {id_info}Invalid rdf_source: {e}"
else:
sub_id = remote_rdf_update.pop("id", missing)
assert missing not in remote_rdf_update.values()
rdf_data.update(remote_rdf_update)

# update rdf entry with fields specified directly in the entry
rdf_update = schema.CollectionEntry().dump(entry)
assert missing not in rdf_update.values()
sub_id = rdf_update.pop("id", sub_id)
if sub_id is missing:
entry_error = f"collection[{idx}]: Missing `id` field"
elif sub_id in seen_ids:
entry_error = f"collection[{idx}]: Duplicate `id` value {sub_id}"
else:
seen_ids.add(sub_id)

rdf_data.update(rdf_update)
if root_id is None:
rdf_data["id"] = sub_id
else:
rdf_data["id"] = f"{root_id}/{sub_id}"

rdf_data.pop("rdf_source", None) # remove rdf_source as we return a plain dict that has no simple source file
assert missing not in rdf_data.values()
ret.append((rdf_data, entry_error))

return ret
38 changes: 5 additions & 33 deletions bioimageio/spec/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@
import traceback
import warnings
from pathlib import Path
from typing import Any, Dict, IO, Optional, Union
from typing import Any, Dict, IO, Union

from marshmallow import ValidationError, missing
from marshmallow.utils import _Missing
from marshmallow import ValidationError

from .collection.v0_2.utils import resolve_collection_entries
from .io_ import (
load_raw_resource_description,
resolve_rdf_source,
save_raw_resource_description,
serialize_raw_resource_description_to_dict,
)
from .shared.common import ValidationWarning, nested_default_dict_as_nested_dict

Expand Down Expand Up @@ -80,35 +79,7 @@ def validate(

if raw_rd is not None and raw_rd.type == "collection":
assert hasattr(raw_rd, "collection")
for idx, entry in enumerate(raw_rd.collection): # type: ignore
entry_error: Optional[str] = None
rdf_update = entry.rdf_update
id_info = f"(id={rdf_update['id']}) " if "id" in rdf_update else ""

# rdf entries are based on collection RDF...
rdf_data = serialize_raw_resource_description_to_dict(raw_rd)
rdf_data.pop("collection") # ... without the collection field to avoid recursion

root_id = rdf_data.pop("id", missing)
# update rdf entry with entrie's rdf_source
sub_id: Union[str, _Missing] = missing
if entry.rdf_source is not missing:
try:
rdf_update, _, _ = resolve_rdf_source(entry.rdf_source)
except Exception as e:
entry_error = f"collection[{idx}]: {id_info}Invalid rdf_source: {e}"
else:
sub_id = rdf_update.pop("id", missing)
rdf_data.update(rdf_update)

# update rdf entry with fields specified directly in the entry
rdf_update = dict(entry.rdf_update)
sub_id = rdf_update.pop("id", sub_id)
if sub_id is missing:
entry_error = f"collection[{idx}]: Missing `id` field for collection entry"

rdf_data.update(rdf_update)

for idx, (rdf_data, entry_error) in enumerate(resolve_collection_entries(raw_rd)): # type: ignore
if entry_error:
entry_summary = {"error": entry_error}
else:
Expand All @@ -118,6 +89,7 @@ def validate(

wrns: Union[str, dict] = entry_summary.get("warnings", {})
assert isinstance(wrns, dict)
id_info = f"(id={rdf_data['id']}) " if "id" in rdf_data else ""
for k, v in wrns.items():
warnings.warn(f"collection[{idx}]:{k}: {id_info}{v}", category=ValidationWarning)

Expand Down
3 changes: 3 additions & 0 deletions bioimageio/spec/io_.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from types import ModuleType
from typing import Dict, IO, Optional, Sequence, Tuple, Union

from marshmallow import missing

from bioimageio.spec.shared import raw_nodes
from bioimageio.spec.shared.common import (
BIOIMAGEIO_CACHE_PATH,
Expand Down Expand Up @@ -206,6 +208,7 @@ def serialize_raw_resource_description_to_dict(raw_rd: RawResourceDescription) -
schema: SharedBioImageIOSchema = getattr(sub_spec.schema, class_name)()
serialized = schema.dump(raw_rd)
assert isinstance(serialized, dict)
assert missing not in serialized.values()

return serialized

Expand Down
4 changes: 4 additions & 0 deletions bioimageio/spec/model/v0_3/converters.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import pathlib
from typing import Any, Dict

Expand Down Expand Up @@ -91,6 +92,9 @@ def convert_model_v0_3_2_to_v0_3_3(data: Dict[str, Any]) -> Dict[str, Any]:

def maybe_convert(data: Dict[str, Any]) -> Dict[str, Any]:
"""auto converts model 'data' to newest format"""

data = copy.deepcopy(data)

if data.get("format_version", "0.3.0") == "0.3.0":
# no breaking change, bump to 0.3.1
data["format_version"] = "0.3.1"
Expand Down
2 changes: 0 additions & 2 deletions bioimageio/spec/model/v0_4/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

from marshmallow import missing

from bioimageio.spec.exceptions import UnconvertibleError


def convert_model_from_v0_3_to_0_4_0(data: Dict[str, Any]) -> Dict[str, Any]:
from bioimageio.spec.model import v0_3
Expand Down
3 changes: 3 additions & 0 deletions bioimageio/spec/rdf/v0_2/converters.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import copy
from typing import Any, Dict


def maybe_convert(data: Dict[str, Any]) -> Dict[str, Any]:
data = copy.deepcopy(data)

# we unofficially accept strings as author entries...
authors = data.get("authors")
if isinstance(authors, list):
Expand Down
16 changes: 13 additions & 3 deletions bioimageio/spec/rdf/v0_2/raw_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,15 @@ class Attachments(RawNode):
files: Union[_Missing, List[Union[Path, URI]]] = missing
unknown: Dict[str, Any] = missing

def __init__(self, files: Union[_Missing, List[Union[Path, URI]]] = missing, **unknown):
def __init__(
self,
files: Union[_Missing, List[Union[Path, URI]]] = missing,
unknown: Dict[str, Any] = missing,
**implicitly_unknown,
):
self.files = files
self.unknown = unknown
self.unknown = unknown or {}
self.unknown.update(implicitly_unknown)
super().__init__()


Expand Down Expand Up @@ -84,10 +90,12 @@ class RDF(ResourceDescription):
documentation: Path = missing
format_version: FormatVersion = missing
git_repo: Union[_Missing, str] = missing
id: Union[_Missing, str] = missing
icon: Union[_Missing, str] = missing
license: Union[_Missing, str] = missing
links: Union[_Missing, List[str]] = missing
maintainers: Union[_Missing, List[Maintainer]] = missing
rdf_source: Union[_Missing, URI] = missing
tags: List[str] = missing

# manual __init__ to allow for unknown kwargs
Expand All @@ -110,6 +118,7 @@ def __init__(
documentation: Path,
git_repo: Union[_Missing, str] = missing,
id: Union[_Missing, str] = missing,
icon: Union[_Missing, str] = missing,
license: Union[_Missing, str] = missing,
links: Union[_Missing, List[str]] = missing,
maintainers: Union[_Missing, List[Maintainer]] = missing,
Expand All @@ -127,6 +136,7 @@ def __init__(
self.documentation = documentation
self.git_repo = git_repo
self.id = id
self.icon = icon
self.license = license
self.links = links
self.maintainers = maintainers
Expand All @@ -138,7 +148,7 @@ def __init__(
# make sure we didn't forget a defined field
field_names = set(f.name for f in dataclasses.fields(self))
for uk in unknown_kwargs:
assert uk not in field_names
assert uk not in field_names, uk

warnings.warn(f"discarding unknown RDF fields: {unknown_kwargs}")

Expand Down
16 changes: 13 additions & 3 deletions bioimageio/spec/shared/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import marshmallow_union
import numpy
from marshmallow import ValidationError, fields as marshmallow_fields, Schema
from marshmallow import Schema, ValidationError, fields as marshmallow_fields, missing

from . import field_validators, raw_nodes

Expand Down Expand Up @@ -119,13 +119,23 @@ class YamlDict(Dict):
@staticmethod
def _make_yaml_friendly(obj):
if isinstance(obj, (list, tuple)):
return [YamlDict._make_yaml_friendly(ob) for ob in obj]
return [YamlDict._make_yaml_friendly(ob) for ob in obj if ob is not missing]
elif isinstance(obj, dict):
return {YamlDict._make_yaml_friendly(k): YamlDict._make_yaml_friendly(v) for k, v in obj.items()}
return {
YamlDict._make_yaml_friendly(k): YamlDict._make_yaml_friendly(v)
for k, v in obj.items()
if v is not missing
}
elif obj is None or isinstance(obj, (float, int, str, bool)):
return obj
elif isinstance(obj, pathlib.PurePath):
return obj.as_posix()
elif isinstance(obj, raw_nodes.URI):
return str(obj)
elif isinstance(obj, (datetime.datetime, datetime.time)):
return obj.isoformat()
elif obj is missing:
return missing
else:
raise TypeError(f"Encountered YAML unfriendly type: {type(obj)}")

Expand Down
4 changes: 3 additions & 1 deletion bioimageio/spec/shared/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ def make_object(self, data, **kwargs):
@post_dump(pass_original=True)
def keep_unknowns(self, output, orig, **kwargs):
if orig and hasattr(orig, self.field_name_unknown_dict):
out_w_unknown = dict(getattr(orig, self.field_name_unknown_dict))
out_w_unknown = fields.YamlDict()._serialize(
getattr(orig, self.field_name_unknown_dict), self.field_name_unknown_dict, self
)
out_w_unknown.update(output)
return out_w_unknown
else:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# TODO physical scale of the data
format_version: 0.3.4
format_version: 0.3.6

name: UNet 2D Nuclei Broad
description: A 2d U-Net trained on the nuclei broad dataset.
Expand Down
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ def get_unet2d_nuclei_broad(unet2d_nuclei_broad_base_path, request) -> dict:
return yaml.load(path)


@pytest.fixture(params=["v0_3_0", "v0_3_1", "v0_3_2", "v0_3_3", "v0_3_4", "v0_4_0", "v0_4_2"])
@pytest.fixture(params=["v0_3_0", "v0_3_1", "v0_3_2", "v0_3_3", "v0_3_6", "v0_4_0", "v0_4_2"])
def unet2d_nuclei_broad_any(unet2d_nuclei_broad_base_path, request):
yield get_unet2d_nuclei_broad(unet2d_nuclei_broad_base_path, request)


@pytest.fixture(params=["v0_3_0", "v0_3_1", "v0_3_2", "v0_3_3", "v0_3_4", "v0_4_0"])
@pytest.fixture(params=["v0_3_0", "v0_3_1", "v0_3_2", "v0_3_3", "v0_3_6", "v0_4_0"])
def unet2d_nuclei_broad_before_latest(unet2d_nuclei_broad_base_path, request):
yield get_unet2d_nuclei_broad(unet2d_nuclei_broad_base_path, request)

Expand All @@ -36,7 +36,7 @@ def unet2d_nuclei_broad_latest(unet2d_nuclei_broad_base_path, request):
yield get_unet2d_nuclei_broad(unet2d_nuclei_broad_base_path, request)


@pytest.fixture(params=["v0_3_4", "v0_4_2"])
@pytest.fixture(params=["v0_3_6", "v0_4_2"])
def unet2d_nuclei_broad_any_minor(unet2d_nuclei_broad_base_path, request):
yield get_unet2d_nuclei_broad(unet2d_nuclei_broad_base_path, request)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_dump_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_dataset_rdf_round_trip():
covers=["https://raw.githubusercontent.com/ilastik/bioimage-io-models/main/dataset_src/platy-cover0.png"],
description="Training data for EM segmentation of cellular membranes, nuclei, cuticle and cilia in Platynereis.",
documentation="https://raw.githubusercontent.com/ilastik/bioimage-io-models/main/dataset_src/platy.md",
format_version="0.2.1",
format_version="0.2.2",
license="CC-BY-4.0",
name="Platynereis EM Traning Data",
source="https://doi.org/10.5281/zenodo.3675220",
Expand Down