Skip to content

Commit

Permalink
Additional options: use-attributes and table-config-path (#86)
Browse files Browse the repository at this point in the history
* Adding a --use--attributes option.

* Adding -table-config-path option to sheets2linkml

* add-missing
  • Loading branch information
cmungall authored Dec 6, 2022
1 parent ca07e05 commit 5d973d1
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 16 deletions.
61 changes: 46 additions & 15 deletions schemasheets/schemamaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from linkml_runtime.linkml_model import Annotation, Example
from linkml_runtime.linkml_model.meta import SchemaDefinition, ClassDefinition, Prefix, \
SlotDefinition, EnumDefinition, PermissibleValue, SubsetDefinition, TypeDefinition, Element
from linkml_runtime.utils.schema_as_dict import schema_as_dict
from linkml_runtime.utils.schemaview import SchemaView, re

from schemasheets.schemasheet_datamodel import ColumnConfig, TableConfig, get_configmodel, get_metamodel, COL_NAME, \
Expand All @@ -39,9 +40,11 @@ class SchemaMaker:
element_map: Dict[Tuple[str, str], Element] = None
metamodel: SchemaView = None
cardinality_vocabulary: str = None
use_attributes: bool = None
default_name: str = None
unique_slots: bool = None
gsheet_id: str = None
table_config_path: str = None

def create_schema(self, csv_files: Union[str, List[str]], **kwargs) -> SchemaDefinition:
"""
Expand Down Expand Up @@ -94,6 +97,8 @@ def merge_sheet(self, file_name: str, delimiter='\t') -> None:
# reader = csv.DictReader(tsv_file, delimiter=delimiter)
with self.ensure_csvreader(file_name, delimiter=delimiter) as reader:
schemasheet = SchemaSheet.from_dictreader(reader)
if self.table_config_path:
schemasheet.load_table_config(self.table_config_path)
line_num = schemasheet.start_line_number
# TODO: check why this doesn't work
#while rows and all(x for x in rows[-1] if not x):
Expand Down Expand Up @@ -140,7 +145,7 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
ann = Annotation(cc.settings.inner_key, v)
actual_element.annotations[ann.tag] = ann
else:
anns = yaml.load(v[0])
anns = yaml.safe_load(v[0])
for ann_key, ann_val in anns.items():
actual_element.annotations[ann_key] = ann_val
elif isinstance(v, list):
Expand Down Expand Up @@ -314,15 +319,22 @@ def check_excess(descriptors):
# TODO: add option to allow to instead represent these as attributes
c: ClassDefinition
for c in vmap[T_CLASS]:
#c: ClassDefinition = vmap[T_CLASS]
if main_elt.name not in c.slots:
c.slots.append(main_elt.name)
if self.unique_slots:
yield main_elt
if self.use_attributes:
# slots always belong to a class;
# no seperate top level slots
a = SlotDefinition(main_elt.name)
c.attributes[main_elt.name] = a
yield a
else:
c.slot_usage[main_elt.name] = SlotDefinition(main_elt.name)
main_elt = c.slot_usage[main_elt.name]
yield main_elt
# add top level slot if not present
if main_elt.name not in c.slots:
c.slots.append(main_elt.name)
if self.unique_slots:
yield main_elt
else:
c.slot_usage[main_elt.name] = SlotDefinition(main_elt.name)
main_elt = c.slot_usage[main_elt.name]
yield main_elt
else:
yield main_elt
elif T_CLASS in vmap:
Expand Down Expand Up @@ -515,6 +527,15 @@ def set_cardinality(self, element: SlotDefinition, card: str) -> None:
raise ValueError(f'Cannot parse cardinality: {card} // {pvs.keys()}')

def repair_schema(self, schema: SchemaDefinition) -> SchemaDefinition:
"""
Performs repair on schema in place
- adds default prefixes
- repairs subsets
:param schema:
:return:
"""
sv = SchemaView(schema)
#pfx = schema.default_prefix
#if pfx not in schema.prefixes:
Expand Down Expand Up @@ -579,10 +600,17 @@ def ensure_csvreader(self, file_name: str, delimiter=None) -> str:
help="output file")
@click.option("-n", "--name",
help="name of the schema")
@click.option("-C", "--table-config-path",
help="YAML file with header mappings")
@click.option("--unique-slots/--no-unique-slots",
default=False,
show_default=True,
help="All slots are treated as unique and top level and do not belong to the specified class")
@click.option("--use-attributes/--no-use-attributes",
"-A", "--no-A",
default=False,
show_default=True,
help="All slots specified in conjunction with a class are attributes of that class")
@click.option("--repair/--no-repair",
default=True,
show_default=True,
Expand All @@ -591,7 +619,7 @@ def ensure_csvreader(self, file_name: str, delimiter=None) -> str:
help="Google sheets ID. If this is specified then the arguments MUST be sheet names")
@click.option("-v", "--verbose", count=True)
@click.argument('tsv_files', nargs=-1)
def convert(tsv_files, gsheet_id, output: TextIO, name, repair, unique_slots: bool, verbose: int):
def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_path: str, use_attributes: bool, unique_slots: bool, verbose: int):
"""
Convert schemasheets to a LinkML schema
Expand All @@ -612,14 +640,17 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, unique_slots: bo
logging.basicConfig(level=logging.INFO)
else:
logging.basicConfig(level=logging.WARNING)
sm = SchemaMaker()
sm.gsheet_id = gsheet_id
sm.default_name = name
sm.unique_slots = unique_slots
sm = SchemaMaker(use_attributes=use_attributes,
unique_slots=unique_slots,
gsheet_id=gsheet_id,
default_name=name,
table_config_path=table_config_path)
schema = sm.create_schema(list(tsv_files))
if repair:
schema = sm.repair_schema(schema)
output.write(yaml_dumper.dumps(schema))
schema_dict = schema_as_dict(schema)
output.write(yaml.dump(schema_dict))
#output.write(yaml_dumper.dumps(schema))


if __name__ == '__main__':
Expand Down
20 changes: 19 additions & 1 deletion schemasheets/schemasheet_datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from dataclasses import dataclass
from typing import Union, Dict, List, Any
import pkgutil
from pathlib import PurePath
from pathlib import PurePath, Path
from functools import lru_cache
import logging
import yaml
Expand Down Expand Up @@ -222,6 +222,24 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
rows=rows,
start_line_number=line_num)

def load_table_config(self, config: Union[dict, str, Path]) -> None:
"""
Loads a table configuration from a file or dict
:param config:
:return:
"""
if not isinstance(config, dict):
with open(config) as f:
config = yaml.safe_load(f)
return self.load_table_config(config)
for k, v in config.items():
if isinstance(v, list):
for v1 in v:
self.table_config.add_info(k, v1)
else:
self.table_config.add_info(k, v)

@lru_cache()
def get_metamodel() -> SchemaView:
"""
Expand Down
23 changes: 23 additions & 0 deletions tests/input/personinfo-descriptors.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
record: class
field: slot
key: identifier
multiplicity: cardinality
range: range
parents: is_a
desc: description
schema.org:
exact_mappings: {curie_prefix: sdo}
wikidata:
- {exact_mappings: {curie_prefix: wikidata}}
- {curie_prefix: wikidata}
belongs: in_subset
status:
- status
- {vmap: {T: testing, R: release}}
special:
- annotations
- {inner_key: special}
special2:
- annotations
- {inner_key: special2}
notes: ignore
13 changes: 13 additions & 0 deletions tests/input/personinfo-no-descriptors.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
record field key multiplicity range parents desc schema.org wikidata belongs status special special2 notes
id yes 1 string any identifier identifier
description no 0..1 string a textual description description my_val my_val2
Person n/a n/a n/a a person,living or dead Person Q215627 R
Person id yes 1 string identifier for a person identifier
Person|Organization name no 1 string full name name my_val
Person age no 0..1 decimal age in years
Person gender no 0..1 decimal age in years
Person has medical history no 0..* MedicalEvent medical history T
Event grouping class for events Q1656682 a R
MedicalEvent n/a n/a n/a Event a medical encounter b T
ForProfit Organization
NonProfit Organization Q163740 foo
35 changes: 35 additions & 0 deletions tests/test_schemamaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,38 @@ def test_problem_cases():
raised = True
assert raised

def test_load_table_config():
"""
tests loading of table configuration
Same as personinfo test, but we provide a separate config
:return:
"""
sm = SchemaMaker(table_config_path=os.path.join(INPUT_DIR, 'personinfo-descriptors.yaml'))
schema = sm.create_schema(os.path.join(INPUT_DIR, 'personinfo-no-descriptors.tsv'))
yaml_dumper.dump(schema, to_file=os.path.join(OUTPUT_DIR, 'personinfo.yaml'))
yaml = yaml_dumper.dumps(schema)
logging.info(yaml)
print(yaml)
person_cls = schema.classes['Person']
organization_cls = schema.classes['Organization']
for s in ['id', 'name', 'age', 'gender', 'has medical history']:
assert s in schema.slots
assert s in person_cls.slots
assert schema.slots['id'].identifier
assert schema.slots['id'].exact_mappings == ['sdo:identifier']
assert person_cls.slot_usage['id'].identifier
assert person_cls.slot_usage['has medical history'].multivalued
assert person_cls.status == 'release'
anns = schema.slots['description'].annotations
assert anns
assert anns['special']
assert anns['special'].value == 'my_val'
assert anns['special2'].value == 'my_val2'
assert not person_cls.slot_usage['has medical history'].required
assert person_cls.slot_usage['has medical history'].status == 'testing'
assert 'name' in organization_cls.slots
assert len(person_cls.exact_mappings) == 2
assert 'wikidata:Q215627' in person_cls.exact_mappings
assert 'sdo:Person' in person_cls.exact_mappings

0 comments on commit 5d973d1

Please sign in to comment.