From 5d973d19c2739e6351e130e9c8575353550b0d2a Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Mon, 5 Dec 2022 18:21:18 -0800 Subject: [PATCH] Additional options: use-attributes and table-config-path (#86) * Adding a --use--attributes option. * Adding -table-config-path option to sheets2linkml * add-missing --- schemasheets/schemamaker.py | 61 +++++++++++++++++------ schemasheets/schemasheet_datamodel.py | 20 +++++++- tests/input/personinfo-descriptors.yaml | 23 +++++++++ tests/input/personinfo-no-descriptors.tsv | 13 +++++ tests/test_schemamaker.py | 35 +++++++++++++ 5 files changed, 136 insertions(+), 16 deletions(-) create mode 100644 tests/input/personinfo-descriptors.yaml create mode 100644 tests/input/personinfo-no-descriptors.tsv diff --git a/schemasheets/schemamaker.py b/schemasheets/schemamaker.py index ce1aa56..13cbcdf 100644 --- a/schemasheets/schemamaker.py +++ b/schemasheets/schemamaker.py @@ -16,6 +16,7 @@ from linkml_runtime.linkml_model import Annotation, Example from linkml_runtime.linkml_model.meta import SchemaDefinition, ClassDefinition, Prefix, \ SlotDefinition, EnumDefinition, PermissibleValue, SubsetDefinition, TypeDefinition, Element +from linkml_runtime.utils.schema_as_dict import schema_as_dict from linkml_runtime.utils.schemaview import SchemaView, re from schemasheets.schemasheet_datamodel import ColumnConfig, TableConfig, get_configmodel, get_metamodel, COL_NAME, \ @@ -39,9 +40,11 @@ class SchemaMaker: element_map: Dict[Tuple[str, str], Element] = None metamodel: SchemaView = None cardinality_vocabulary: str = None + use_attributes: bool = None default_name: str = None unique_slots: bool = None gsheet_id: str = None + table_config_path: str = None def create_schema(self, csv_files: Union[str, List[str]], **kwargs) -> SchemaDefinition: """ @@ -94,6 +97,8 @@ def merge_sheet(self, file_name: str, delimiter='\t') -> None: # reader = csv.DictReader(tsv_file, delimiter=delimiter) with self.ensure_csvreader(file_name, delimiter=delimiter) as reader: schemasheet = SchemaSheet.from_dictreader(reader) + if self.table_config_path: + schemasheet.load_table_config(self.table_config_path) line_num = schemasheet.start_line_number # TODO: check why this doesn't work #while rows and all(x for x in rows[-1] if not x): @@ -140,7 +145,7 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig): ann = Annotation(cc.settings.inner_key, v) actual_element.annotations[ann.tag] = ann else: - anns = yaml.load(v[0]) + anns = yaml.safe_load(v[0]) for ann_key, ann_val in anns.items(): actual_element.annotations[ann_key] = ann_val elif isinstance(v, list): @@ -314,15 +319,22 @@ def check_excess(descriptors): # TODO: add option to allow to instead represent these as attributes c: ClassDefinition for c in vmap[T_CLASS]: - #c: ClassDefinition = vmap[T_CLASS] - if main_elt.name not in c.slots: - c.slots.append(main_elt.name) - if self.unique_slots: - yield main_elt + if self.use_attributes: + # slots always belong to a class; + # no seperate top level slots + a = SlotDefinition(main_elt.name) + c.attributes[main_elt.name] = a + yield a else: - c.slot_usage[main_elt.name] = SlotDefinition(main_elt.name) - main_elt = c.slot_usage[main_elt.name] - yield main_elt + # add top level slot if not present + if main_elt.name not in c.slots: + c.slots.append(main_elt.name) + if self.unique_slots: + yield main_elt + else: + c.slot_usage[main_elt.name] = SlotDefinition(main_elt.name) + main_elt = c.slot_usage[main_elt.name] + yield main_elt else: yield main_elt elif T_CLASS in vmap: @@ -515,6 +527,15 @@ def set_cardinality(self, element: SlotDefinition, card: str) -> None: raise ValueError(f'Cannot parse cardinality: {card} // {pvs.keys()}') def repair_schema(self, schema: SchemaDefinition) -> SchemaDefinition: + """ + Performs repair on schema in place + + - adds default prefixes + - repairs subsets + + :param schema: + :return: + """ sv = SchemaView(schema) #pfx = schema.default_prefix #if pfx not in schema.prefixes: @@ -579,10 +600,17 @@ def ensure_csvreader(self, file_name: str, delimiter=None) -> str: help="output file") @click.option("-n", "--name", help="name of the schema") +@click.option("-C", "--table-config-path", + help="YAML file with header mappings") @click.option("--unique-slots/--no-unique-slots", default=False, show_default=True, help="All slots are treated as unique and top level and do not belong to the specified class") +@click.option("--use-attributes/--no-use-attributes", + "-A", "--no-A", + default=False, + show_default=True, + help="All slots specified in conjunction with a class are attributes of that class") @click.option("--repair/--no-repair", default=True, show_default=True, @@ -591,7 +619,7 @@ def ensure_csvreader(self, file_name: str, delimiter=None) -> str: help="Google sheets ID. If this is specified then the arguments MUST be sheet names") @click.option("-v", "--verbose", count=True) @click.argument('tsv_files', nargs=-1) -def convert(tsv_files, gsheet_id, output: TextIO, name, repair, unique_slots: bool, verbose: int): +def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_path: str, use_attributes: bool, unique_slots: bool, verbose: int): """ Convert schemasheets to a LinkML schema @@ -612,14 +640,17 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, unique_slots: bo logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARNING) - sm = SchemaMaker() - sm.gsheet_id = gsheet_id - sm.default_name = name - sm.unique_slots = unique_slots + sm = SchemaMaker(use_attributes=use_attributes, + unique_slots=unique_slots, + gsheet_id=gsheet_id, + default_name=name, + table_config_path=table_config_path) schema = sm.create_schema(list(tsv_files)) if repair: schema = sm.repair_schema(schema) - output.write(yaml_dumper.dumps(schema)) + schema_dict = schema_as_dict(schema) + output.write(yaml.dump(schema_dict)) + #output.write(yaml_dumper.dumps(schema)) if __name__ == '__main__': diff --git a/schemasheets/schemasheet_datamodel.py b/schemasheets/schemasheet_datamodel.py index 83d7354..4d3a5b1 100644 --- a/schemasheets/schemasheet_datamodel.py +++ b/schemasheets/schemasheet_datamodel.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from typing import Union, Dict, List, Any import pkgutil -from pathlib import PurePath +from pathlib import PurePath, Path from functools import lru_cache import logging import yaml @@ -222,6 +222,24 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet": rows=rows, start_line_number=line_num) + def load_table_config(self, config: Union[dict, str, Path]) -> None: + """ + Loads a table configuration from a file or dict + + :param config: + :return: + """ + if not isinstance(config, dict): + with open(config) as f: + config = yaml.safe_load(f) + return self.load_table_config(config) + for k, v in config.items(): + if isinstance(v, list): + for v1 in v: + self.table_config.add_info(k, v1) + else: + self.table_config.add_info(k, v) + @lru_cache() def get_metamodel() -> SchemaView: """ diff --git a/tests/input/personinfo-descriptors.yaml b/tests/input/personinfo-descriptors.yaml new file mode 100644 index 0000000..d8b2a0e --- /dev/null +++ b/tests/input/personinfo-descriptors.yaml @@ -0,0 +1,23 @@ +record: class +field: slot +key: identifier +multiplicity: cardinality +range: range +parents: is_a +desc: description +schema.org: + exact_mappings: {curie_prefix: sdo} +wikidata: + - {exact_mappings: {curie_prefix: wikidata}} + - {curie_prefix: wikidata} +belongs: in_subset +status: + - status + - {vmap: {T: testing, R: release}} +special: + - annotations + - {inner_key: special} +special2: + - annotations + - {inner_key: special2} +notes: ignore \ No newline at end of file diff --git a/tests/input/personinfo-no-descriptors.tsv b/tests/input/personinfo-no-descriptors.tsv new file mode 100644 index 0000000..d84bb89 --- /dev/null +++ b/tests/input/personinfo-no-descriptors.tsv @@ -0,0 +1,13 @@ +record field key multiplicity range parents desc schema.org wikidata belongs status special special2 notes + id yes 1 string any identifier identifier + description no 0..1 string a textual description description my_val my_val2 +Person n/a n/a n/a a person,living or dead Person Q215627 R +Person id yes 1 string identifier for a person identifier +Person|Organization name no 1 string full name name my_val +Person age no 0..1 decimal age in years +Person gender no 0..1 decimal age in years +Person has medical history no 0..* MedicalEvent medical history T +Event grouping class for events Q1656682 a R +MedicalEvent n/a n/a n/a Event a medical encounter b T +ForProfit Organization +NonProfit Organization Q163740 foo diff --git a/tests/test_schemamaker.py b/tests/test_schemamaker.py index 3ab677b..7c20c21 100644 --- a/tests/test_schemamaker.py +++ b/tests/test_schemamaker.py @@ -247,3 +247,38 @@ def test_problem_cases(): raised = True assert raised +def test_load_table_config(): + """ + tests loading of table configuration + + Same as personinfo test, but we provide a separate config + :return: + """ + sm = SchemaMaker(table_config_path=os.path.join(INPUT_DIR, 'personinfo-descriptors.yaml')) + schema = sm.create_schema(os.path.join(INPUT_DIR, 'personinfo-no-descriptors.tsv')) + yaml_dumper.dump(schema, to_file=os.path.join(OUTPUT_DIR, 'personinfo.yaml')) + yaml = yaml_dumper.dumps(schema) + logging.info(yaml) + print(yaml) + person_cls = schema.classes['Person'] + organization_cls = schema.classes['Organization'] + for s in ['id', 'name', 'age', 'gender', 'has medical history']: + assert s in schema.slots + assert s in person_cls.slots + assert schema.slots['id'].identifier + assert schema.slots['id'].exact_mappings == ['sdo:identifier'] + assert person_cls.slot_usage['id'].identifier + assert person_cls.slot_usage['has medical history'].multivalued + assert person_cls.status == 'release' + anns = schema.slots['description'].annotations + assert anns + assert anns['special'] + assert anns['special'].value == 'my_val' + assert anns['special2'].value == 'my_val2' + assert not person_cls.slot_usage['has medical history'].required + assert person_cls.slot_usage['has medical history'].status == 'testing' + assert 'name' in organization_cls.slots + assert len(person_cls.exact_mappings) == 2 + assert 'wikidata:Q215627' in person_cls.exact_mappings + assert 'sdo:Person' in person_cls.exact_mappings +