From 4c2570f4c8e3126f170e5b8cce1535b773e1e1c5 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Tue, 29 Oct 2024 22:31:39 -0400 Subject: [PATCH 1/5] prototype templated schema-as-yaml modifier --- qy2-config.tsv | 49 +++ src/nmdc_submission_schema/scripts/qy2.py | 361 ++++++++++++++++++++++ 2 files changed, 410 insertions(+) create mode 100644 qy2-config.tsv create mode 100644 src/nmdc_submission_schema/scripts/qy2.py diff --git a/qy2-config.tsv b/qy2-config.tsv new file mode 100644 index 00000000..b896fb19 --- /dev/null +++ b/qy2-config.tsv @@ -0,0 +1,49 @@ +rank active scope criterion_field criterion_value criterion_in_set element_key operation target_field target_value success +1 true slot_or_usage range TimestampValue set range string true +2 true slot_or_usage range TextValue set range string true +3 true slot_or_usage range QuantityValue set range string true +4 true slot_or_usage range OntologyClass set range string true +5 true slot_or_usage range GeolocationValue set range string true +6 true slot_or_usage range ControlledTermValue set range string true +7 true slot_or_usage range ControlledIdentifiedTermValue set range string true +8 true slot_or_usage range enum_keys delete_field multivalued true except for oxy_stat_samp +9 true slot_or_usage range type_keys delete_field multivalued true +11 true slot_or_usage dna_dnase set range YesNoEnum not in usages +12 true slot_or_usage dnase_rna set range YesNoEnum not in usages +13 true slot_or_usage oxy_stat_samp set range rel_to_oxygen_enum not in usages +14 true slot_or_usage oxy_stat_samp delete_field multivalued +15 true slot_or_usage sample_link set range string true +16 true slot_or_usage delete_element latitude true +17 true slot_or_usage delete_element longitude true +18 true slot_or_usage delete_element has_maximum_numeric_value true +19 true slot_or_usage delete_element has_minimum_numeric_value true +20 true slot_or_usage delete_element has_numeric_value true except for some examples +21 true slot_or_usage delete_element has_raw_value true except for some examples and comments +22 true slot_or_usage delete_element has_unit true except for some examples +23 true slot_or_usage delete_element term true +24 true class delete_element AttributeValue true +25 true class delete_element ControlledIdentifiedTermValue true +26 true class delete_element ControlledTermValue true +27 true class delete_element GeolocationValue true +28 true class delete_element OntologyClass true +29 true class delete_element QuantityValue true +30 true class delete_element TextValue true +31 true class delete_element TimestampValue true +32 true all_elements delete_field domain_of true +33 true all_elements delete_field from_schema true +34 true all_elements delete_field domain true +36 slot_or_usage range class_keys set range string +37 slot_or_usage range string delete_field multivalued +38 slot_or_usage range float delete_field multivalued +39 slot_or_usage range double delete_field multivalued +40 slot_or_usage range integer delete_field multivalued +41 slot_or_usage range uriorcurie delete_field multivalued +42 true all_elements delete_field name +34.2 true all_elements delete_field owner true +33 true all_elements delete_field from_schema "" +33 true all_elements delete_field from_schema "" +33 true all_elements delete_field from_schema "" +24 true class delete_element NamedThing true +24 true slot_or_usage delete_element name true +24 true slot_or_usage range string delete_field inlined true +24 true slot_or_usage range string delete_field inlined_as_list true \ No newline at end of file diff --git a/src/nmdc_submission_schema/scripts/qy2.py b/src/nmdc_submission_schema/scripts/qy2.py new file mode 100644 index 00000000..e5d9adb1 --- /dev/null +++ b/src/nmdc_submission_schema/scripts/qy2.py @@ -0,0 +1,361 @@ +import csv + +import yaml +from glom import glom, assign +import click +from typing import Dict, Any, Optional, List +import re +from typing import Dict, Any +import click + + +def load_yaml(file_path: str) -> Dict[str, Any]: + """Load a YAML file and return its content as a dictionary.""" + with open(file_path, 'r') as file: + return yaml.safe_load(file) + + +def load_tsv(file_path: str) -> List[Dict[str | Any, str | Any]]: + """Load a TSV file and return its content as a list of dictionaries.""" + with open(file_path, mode='r', newline='') as file: + reader = csv.DictReader(file, delimiter='\t') + return [row for row in reader] # Convert reader to a list of dictionaries + + +def save_yaml(data: Dict[str, Any], file_path: str) -> None: + """Save a dictionary to a YAML file.""" + with open(file_path, 'w') as file: + yaml.safe_dump(data, file) + + +@click.command() +@click.option("--schema", required=True, type=click.Path(exists=True), help="Path to the input YAML file.") +@click.option("--config", required=True, type=click.Path(exists=True), + help="Path to the TSV configuration file specifying modifications.") +@click.option("--output", required=True, type=click.Path(), help="Path to save the modified YAML file.") +def main(schema: str, config: str, output: str) -> None: + """ + Modify the input YAML based on rules in the config file and save the result to the output file. + + Args: + schema (str): Path to the input YAML file. + config (str): Path to the configuration file specifying modifications. + output (str): Path to save the modified YAML file. + """ + # Load input data and configuration + schema = load_yaml(schema) + + enums = schema.get("enums", {}) + enum_keys = set(enums.keys()) # Collect the keys for quick lookup + types = schema.get("types", {}) + type_keys = set(types.keys()) + classes = schema.get("classes", {}) + class_keys = set(classes.keys()) + sets = dict() + sets['enum_keys'] = enum_keys + sets['type_keys'] = type_keys + sets['class_keys'] = class_keys + sets_keys = set(sets.keys()) + + raw_config = load_tsv(config) + + for rc in raw_config: + scope = rc.get('scope') + criterion_field = rc.get('criterion_field') + criterion_value = rc.get('criterion_value') + criterion_in_set = rc.get('criterion_in_set') + operation = rc.get('operation') + target_field = rc.get('target_field') + target_value = rc.get('target_value') + element_key = rc.get('element_key') + active = rc.get('active') + # make active uppercase + if active.upper() != "TRUE": + continue + + if operation == 'set' \ + and criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key in schema['slots'] \ + and scope in ['slot', 'slot_or_usage'] \ + and target_field in schema['slots'][element_key]: + schema['slots'][element_key][target_field] = target_value + + if operation == 'delete_element' \ + and criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and scope in ['slot', 'slot_or_usage'] \ + and target_field in schema['slots']: + del schema['slots'][target_field] + + for ck, cv in schema['classes'].items(): + if 'slots' in cv: + slots_list = cv['slots'] + if operation == 'delete_element' \ + and criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and scope in ['usage', 'slot_or_usage'] \ + and target_field in slots_list: + slots_list.remove(target_field) + if 'slot_usage' in cv: + if operation == 'delete_element' \ + and criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and scope in ['usage', 'slot_or_usage'] \ + and target_field in cv['slot_usage']: + del cv['slot_usage'][target_field] + if operation == 'set' \ + and criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key in cv['slot_usage'] \ + and scope in ['usage', 'slot_or_usage'] \ + and target_field in cv['slot_usage'][element_key]: + cv['slot_usage'][element_key][target_field] = target_value + + if operation == 'delete_element' \ + and criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and scope in ['class'] \ + and target_field in schema['classes']: + del schema['classes'][target_field] + + for ok, ov in schema.items(): # o for outer + # print(ok) + # + # types + # + # classes + # enums + # slots + # subsets + if ok == 'slots' \ + and scope in ['slot', 'slot_or_usage', 'all_elements']: + for sk, sv in ov.items(): # s for slot + if criterion_field in sv \ + and criterion_in_set == "" \ + and element_key == "" \ + and operation == 'set' \ + and sv[criterion_field] == criterion_value: + # click.echo( + # f"setting {target_field} to {target_value} in slot {sk} because {criterion_field} == {criterion_value}") + sv[target_field] = target_value + if criterion_field in sv \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and sv[criterion_field] in sets[criterion_in_set] \ + and target_field in sv \ + and target_value == "": + click.echo( + f"deleting {target_field} in slot {sk} because {sv[criterion_field]} is in {criterion_in_set}") + del sv[target_field] + if criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and target_field in sv: + # click.echo(f"globally deleting {target_field} in slot {sk}") + del sv[target_field] + if sk == element_key \ + and criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and operation == 'set': + # click.echo(f"setting {target_field} to {target_value} in slot {sk}") + sv[target_field] = target_value + if sk == element_key \ + and criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and operation == 'delete_field' \ + and target_field in sv: + click.echo(f"deleting {target_field} in slot {sk}") + del sv[target_field] + if criterion_field in sv \ + and element_key == "" \ + and operation == 'delete_field' \ + and sv[criterion_field] == criterion_value \ + and target_field in sv \ + and target_value == "": + click.echo( + f"deleting {target_field} in slot {sk} because {sv[criterion_field]} == {criterion_value}") + del sv[target_field] + if ok == 'classes' \ + and scope in ['class', 'all_elements']: + for ck, cv in ov.items(): # c for class + if criterion_field in cv \ + and criterion_in_set == "" \ + and element_key == "" \ + and operation == 'set' \ + and cv[criterion_field] == criterion_value: + click.echo( + f"setting {target_field} to {target_value} in class {ck} because {criterion_field} == {criterion_value}") + cv[target_field] = target_value + if criterion_field in cv \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and cv[criterion_field] in sets[criterion_in_set] \ + and target_field in cv \ + and target_value == "": + click.echo( + f"deleting {target_field} in class {ck} because {cv[criterion_field]} is in {criterion_in_set}") + del cv[target_field] + if criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and target_field in cv: + # click.echo(f"globally deleting {target_field} in class {ck}") + del cv[target_field] + if ok == 'classes' \ + and scope in ['usage', 'slot_or_usage', 'all_elements']: + for ck, cv in ov.items(): # c for class + if 'slot_usage' in cv: + for sk, sv in cv['slot_usage'].items(): + # click.echo(f"checking slot_usage in class {ck} slot {sk}") + if criterion_field in sv \ + and criterion_in_set == "" \ + and element_key == "" \ + and operation == 'set' \ + and sv[criterion_field] == criterion_value: + # click.echo( + # f"setting {target_field} to {target_value} in class {ck} usage of slot {sk} because {criterion_field} == {criterion_value}") + sv[target_field] = target_value + if criterion_field in sv \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and sv[criterion_field] in sets[criterion_in_set] \ + and target_field in sv \ + and target_value == "": + # click.echo( + # f"deleting {target_field} in class {ck} usage {sk} because {sv[criterion_field]} is in {criterion_in_set}") + del sv[target_field] + if criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and target_field in sv: + # click.echo(f"globally deleting {target_field} in class {ck} usage {sk}") + del sv[target_field] + if criterion_field in sv \ + and element_key == "" \ + and operation == 'delete_field' \ + and sv[criterion_field] == criterion_value \ + and target_field in sv \ + and target_value == "": + # click.echo( + # f"deleting {target_field} in {ck} usage {sk} because {sv[criterion_field]} == {criterion_value}") + del sv[target_field] + if ok == 'enums' \ + and scope in ['enum', 'all_elements']: + for ek, ev in ov.items(): + # click.echo(f"checking enum {ek}") + if criterion_field in ev \ + and criterion_in_set == "" \ + and element_key == "" \ + and operation == 'set' \ + and ev[criterion_field] == criterion_value: + click.echo( + f"setting {target_field} to {target_value} in enum {ek} because {criterion_field} == {criterion_value}") + ev[target_field] = target_value + if criterion_field in ev \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and ev[criterion_field] in sets[criterion_in_set] \ + and target_field in ev \ + and target_value == "": + click.echo( + f"deleting {target_field} in enum {ek} because {ek[criterion_field]} is in {criterion_in_set}") + del ev[target_field] + if criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and target_field in ev: + # click.echo(f"globally deleting {target_field} in enum {ek}") + del ev[target_field] + if ok == 'subsets' \ + and scope in ['all_elements']: + for uk, uv in ov.items(): + # click.echo(f"checking subset {uk}") + if criterion_field in uv \ + and criterion_in_set == "" \ + and element_key == "" \ + and operation == 'set' \ + and uv[criterion_field] == criterion_value: + click.echo( + f"setting {target_field} to {target_value} in subset {uk} because {criterion_field} == {criterion_value}") + uv[target_field] = target_value + if criterion_field in uv \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and uv[criterion_field] in sets[criterion_in_set] \ + and target_field in uv \ + and target_value == "": + click.echo( + f"deleting {target_field} in subset {uk} because {uk[criterion_field]} is in {criterion_in_set}") + del uv[target_field] + if criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and target_field in uv: + # click.echo(f"globally deleting {target_field} in subset {uk}") + del uv[target_field] + if ok == 'types' \ + and scope in ['all_elements']: # or might want to remove all linkml types and add an import + for tk, tv in ov.items(): + # click.echo(f"checking type {tk}") + if criterion_field in tv \ + and criterion_in_set == "" \ + and element_key == "" \ + and operation == 'set' \ + and tv[criterion_field] == criterion_value: + click.echo( + f"setting {target_field} to {target_value} in type {tk} because {criterion_field} == {criterion_value}") + tv[target_field] = target_value + if criterion_field in tv \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and tv[criterion_field] in sets[criterion_in_set] \ + and target_field in tv \ + and target_value == "": + click.echo( + f"deleting {target_field} in type {tk} because {tk[criterion_field]} is in {criterion_in_set}") + del tv[target_field] + if criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and target_field in tv: + # click.echo(f"globally deleting {target_field} in type {tk}") + del tv[target_field] + + # Save the modified data to the output file + save_yaml(schema, output) + # click.echo(f"Modifications applied to {input_yaml} and saved to {output_file} as per {config_file}.") + + +if __name__ == "__main__": + main() From 43ce3567f59883bf7c9a5be4717c029ee8c3bd81 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Tue, 29 Oct 2024 23:13:03 -0400 Subject: [PATCH 2/5] remove titles from aliases --- qy2-config.tsv | 3 +- src/nmdc_submission_schema/scripts/qy2.py | 96 +++++++++++++++++++---- 2 files changed, 84 insertions(+), 15 deletions(-) diff --git a/qy2-config.tsv b/qy2-config.tsv index b896fb19..eb9d6b4e 100644 --- a/qy2-config.tsv +++ b/qy2-config.tsv @@ -46,4 +46,5 @@ rank active scope criterion_field criterion_value criterion_in_set element_key o 24 true class delete_element NamedThing true 24 true slot_or_usage delete_element name true 24 true slot_or_usage range string delete_field inlined true -24 true slot_or_usage range string delete_field inlined_as_list true \ No newline at end of file +24 true slot_or_usage range string delete_field inlined_as_list true +24 true all_elements "" "" delete_field text true \ No newline at end of file diff --git a/src/nmdc_submission_schema/scripts/qy2.py b/src/nmdc_submission_schema/scripts/qy2.py index e5d9adb1..b323585e 100644 --- a/src/nmdc_submission_schema/scripts/qy2.py +++ b/src/nmdc_submission_schema/scripts/qy2.py @@ -33,7 +33,11 @@ def save_yaml(data: Dict[str, Any], file_path: str) -> None: @click.option("--config", required=True, type=click.Path(exists=True), help="Path to the TSV configuration file specifying modifications.") @click.option("--output", required=True, type=click.Path(), help="Path to save the modified YAML file.") -def main(schema: str, config: str, output: str) -> None: +@click.option('--collapse-annotations/--no-collapse-annotations', default=True, + help="convert annotations to simple dict form.") +@click.option('--drop-redundant-aliases/--no-drop-redundant-aliases', default=True, + help="drop aliases that are the same as the title.") +def main(schema: str, config: str, output: str, collapse_annotations: bool, drop_redundant_aliases: bool) -> None: """ Modify the input YAML based on rules in the config file and save the result to the output file. @@ -156,8 +160,8 @@ def main(schema: str, config: str, output: str) -> None: and sv[criterion_field] in sets[criterion_in_set] \ and target_field in sv \ and target_value == "": - click.echo( - f"deleting {target_field} in slot {sk} because {sv[criterion_field]} is in {criterion_in_set}") + # click.echo( + # f"deleting {target_field} in slot {sk} because {sv[criterion_field]} is in {criterion_in_set}") del sv[target_field] if criterion_field == "" \ and criterion_in_set == "" \ @@ -188,9 +192,25 @@ def main(schema: str, config: str, output: str) -> None: and sv[criterion_field] == criterion_value \ and target_field in sv \ and target_value == "": - click.echo( - f"deleting {target_field} in slot {sk} because {sv[criterion_field]} == {criterion_value}") + # click.echo( + # f"deleting {target_field} in slot {sk} because {criterion_field} == {criterion_value}") del sv[target_field] + if collapse_annotations and 'annotations' in sv: + # click.echo(f"collapsing annotations for slot {sk}") + for ak, av in sv['annotations'].items(): + # click.echo(f"setting {ak} to {av}") + if 'tag' in av and 'value' in av: + # click.echo(f"setting {ak} to {av['value']} for {sk}") + sv['annotations'][ak] = av['tag'] + if drop_redundant_aliases and 'aliases' in sv: + current_aliases = sv['aliases'] + for alias in sv['aliases']: + if 'title' in sv and alias == sv['title']: + current_aliases.remove(alias) + if len(current_aliases) == 0: + del sv['aliases'] + else: + sv['aliases'] = current_aliases if ok == 'classes' \ and scope in ['class', 'all_elements']: for ck, cv in ov.items(): # c for class @@ -209,8 +229,8 @@ def main(schema: str, config: str, output: str) -> None: and cv[criterion_field] in sets[criterion_in_set] \ and target_field in cv \ and target_value == "": - click.echo( - f"deleting {target_field} in class {ck} because {cv[criterion_field]} is in {criterion_in_set}") + # click.echo( + # f"deleting {target_field} in class {ck} because {cv[criterion_field]} is in {criterion_in_set}") del cv[target_field] if criterion_field == "" \ and criterion_in_set == "" \ @@ -220,6 +240,13 @@ def main(schema: str, config: str, output: str) -> None: and target_field in cv: # click.echo(f"globally deleting {target_field} in class {ck}") del cv[target_field] + if collapse_annotations and 'annotations' in cv: + # click.echo(f"collapsing annotations for slot {sk}") + for ak, av in cv['annotations'].items(): + # click.echo(f"setting {ak} to {av}") + if 'tag' in av and 'value' in av: + # click.echo(f"setting {ak} to {av['value']} for {sk}") + cv['annotations'][ak] = av['tag'] if ok == 'classes' \ and scope in ['usage', 'slot_or_usage', 'all_elements']: for ck, cv in ov.items(): # c for class @@ -259,8 +286,24 @@ def main(schema: str, config: str, output: str) -> None: and target_field in sv \ and target_value == "": # click.echo( - # f"deleting {target_field} in {ck} usage {sk} because {sv[criterion_field]} == {criterion_value}") + # f"deleting {target_field} in {ck} usage {sk} because {criterion_field} == {criterion_value}") del sv[target_field] + if collapse_annotations and 'annotations' in sv: + # click.echo(f"collapsing annotations for slot {sk}") + for ak, av in sv['annotations'].items(): + # click.echo(f"setting {ak} to {av}") + if 'tag' in av and 'value' in av: + # click.echo(f"setting {ak} to {av['value']} for {sk}") + sv['annotations'][ak] = av['tag'] + if drop_redundant_aliases and 'aliases' in sv: + current_aliases = sv['aliases'] + for alias in sv['aliases']: + if 'title' in sv and alias == sv['title']: + current_aliases.remove(alias) + if len(current_aliases) == 0: + del sv['aliases'] + else: + sv['aliases'] = current_aliases if ok == 'enums' \ and scope in ['enum', 'all_elements']: for ek, ev in ov.items(): @@ -280,8 +323,8 @@ def main(schema: str, config: str, output: str) -> None: and ev[criterion_field] in sets[criterion_in_set] \ and target_field in ev \ and target_value == "": - click.echo( - f"deleting {target_field} in enum {ek} because {ek[criterion_field]} is in {criterion_in_set}") + # click.echo( + # f"deleting {target_field} in enum {ek} because {ek[criterion_field]} is in {criterion_in_set}") del ev[target_field] if criterion_field == "" \ and criterion_in_set == "" \ @@ -291,6 +334,31 @@ def main(schema: str, config: str, output: str) -> None: and target_field in ev: # click.echo(f"globally deleting {target_field} in enum {ek}") del ev[target_field] + if collapse_annotations and 'annotations' in ev: + # click.echo(f"collapsing annotations for enum {ev}") + for ak, av in ev['annotations'].items(): + # click.echo(f"setting {ak} to {av}") + if 'tag' in av and 'value' in av: + # click.echo(f"setting {ak} to {av['value']} for {sk}") + ev['annotations'][ak] = av['tag'] + if 'permissible_values' in ev: + for vk, vv in ev['permissible_values'].items(): + # click.echo(f"checking permissible value {vk} in enum {ek}") + if collapse_annotations and 'annotations' in vv: + # click.echo(f"collapsing annotations for permissible value {vk} in enum {ek}") + for ak, av in vv['annotations'].items(): + # click.echo(f"setting {ak} to {av}") + if 'tag' in av and 'value' in av: + # click.echo(f"setting {ak} to {av['value']} for {sk}") + vv['annotations'][ak] = av['tag'] + if criterion_field == "" \ + and criterion_in_set == "" \ + and criterion_value == "" \ + and element_key == "" \ + and operation == 'delete_field' \ + and target_field in vv: + # click.echo(f"globally deleting {target_field} in permissible value {vk} of enum {ek}") + del vv[target_field] if ok == 'subsets' \ and scope in ['all_elements']: for uk, uv in ov.items(): @@ -310,8 +378,8 @@ def main(schema: str, config: str, output: str) -> None: and uv[criterion_field] in sets[criterion_in_set] \ and target_field in uv \ and target_value == "": - click.echo( - f"deleting {target_field} in subset {uk} because {uk[criterion_field]} is in {criterion_in_set}") + # click.echo( + # f"deleting {target_field} in subset {uk} because {uk[criterion_field]} is in {criterion_in_set}") del uv[target_field] if criterion_field == "" \ and criterion_in_set == "" \ @@ -340,8 +408,8 @@ def main(schema: str, config: str, output: str) -> None: and tv[criterion_field] in sets[criterion_in_set] \ and target_field in tv \ and target_value == "": - click.echo( - f"deleting {target_field} in type {tk} because {tk[criterion_field]} is in {criterion_in_set}") + # click.echo( + # f"deleting {target_field} in type {tk} because {tk[criterion_field]} is in {criterion_in_set}") del tv[target_field] if criterion_field == "" \ and criterion_in_set == "" \ From b1658fe5bdc0ca30369e53687f6e4b7ad94a819b Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Tue, 29 Oct 2024 23:16:53 -0400 Subject: [PATCH 3/5] set tag to value --- src/nmdc_submission_schema/scripts/qy2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/nmdc_submission_schema/scripts/qy2.py b/src/nmdc_submission_schema/scripts/qy2.py index b323585e..55ce1ab2 100644 --- a/src/nmdc_submission_schema/scripts/qy2.py +++ b/src/nmdc_submission_schema/scripts/qy2.py @@ -201,7 +201,7 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop # click.echo(f"setting {ak} to {av}") if 'tag' in av and 'value' in av: # click.echo(f"setting {ak} to {av['value']} for {sk}") - sv['annotations'][ak] = av['tag'] + sv['annotations'][ak] = av['value'] if drop_redundant_aliases and 'aliases' in sv: current_aliases = sv['aliases'] for alias in sv['aliases']: @@ -246,7 +246,7 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop # click.echo(f"setting {ak} to {av}") if 'tag' in av and 'value' in av: # click.echo(f"setting {ak} to {av['value']} for {sk}") - cv['annotations'][ak] = av['tag'] + cv['annotations'][ak] = av['value'] if ok == 'classes' \ and scope in ['usage', 'slot_or_usage', 'all_elements']: for ck, cv in ov.items(): # c for class @@ -294,7 +294,7 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop # click.echo(f"setting {ak} to {av}") if 'tag' in av and 'value' in av: # click.echo(f"setting {ak} to {av['value']} for {sk}") - sv['annotations'][ak] = av['tag'] + sv['annotations'][ak] = av['value'] if drop_redundant_aliases and 'aliases' in sv: current_aliases = sv['aliases'] for alias in sv['aliases']: @@ -340,7 +340,7 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop # click.echo(f"setting {ak} to {av}") if 'tag' in av and 'value' in av: # click.echo(f"setting {ak} to {av['value']} for {sk}") - ev['annotations'][ak] = av['tag'] + ev['annotations'][ak] = av['value'] if 'permissible_values' in ev: for vk, vv in ev['permissible_values'].items(): # click.echo(f"checking permissible value {vk} in enum {ek}") @@ -350,7 +350,7 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop # click.echo(f"setting {ak} to {av}") if 'tag' in av and 'value' in av: # click.echo(f"setting {ak} to {av['value']} for {sk}") - vv['annotations'][ak] = av['tag'] + vv['annotations'][ak] = av['value'] if criterion_field == "" \ and criterion_in_set == "" \ and criterion_value == "" \ From 5c4b5dc82828899bc232b966eb13c9139d0036af Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Wed, 30 Oct 2024 01:17:56 -0400 Subject: [PATCH 4/5] needs any_of handling --- qy2-config.tsv | 94 ++-- src/nmdc_submission_schema/scripts/qy2.py | 656 +++++++++++++++------- 2 files changed, 507 insertions(+), 243 deletions(-) diff --git a/qy2-config.tsv b/qy2-config.tsv index eb9d6b4e..b6f57e50 100644 --- a/qy2-config.tsv +++ b/qy2-config.tsv @@ -1,50 +1,44 @@ -rank active scope criterion_field criterion_value criterion_in_set element_key operation target_field target_value success -1 true slot_or_usage range TimestampValue set range string true -2 true slot_or_usage range TextValue set range string true -3 true slot_or_usage range QuantityValue set range string true -4 true slot_or_usage range OntologyClass set range string true -5 true slot_or_usage range GeolocationValue set range string true -6 true slot_or_usage range ControlledTermValue set range string true -7 true slot_or_usage range ControlledIdentifiedTermValue set range string true -8 true slot_or_usage range enum_keys delete_field multivalued true except for oxy_stat_samp -9 true slot_or_usage range type_keys delete_field multivalued true -11 true slot_or_usage dna_dnase set range YesNoEnum not in usages -12 true slot_or_usage dnase_rna set range YesNoEnum not in usages -13 true slot_or_usage oxy_stat_samp set range rel_to_oxygen_enum not in usages -14 true slot_or_usage oxy_stat_samp delete_field multivalued -15 true slot_or_usage sample_link set range string true -16 true slot_or_usage delete_element latitude true -17 true slot_or_usage delete_element longitude true -18 true slot_or_usage delete_element has_maximum_numeric_value true -19 true slot_or_usage delete_element has_minimum_numeric_value true -20 true slot_or_usage delete_element has_numeric_value true except for some examples -21 true slot_or_usage delete_element has_raw_value true except for some examples and comments -22 true slot_or_usage delete_element has_unit true except for some examples -23 true slot_or_usage delete_element term true -24 true class delete_element AttributeValue true -25 true class delete_element ControlledIdentifiedTermValue true -26 true class delete_element ControlledTermValue true -27 true class delete_element GeolocationValue true -28 true class delete_element OntologyClass true -29 true class delete_element QuantityValue true -30 true class delete_element TextValue true -31 true class delete_element TimestampValue true -32 true all_elements delete_field domain_of true -33 true all_elements delete_field from_schema true -34 true all_elements delete_field domain true -36 slot_or_usage range class_keys set range string -37 slot_or_usage range string delete_field multivalued -38 slot_or_usage range float delete_field multivalued -39 slot_or_usage range double delete_field multivalued -40 slot_or_usage range integer delete_field multivalued -41 slot_or_usage range uriorcurie delete_field multivalued -42 true all_elements delete_field name -34.2 true all_elements delete_field owner true -33 true all_elements delete_field from_schema "" -33 true all_elements delete_field from_schema "" -33 true all_elements delete_field from_schema "" -24 true class delete_element NamedThing true -24 true slot_or_usage delete_element name true -24 true slot_or_usage range string delete_field inlined true -24 true slot_or_usage range string delete_field inlined_as_list true -24 true all_elements "" "" delete_field text true \ No newline at end of file +active scope criterion_field criterion_in_set criterion_value element_key operation target_field target_value +true slot_or_usage range TimestampValue set range string +true slot_or_usage range TextValue set range string +true slot_or_usage range QuantityValue set range string +true slot_or_usage range OntologyClass set range string +true slot_or_usage range GeolocationValue set range string +true slot_or_usage range ControlledTermValue set range string +true slot_or_usage range ControlledIdentifiedTermValue set range string +true slot_or_usage range enum_keys delete_field multivalued NULL +true slot_or_usage range type_keys delete_field multivalued NULL +true slot_or_usage dna_dnase set range YesNoEnum +true slot_or_usage dnase_rna set range YesNoEnum +true slot_or_usage oxy_stat_samp set range rel_to_oxygen_enum +true slot_or_usage oxy_stat_samp delete_field multivalued "" +true slot_or_usage sample_link set range string +true slot_or_usage delete_element latitude NULL +true slot_or_usage delete_element longitude NULL +true slot_or_usage delete_element has_maximum_numeric_value NULL +true slot_or_usage delete_element has_minimum_numeric_value NULL +true slot_or_usage delete_element has_numeric_value NULL +true slot_or_usage delete_element has_raw_value NULL +true slot_or_usage delete_element has_unit NULL +true slot_or_usage delete_element term NULL +true class delete_element AttributeValue NULL +true class delete_element ControlledIdentifiedTermValue NULL +true class delete_element ControlledTermValue NULL +true class delete_element GeolocationValue NULL +true class delete_element OntologyClass NULL +true class delete_element QuantityValue NULL +true class delete_element TextValue NULL +true class delete_element TimestampValue NULL +true all_elements delete_field domain_of NULL +true all_elements delete_field from_schema NULL +true all_elements delete_field domain NULL +true all_elements delete_field name "" +true all_elements delete_field owner NULL +true all_elements delete_field from_schema NULL +true all_elements delete_field from_schema NULL +true all_elements delete_field from_schema NULL +true class delete_element NamedThing NULL +true slot_or_usage delete_element name NULL +true slot_or_usage range string delete_field inlined NULL +true slot_or_usage range string delete_field inlined_as_list NULL +true all_elements "" "" delete_field text NULL diff --git a/src/nmdc_submission_schema/scripts/qy2.py b/src/nmdc_submission_schema/scripts/qy2.py index 55ce1ab2..43bbe0f2 100644 --- a/src/nmdc_submission_schema/scripts/qy2.py +++ b/src/nmdc_submission_schema/scripts/qy2.py @@ -1,4 +1,5 @@ import csv +import pprint import yaml from glom import glom, assign @@ -8,6 +9,8 @@ from typing import Dict, Any import click +emptyish = ["", "NULL", None] # standardize the config values to just one of these + def load_yaml(file_path: str) -> Dict[str, Any]: """Load a YAML file and return its content as a dictionary.""" @@ -59,10 +62,96 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop sets['enum_keys'] = enum_keys sets['type_keys'] = type_keys sets['class_keys'] = class_keys - sets_keys = set(sets.keys()) + # sets_keys = set(sets.keys()) raw_config = load_tsv(config) + # pprint.pprint(raw_config) + + for sk, sv in schema['slots'].items(): + if collapse_annotations and 'annotations' in sv: # pull out of rc loop + for ak, av in sv['annotations'].items(): + if 'tag' in av and 'value' in av: + # click.echo(f"setting {ak} to {av['value']} for {sk}") + sv['annotations'][ak] = av['value'] + if drop_redundant_aliases and 'aliases' in sv: + current_aliases = sv['aliases'] + for alias in sv['aliases']: + if 'title' in sv and alias == sv['title']: + # click.echo(f"removing redundant alias {alias} from slot {sk}") + current_aliases.remove(alias) + if len(current_aliases) == 0: + del sv['aliases'] + else: + sv['aliases'] = current_aliases + for ck, cv in schema['classes'].items(): + if collapse_annotations and 'annotations' in cv: + for ak, av in cv['annotations'].items(): + if 'tag' in av and 'value' in av: + # click.echo(f"setting {ak} to {av['value']} for {ck}") + cv['annotations'][ak] = av['value'] + if drop_redundant_aliases and 'aliases' in cv: + current_aliases = cv['aliases'] + for alias in cv['aliases']: + if 'title' in cv and alias == cv['title']: + click.echo(f"removing redundant alias {alias} from {ck}") + current_aliases.remove(alias) + if len(current_aliases) == 0: + del cv['aliases'] + else: + cv['aliases'] = current_aliases + if 'slot_usage' in cv: + for sk, sv in cv['slot_usage'].items(): + if collapse_annotations and 'annotations' in sv: + for ak, av in sv['annotations'].items(): + if 'tag' in av and 'value' in av: + # click.echo(f"setting {ak} to {av['value']} for {sk} in {ck}") + sv['annotations'][ak] = av['value'] + if drop_redundant_aliases and 'aliases' in sv: + current_aliases = sv['aliases'] + for alias in sv['aliases']: + if 'title' in sv and alias == sv['title']: + # click.echo(f"removing redundant alias {alias} from slot {sk} in {ck}") + current_aliases.remove(alias) + if len(current_aliases) == 0: + del sv['aliases'] + else: + sv['aliases'] = current_aliases + + for ek, ev in schema['enums'].items(): + if collapse_annotations and 'annotations' in ev: + for ak, av in ev['annotations'].items(): + if 'tag' in av and 'value' in av: + click.echo(f"setting {ak} to {av['value']} for {ek}") + ev['annotations'][ak] = av['value'] + if drop_redundant_aliases and 'aliases' in ev: + current_aliases = ev['aliases'] + for alias in ev['aliases']: + if 'title' in ev and alias == ev['title']: + click.echo(f"removing redundant alias {alias} from {ek}") + current_aliases.remove(alias) + if len(current_aliases) == 0: + del ev['aliases'] + else: + ev['aliases'] = current_aliases + if 'permissible_values' in ev: + for vk, vv in ev['permissible_values'].items(): + if collapse_annotations and 'annotations' in vv: + for ak, av in vv['annotations'].items(): + if 'tag' in av and 'value' in av: + # click.echo(f"setting {ak} to {av['value']} for {vk} in {ek}") + vv['annotations'][ak] = av['value'] + if drop_redundant_aliases and 'aliases' in vv: + current_aliases = vv['aliases'] + for alias in vv['aliases']: + if 'title' in vv and alias == vv['title']: + click.echo(f"removing redundant alias {alias} from {vk} in {ek}") + current_aliases.remove(alias) + if len(current_aliases) == 0: + del vv['aliases'] + else: + vv['aliases'] = current_aliases + for rc in raw_config: scope = rc.get('scope') criterion_field = rc.get('criterion_field') @@ -78,19 +167,19 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop continue if operation == 'set' \ - and criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ and element_key in schema['slots'] \ and scope in ['slot', 'slot_or_usage'] \ and target_field in schema['slots'][element_key]: schema['slots'][element_key][target_field] = target_value if operation == 'delete_element' \ - and criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and scope in ['slot', 'slot_or_usage'] \ and target_field in schema['slots']: del schema['slots'][target_field] @@ -99,36 +188,36 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop if 'slots' in cv: slots_list = cv['slots'] if operation == 'delete_element' \ - and criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and scope in ['usage', 'slot_or_usage'] \ and target_field in slots_list: slots_list.remove(target_field) if 'slot_usage' in cv: if operation == 'delete_element' \ - and criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and scope in ['usage', 'slot_or_usage'] \ and target_field in cv['slot_usage']: del cv['slot_usage'][target_field] if operation == 'set' \ - and criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ and element_key in cv['slot_usage'] \ and scope in ['usage', 'slot_or_usage'] \ and target_field in cv['slot_usage'][element_key]: cv['slot_usage'][element_key][target_field] = target_value if operation == 'delete_element' \ - and criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and scope in ['class'] \ and target_field in schema['classes']: del schema['classes'][target_field] @@ -146,279 +235,460 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop and scope in ['slot', 'slot_or_usage', 'all_elements']: for sk, sv in ov.items(): # s for slot if criterion_field in sv \ - and criterion_in_set == "" \ - and element_key == "" \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ and operation == 'set' \ - and sv[criterion_field] == criterion_value: + and sv[criterion_field] == criterion_value \ + and target_field not in emptyish: # click.echo( # f"setting {target_field} to {target_value} in slot {sk} because {criterion_field} == {criterion_value}") sv[target_field] = target_value + if sk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and operation == 'set' \ + and target_field not in emptyish \ + and target_value not in emptyish: + # click.echo(f"setting {target_field} to {target_value} in slot {sk}") + sv[target_field] = target_value if criterion_field in sv \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field not in emptyish \ + and criterion_in_set not in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ and sv[criterion_field] in sets[criterion_in_set] \ and target_field in sv \ - and target_value == "": + and target_value in emptyish: # click.echo( # f"deleting {target_field} in slot {sk} because {sv[criterion_field]} is in {criterion_in_set}") del sv[target_field] - if criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ + if criterion_field in sv \ + and criterion_field not in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ - and target_field in sv: - # click.echo(f"globally deleting {target_field} in slot {sk}") + and sv[criterion_field] == criterion_value \ + and target_field in sv \ + and target_value in emptyish: + # click.echo( + # f"deleting {target_field} in slot {sk} because {criterion_field} == {criterion_value}") del sv[target_field] - if sk == element_key \ - and criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and operation == 'set': - # click.echo(f"setting {target_field} to {target_value} in slot {sk}") - sv[target_field] = target_value - if sk == element_key \ - and criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ + if criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ - and target_field in sv: - click.echo(f"deleting {target_field} in slot {sk}") + and target_field in sv \ + and target_value in emptyish: + # click.echo(f"globally deleting {target_field} in slot {sk}") del sv[target_field] - if criterion_field in sv \ - and element_key == "" \ + if sk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key not in emptyish \ and operation == 'delete_field' \ - and sv[criterion_field] == criterion_value \ and target_field in sv \ - and target_value == "": - # click.echo( - # f"deleting {target_field} in slot {sk} because {criterion_field} == {criterion_value}") + and target_value in emptyish: + # click.echo(f"deleting {target_field} in slot {sk}") del sv[target_field] - if collapse_annotations and 'annotations' in sv: - # click.echo(f"collapsing annotations for slot {sk}") - for ak, av in sv['annotations'].items(): - # click.echo(f"setting {ak} to {av}") - if 'tag' in av and 'value' in av: - # click.echo(f"setting {ak} to {av['value']} for {sk}") - sv['annotations'][ak] = av['value'] - if drop_redundant_aliases and 'aliases' in sv: - current_aliases = sv['aliases'] - for alias in sv['aliases']: - if 'title' in sv and alias == sv['title']: - current_aliases.remove(alias) - if len(current_aliases) == 0: - del sv['aliases'] - else: - sv['aliases'] = current_aliases + if ok == 'classes' \ and scope in ['class', 'all_elements']: for ck, cv in ov.items(): # c for class if criterion_field in cv \ - and criterion_in_set == "" \ - and element_key == "" \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ and operation == 'set' \ - and cv[criterion_field] == criterion_value: + and cv[criterion_field] == criterion_value \ + and target_field not in emptyish: click.echo( f"setting {target_field} to {target_value} in class {ck} because {criterion_field} == {criterion_value}") cv[target_field] = target_value + if ck == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and operation == 'set' \ + and target_field not in emptyish \ + and target_value not in emptyish: + click.echo(f"setting {target_field} to {target_value} in class {ck}") + cv[target_field] = target_value if criterion_field in cv \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field not in emptyish \ + and criterion_in_set not in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ and cv[criterion_field] in sets[criterion_in_set] \ and target_field in cv \ - and target_value == "": - # click.echo( - # f"deleting {target_field} in class {ck} because {cv[criterion_field]} is in {criterion_in_set}") + and target_value in emptyish: + click.echo( + f"deleting {target_field} in class {ck} because {cv[criterion_field]} is in {criterion_in_set}") + del cv[target_field] + if criterion_field in cv \ + and criterion_field not in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ + and operation == 'delete_field' \ + and cv[criterion_field] == criterion_value \ + and target_field in cv \ + and target_value in emptyish: + click.echo( + f"deleting {target_field} in class {ck} because {criterion_field} == {criterion_value}") del cv[target_field] - if criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ + if criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ - and target_field in cv: + and target_field in cv \ + and target_value in emptyish: # click.echo(f"globally deleting {target_field} in class {ck}") del cv[target_field] - if collapse_annotations and 'annotations' in cv: - # click.echo(f"collapsing annotations for slot {sk}") - for ak, av in cv['annotations'].items(): - # click.echo(f"setting {ak} to {av}") - if 'tag' in av and 'value' in av: - # click.echo(f"setting {ak} to {av['value']} for {sk}") - cv['annotations'][ak] = av['value'] + if ck == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key not in emptyish \ + and operation == 'delete_field' \ + and target_field in cv \ + and target_value in emptyish: + click.echo(f"deleting {target_field} in class {ck}") + del cv[target_field] + if ok == 'classes' \ and scope in ['usage', 'slot_or_usage', 'all_elements']: for ck, cv in ov.items(): # c for class if 'slot_usage' in cv: for sk, sv in cv['slot_usage'].items(): - # click.echo(f"checking slot_usage in class {ck} slot {sk}") if criterion_field in sv \ - and criterion_in_set == "" \ - and element_key == "" \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ and operation == 'set' \ - and sv[criterion_field] == criterion_value: - # click.echo( - # f"setting {target_field} to {target_value} in class {ck} usage of slot {sk} because {criterion_field} == {criterion_value}") + and sv[criterion_field] == criterion_value \ + and target_field not in emptyish: + click.echo( + f"setting {target_field} to {target_value} in class {ck} usage {sk} because {criterion_field} == {criterion_value}") + sv[target_field] = target_value + if sk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and operation == 'set' \ + and target_field not in emptyish \ + and target_value not in emptyish: + # click.echo(f"setting {target_field} to {target_value} in class {ck} usage {sk}") sv[target_field] = target_value if criterion_field in sv \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field not in emptyish \ + and criterion_in_set not in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ and sv[criterion_field] in sets[criterion_in_set] \ and target_field in sv \ - and target_value == "": + and target_value in emptyish: # click.echo( # f"deleting {target_field} in class {ck} usage {sk} because {sv[criterion_field]} is in {criterion_in_set}") del sv[target_field] - if criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ - and operation == 'delete_field' \ - and target_field in sv: - # click.echo(f"globally deleting {target_field} in class {ck} usage {sk}") - del sv[target_field] if criterion_field in sv \ - and element_key == "" \ + and criterion_field not in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ and sv[criterion_field] == criterion_value \ and target_field in sv \ - and target_value == "": + and target_value in emptyish: # click.echo( - # f"deleting {target_field} in {ck} usage {sk} because {criterion_field} == {criterion_value}") + # f"deleting {target_field} in class {ck} usage {sk} because {criterion_field} == {criterion_value}") + del sv[target_field] + if criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ + and operation == 'delete_field' \ + and target_field in sv \ + and target_value in emptyish: + # click.echo(f"globally deleting {target_field} in class {ck} usage {sk}") + del sv[target_field] + if sk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key not in emptyish \ + and operation == 'delete_field' \ + and target_field in sv \ + and target_value in emptyish: + click.echo(f"deleting {target_field} class {ck} usage {sk}") del sv[target_field] - if collapse_annotations and 'annotations' in sv: - # click.echo(f"collapsing annotations for slot {sk}") - for ak, av in sv['annotations'].items(): - # click.echo(f"setting {ak} to {av}") - if 'tag' in av and 'value' in av: - # click.echo(f"setting {ak} to {av['value']} for {sk}") - sv['annotations'][ak] = av['value'] - if drop_redundant_aliases and 'aliases' in sv: - current_aliases = sv['aliases'] - for alias in sv['aliases']: - if 'title' in sv and alias == sv['title']: - current_aliases.remove(alias) - if len(current_aliases) == 0: - del sv['aliases'] - else: - sv['aliases'] = current_aliases if ok == 'enums' \ and scope in ['enum', 'all_elements']: for ek, ev in ov.items(): - # click.echo(f"checking enum {ek}") if criterion_field in ev \ - and criterion_in_set == "" \ - and element_key == "" \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ and operation == 'set' \ - and ev[criterion_field] == criterion_value: + and ev[criterion_field] == criterion_value \ + and target_field not in emptyish: click.echo( f"setting {target_field} to {target_value} in enum {ek} because {criterion_field} == {criterion_value}") ev[target_field] = target_value + if ek == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and operation == 'set' \ + and target_field not in emptyish \ + and target_value not in emptyish: + click.echo(f"setting {target_field} to {target_value} in enum {ek}") + ev[target_field] = target_value if criterion_field in ev \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field not in emptyish \ + and criterion_in_set not in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ and ev[criterion_field] in sets[criterion_in_set] \ and target_field in ev \ - and target_value == "": - # click.echo( - # f"deleting {target_field} in enum {ek} because {ek[criterion_field]} is in {criterion_in_set}") + and target_value in emptyish: + click.echo( + f"deleting {target_field} in enum {ek} because {ev[criterion_field]} is in {criterion_in_set}") + del ev[target_field] + if criterion_field in ev \ + and criterion_field not in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ + and operation == 'delete_field' \ + and ev[criterion_field] == criterion_value \ + and target_field in ev \ + and target_value in emptyish: + click.echo( + f"deleting {target_field} in enum {ek} because {criterion_field} == {criterion_value}") del ev[target_field] - if criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ + if criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ - and target_field in ev: + and target_field in ev \ + and target_value in emptyish: # click.echo(f"globally deleting {target_field} in enum {ek}") del ev[target_field] - if collapse_annotations and 'annotations' in ev: - # click.echo(f"collapsing annotations for enum {ev}") - for ak, av in ev['annotations'].items(): - # click.echo(f"setting {ak} to {av}") - if 'tag' in av and 'value' in av: - # click.echo(f"setting {ak} to {av['value']} for {sk}") - ev['annotations'][ak] = av['value'] - if 'permissible_values' in ev: - for vk, vv in ev['permissible_values'].items(): - # click.echo(f"checking permissible value {vk} in enum {ek}") - if collapse_annotations and 'annotations' in vv: - # click.echo(f"collapsing annotations for permissible value {vk} in enum {ek}") - for ak, av in vv['annotations'].items(): - # click.echo(f"setting {ak} to {av}") - if 'tag' in av and 'value' in av: - # click.echo(f"setting {ak} to {av['value']} for {sk}") - vv['annotations'][ak] = av['value'] - if criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ - and operation == 'delete_field' \ - and target_field in vv: - # click.echo(f"globally deleting {target_field} in permissible value {vk} of enum {ek}") - del vv[target_field] + if ek == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key not in emptyish \ + and operation == 'delete_field' \ + and target_field in ev \ + and target_value in emptyish: + click.echo(f"deleting {target_field} in enum {ek}") + del ev[target_field] + for vk, vv in ev.get('permissible_values', {}).items(): + if criterion_field in vv \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ + and operation == 'set' \ + and vv[criterion_field] == criterion_value \ + and target_field not in emptyish: + click.echo( + f"setting {target_field} to {target_value} in PV {vk} from enum {ek} because {criterion_field} == {criterion_value}") + vv[target_field] = target_value + if vk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and operation == 'set' \ + and target_field not in emptyish \ + and target_value not in emptyish: + click.echo(f"setting {target_field} to {target_value} in PV {vk} from enum {ek}") + vv[target_field] = target_value + if criterion_field in vv \ + and criterion_field not in emptyish \ + and criterion_in_set not in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ + and operation == 'delete_field' \ + and vv[criterion_field] in sets[criterion_in_set] \ + and target_field in vv \ + and target_value in emptyish: + click.echo( + f"deleting {target_field} in PV {vk} from enum {ek} because {vv[criterion_field]} is in {criterion_in_set}") + del vv[target_field] + if criterion_field in vv \ + and criterion_field not in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ + and operation == 'delete_field' \ + and vv[criterion_field] == criterion_value \ + and target_field in vv \ + and target_value in emptyish: + click.echo( + f"deleting {target_field} in PV {vk} from enum {ek} because {criterion_field} == {criterion_value}") + del vv[target_field] + if criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ + and operation == 'delete_field' \ + and target_field in vv \ + and target_value in emptyish: + # click.echo(f"globally deleting {target_field} in PV {vk} from enum {ek}") + del vv[target_field] + if vk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key not in emptyish \ + and operation == 'delete_field' \ + and target_field in vv \ + and target_value in emptyish: + click.echo(f"deleting {target_field} in PV {vk} from enum {ek}") + del vv[target_field] + if ok == 'subsets' \ and scope in ['all_elements']: for uk, uv in ov.items(): - # click.echo(f"checking subset {uk}") if criterion_field in uv \ - and criterion_in_set == "" \ - and element_key == "" \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ and operation == 'set' \ - and uv[criterion_field] == criterion_value: + and uv[criterion_field] == criterion_value \ + and target_field not in emptyish: click.echo( f"setting {target_field} to {target_value} in subset {uk} because {criterion_field} == {criterion_value}") uv[target_field] = target_value + if uk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and operation == 'set' \ + and target_field not in emptyish \ + and target_value not in emptyish: + click.echo(f"setting {target_field} to {target_value} in subset {uk}") + uv[target_field] = target_value if criterion_field in uv \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field not in emptyish \ + and criterion_in_set not in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ and uv[criterion_field] in sets[criterion_in_set] \ and target_field in uv \ - and target_value == "": - # click.echo( - # f"deleting {target_field} in subset {uk} because {uk[criterion_field]} is in {criterion_in_set}") + and target_value in emptyish: + click.echo( + f"deleting {target_field} in subset {uk} because {uv[criterion_field]} is in {criterion_in_set}") + del uv[target_field] + if criterion_field in uv \ + and criterion_field not in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ + and operation == 'delete_field' \ + and uv[criterion_field] == criterion_value \ + and target_field in uv \ + and target_value in emptyish: + click.echo( + f"deleting {target_field} in subset {uk} because {criterion_field} == {criterion_value}") del uv[target_field] - if criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ + if criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ - and target_field in uv: + and target_field in uv \ + and target_value in emptyish: # click.echo(f"globally deleting {target_field} in subset {uk}") del uv[target_field] + if uk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key not in emptyish \ + and operation == 'delete_field' \ + and target_field in uv \ + and target_value in emptyish: + click.echo(f"deleting {target_field} in subset {uk}") + del uv[target_field] if ok == 'types' \ and scope in ['all_elements']: # or might want to remove all linkml types and add an import for tk, tv in ov.items(): - # click.echo(f"checking type {tk}") if criterion_field in tv \ - and criterion_in_set == "" \ - and element_key == "" \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ and operation == 'set' \ - and tv[criterion_field] == criterion_value: + and tv[criterion_field] == criterion_value \ + and target_field not in emptyish: click.echo( f"setting {target_field} to {target_value} in type {tk} because {criterion_field} == {criterion_value}") tv[target_field] = target_value + if tk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and operation == 'set' \ + and target_field not in emptyish \ + and target_value not in emptyish: + click.echo(f"setting {target_field} to {target_value} in type {tk}") + tv[target_field] = target_value if criterion_field in tv \ - and criterion_value == "" \ - and element_key == "" \ + and criterion_field not in emptyish \ + and criterion_in_set not in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ and tv[criterion_field] in sets[criterion_in_set] \ and target_field in tv \ - and target_value == "": - # click.echo( - # f"deleting {target_field} in type {tk} because {tk[criterion_field]} is in {criterion_in_set}") + and target_value in emptyish: + click.echo( + f"deleting {target_field} in type {tk} because {tv[criterion_field]} is in {criterion_in_set}") del tv[target_field] - if criterion_field == "" \ - and criterion_in_set == "" \ - and criterion_value == "" \ - and element_key == "" \ + if criterion_field in tv \ + and criterion_field not in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value not in emptyish \ + and element_key in emptyish \ and operation == 'delete_field' \ - and target_field in tv: + and tv[criterion_field] == criterion_value \ + and target_field in tv \ + and target_value in emptyish: + click.echo( + f"deleting {target_field} in type {tk} because {criterion_field} == {criterion_value}") + del tv[target_field] + if criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key in emptyish \ + and operation == 'delete_field' \ + and target_field in tv \ + and target_value in emptyish: # click.echo(f"globally deleting {target_field} in type {tk}") del tv[target_field] + if tk == element_key \ + and criterion_field in emptyish \ + and criterion_in_set in emptyish \ + and criterion_value in emptyish \ + and element_key not in emptyish \ + and operation == 'delete_field' \ + and target_field in tv \ + and target_value in emptyish: + click.echo(f"deleting {target_field} in type {tk}") + del tv[target_field] # Save the modified data to the output file save_yaml(schema, output) From 8f7169e34dd442c20b3eba9857577b549ac4db5d Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Wed, 30 Oct 2024 10:39:26 -0400 Subject: [PATCH 5/5] more rows in config --- qy2-config.tsv | 3 +++ src/nmdc_submission_schema/scripts/qy2.py | 27 ++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/qy2-config.tsv b/qy2-config.tsv index b6f57e50..e929d8b2 100644 --- a/qy2-config.tsv +++ b/qy2-config.tsv @@ -42,3 +42,6 @@ true slot_or_usage delete_element name NULL true slot_or_usage range string delete_field inlined NULL true slot_or_usage range string delete_field inlined_as_list NULL true all_elements "" "" delete_field text NULL +true slot_or_usage "" "" lat_lon set pattern ^[-+]?([1-8]?\d(\.\d{1,8})?|90(\.0{1,8})?)\s[-+]?(180(\.0{1,8})?|((1[0-7]\d)|([1-9]?\d))(\.\d{1,8})?)$ +true slot_or_usage range QuantityValue pattern ^([-+]?[0-9]*\.?[0-9]+ +\S.*\|)*([-+]?[0-9]*\.?[0-9]+ +\S.*)$ +true slot_or_usage string_serialization {text};{float} {unit} pattern ^[^;\t\r\x0A\|]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A\|]+$ \ No newline at end of file diff --git a/src/nmdc_submission_schema/scripts/qy2.py b/src/nmdc_submission_schema/scripts/qy2.py index 43bbe0f2..603bc2c1 100644 --- a/src/nmdc_submission_schema/scripts/qy2.py +++ b/src/nmdc_submission_schema/scripts/qy2.py @@ -1,3 +1,6 @@ +# todo: handle any_of (optionally?) +# deduplicate lists like comments or slots + import csv import pprint @@ -77,7 +80,7 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop if drop_redundant_aliases and 'aliases' in sv: current_aliases = sv['aliases'] for alias in sv['aliases']: - if 'title' in sv and alias == sv['title']: + if ('title' in sv and alias == sv['title']) or alias == sk: # click.echo(f"removing redundant alias {alias} from slot {sk}") current_aliases.remove(alias) if len(current_aliases) == 0: @@ -93,9 +96,11 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop if drop_redundant_aliases and 'aliases' in cv: current_aliases = cv['aliases'] for alias in cv['aliases']: - if 'title' in cv and alias == cv['title']: + if ('title' in cv and alias == sv['title']) or alias == ck: click.echo(f"removing redundant alias {alias} from {ck}") current_aliases.remove(alias) + if alias == ck: + current_aliases.remove(alias) if len(current_aliases) == 0: del cv['aliases'] else: @@ -110,9 +115,11 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop if drop_redundant_aliases and 'aliases' in sv: current_aliases = sv['aliases'] for alias in sv['aliases']: - if 'title' in sv and alias == sv['title']: + if ('title' in sv and alias == sv['title']) or alias == sk: # click.echo(f"removing redundant alias {alias} from slot {sk} in {ck}") current_aliases.remove(alias) + # if alias == sk: + # current_aliases.remove(alias) if len(current_aliases) == 0: del sv['aliases'] else: @@ -127,9 +134,11 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop if drop_redundant_aliases and 'aliases' in ev: current_aliases = ev['aliases'] for alias in ev['aliases']: - if 'title' in ev and alias == ev['title']: + if ('title' in ev and alias == sv['title']) or alias == ek: click.echo(f"removing redundant alias {alias} from {ek}") current_aliases.remove(alias) + # if alias == ek: + # current_aliases.remove(alias) if len(current_aliases) == 0: del ev['aliases'] else: @@ -144,9 +153,11 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop if drop_redundant_aliases and 'aliases' in vv: current_aliases = vv['aliases'] for alias in vv['aliases']: - if 'title' in vv and alias == vv['title']: + if ('title' in vv and alias == sv['title']) or alias == vk: click.echo(f"removing redundant alias {alias} from {vk} in {ek}") current_aliases.remove(alias) + # if alias == vk: + # current_aliases.remove(alias) if len(current_aliases) == 0: del vv['aliases'] else: @@ -223,7 +234,7 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop del schema['classes'][target_field] for ok, ov in schema.items(): # o for outer - # print(ok) + # click.echo(ok) # # types # @@ -375,8 +386,8 @@ def main(schema: str, config: str, output: str, collapse_annotations: bool, drop and operation == 'set' \ and sv[criterion_field] == criterion_value \ and target_field not in emptyish: - click.echo( - f"setting {target_field} to {target_value} in class {ck} usage {sk} because {criterion_field} == {criterion_value}") + # click.echo( + # f"setting {target_field} to {target_value} in class {ck} usage {sk} because {criterion_field} == {criterion_value}") sv[target_field] = target_value if sk == element_key \ and criterion_field in emptyish \