diff --git a/assets/yq-final.txt b/assets/yq-final.txt new file mode 100644 index 00000000..7d1948f6 --- /dev/null +++ b/assets/yq-final.txt @@ -0,0 +1,29 @@ +# remove the multivalued true annotation from these global slot definitions for the sake of linkml-convert +# esp to tsv? and dumping to SQLite? + +# follow the .string_serialization=="{text};{float} {unit}" and .multivalued == true pattern? + +'(.slots.[] | select(.range == "ControlledIdentifiedTermValue") | .range) = "string"' +'(.slots.[] | select(.range == "ControlledTermValue") | .range) = "string"' +'(.slots.[] | select(.range == "GeolocationValue") | .range) = "string"' +'(.slots.[] | select(.range == "OntologyClass") | .range) = "string"' +'(.slots.[] | select(.range == "QuantityValue") | .range) = "string"' +'(.slots.[] | select(.range == "TextValue") | .range) = "string"' +'(.slots.[] | select(.range == "TimestampValue") | .range) = "string"' + +# yq -i '(.slots.[] | select(has("range") | not ) | .range ) = "string"' +# yq -i '(.classes.[].slot_usage.[] | select(has("range") | not ) | .range ) = "string"' + +'(.slots.[] | select(.name == "sample_link") | .range ) = "string"' +'(.slots.[] | select(.range == "string") | .multivalued ) = false' +'(.classes.[].slot_usage.[] | select(.range=="string") | .multivalued) = false' + +# yq -i '(.slots.[] | select(.name == "dna_dnase") | .range) = "boolean"' +# yq -i '(.classes.[].slot_usage.[] | select(.name == "dna_dnase") | .range) = "boolean"' + +'(.slots.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"' +'(.classes.[].slot_usage.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"' +'(.slots.[] | select(.name == "dna_dnase") | .range) = "YesNoEnum"' +'(.classes.[].slot_usage.[] | select(.name == "dna_dnase") | .range) = "YesNoEnum"' +'(.slots.[] | select(.name == "dnase_rna") | .range) = "YesNoEnum"' +'(.classes.[].slot_usage.[] | select(.name == "dnase_rna") | .range) = "YesNoEnum"' \ No newline at end of file diff --git a/assets/yq-for-shuttles.txt b/assets/yq-for-shuttles.txt new file mode 100644 index 00000000..0cfa4b57 --- /dev/null +++ b/assets/yq-for-shuttles.txt @@ -0,0 +1,96 @@ +# using \x0A to represent a line feed +# double $ gets reduced to one by make + +# .string_serialization=="{text};{float} {unit}": what about multivalueds? don't see any at this time +# ControlledTermValue: experiential factor has string_serialization: '{termLabel} {[termID]}|{text}' +# ControlledTermValue: what about multivalued CTVs? don't see any besides chem_administration above at this time +# for water, can depth be a point, a range, or both? + + +# globally replace structured ranges with strings. +# undoes some of the range alterations that nmdc-schema makes when importing MIxS terms +# future versions of the nmdc-schema might just use strings, too + +# there's still more to do. see schemasheets/populated_tsv/slot_usage.tsv +# to some degree this should be handled globally by sheets_and_friends/tsv_in/validation_converter.tsv +# and on a slot-by-slot basic by sheets_and_friends/tsv_in/modifications_long.tsv + +# we should be consistent about the following things in patterns +# single or multiple whitespace? +# [0-9] or \d? +# include scientific notation? (eg quantity value) +# what whitespace to exclude? + +# be careful about strings that look like numbers with quotes in YAML +# impact on other serializations? + +# escape pipes that are going to be used literally as future delimiters ? + +# should add a remove attribute option to sheets and friend's modify and validate +# currently have nan string serializations + +# scrutinize the slots that currently accept xyz or 100 units. how could they be better constrained? + +# synchronize between guidance, examples and validation +# cross-reference MIxS' values for those aspects + +# use yq to add examples when the examples themselves include the packed value separator | +# good reason for using ! instead of | + +'(.classes.[].slot_usage.[] | select(.name=="chem_administration") | .examples) = [{"value": "agar [CHEBI:2509];2018-05-11|agar [CHEBI:2509];2018-05-22"}, {"value": "agar [CHEBI:2509];2018-05"}]' + +# use yq to add patterns with a secondary condition like mutivalued +'(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .pattern) = "^[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?)\s[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)$$"' +'(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .range) = "string"' + +'(.classes.[].slot_usage.[] | select(.range == "QuantityValue") | .pattern) = "^[-+]?[0-9]*\.?[0-9]+ +\S.*$$"' +'(.classes.[].slot_usage.[] | select(.range == "QuantityValue" and .multivalued == true) | .pattern) = "^([-+]?[0-9]*\.?[0-9]+ +\S.*\|)*([-+]?[0-9]*\.?[0-9]+ +\S.*)$$"' + +# add a pattern for {termLabel} {[termID]} in teh validation configuration +# need more invalid examples +#yq -i '(.classes.[].slot_usage.[] | select(.string_serialization=="{termLabel} {[termID]}") | .range) = "string"' $@ + +'(.classes.[].slot_usage.[] | select(.string_serialization=="{text};{float} {unit}") | .pattern) = "^[^;\t\r\x0A\|]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A\|]+$$"' +'(.classes.[].slot_usage.[] | select(.string_serialization=="{text};{float} {unit}" and .multivalued == true ) | .pattern) = "^([^;\t\r\x0A]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A]+\|)*([^;\t\r\x0A]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A]+)$$"' + +'(.slots.[] | select(.domain == "Activity") | .domain ) = "NamedThing"' +'(.slots.[] | select(.domain == "Agent") | .domain ) = "NamedThing"' +'(.slots.[] | select(.domain == "AttributeValue") | .domain ) = "NamedThing"' +'(.slots.[] | select(.domain == "AttributeValue") | .domain ) = "NamedThing"' +'(.slots.[] | select(.domain == "ControlledTermValue") | .domain ) = "NamedThing"' +'(.slots.[] | select(.domain == "GeolocationValue") | .domain ) = "NamedThing"' + +'del(.classes.Activity)' +'del(.classes.Agent)' +'del(.classes.AttributeValue)' +'del(.classes.ControlledIdentifiedTermValue)' +'del(.classes.ControlledTermValue)' +'del(.classes.GeolocationValue)' +'del(.classes.OntologyClass)' +'del(.classes.QuantityValue)' +'del(.classes.TextValue)' +'del(.classes.TimestampValue)' + +# use yq for global modifications +# rel_to_oxygen / oxy_stat_samp +'(.slots.[] | select(.name == "rel_to_oxygen") | .range) = "rel_to_oxygen_enum"' +'(.slots.[] | select(.name == "oxy_stat_samp") | .range) = "rel_to_oxygen_enum"' + +# remove slots that are no longer necessary due to removal of classes above +'del(.slots.[] | select(.name == "acted_on_behalf_of"))' +'del(.slots.[] | select(.name == "ended_at_time"))' +'del(.slots.[] | select(.name == "has_maximum_numeric_value"))' +'del(.slots.[] | select(.name == "has_minimum_numeric_value"))' +'del(.slots.[] | select(.name == "has_numeric_value"))' +'del(.slots.[] | select(.name == "has_raw_value"))' +'del(.slots.[] | select(.name == "has_unit"))' +'del(.slots.[] | select(.name == "latitude"))' +'del(.slots.[] | select(.name == "longitude"))' +'del(.slots.[] | select(.name == "started_at_time"))' +'del(.slots.[] | select(.name == "term"))' +'del(.slots.[] | select(.name == "used"))' +'del(.slots.[] | select(.name == "was_associated_with"))' +'del(.slots.[] | select(.name == "was_generated_by"))' +'del(.slots.[] | select(.name == "was_informed_by"))' + +'del(.slots.[] | select(.name == "was_informed_by"))' \ No newline at end of file diff --git a/project.Makefile b/project.Makefile index 59fd278e..867df576 100644 --- a/project.Makefile +++ b/project.Makefile @@ -69,104 +69,9 @@ sheets_and_friends/tsv_in/import_slots_regardless.tsv --format yaml $@.raw > $@ - $(RUN) linkml-lint $@ > local/with_shuttles.lint_report.txt -local/with_shuttles_yq.yaml: local/with_shuttles.yaml +local/with_shuttles_yq.yaml: local/with_shuttles.yaml assets/yq-for-shuttles.txt cp $< $@ - # using \x0A to represent a line feed - # double $ gets reduced to one by make - - # .string_serialization=="{text};{float} {unit}": what about multivalueds? don't see any at this time - # ControlledTermValue: experiential factor has string_serialization: '{termLabel} {[termID]}|{text}' - # ControlledTermValue: what about multivalued CTVs? don't see any besides chem_administration above at this time - # for water, can depth be a point, a range, or both? - - -# globally replace structured ranges with strings. -# undoes some of the range alterations that nmdc-schema makes when importing MIxS terms -# future versions of the nmdc-schema might just use strings, too - -# there's still more to do. see schemasheets/populated_tsv/slot_usage.tsv -# to some degree this should be handled globally by sheets_and_friends/tsv_in/validation_converter.tsv -# and on a slot-by-slot basic by sheets_and_friends/tsv_in/modifications_long.tsv - -# we should be consistent about the following things in patterns -# single or multiple whitespace? -# [0-9] or \d? -# include scientific notation? (eg quantity value) -# what whitespace to exclude? - -# be careful about strings that look like numbers with quotes in YAML -# impact on other serializations? - -# escape pipes that are going to be used literally as future delimiters ? - -# should add a remove attribute option to sheets and friend's modify and validate -# currently have nan string serializations - -# scrutininze the slots that currerntly accept xyz or 100 units. how could they be better constrained? - -# synchronize between guidance, examples and validation -# cross reference MIxS' values for those aspects - -# use yq to add examples when the examples themselves include the packed value separator | -# good reason for using ! instead of | - yq -i '(.classes.[].slot_usage.[] | select(.name=="chem_administration") | .examples) = [{"value": "agar [CHEBI:2509];2018-05-11|agar [CHEBI:2509];2018-05-22"}, {"value": "agar [CHEBI:2509];2018-05"}]' $@ - -# use yq to add patterns with a secondary condition like mutivalued - yq -i '(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .pattern) = "^[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?)\s[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)$$"' $@ - yq -i '(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .range) = "string"' $@ - - yq -i '(.classes.[].slot_usage.[] | select(.range == "QuantityValue") | .pattern) = "^[-+]?[0-9]*\.?[0-9]+ +\S.*$$"' $@ - yq -i '(.classes.[].slot_usage.[] | select(.range == "QuantityValue" and .multivalued == true) | .pattern) = "^([-+]?[0-9]*\.?[0-9]+ +\S.*\|)*([-+]?[0-9]*\.?[0-9]+ +\S.*)$$"' $@ - -# add a pattern for {termLabel} {[termID]} in teh validation configuration -# need more invalid examples - #yq -i '(.classes.[].slot_usage.[] | select(.string_serialization=="{termLabel} {[termID]}") | .range) = "string"' $@ - - yq -i '(.classes.[].slot_usage.[] | select(.string_serialization=="{text};{float} {unit}") | .pattern) = "^[^;\t\r\x0A\|]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A\|]+$$"' $@ - yq -i '(.classes.[].slot_usage.[] | select(.string_serialization=="{text};{float} {unit}" and .multivalued == true ) | .pattern) = "^([^;\t\r\x0A]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A]+\|)*([^;\t\r\x0A]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A]+)$$"' $@ - - yq -i '(.slots.[] | select(.domain == "Activity") | .domain ) = "NamedThing"' $@ - yq -i '(.slots.[] | select(.domain == "Agent") | .domain ) = "NamedThing"' $@ - yq -i '(.slots.[] | select(.domain == "AttributeValue") | .domain ) = "NamedThing"' $@ - yq -i '(.slots.[] | select(.domain == "AttributeValue") | .domain ) = "NamedThing"' $@ - yq -i '(.slots.[] | select(.domain == "ControlledTermValue") | .domain ) = "NamedThing"' $@ - yq -i '(.slots.[] | select(.domain == "GeolocationValue") | .domain ) = "NamedThing"' $@ - - yq -i 'del(.classes.Activity)' $@ - yq -i 'del(.classes.Agent)' $@ - yq -i 'del(.classes.AttributeValue)' $@ - yq -i 'del(.classes.ControlledIdentifiedTermValue)' $@ - yq -i 'del(.classes.ControlledTermValue)' $@ - yq -i 'del(.classes.GeolocationValue)' $@ - yq -i 'del(.classes.OntologyClass)' $@ - yq -i 'del(.classes.QuantityValue)' $@ - yq -i 'del(.classes.TextValue)' $@ - yq -i 'del(.classes.TimestampValue)' $@ - -# use yq for global modifications -# rel_to_oxygen / oxy_stat_samp - yq -i '(.slots.[] | select(.name == "rel_to_oxygen") | .range) = "rel_to_oxygen_enum"' $@ - yq -i '(.slots.[] | select(.name == "oxy_stat_samp") | .range) = "rel_to_oxygen_enum"' $@ - -# remove slots that are no longer necessary due to removal of classes above - yq -i 'del(.slots.[] | select(.name == "acted_on_behalf_of"))' $@ - yq -i 'del(.slots.[] | select(.name == "ended_at_time"))' $@ - yq -i 'del(.slots.[] | select(.name == "has_maximum_numeric_value"))' $@ - yq -i 'del(.slots.[] | select(.name == "has_minimum_numeric_value"))' $@ - yq -i 'del(.slots.[] | select(.name == "has_numeric_value"))' $@ - yq -i 'del(.slots.[] | select(.name == "has_raw_value"))' $@ - yq -i 'del(.slots.[] | select(.name == "has_unit"))' $@ - yq -i 'del(.slots.[] | select(.name == "latitude"))' $@ - yq -i 'del(.slots.[] | select(.name == "longitude"))' $@ - yq -i 'del(.slots.[] | select(.name == "started_at_time"))' $@ - yq -i 'del(.slots.[] | select(.name == "term"))' $@ - yq -i 'del(.slots.[] | select(.name == "used"))' $@ - yq -i 'del(.slots.[] | select(.name == "was_associated_with"))' $@ - yq -i 'del(.slots.[] | select(.name == "was_generated_by"))' $@ - yq -i 'del(.slots.[] | select(.name == "was_informed_by"))' $@ - - yq -i 'del(.slots.[] | select(.name == "was_informed_by"))' $@ - + grep "^'" $(word 2, $^) | while IFS= read -r line ; do echo $$line ; eval yq -i $$line $@ ; done modifications-clean: rm -rf sheets_and_friends/yaml_out/with_modifications.yaml @@ -174,7 +79,6 @@ modifications-clean: local/nmdc.yaml: wget -O $@ https://raw.githubusercontent.com/microbiomedata/nmdc-schema/v10.3.0/nmdc_schema/nmdc_materialized_patterns.yaml -# sheets-for-nmdc-submission-schema_validation_converter_empty.tsv local/with_modifications.yaml: local/with_shuttles_yq.yaml \ sheets_and_friends/tsv_in/modifications_long.tsv \ sheets_and_friends/tsv_in/validation_converter.tsv \ @@ -185,61 +89,28 @@ local/nmdc.yaml --validation_config_tsv $(word 3,$^) \ --yaml_output $@.raw - # # having trouble selectively injecting rules based on title pattern -# yq eval-all \ -# 'select(fileIndex==1).classes.JgiMtInterface.rules = (select(fileIndex==0).classes.Biosample.rules.[] | select(.title == "rna*")) | select(fileIndex==1)' \ -# local/nmdc.yaml src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml | cat > ruleswap.yaml - # # so just inject all rules... - # # using | cat > because yq fails to write to STDOUT (permissions error?!) + # apply all rules from Biosample to JgiMgInterface and JgiMtInterface yq eval-all \ 'select(fileIndex==1).classes.JgiMgInterface.rules = select(fileIndex==0).classes.Biosample.rules | select(fileIndex==1)' \ local/nmdc.yaml $@.raw | cat > $@.raw2 yq eval-all \ 'select(fileIndex==1).classes.JgiMtInterface.rules = select(fileIndex==0).classes.Biosample.rules | select(fileIndex==1)' \ local/nmdc.yaml $@.raw2 | cat > $@.raw - # # ...then removing rules that aren't relevant to a class - # # requires some prior knowledge + + # remove some rules yq -i 'del(.classes.JgiMgInterface.rules.[] | select(.title == "rna*"))' $@.raw yq -i 'del(.classes.JgiMtInterface.rules.[] | select(.title == "dna*"))' $@.raw $(RUN) gen-linkml \ --no-materialize-attributes \ --format yaml $@.raw > $@ + - $(RUN) linkml-lint $@ > local/with_modifications.lint_report.txt -src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml: local/with_modifications.yaml project/thirdparty/GoldEcosystemTree.json +src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml: local/with_modifications.yaml \ +project/thirdparty/GoldEcosystemTree.json assets/yq-final.txt $(RUN) inject-gold-pathway-terms -g $(word 2,$^) -i $< -o $@ -# remove the multivalued true annotation from these gloabl slot definitions for the sake of linkml-convert -# esp to tsv? and dumping to SQLite? -# follow the .string_serialization=="{text};{float} {unit}" and .multivalued == true pattern? - - yq -i '(.slots.[] | select(.range == "ControlledIdentifiedTermValue") | .range) = "string"' $@ - yq -i '(.slots.[] | select(.range == "ControlledTermValue") | .range) = "string"' $@ - yq -i '(.slots.[] | select(.range == "GeolocationValue") | .range) = "string"' $@ - yq -i '(.slots.[] | select(.range == "OntologyClass") | .range) = "string"' $@ - yq -i '(.slots.[] | select(.range == "QuantityValue") | .range) = "string"' $@ - yq -i '(.slots.[] | select(.range == "TextValue") | .range) = "string"' $@ - yq -i '(.slots.[] | select(.range == "TimestampValue") | .range) = "string"' $@ - -# yq -i '(.slots.[] | select(has("range") | not ) | .range ) = "string"' $@ -# yq -i '(.classes.[].slot_usage.[] | select(has("range") | not ) | .range ) = "string"' $@ - - yq -i '(.slots.[] | select(.name == "sample_link") | .range ) = "string"' $@ - - yq -i '(.slots.[] | select(.range == "string") | .multivalued ) = false' $@ - yq -i '(.classes.[].slot_usage.[] | select(.range=="string") | .multivalued) = false' $@ - -# yq -i '(.slots.[] | select(.name == "dna_dnase") | .range) = "boolean"' $@ -# yq -i '(.classes.[].slot_usage.[] | select(.name == "dna_dnase") | .range) = "boolean"' $@ - - yq -i '(.slots.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"' $@ - yq -i '(.classes.[].slot_usage.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"' $@ - - yq -i '(.slots.[] | select(.name == "dna_dnase") | .range) = "YesNoEnum"' $@ - yq -i '(.classes.[].slot_usage.[] | select(.name == "dna_dnase") | .range) = "YesNoEnum"' $@ - - yq -i '(.slots.[] | select(.name == "dnase_rna") | .range) = "YesNoEnum"' $@ - yq -i '(.classes.[].slot_usage.[] | select(.name == "dnase_rna") | .range) = "YesNoEnum"' $@ + grep "^'" $(word 3, $^) | while IFS= read -r line ; do echo $$line ; eval yq -i $$line $@ ; done run-examples: examples-clean examples/output/README.md @@ -273,6 +144,7 @@ src/data/invalid src/data/valid local/SampleData-water-data-exhaustive.tsv: src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml \ src/data/valid/SampleData-water-data-exhaustive.yaml + mkdir -p local $(RUN) linkml-convert \ --output $@ \ --target-class SampleData \ @@ -281,6 +153,7 @@ src/data/valid/SampleData-water-data-exhaustive.yaml examples/output/SampleData-water-data-exhaustive.regen.yaml: src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml \ local/SampleData-water-data-exhaustive.tsv + mkdir -p examples/output $(RUN) linkml-convert \ --output $@ \ --target-class SampleData \