Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

moved most yq commands and comments #198

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions assets/yq-final.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# remove the multivalued true annotation from these global slot definitions for the sake of linkml-convert
# esp to tsv? and dumping to SQLite?

# follow the .string_serialization=="{text};{float} {unit}" and .multivalued == true pattern?

'(.slots.[] | select(.range == "ControlledIdentifiedTermValue") | .range) = "string"'
'(.slots.[] | select(.range == "ControlledTermValue") | .range) = "string"'
'(.slots.[] | select(.range == "GeolocationValue") | .range) = "string"'
'(.slots.[] | select(.range == "OntologyClass") | .range) = "string"'
'(.slots.[] | select(.range == "QuantityValue") | .range) = "string"'
'(.slots.[] | select(.range == "TextValue") | .range) = "string"'
'(.slots.[] | select(.range == "TimestampValue") | .range) = "string"'

# yq -i '(.slots.[] | select(has("range") | not ) | .range ) = "string"'
# yq -i '(.classes.[].slot_usage.[] | select(has("range") | not ) | .range ) = "string"'

'(.slots.[] | select(.name == "sample_link") | .range ) = "string"'
'(.slots.[] | select(.range == "string") | .multivalued ) = false'
'(.classes.[].slot_usage.[] | select(.range=="string") | .multivalued) = false'

# yq -i '(.slots.[] | select(.name == "dna_dnase") | .range) = "boolean"'
# yq -i '(.classes.[].slot_usage.[] | select(.name == "dna_dnase") | .range) = "boolean"'

'(.slots.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"'
'(.classes.[].slot_usage.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"'
'(.slots.[] | select(.name == "dna_dnase") | .range) = "YesNoEnum"'
'(.classes.[].slot_usage.[] | select(.name == "dna_dnase") | .range) = "YesNoEnum"'
'(.slots.[] | select(.name == "dnase_rna") | .range) = "YesNoEnum"'
'(.classes.[].slot_usage.[] | select(.name == "dnase_rna") | .range) = "YesNoEnum"'
96 changes: 96 additions & 0 deletions assets/yq-for-shuttles.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# using \x0A to represent a line feed
# double $ gets reduced to one by make

# .string_serialization=="{text};{float} {unit}": what about multivalueds? don't see any at this time
# ControlledTermValue: experiential factor has string_serialization: '{termLabel} {[termID]}|{text}'
# ControlledTermValue: what about multivalued CTVs? don't see any besides chem_administration above at this time
# for water, can depth be a point, a range, or both?


# globally replace structured ranges with strings.
# undoes some of the range alterations that nmdc-schema makes when importing MIxS terms
# future versions of the nmdc-schema might just use strings, too

# there's still more to do. see schemasheets/populated_tsv/slot_usage.tsv
# to some degree this should be handled globally by sheets_and_friends/tsv_in/validation_converter.tsv
# and on a slot-by-slot basic by sheets_and_friends/tsv_in/modifications_long.tsv

# we should be consistent about the following things in patterns
# single or multiple whitespace?
# [0-9] or \d?
# include scientific notation? (eg quantity value)
# what whitespace to exclude?

# be careful about strings that look like numbers with quotes in YAML
# impact on other serializations?

# escape pipes that are going to be used literally as future delimiters ?

# should add a remove attribute option to sheets and friend's modify and validate
# currently have nan string serializations

# scrutinize the slots that currently accept xyz or 100 units. how could they be better constrained?

# synchronize between guidance, examples and validation
# cross-reference MIxS' values for those aspects

# use yq to add examples when the examples themselves include the packed value separator |
# good reason for using ! instead of |

'(.classes.[].slot_usage.[] | select(.name=="chem_administration") | .examples) = [{"value": "agar [CHEBI:2509];2018-05-11|agar [CHEBI:2509];2018-05-22"}, {"value": "agar [CHEBI:2509];2018-05"}]'

# use yq to add patterns with a secondary condition like mutivalued
'(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .pattern) = "^[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?)\s[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)$$"'
'(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .range) = "string"'

'(.classes.[].slot_usage.[] | select(.range == "QuantityValue") | .pattern) = "^[-+]?[0-9]*\.?[0-9]+ +\S.*$$"'
'(.classes.[].slot_usage.[] | select(.range == "QuantityValue" and .multivalued == true) | .pattern) = "^([-+]?[0-9]*\.?[0-9]+ +\S.*\|)*([-+]?[0-9]*\.?[0-9]+ +\S.*)$$"'

# add a pattern for {termLabel} {[termID]} in teh validation configuration
# need more invalid examples
#yq -i '(.classes.[].slot_usage.[] | select(.string_serialization=="{termLabel} {[termID]}") | .range) = "string"' $@

'(.classes.[].slot_usage.[] | select(.string_serialization=="{text};{float} {unit}") | .pattern) = "^[^;\t\r\x0A\|]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A\|]+$$"'
'(.classes.[].slot_usage.[] | select(.string_serialization=="{text};{float} {unit}" and .multivalued == true ) | .pattern) = "^([^;\t\r\x0A]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A]+\|)*([^;\t\r\x0A]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A]+)$$"'

'(.slots.[] | select(.domain == "Activity") | .domain ) = "NamedThing"'
'(.slots.[] | select(.domain == "Agent") | .domain ) = "NamedThing"'
'(.slots.[] | select(.domain == "AttributeValue") | .domain ) = "NamedThing"'
'(.slots.[] | select(.domain == "AttributeValue") | .domain ) = "NamedThing"'
'(.slots.[] | select(.domain == "ControlledTermValue") | .domain ) = "NamedThing"'
'(.slots.[] | select(.domain == "GeolocationValue") | .domain ) = "NamedThing"'

'del(.classes.Activity)'
'del(.classes.Agent)'
'del(.classes.AttributeValue)'
'del(.classes.ControlledIdentifiedTermValue)'
'del(.classes.ControlledTermValue)'
'del(.classes.GeolocationValue)'
'del(.classes.OntologyClass)'
'del(.classes.QuantityValue)'
'del(.classes.TextValue)'
'del(.classes.TimestampValue)'

# use yq for global modifications
# rel_to_oxygen / oxy_stat_samp
'(.slots.[] | select(.name == "rel_to_oxygen") | .range) = "rel_to_oxygen_enum"'
'(.slots.[] | select(.name == "oxy_stat_samp") | .range) = "rel_to_oxygen_enum"'

# remove slots that are no longer necessary due to removal of classes above
'del(.slots.[] | select(.name == "acted_on_behalf_of"))'
'del(.slots.[] | select(.name == "ended_at_time"))'
'del(.slots.[] | select(.name == "has_maximum_numeric_value"))'
'del(.slots.[] | select(.name == "has_minimum_numeric_value"))'
'del(.slots.[] | select(.name == "has_numeric_value"))'
'del(.slots.[] | select(.name == "has_raw_value"))'
'del(.slots.[] | select(.name == "has_unit"))'
'del(.slots.[] | select(.name == "latitude"))'
'del(.slots.[] | select(.name == "longitude"))'
'del(.slots.[] | select(.name == "started_at_time"))'
'del(.slots.[] | select(.name == "term"))'
'del(.slots.[] | select(.name == "used"))'
'del(.slots.[] | select(.name == "was_associated_with"))'
'del(.slots.[] | select(.name == "was_generated_by"))'
'del(.slots.[] | select(.name == "was_informed_by"))'

'del(.slots.[] | select(.name == "was_informed_by"))'
149 changes: 11 additions & 138 deletions project.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,112 +69,16 @@ sheets_and_friends/tsv_in/import_slots_regardless.tsv
--format yaml [email protected] > $@
- $(RUN) linkml-lint $@ > local/with_shuttles.lint_report.txt

local/with_shuttles_yq.yaml: local/with_shuttles.yaml
local/with_shuttles_yq.yaml: local/with_shuttles.yaml assets/yq-for-shuttles.txt
cp $< $@
# using \x0A to represent a line feed
# double $ gets reduced to one by make

# .string_serialization=="{text};{float} {unit}": what about multivalueds? don't see any at this time
# ControlledTermValue: experiential factor has string_serialization: '{termLabel} {[termID]}|{text}'
# ControlledTermValue: what about multivalued CTVs? don't see any besides chem_administration above at this time
# for water, can depth be a point, a range, or both?


# globally replace structured ranges with strings.
# undoes some of the range alterations that nmdc-schema makes when importing MIxS terms
# future versions of the nmdc-schema might just use strings, too

# there's still more to do. see schemasheets/populated_tsv/slot_usage.tsv
# to some degree this should be handled globally by sheets_and_friends/tsv_in/validation_converter.tsv
# and on a slot-by-slot basic by sheets_and_friends/tsv_in/modifications_long.tsv

# we should be consistent about the following things in patterns
# single or multiple whitespace?
# [0-9] or \d?
# include scientific notation? (eg quantity value)
# what whitespace to exclude?

# be careful about strings that look like numbers with quotes in YAML
# impact on other serializations?

# escape pipes that are going to be used literally as future delimiters ?

# should add a remove attribute option to sheets and friend's modify and validate
# currently have nan string serializations

# scrutininze the slots that currerntly accept xyz or 100 units. how could they be better constrained?

# synchronize between guidance, examples and validation
# cross reference MIxS' values for those aspects

# use yq to add examples when the examples themselves include the packed value separator |
# good reason for using ! instead of |
yq -i '(.classes.[].slot_usage.[] | select(.name=="chem_administration") | .examples) = [{"value": "agar [CHEBI:2509];2018-05-11|agar [CHEBI:2509];2018-05-22"}, {"value": "agar [CHEBI:2509];2018-05"}]' $@

# use yq to add patterns with a secondary condition like mutivalued
yq -i '(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .pattern) = "^[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?)\s[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)$$"' $@
yq -i '(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .range) = "string"' $@

yq -i '(.classes.[].slot_usage.[] | select(.range == "QuantityValue") | .pattern) = "^[-+]?[0-9]*\.?[0-9]+ +\S.*$$"' $@
yq -i '(.classes.[].slot_usage.[] | select(.range == "QuantityValue" and .multivalued == true) | .pattern) = "^([-+]?[0-9]*\.?[0-9]+ +\S.*\|)*([-+]?[0-9]*\.?[0-9]+ +\S.*)$$"' $@

# add a pattern for {termLabel} {[termID]} in teh validation configuration
# need more invalid examples
#yq -i '(.classes.[].slot_usage.[] | select(.string_serialization=="{termLabel} {[termID]}") | .range) = "string"' $@

yq -i '(.classes.[].slot_usage.[] | select(.string_serialization=="{text};{float} {unit}") | .pattern) = "^[^;\t\r\x0A\|]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A\|]+$$"' $@
yq -i '(.classes.[].slot_usage.[] | select(.string_serialization=="{text};{float} {unit}" and .multivalued == true ) | .pattern) = "^([^;\t\r\x0A]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A]+\|)*([^;\t\r\x0A]+;[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? [^;\t\r\x0A]+)$$"' $@

yq -i '(.slots.[] | select(.domain == "Activity") | .domain ) = "NamedThing"' $@
yq -i '(.slots.[] | select(.domain == "Agent") | .domain ) = "NamedThing"' $@
yq -i '(.slots.[] | select(.domain == "AttributeValue") | .domain ) = "NamedThing"' $@
yq -i '(.slots.[] | select(.domain == "AttributeValue") | .domain ) = "NamedThing"' $@
yq -i '(.slots.[] | select(.domain == "ControlledTermValue") | .domain ) = "NamedThing"' $@
yq -i '(.slots.[] | select(.domain == "GeolocationValue") | .domain ) = "NamedThing"' $@

yq -i 'del(.classes.Activity)' $@
yq -i 'del(.classes.Agent)' $@
yq -i 'del(.classes.AttributeValue)' $@
yq -i 'del(.classes.ControlledIdentifiedTermValue)' $@
yq -i 'del(.classes.ControlledTermValue)' $@
yq -i 'del(.classes.GeolocationValue)' $@
yq -i 'del(.classes.OntologyClass)' $@
yq -i 'del(.classes.QuantityValue)' $@
yq -i 'del(.classes.TextValue)' $@
yq -i 'del(.classes.TimestampValue)' $@

# use yq for global modifications
# rel_to_oxygen / oxy_stat_samp
yq -i '(.slots.[] | select(.name == "rel_to_oxygen") | .range) = "rel_to_oxygen_enum"' $@
yq -i '(.slots.[] | select(.name == "oxy_stat_samp") | .range) = "rel_to_oxygen_enum"' $@

# remove slots that are no longer necessary due to removal of classes above
yq -i 'del(.slots.[] | select(.name == "acted_on_behalf_of"))' $@
yq -i 'del(.slots.[] | select(.name == "ended_at_time"))' $@
yq -i 'del(.slots.[] | select(.name == "has_maximum_numeric_value"))' $@
yq -i 'del(.slots.[] | select(.name == "has_minimum_numeric_value"))' $@
yq -i 'del(.slots.[] | select(.name == "has_numeric_value"))' $@
yq -i 'del(.slots.[] | select(.name == "has_raw_value"))' $@
yq -i 'del(.slots.[] | select(.name == "has_unit"))' $@
yq -i 'del(.slots.[] | select(.name == "latitude"))' $@
yq -i 'del(.slots.[] | select(.name == "longitude"))' $@
yq -i 'del(.slots.[] | select(.name == "started_at_time"))' $@
yq -i 'del(.slots.[] | select(.name == "term"))' $@
yq -i 'del(.slots.[] | select(.name == "used"))' $@
yq -i 'del(.slots.[] | select(.name == "was_associated_with"))' $@
yq -i 'del(.slots.[] | select(.name == "was_generated_by"))' $@
yq -i 'del(.slots.[] | select(.name == "was_informed_by"))' $@

yq -i 'del(.slots.[] | select(.name == "was_informed_by"))' $@

grep "^'" $(word 2, $^) | while IFS= read -r line ; do echo $$line ; eval yq -i $$line $@ ; done

modifications-clean:
rm -rf sheets_and_friends/yaml_out/with_modifications.yaml

local/nmdc.yaml:
wget -O $@ https://raw.githubusercontent.com/microbiomedata/nmdc-schema/v10.3.0/nmdc_schema/nmdc_materialized_patterns.yaml

# sheets-for-nmdc-submission-schema_validation_converter_empty.tsv
local/with_modifications.yaml: local/with_shuttles_yq.yaml \
sheets_and_friends/tsv_in/modifications_long.tsv \
sheets_and_friends/tsv_in/validation_converter.tsv \
Expand All @@ -185,61 +89,28 @@ local/nmdc.yaml
--validation_config_tsv $(word 3,$^) \
--yaml_output [email protected]

# # having trouble selectively injecting rules based on title pattern
# yq eval-all \
# 'select(fileIndex==1).classes.JgiMtInterface.rules = (select(fileIndex==0).classes.Biosample.rules.[] | select(.title == "rna*")) | select(fileIndex==1)' \
# local/nmdc.yaml src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml | cat > ruleswap.yaml
# # so just inject all rules...
# # using | cat > because yq fails to write to STDOUT (permissions error?!)
# apply all rules from Biosample to JgiMgInterface and JgiMtInterface
yq eval-all \
'select(fileIndex==1).classes.JgiMgInterface.rules = select(fileIndex==0).classes.Biosample.rules | select(fileIndex==1)' \
local/nmdc.yaml [email protected] | cat > [email protected]
yq eval-all \
'select(fileIndex==1).classes.JgiMtInterface.rules = select(fileIndex==0).classes.Biosample.rules | select(fileIndex==1)' \
local/nmdc.yaml [email protected] | cat > [email protected]
# # ...then removing rules that aren't relevant to a class
# # requires some prior knowledge

# remove some rules
yq -i 'del(.classes.JgiMgInterface.rules.[] | select(.title == "rna*"))' [email protected]
yq -i 'del(.classes.JgiMtInterface.rules.[] | select(.title == "dna*"))' [email protected]

$(RUN) gen-linkml \
--no-materialize-attributes \
--format yaml [email protected] > $@

- $(RUN) linkml-lint $@ > local/with_modifications.lint_report.txt

src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml: local/with_modifications.yaml project/thirdparty/GoldEcosystemTree.json
src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml: local/with_modifications.yaml \
project/thirdparty/GoldEcosystemTree.json assets/yq-final.txt
$(RUN) inject-gold-pathway-terms -g $(word 2,$^) -i $< -o $@
# remove the multivalued true annotation from these gloabl slot definitions for the sake of linkml-convert
# esp to tsv? and dumping to SQLite?
# follow the .string_serialization=="{text};{float} {unit}" and .multivalued == true pattern?

yq -i '(.slots.[] | select(.range == "ControlledIdentifiedTermValue") | .range) = "string"' $@
yq -i '(.slots.[] | select(.range == "ControlledTermValue") | .range) = "string"' $@
yq -i '(.slots.[] | select(.range == "GeolocationValue") | .range) = "string"' $@
yq -i '(.slots.[] | select(.range == "OntologyClass") | .range) = "string"' $@
yq -i '(.slots.[] | select(.range == "QuantityValue") | .range) = "string"' $@
yq -i '(.slots.[] | select(.range == "TextValue") | .range) = "string"' $@
yq -i '(.slots.[] | select(.range == "TimestampValue") | .range) = "string"' $@

# yq -i '(.slots.[] | select(has("range") | not ) | .range ) = "string"' $@
# yq -i '(.classes.[].slot_usage.[] | select(has("range") | not ) | .range ) = "string"' $@

yq -i '(.slots.[] | select(.name == "sample_link") | .range ) = "string"' $@

yq -i '(.slots.[] | select(.range == "string") | .multivalued ) = false' $@
yq -i '(.classes.[].slot_usage.[] | select(.range=="string") | .multivalued) = false' $@

# yq -i '(.slots.[] | select(.name == "dna_dnase") | .range) = "boolean"' $@
# yq -i '(.classes.[].slot_usage.[] | select(.name == "dna_dnase") | .range) = "boolean"' $@

yq -i '(.slots.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"' $@
yq -i '(.classes.[].slot_usage.[] | select(.name == "oxy_stat_samp") | .range) = "OxyStatSampEnum"' $@

yq -i '(.slots.[] | select(.name == "dna_dnase") | .range) = "YesNoEnum"' $@
yq -i '(.classes.[].slot_usage.[] | select(.name == "dna_dnase") | .range) = "YesNoEnum"' $@

yq -i '(.slots.[] | select(.name == "dnase_rna") | .range) = "YesNoEnum"' $@
yq -i '(.classes.[].slot_usage.[] | select(.name == "dnase_rna") | .range) = "YesNoEnum"' $@
grep "^'" $(word 3, $^) | while IFS= read -r line ; do echo $$line ; eval yq -i $$line $@ ; done

run-examples: examples-clean examples/output/README.md

Expand Down Expand Up @@ -273,6 +144,7 @@ src/data/invalid src/data/valid

local/SampleData-water-data-exhaustive.tsv: src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml \
src/data/valid/SampleData-water-data-exhaustive.yaml
mkdir -p local
$(RUN) linkml-convert \
--output $@ \
--target-class SampleData \
Expand All @@ -281,6 +153,7 @@ src/data/valid/SampleData-water-data-exhaustive.yaml

examples/output/SampleData-water-data-exhaustive.regen.yaml: src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml \
local/SampleData-water-data-exhaustive.tsv
mkdir -p examples/output
$(RUN) linkml-convert \
--output $@ \
--target-class SampleData \
Expand Down
Loading