Skip to content

Commit

Permalink
Merge pull request #180 from microbiomedata/176-review-srcdata-paths-…
Browse files Browse the repository at this point in the history
…and-remove-everything-besides-valid-and-invalid-if-possible

remove everything from src/data besides valid and invalid
  • Loading branch information
turbomam authored Oct 28, 2024
2 parents 93e452b + f8d8562 commit 7f0b83d
Show file tree
Hide file tree
Showing 24 changed files with 26 additions and 250 deletions.
2 changes: 1 addition & 1 deletion examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

This folder contains example data conforming to nmdc_submission_schema

The source for these is in [src/data](../src/data/examples)
The source for these is in [src/data](../src/data/)
25 changes: 14 additions & 11 deletions project.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ local/with_shuttles_yq.yaml: local/with_shuttles.yaml
yq -i '(.classes.[].slot_usage.[] | select(.name=="chem_administration") | .examples) = [{"value": "agar [CHEBI:2509];2018-05-11|agar [CHEBI:2509];2018-05-22"}, {"value": "agar [CHEBI:2509];2018-05"}]' $@

# use yq to add patterns with a secondary condition like mutivalued
yq -i '(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .pattern) = "^[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?)\s[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)$$"' $@
yq -i '(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .pattern) = "^[-+]?([1-8]?\d(\.\d{1,8})?|90(\.0{1,8})?)\s[-+]?(180(\.0{1,8})?|((1[0-7]\d)|([1-9]?\d))(\.\d{1,8})?)$$"' $@
yq -i '(.classes.[].slot_usage.[] | select(.range == "GeolocationValue") | .range) = "string"' $@

yq -i '(.classes.[].slot_usage.[] | select(.range == "QuantityValue") | .pattern) = "^[-+]?[0-9]*\.?[0-9]+ +\S.*$$"' $@
Expand Down Expand Up @@ -181,6 +181,8 @@ local/nmdc.yaml

src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml: local/with_modifications.yaml project/thirdparty/GoldEcosystemTree.json
$(RUN) inject-gold-pathway-terms -g $(word 2,$^) -i $< -o $@
#cp $< $@

# remove the multivalued true annotation from these gloabl slot definitions for the sake of linkml-convert
# follow the .string_serialization=="{text};{float} {unit}" and .multivalued == true pattern?

Expand Down Expand Up @@ -356,17 +358,18 @@ local/abp_pvs.txt: local/abp_tree_down.txt
# human construction
# astronomical body part

src/data/data_harmonizer_io/soil_for_linkml.json: src/data/data_harmonizer_io/soil_from_dh.json
$(RUN) dh-json2linkml \
--input-file $< \
--output-file $@ \
--key soil_data

#src/data/data_harmonizer_io/soil_for_linkml.json: src/data/data_harmonizer_io/soil_from_dh.json
# $(RUN) dh-json2linkml \
# --input-file $< \
# --output-file $@ \
# --key soil_data

src/data/data_harmonizer_io/soil_data.json: src/data/data_harmonizer_io/soil_for_linkml.json
$(RUN) linkml-json2dh \
--input-file $< \
--output-dir $(dir $@)
## todo frozen content in src/data/data_harmonizer_io has been removed
## todo find a better home for the se scripts if they are still of any use
#src/data/data_harmonizer_io/soil_data.json: src/data/data_harmonizer_io/soil_for_linkml.json
# $(RUN) linkml-json2dh \
# --input-file $< \
# --output-dir $(dir $@)

local/usage_template.tsv: src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml
mkdir -p $(@D)
Expand Down
2 changes: 2 additions & 0 deletions sheets_and_friends/tsv_in/modifications_long.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ JgiMgInterface|JgiMgLrInterface dna_sample_format replace_attribute required t
JgiMgInterface|JgiMgLrInterface dna_sample_format replace_attribute recommended false
JgiMgInterface|JgiMgLrInterface dna_sample_name replace_attribute required true
JgiMgInterface|JgiMgLrInterface dna_sample_name replace_attribute recommended false
JgiMgInterface|JgiMgLrInterface dna_sample_name replace_attribute pattern ^[_a-zA-Z0-9-]*$
JgiMgInterface|JgiMgLrInterface dna_seq_project replace_attribute required true
JgiMgInterface|JgiMgLrInterface dna_seq_project replace_attribute recommended false
JgiMgInterface|JgiMgLrInterface dna_seq_project_name replace_attribute required true
Expand Down Expand Up @@ -191,6 +192,7 @@ JgiMtInterface rna_sample_format replace_attribute required true
JgiMtInterface rna_sample_format replace_attribute recommended false
JgiMtInterface rna_sample_name replace_attribute required true
JgiMtInterface rna_sample_name replace_attribute recommended false
JgiMtInterface rna_sample_name replace_attribute pattern ^[_a-zA-Z0-9-]*$
JgiMtInterface rna_seq_project replace_attribute required true
JgiMtInterface rna_seq_project replace_attribute recommended false
JgiMtInterface rna_seq_project_name replace_attribute required true
Expand Down
4 changes: 2 additions & 2 deletions sheets_and_friends/tsv_in/validation_converter.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ from_val from_type to_type to_val len
{float}-{float} MIxS string serialization DH pattern regex ^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s*-\s*[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$
{float}|{float}-{float} MIxS string serialization DH pattern regex ^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?(\s*-\s*[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)?$
{integer} MIxS string serialization DH datatype integer 7
{lat lon} linkml string_serialization DH pattern regex ^[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?)\s[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)$ 86
{lat lon} linkml string_serialization DH pattern regex ^[-+]?([1-8]?\d(\.\d{1,8})?|90(\.0{1,8})?)\s[-+]?(180(\.0{1,8})?|((1[0-7]\d)|([1-9]?\d))(\.\d{1,8})?)$ 86
{termLabel} {[termID]} MIxS string serialization DH pattern regex ^\S+.*\S+ \[[a-zA-Z]{2,}:\d+\]$
{termLabel} {[termID]}; {timestamp} MIxS string serialization DH pattern regex ^\S+.*\S+ \[[a-zA-Z]{2,}:\d+\]; ([\+-]?\d{4}(?!\d{2}\b))((-?)((0[1-9]|1[0-2])(\3([12]\d|0[1-9]|3[01]))?|W([0-4]\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\d|[12]\d{2}|3([0-5]\d|6[1-6])))([T\s]((([01]\d|2[0-3])((:?)[0-5]\d)?|24\:?00)([\.,]\d+(?!:))?)?(\17[0-5]\d([\.,]\d+)?)?([zZ]|([\+-])([01]\d|2[0-3]):?([0-5]\d)?)?)?)?$ 314
{termLabel} {[termID]};{timestamp} MIxS string serialization DH pattern regex ^\S+.*\S+ \[[a-zA-Z]{2,}:\d+\];([\+-]?\d{4}(?!\d{2}\b))((-?)((0[1-9]|1[0-2])(\3([12]\d|0[1-9]|3[01]))?|W([0-4]\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\d|[12]\d{2}|3([0-5]\d|6[1-6])))([T\s]((([01]\d|2[0-3])((:?)[0-5]\d)?|24\:?00)([\.,]\d+(?!:))?)?(\17[0-5]\d([\.,]\d+)?)?([zZ]|([\+-])([01]\d|2[0-3]):?([0-5]\d)?)?)?)?$ 313
Expand All @@ -24,7 +24,7 @@ ControlledIdentifiedTermValue linkml range DH datatype string
ControlledTermValue linkml range DH datatype string
ControlledTermValue linkml range DH pattern regex ^\S+.*\S+ \[[a-zA-Z]{2,}:\d+\]$
GeolocationValue linkml range DH datatype string
GeolocationValue linkml range DH pattern regex ^[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?)\s[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)$
GeolocationValue linkml range DH pattern regex ^[-+]?([1-8]?\d(\.\d{1,8})?|90(\.0{1,8})?)\s[-+]?(180(\.0{1,8})?|((1[0-7]\d)|([1-9]?\d))(\.\d{1,8})?)$
OntologyClass linkml range DH datatype string
QuantityValue linkml range DH datatype string
TextValue linkml range DH datatype string
Expand Down
1 change: 0 additions & 1 deletion src/data/data_harmonizer_io/soil_from_dh.json

This file was deleted.

5 changes: 0 additions & 5 deletions src/data/data_harmonizer_io/soil_from_dh.tsv

This file was deleted.

Binary file removed src/data/data_harmonizer_io/soil_from_dh.xlsx
Binary file not shown.
43 changes: 0 additions & 43 deletions src/data/data_harmonizer_io/soil_validatable.json

This file was deleted.

104 changes: 0 additions & 104 deletions src/data/examples/SampleData-water-data-exhaustive.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jgi_mg_data:
dna_project_contact: xxx
dna_samp_id: xxx
dna_sample_format: DNAStable
dna_sample_name: DNA:0546789 # contains colon and should not be allowed 'a-z, A-Z, 0-9, - and _ only'
dna_sample_name: DNA:0546789 # description says "Sample names must ... contain a-z, A-Z, 0-9, - and _ only." but there's no patten constraint
dna_seq_project: xxx
dna_seq_project_name: xxx
dna_seq_project_pi: xxx
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Loading

0 comments on commit 7f0b83d

Please sign in to comment.