Skip to content

Commit

Permalink
Merge pull request #173 from microbiomedata/issue-154
Browse files Browse the repository at this point in the history
Update GOLD ecosystem classification path terms
  • Loading branch information
pkalita-lbl authored Jan 25, 2024
2 parents a43f414 + a6dd2cb commit 6cfd782
Show file tree
Hide file tree
Showing 7 changed files with 8,050 additions and 73 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,5 @@ dmypy.json

# Pyre type checker
.pyre/

/examples/output
8 changes: 6 additions & 2 deletions project.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ local/nmdc.yaml
--format yaml [email protected] > $@
- $(RUN) linkml-lint $@ > local/with_modifications.lint_report.txt

src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml: local/with_modifications.yaml
cp $< $@
src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml: local/with_modifications.yaml project/thirdparty/GoldEcosystemTree.json
$(RUN) inject-gold-pathway-terms -g $(word 2,$^) -i $< -o $@
# remove the multivalued true annotation from these gloabl slot definitions for the sake of linkml-convert
# esp to tsv? and dumping to SQLite?
# follow the .string_serialization=="{text};{float} {unit}" and .multivalued == true pattern?
Expand Down Expand Up @@ -310,6 +310,10 @@ project/json/nmdc_submission_schema.json: src/nmdc_submission_schema/schema/nmdc
mkdir -p $(@D)
$(RUN) gen-linkml $< --format json --materialize-patterns --materialize-attributes > $@

project/thirdparty/GoldEcosystemTree.json:
mkdir -p $(@D)
wget https://gold.jgi.doe.gov/download?mode=biosampleEcosystemsJson -O $@

dh-dev: project/json/nmdc_submission_schema.json
cd data_harmonizer && npm run dev

Expand Down
7,952 changes: 7,952 additions & 0 deletions project/thirdparty/GoldEcosystemTree.json

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ build-backend = "poetry_dynamic_versioning.backend"
docs = ["linkml", "mkdocs-material"]

[tool.poetry.scripts]
oak-tree-to-pv-list = "src.nmdc_submission_schema.datamodel.oak_tree_to_pv_list:main"
dh-json2linkml = 'src.nmdc_submission_schema.datamodel.dh_json2linkml:update_json'
linkml-json2dh = 'src.nmdc_submission_schema.datamodel.linkml_json2dh:extract_lists'
oak-tree-to-pv-list = "nmdc_submission_schema.datamodel.oak_tree_to_pv_list:main"
dh-json2linkml = 'nmdc_submission_schema.datamodel.dh_json2linkml:update_json'
linkml-json2dh = 'nmdc_submission_schema.datamodel.linkml_json2dh:extract_lists'
inject-gold-pathway-terms = 'nmdc_submission_schema.datamodel.gold:main'
53 changes: 0 additions & 53 deletions schemasheets/tsv_in/enums.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -27,34 +27,6 @@ DnaSampleFormatEnum RNAStable placeholder PV descr
DnaSampleFormatEnum TE placeholder PV descr
DnaSampleFormatEnum Water placeholder PV descr
DnaSampleFormatEnum placeholder enum descr
EcosystemCategoryEnum Terrestrial placeholder PV descr https://github.com/microbiomedata/sheets_and_friends/issues/22
EcosystemCategoryEnum placeholder enum descr
EcosystemEnum Environmental placeholder PV descr
EcosystemEnum placeholder enum descr
EcosystemSubtypeEnum Biocrust placeholder PV descr
EcosystemSubtypeEnum Biofilm placeholder PV descr
EcosystemSubtypeEnum Bulk soil placeholder PV descr
EcosystemSubtypeEnum Clay placeholder PV descr
EcosystemSubtypeEnum Floodplain placeholder PV descr
EcosystemSubtypeEnum Fossil placeholder PV descr
EcosystemSubtypeEnum Glacier placeholder PV descr
EcosystemSubtypeEnum Loam placeholder PV descr
EcosystemSubtypeEnum Mineral horizon placeholder PV descr
EcosystemSubtypeEnum Nature reserve placeholder PV descr
EcosystemSubtypeEnum Organic layer placeholder PV descr
EcosystemSubtypeEnum Paddy field/soil placeholder PV descr
EcosystemSubtypeEnum Pasture placeholder PV descr
EcosystemSubtypeEnum Peat placeholder PV descr
EcosystemSubtypeEnum Ranch placeholder PV descr
EcosystemSubtypeEnum Sand placeholder PV descr
EcosystemSubtypeEnum Silt placeholder PV descr
EcosystemSubtypeEnum Soil crust placeholder PV descr
EcosystemSubtypeEnum Unclassified placeholder PV descr
EcosystemSubtypeEnum Watershed placeholder PV descr
EcosystemSubtypeEnum Wetlands placeholder PV descr
EcosystemSubtypeEnum placeholder enum descr
EcosystemTypeEnum Soil placeholder PV descr
EcosystemTypeEnum placeholder enum descr
EnvPackageEnum soil placeholder PV descr I don't think this is a MIxS term anymore, so it make sense to define it here
EnvPackageEnum placeholder enum descr
GrowthFacilEnum experimental_garden placeholder PV descr growth_facility https://genomicsstandardsconsortium.github.io/mixs/growth_facil/ string range according to MIxS
Expand Down Expand Up @@ -89,31 +61,6 @@ RnaSampleFormatEnum placeholder enum descr
SampleTypeEnum soil placeholder PV descr
SampleTypeEnum water_extract_soil placeholder PV descr
SampleTypeEnum placeholder enum descr
SpecificEcosystemEnum Agricultural placeholder PV descr
SpecificEcosystemEnum Agricultural land placeholder PV descr
SpecificEcosystemEnum Agricultural soil placeholder PV descr
SpecificEcosystemEnum Alpine placeholder PV descr
SpecificEcosystemEnum Bog placeholder PV descr
SpecificEcosystemEnum Boreal forest placeholder PV descr
SpecificEcosystemEnum Contaminated placeholder PV descr
SpecificEcosystemEnum Desert placeholder PV descr
SpecificEcosystemEnum Farm placeholder PV descr
SpecificEcosystemEnum Forest soil placeholder PV descr
SpecificEcosystemEnum Forest Soil placeholder PV descr
SpecificEcosystemEnum Grasslands placeholder PV descr
SpecificEcosystemEnum Meadow placeholder PV descr
SpecificEcosystemEnum Mine placeholder PV descr
SpecificEcosystemEnum Mine drainage placeholder PV descr
SpecificEcosystemEnum Oil-contaminated placeholder PV descr
SpecificEcosystemEnum Orchard soil placeholder PV descr
SpecificEcosystemEnum Permafrost placeholder PV descr
SpecificEcosystemEnum Riparian soil placeholder PV descr
SpecificEcosystemEnum River placeholder PV descr
SpecificEcosystemEnum Shrubland placeholder PV descr
SpecificEcosystemEnum Tropical rainforest placeholder PV descr
SpecificEcosystemEnum Unclassified placeholder PV descr
SpecificEcosystemEnum Uranium contaminated placeholder PV descr
SpecificEcosystemEnum placeholder enum descr
StoreCondEnum fresh placeholder PV descr storage_condt see samp_store_dur, samp_store_loc, samp_store_temp and store_cond (which takes a string range according to mixs-source).
StoreCondEnum frozen placeholder PV descr
StoreCondEnum lyophilized placeholder PV descr
Expand Down
25 changes: 10 additions & 15 deletions sheets_and_friends/tsv_in/modifications_long.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,17 @@ JgiMgInterface dna_seq_project_name replace_attribute required true
JgiMgInterface dna_seq_project_pi replace_attribute required true
JgiMgInterface dna_volume replace_attribute required true
JgiMtInterface dnase_rna replace_attribute required true
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface ecosystem add_attribute notes range: string or EcosystemEnum? The enum could be dynamic or might be soil biased.
SoilInterface ecosystem replace_attribute range EcosystemEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface ecosystem replace_attribute range string
SoilInterface ecosystem replace_attribute range EcosystemForSoilEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface ecosystem replace_attribute range EcosystemEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface|WaterInterface ecosystem replace_attribute recommended true
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface ecosystem_category add_attribute notes string or EcosystemCategoryEnum range?
SoilInterface ecosystem_category replace_attribute range EcosystemCategoryEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface ecosystem_category replace_attribute range string
SoilInterface ecosystem_category replace_attribute range EcosystemCategoryForSoilEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface ecosystem_category replace_attribute range EcosystemCategoryEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface|WaterInterface ecosystem_category replace_attribute recommended true
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface ecosystem_subtype add_attribute notes string or EcosystemSubtypeEnum range?
SoilInterface ecosystem_subtype replace_attribute range EcosystemSubtypeEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface ecosystem_subtype replace_attribute range string
SoilInterface ecosystem_subtype replace_attribute range EcosystemSubtypeForSoilEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface ecosystem_subtype replace_attribute range EcosystemSubtypeEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface|WaterInterface ecosystem_subtype replace_attribute recommended true
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface ecosystem_type add_attribute notes string or EcosystemTypeEnum range?
SoilInterface ecosystem_type replace_attribute range EcosystemTypeEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface ecosystem_type replace_attribute range string
SoilInterface ecosystem_type replace_attribute range EcosystemTypeForSoilEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface ecosystem_type replace_attribute range EcosystemTypeEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface|WaterInterface ecosystem_type replace_attribute recommended true
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface elev overwrite_examples examples 225| 0|1250
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface elev replace_attribute range float
Expand Down Expand Up @@ -203,9 +199,8 @@ DhMultiviewCommonColumnsMixin source_mat_id overwrite_examples examples IGSN:A
DhMultiviewCommonColumnsMixin source_mat_id add_attribute notes The source material IS the Globally Unique ID
DhMultiviewCommonColumnsMixin source_mat_id replace_attribute string_serialization {text}:{text}
DhMultiviewCommonColumnsMixin source_mat_id add_attribute todos Currently, the comments say to use UUIDs. However, if we implement assigning NMDC identifiers with the minter we dont need to require a GUID. It can be an optional field to fill out only if they already have a resolvable ID.
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface specific_ecosystem add_attribute notes string or SpecificEcosystemEnum range?
SoilInterface specific_ecosystem replace_attribute range SpecificEcosystemEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface specific_ecosystem replace_attribute range string
SoilInterface specific_ecosystem replace_attribute range SpecificEcosystemForSoilEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|WastewaterSludgeInterface specific_ecosystem replace_attribute range SpecificEcosystemEnum
AirInterface|BiofilmInterface|BuiltEnvInterface|HcrCoresInterface|HcrFluidsSwabsInterface|HostAssociatedInterface|MiscEnvsInterface|PlantAssociatedInterface|SedimentInterface|SoilInterface|WastewaterSludgeInterface|WaterInterface specific_ecosystem replace_attribute recommended true
SoilInterface store_cond replace_attribute description Explain how the soil sample is stored (fresh/frozen/other).
SoilInterface store_cond replace_attribute range StoreCondEnum
Expand Down
76 changes: 76 additions & 0 deletions src/nmdc_submission_schema/datamodel/gold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import json
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import Dict, Set, Iterable, Optional

import click
from linkml_runtime import SchemaView
from linkml_runtime.dumpers import yaml_dumper
from linkml_runtime.linkml_model import EnumDefinition, PermissibleValue, SchemaDefinition
from linkml_runtime.loaders import yaml_loader


class PathwayLevel(Enum):
ecosystem = 'Ecosystem'
ecosystem_category = 'EcosystemCategory'
ecosystem_type = 'EcosystemType'
ecosystem_subtype = 'EcosystemSubtype'
specific_ecosystem = 'SpecificEcosystem'


class GoldTermSet:

def __init__(self, *, schema_enum_suffix: str = '', include: Optional[Dict[PathwayLevel, Iterable[str]]] = None):
self.schema_enum_suffix: str = schema_enum_suffix
self.terms: Dict[PathwayLevel, Set[str]] = defaultdict(set)
self.include: Dict[PathwayLevel, Iterable[str]] = include if include is not None else {}


def inject_gold_pathway_terms(gold_ecosystem_tree_path: Path, input_schema_path: SchemaDefinition) -> SchemaDefinition:
"""Inject gold pathway terms into the schema."""
with open(gold_ecosystem_tree_path) as f:
gold_ecosystem_tree = json.load(f)

levels = list(PathwayLevel)
all_terms = GoldTermSet()
soil_terms = GoldTermSet(schema_enum_suffix='ForSoil', include={
PathwayLevel.ecosystem: ['Environmental'],
PathwayLevel.ecosystem_category: ['Terrestrial'],
PathwayLevel.ecosystem_type: ['Soil'],
})
term_sets = [all_terms, soil_terms]

def _add_term(term_set, term, level_index):
level = levels[level_index]
if term_set.include.get(level) and term['name'] not in term_set.include[level]:
return
term_set.terms[level].add(term['name'])
for child in term.get('children', []):
_add_term(term_set, child, level_index + 1)

for term_set in term_sets:
for ecosystem in gold_ecosystem_tree['children']:
_add_term(term_set, ecosystem, level_index=0)

schemaview = SchemaView(input_schema_path)

for term_set in term_sets:
for level in levels:
pvs = [PermissibleValue(text=term) for term in sorted(term_set.terms[level])]
schemaview.add_enum(EnumDefinition(
name=level.value + term_set.schema_enum_suffix + 'Enum',
permissible_values=pvs
))

return schemaview.schema

@click.command()
@click.option('--gold-ecosystem-tree-path', '-g', type=click.Path(exists=True),
help='Path to the gold ecosystem tree.')
@click.option('--input-file', '-i', type=click.File('r'), default="-")
@click.option('--output-file', '-o', type=click.File('w'), default="-")
def main(gold_ecosystem_tree_path, input_file, output_file):
schema = yaml_loader.loads(input_file.read(), SchemaDefinition)
output = inject_gold_pathway_terms(Path(gold_ecosystem_tree_path), schema)
print(yaml_dumper.dumps(output), file=output_file)

0 comments on commit 6cfd782

Please sign in to comment.