diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml new file mode 100644 index 0000000..4fd9e94 --- /dev/null +++ b/.github/workflows/main.yaml @@ -0,0 +1,40 @@ +# This is a basic workflow to help you get started with Actions + +name: Run tests + +# Controls when the action will run. +on: + # Triggers the workflow on push or pull request events but only for the master branch + pull_request: + types: [opened, synchronize, reopened] + + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "build" + build: + # The type of runner that the job will run on + runs-on: ubuntu-latest + strategy: + matrix: + python: [ "3.9", "3.10", "3.11" ] + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + name: setup python environment + with: + python-version: ${{ matrix.python }} + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install poetry + poetry install + + - name: Run tests + run: | + poetry run pytest tests/* diff --git a/Makefile b/Makefile index 444c572..e179fa7 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ WGET=wget RUN=poetry run +include Makefiles/env_broad_scale.Makefile #include Makefiles/env_broad_scale.Makefile #include Makefiles/soil-env_medium.Makefile include Makefiles/envo.Makefile @@ -23,6 +24,200 @@ include Makefiles/soil-env_medium.Makefile local/microbiomedata-repos.csv: . ./report-microbiomedata-repos.sh > $@ +# NMDC SCHEMA STUFF +downloads/nmdc_submission_schema.yaml: + wget -O $@ $(SUBMISSION_SCHEMA_URL) + +local/established-value-sets-from-submission-schema.json: downloads/nmdc_submission_schema.yaml + yq -o=json e '{"enums": {"EnvBroadScaleSoilEnum": .enums.EnvBroadScaleSoilEnum, "EnvLocalScaleSoilEnum": .enums.EnvLocalScaleSoilEnum, "EnvMediumSoilEnum": .enums.EnvMediumSoilEnum}}' $< | cat > $@ # ~ 48 + +local/nmdc-submission-schema-enums-keys.txt: downloads/nmdc_submission_schema.yaml + yq eval '.enums | keys | .[]' $< | sort > $@ + +local/EnvBroadScaleSoilEnum-pvs-keys.txt: downloads/nmdc_submission_schema.yaml + yq eval '.enums.EnvBroadScaleSoilEnum.permissible_values | keys | .[]' $< | cat > $@ + +local/EnvBroadScaleSoilEnum-pvs-keys-parsed.csv: local/EnvBroadScaleSoilEnum-pvs-keys.txt + $(RUN) normalize-envo-data \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ + +local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv: local/EnvBroadScaleSoilEnum-pvs-keys-parsed.csv + cut -f3,4 -d, $< | head -n 1 > $<.header.csv + tail -n +2 $< | cut -f3,4 -d, | sort | uniq > $@.tmp + cat $<.header.csv $@.tmp > $@ + rm -rf $<.header.csv $@.tmp + +local/EnvBroadScaleSoilEnum.png: local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv + cat $< | tail -n +2 | cut -f1 -d, > $@.ids.txt + $(RUN) runoak --input sqlite:obo:envo viz --no-view --output $@ --gap-fill [ .idfile $@.ids.txt ] .or biome + rm -rf $@.ids.txt + +local/EnvMediumSoilEnum-pvs-keys.txt: downloads/nmdc_submission_schema.yaml + yq eval '.enums.EnvMediumSoilEnum.permissible_values | keys | .[]' $< | cat > $@ + +local/EnvMediumSoilEnum-pvs-keys-parsed.csv: local/EnvMediumSoilEnum-pvs-keys.txt + $(RUN) normalize-envo-data \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ + +local/EnvMediumSoilEnum-pvs-keys-parsed-unique.csv: local/EnvMediumSoilEnum-pvs-keys-parsed.csv + cut -f3,4 -d, $< | head -n 1 > $<.header.csv + tail -n +2 $< | cut -f3,4 -d, | sort | uniq > $@.tmp + cat $<.header.csv $@.tmp > $@ + rm -rf $<.header.csv $@.tmp + +local/EnvMediumSoilEnum.png: local/EnvMediumSoilEnum-pvs-keys-parsed-unique.csv + cat $< | tail -n +2 | cut -f1 -d, > $@.ids.txt + $(RUN) runoak --input sqlite:obo:envo viz --gap-fill --no-view --output $@ .idfile $@.ids.txt + rm -rf $@.ids.txt + +# NMDC METADATA STUFF +downloads/nmdc-production-studies.json: + wget -O $@.bak https://api.microbiomedata.org/nmdcschema/study_set?max_page_size=999999 + yq '.resources' -o=json $@.bak | cat > $@ + rm -rf $@.bak + +downloads/nmdc-production-biosamples.json: + wget -O $@.bak https://api.microbiomedata.org/nmdcschema/biosample_set?max_page_size=999999 + yq '.resources' -o=json $@.bak | cat > $@ + rm -rf $@.bak + +local/nmdc-production-biosamples-5pct.json: downloads/nmdc-production-biosamples.json + $(RUN) random-sample-resources \ + --input-file $< \ + --output-file $@ \ + --sample-percentage 5 + +local/nmdc-production-biosamples-json-to-context.tsv: downloads/nmdc-production-biosamples.json + $(RUN) biosample-json-to-context-tsv \ + --input-file $< \ + --output-file $@ + +local/nmdc-production-biosamples-env-package.json: + curl -X 'GET' \ + 'https://api.microbiomedata.org/nmdcschema/biosample_set?max_page_size=999999&projection=env_package' \ + -H 'accept: application/json' > $@.bak + yq '.resources' -o=json $@.bak | cat > $@ # ENVO:00001998 is also soil + rm -rf $@.bak + +local/nmdc-production-studies-images.csv: downloads/nmdc-production-studies.json + $(RUN) python external_metadata_awareness/study-image-table.py \ + --input-file $< \ + --output-file $@ + +#### + +# biosamples that are part of a particular study +downloads/sty-11-ev70y104_biosamples.json: + wget -O $@.bak 'https://api.microbiomedata.org/nmdcschema/biosample_set?filter=%7B%22part_of%22%3A%20%22nmdc%3Asty-11-ev70y104%22%7D&max_page_size=999999' + yq -o=json e '.resources' $@.bak | cat > $@ + rm -rf $@.bak + +# metadata about a particular study +downloads/sty-11-ev70y104_study.json: + wget -O $@.bak 'https://api.microbiomedata.org/nmdcschema/ids/nmdc%3Asty-11-ev70y104' + yq -o=json e '.' $@.bak | cat > $@ + rm -rf $@.bak + +#### + +valid-env_broad_scale-biosample-all: valid-env_broad_scale-biosample-clean \ +local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv \ +local/ncbi-biosamples-context-value-counts-failures.csv + +valid-env_broad_scale-biosample-clean: + rm -rf local/biome-info.txt \ + local/envo-info.csv \ + local/envo-info.txt \ + local/ncbi-biosamples-context-value-counts.csv \ + local/ncbi-biosamples-context-value-counts-normalized.csv \ + local/ncbi-biosamples-context-value-counts-real-labels.csv \ + local/ncbi-biosamples-context-value-counts-real-labels-only-annotated.tsv \ + local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-1.tsv \ + local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.tsv \ + local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv + +local/ncbi-biosamples-context-value-counts.csv: + $(RUN) count-biosample-context-vals-from-postgres \ + --output-file $@ \ + --min-count 2 + +local/ncbi-biosamples-context-value-counts-normalized.csv: local/ncbi-biosamples-context-value-counts.csv + $(RUN) normalize-envo-data \ + --count-col-name total_count \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ \ + --val-col-name value + +local/ncbi-biosamples-context-value-counts-failures.csv: local/ncbi-biosamples-context-value-counts-normalized.csv + $(RUN) find-envo-present-no-curie-extracted \ + --input-file $< \ + --output-file $@ + +local/envo-info.txt: + $(RUN) runoak --input sqlite:obo:envo info .desc//p=i continuant > $@ # or .ALL + +local/envo-info.csv: local/envo-info.txt + $(RUN) normalize-envo-data \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ + +local/ncbi-biosamples-context-value-counts-real-labels.csv: local/ncbi-biosamples-context-value-counts-normalized.csv local/envo-info.csv + $(RUN) merge-in-reference-data \ + --keep-file $(word 1,$^) \ + --keep-key normalized_curie \ + --reference-file $(word 2,$^) \ + --reference-key normalized_curie \ + --reference-addition normalized_label \ + --addition-rename real_label \ + --merged-file $@ + +local/ncbi-biosamples-context-value-counts-real-labels-only-annotated.tsv: local/ncbi-biosamples-context-value-counts-real-labels.csv + date ; $(RUN) runoak \ + --input sqlite:obo:envo annotate \ + --matches-whole-text \ + --output-type tsv \ + --output $@ \ + --text-file $< \ + --match-column normalized_label ; date + +local/biome-info.txt: + $(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00000428 > $@ + +local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-1.tsv: local/biome-info.txt \ +local/ncbi-biosamples-context-value-counts-real-labels-only-annotated.tsv + $(RUN) detect-curies-in-subset \ + --tsv-file $(word 2,$^) \ + --class-info-file $(word 1,$^) \ + --tsv-column-name normalized_curie \ + --subset-label biome \ + --output-file $@ + +local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.tsv: local/biome-info.txt \ +local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-1.tsv + $(RUN) detect-curies-in-subset \ + --tsv-file $(word 2,$^) \ + --class-info-file $(word 1,$^) \ + --tsv-column-name matched_id \ + --subset-label biome \ + --output-file $@ + +local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv: local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.tsv + $(RUN) or-boolean-columns \ + --input-file $< \ + --output-file $@ \ + --column1 "normalized_curie_biome" \ + --column2 "matched_id_biome" + +detected-annotations-to-postgres: local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv + $(RUN) load-tsv-into-postgres \ + --tsv-file $< \ + --table-name detected_annotations local/envo_goldterms.db: $(RUN) runoak --input sqlite:obo:envo ontology-metadata --all > /dev/null # ensure semsql file is cached $(RUN) runoak --input sqlite:obo:goldterms ontology-metadata --all > /dev/null # ensure semsql file is cached @@ -31,5 +226,3 @@ local/envo_goldterms.db: --primary-db local/envo.db \ --secondary-db ~/.data/oaklib/goldterms.db mv local/envo.db $@ - - diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml new file mode 100755 index 0000000..c7fe387 --- /dev/null +++ b/config/env-local-scale-extraction-config.yaml @@ -0,0 +1,167 @@ +# env-local-scale-extraction-config.yaml +output: "local/env-local-scale-candidates.txt" +entity: "material entity" +text_exclusions: + - "gaseous" + - "marine" + - "undersea" + - "saline" + - "brackish" + - "undersea" +post_process_inclusion_single_terms: + - "bridge" + - "road" + - "wildlife management area" +term_and_descendant_exclusions: + - "BFO:0000050" + - "BFO:0000050" + - "BFO:0000050" + - "BFO:0000050" + - "BFO:0000050" + - "BFO:0000050" + - "RO:0001025" + - "RO:0001025" + - "RO:0002473" + - "NCBITaxon:1" + - "administrative region" + - "aeroform" + - "anatomical entity" + - "anatomical entity environment" + - "area protected according to IUCN guidelines" + - "astronomical body" + - "astronomical object" + - "biome" + - "channel of a watercourse" + - "chemical entity" + - "cloud" + - "collection of organisms" + - "cryospheric layer" + - "ecozone" + - "ecosystem" + - "environmental material" + - "environmental monitoring area" + - "environmental system" + - "environmental zone" + - "fluid layer" + - "healthcare facility" + - "ice" + - "interface layer" + - "island" + - "lake layer" + - "manufactured product" + - "marine environmental zone" + - "marine littoral zone" + - "mass of environmental material" + - "mass of liquid" + - "material isosurface" + - "meteor" + - "observing system" + - "organic material" + - "organism" + - "particle" + - "planetary structural layer" + - "political entity" + - "protected area" + - "room" + - "saline water" + - "sea floor" + - "subatomic particle" + - "transport feature" + - "water body" + - "water body" + - "water current" +"single_term_exclusions": + - "anthropised terrestrial environmental zone" + - "anthropogenic contamination feature" + - "anthropogenic geographic feature" + - "area of attached faunal communities" + - "area of attached mussel assemblages" + - "area of developed space" + - "astronomical body part" + - "biosphere" + - "body of liquid" + - "carbonate system of ocean water" + - "cellular organisms" + - "child care facility" + - "cloud part" + - "compound astronomical body part" + - "construction" + - "conveyor system" + - "cryoform" + - "educational facility" + - "environmental zone" + - "environmental zone of processual equilibrium" + - "facility" + - "fiat object" + - "fiat part of an astronomical object" + - "floating ice mass" + - "fluid astronomical body part" + - "fresh water body" + - "gaseous astronomical body part" + - "gaseous part of an atmosphere" + - "geographic feature" + - "hail stone" + - "hydroform" + - "hydrographic feature" + - "hydrosphere" + - "ice decumulation zone" + - "landform" + - "layer" + - "liquid astronomical body part" + - "lotic water body" + - "marine hydrothermal vent" + - "marine reef" + - "marine water body" + - "marine water mass" + - "mass of compounded environmental materials" + - "mass of environmental material" + - "mass of solid material" + - "material accumulation zone" + - "material decumulation zone" + - "material entity" + - "object" + - "object aggregate" + - "ocean basin" + - "open cage mariculture facility" + - "organismal entity" + - "pedosphere" + - "planetary photic zone" + - "planetary subsurface zone" + - "pole" + - "polling place" + - "polling station" + - "processed material" + - "processing plant" + - "public infrastructure" + - "public transit system" + - "rapid transit system" + - "rain" + - "range of seamounts" + - "rocky reef" + - "root" + - "saline water body" + - "sea ice floe" + - "sea ice hummock" + - "sea ice mass" + - "seamount" + - "sleet pellet" + - "sleet pellet" + - "soil horizon" + - "soil layer" + - "solid astronomical body part" + - "solid layer" + - "subsurface landform" + - "subsurface zone of an astronomical body" + - "surface landform" + - "system" + - "Taylor column" + - "technosphere" + - "underground water body" + - "volcanic feature" + - "water body" + - "watercourse" + - "water mass" + - "water-based rain" + + + diff --git a/config/oak-config.yaml b/config/oak-config.yaml new file mode 100644 index 0000000..7b45869 --- /dev/null +++ b/config/oak-config.yaml @@ -0,0 +1,3 @@ +ontology_resources: + envo: + selector: sqlite:obo:envo \ No newline at end of file diff --git a/env_triad.Makefile b/env_triad.Makefile new file mode 100644 index 0000000..5bbd240 --- /dev/null +++ b/env_triad.Makefile @@ -0,0 +1,197 @@ +WGET=wget +RUN=poetry run + +# preferable to use a tagged release, but theres good stuff in this commit that hasn't been released yet +MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml +SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml + +# ENVO STUFF +# getting fragments of EnvO because the whole thing is too large to feed into an LLM +# our guideline is that env_broad_scale should be answered with an EnvO biome subclass + +# these OAK commands fetch the latest EnvO SQLite file from a BBOP S3 bucket +# it may be a few days behind the envo.owl file form the EnvO GH repo +# use `runoak cache-ls` to see where the SQLite files are cached + +local/biome-relationships.tsv: + $(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00000428 > $@ + # !!! pivot? include entailment? --include-entailed / --no-include-entailed; --non-redundant-entailed / --no-non-redundant-entailed + # LLM web interfaces might want CSVs + +local/biome-relationships.csv: local/biome-relationships.tsv + sed 's/\t/,/g' $< > $@ + #awk 'BEGIN {FS="\t"; OFS=","} {print $$0}' $< > $@ + rm -rf $< + +local/biome-metadata.yaml: + $(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00000428 > $@ + # !!! try different formats? or predicate list? + +local/biome-metadata.json: local/biome-metadata.yaml + yq ea '[.]' $< -o=json | cat > $@ + rm -rf $< + +# our guideline is that env_medium should be answered with an EnvO biome subclass +local/environmental-materials-relationships.tsv: + $(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00010483 > $@ + +local/environmental-materials-relationships.csv: local/environmental-materials-relationships.tsv + sed 's/\t/,/g' $< > $@ + rm -rf $< + +local/environmental-materials-metadata.yaml: + $(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00010483 > $@ + +local/environmental-materials-metadata.json: local/environmental-materials-metadata.yaml + yq ea '[.]' $< -o=json | cat > $@ + rm -rf $< + +local/environmental-material-info.txt: + $(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00010483 > $@ + +local/aquatic-biome-info.txt: + $(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00002030 > $@ # --output-type tsv has lots of info but wrapped in square brackets + +local/aquatic-biome-relationships.tsv: + $(RUN) runoak --input sqlite:obo:envo relationships --output-type tsv --output $@ .desc//p=i ENVO:00002030 + +local/aquatic-biome.png: + $(RUN) runoak --input sqlite:obo:envo viz --no-view --output $@ --gap-fill .desc//p=i ENVO:00002030 + +local/soil-env_broad_scale-algebraic.txt: + $(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ .desc//p=i biome .not .desc//p=i 'aquatic biome' ] .not .desc//p=i 'forest biome' ] .not .desc//p=i 'grassland biome' ] .not .desc//p=i 'desert biome' ] .not biome ] .not 'cropland biome' > $@ + +local/soil-env_broad_scale-algebraic.csv: local/soil-env_broad_scale-algebraic.txt + $(RUN) normalize-envo-data \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ + + +## for env medium +#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv: local/environmental-material-info.txt \ +#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.csv +# $(RUN) detect-curies-in-subset \ +# --tsv-file $(word 2,$^) \ +# --class-info-file $(word 1,$^) \ +# --tsv-column-name normalized-curie \ +# --subset-label environmental-material \ +# --output-file $@ +# +#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-4.csv: local/environmental-material-info.txt \ +#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv +# $(RUN) detect-curies-in-subset \ +# --tsv-file $(word 2,$^) \ +# --class-info-file $(word 1,$^) \ +# --tsv-column-name matched_id \ +# --subset-label environmental_material \ +# --output-file $@ + + +# REPORT OF WHETHER A BIOSAMPLE USES A BIOME AS IT'S env_broad_scale VALUE +# joins pre-loaded (grouped) detected_annotations table with individual biosample env_broad_scale assertions +local/soil-water-env-broad-scale.tsv: sql/soil-water-env_broad_scale.sql + $(RUN) sql-to-tsv \ + --sql-file $< \ + --output-file $@ + +#### + +local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \ +local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \ +local/biome-relationships.csv + $(RUN) build-prompt-from-template \ + --spec-file-path $(word 1,$^) \ + --output-file-path $@ + +# suggested models: gpt-4, gpt-4o, gpt-4-turbo (?), claude-3-opus, claude-3.5-sonnet, gemini-1.5-pro-latest +# gemini models don't seem to take a temperature parameter +# cborg/claude-sonnet +local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-prompt.txt + cat $(word 1,$^) | $(RUN) llm prompt --model claude-3.5-sonnet -o temperature 0.01 | tee $@ + +#### + +#local/env-local-scale-candidates.txt: +# $(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@ +# REPORT OF WHETHER A BIOSAMPLE USES A BIOME AS IT'S env_broad_scale VALUE +# joins pre-loaded (grouped) detected_annotations table with individual biosample env_broad_scale assertions +local/soil-water-env-broad-scale.tsv: sql/soil-water-env_broad_scale.sql + $(RUN) sql-to-tsv \ + --sql-file $< \ + --output-file $@ + +#### + +local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \ +local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \ +local/biome-relationships.csv + $(RUN) build-prompt-from-template \ + --spec-file-path $(word 1,$^) \ + --output-file-path $@ + +# suggested models: gpt-4, gpt-4o, gpt-4-turbo (?), claude-3-opus, claude-3.5-sonnet, gemini-1.5-pro-latest +# gemini models don't seem to take a temperature parameter +# cborg/claude-sonnet +local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-prompt.txt + cat $(word 1,$^) | $(RUN) llm prompt --model claude-3.5-sonnet -o temperature 0.01 | tee $@ + +#### + +# remove .desc//p=BFO:0000050 'marine littoral zone' .or .desc//p=BFO:0000050 'saline water' .or .desc//p=RO:0001025 'water body' .or .desc//p=RO:0001025 'water body' .or .desc//p=RO:0002473 ' because of .or 'l~marine' +# remove .or .desc//p=i 'organic material' because of .or .desc//p=i 'environmental material' +# remove .or .desc//p=i 'mass of liquid' because of .or .desc//p=i 'mass of environmental material' +# removed .or .desc//p=i NCBITaxon:1 because of .or .desc//p=i 'organism' +# .or .desc//p=i 'organic material' (.desc//p=i 'environmental material') +# .or .desc//p=i 'gas planet' (.desc//p=i 'environmental material') + +local/env-local-scale-candidates-relationships.tsv: + $(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i 'material entity' .not [ .desc//p=i 'biome' .or .desc//p=i 'environmental material' .or .desc//p=i 'meteorite' .or .desc//p=i 'chemical entity' .or .desc//p=i 'organic material' .or .desc//p=i 'anatomical entity' .or .desc//p=i 'organism' .or .desc//p=i 'plant anatomical entity' .or .desc//p=i 'healthcare facility' .or .desc//p=i 'fluid layer' .or .desc//p=i 'interface layer' .or .desc//p=i 'manufactured product' .or .desc//p=i 'anatomical entity environment' .or .desc//p=i 'ecosystem' .or .desc//p=i 'area protected according to IUCN guidelines' .or .desc//p=i 'astronomical body' .or .desc//p=i 'astronomical object' .or .desc//p=i 'cloud' .or .desc//p=i 'collection of organisms' .or .desc//p=i 'environmental system' .or .desc//p=i 'ecozone' .or .desc//p=i 'environmental zone' .or .desc//p=i 'water current' .or .desc//p=i 'mass of environmental material' .or .desc//p=i 'subatomic particle' .or .desc//p=i 'observing system' .or .desc//p=i 'particle' .or .desc//p=i 'planetary structural layer' .or .desc//p=i 'political entity' .or .desc//p=i 'meteor' .or .desc//p=i 'room' .or .desc//p=i 'transport feature' .or .desc//p=i 'mass of liquid' .or .desc//p=RO:0001025 'water body' .or .desc//p=BFO:0000050 'environmental monitoring area' .or .desc//p=BFO:0000050 'marine littoral zone' .or .desc//p=BFO:0000050 'marine environmental zone' .or .desc//p=RO:0002473 'sea floor' .or .desc//p=BFO:0000050 'saline water' .or .desc//p=BFO:0000050 'ice' .or .desc//p=RO:0001025 'water body' .or .desc//p=i 'administrative region' .or .desc//p=i 'protected area' .or .desc//p=i 'channel of a watercourse' .or .desc//p=i 'cryospheric layer' .or 'l~gaseous' .or 'l~marine' .or .desc//p=i 'material isosurface' .or 'l~undersea' .or .desc//p=i NCBITaxon:1 .or 'l~saline' .or 'l~brackish' .or .desc//p=i 'aeroform' ] > $@ + +local/envo-leaves.txt: + $(RUN) runoak --input sqlite:obo:envo leafs > $@ + +local/envo-leaf-ids.txt: local/envo-leaves.txt + cut -f1 -d' ' $< > $@ + +local/env-local-scale-candidate-ids.txt: local/env-local-scale-candidates.txt + cut -f1 -d' ' $< > $@ + +local/env-local-scale-non-leaf.txt: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt + $(RUN) runoak --input sqlite:obo:envo info .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ] > $@ + +local/env-local-scale-non-leaf.csv: local/env-local-scale-non-leaf.txt + $(RUN) normalize-envo-data \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ + +local/env-local-scale-non-leaf.png: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt + $(RUN) runoak --input sqlite:obo:envo viz --gap-fill .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ] + +local/goldData.xlsx: + wget -O $@ "https://gold.jgi.doe.gov/download?mode=site_excel" + +local/goldData_biosamples.csv: local/goldData.xlsx # for counting biosamples with a path that corresponds to en env_local_scale from local/goldterms-env_local_scale-of-environmental-terrestrial-soil-counts.txt + $(RUN) python -c "import pandas as pd; import sys; pd.read_excel(sys.argv[1], sheet_name=sys.argv[3]).to_csv(sys.argv[2], index=False)" $< $@ Biosample + +local/goldterms-env_local_scale-of-environmental-terrestrial-soil-counts.txt: # counts by path, not by bold biosamples + $(RUN) runoak --input sqlite:obo:goldterms query --output $@.bak --query "SELECT s.object, count(1) as path_count FROM entailed_edge ee JOIN statements s ON ee.subject = s.subject WHERE ee.predicate = 'rdfs:subClassOf' AND ee.object = 'GOLDTERMS:4212' AND s.predicate = 'mixs:env_local' group by s.object order by count(1) desc" + cut -f 1,3 $@.bak > $@ + rm -rf $@.bak + + +###### SIERRA's STUFF ####### + +# local/env-local-scale-candidates.txt: +# $(RUN) runoak --input sqlite:obo:envo info .desc//p=i 'material entity' .not [ .desc//p=i 'biome'.or .desc//p=i 'environmental material' .or .desc//p=i 'anatomical entity' .or .desc//p=i 'chemical entity' .or .desc//p=i 'environmental system' .or .desc//p=i 'administrative region' .or .desc//p=i 'aeroform' .or .desc//p=i 'anatomical entity environment' .or .desc//p=i 'area protected according to IUCN guidelines' .or .desc//p=i 'astronomical object' .or .desc//p=i 'building part' .or .desc//p=i 'channel of a watercourse' .or .desc//p=i 'collection of organisms' .or .desc//p=i 'cryospheric layer' .or .desc//p=i 'ecozone' .or .desc//p=i 'environmental monitoring area' .or .desc//p=i 'fluid layer' .or .desc//p=i 'healthcare facility' .or .desc//p=i 'interface layer' .or .desc//p=i 'manufactured product' .or .desc//p=i 'mass of biological material' .or .desc//p=i 'mass of fluid' .or .desc//p=i 'material isosurface' .or .desc//p=i 'meteor' .or .desc//p=i 'meteorite' .or .desc//p=i 'observing system' .or .desc//p=i 'organic object' .or .desc//p=i 'organism' .or .desc//p=i 'particle' .or .desc//p=i 'piece of plastic' .or .desc//p=i 'piece of rock' .or .desc//p=i 'planetary structural layer' .or .desc//p=i 'plant anatomical entity' .or .desc//p=i 'political entity' .or .desc//p=i 'protected area' .or .desc//p=i 'subatomic particle' .or .desc//p=i 'transport feature' .or .desc//p=i 'water current' .or .desc//p=i,p 'l~undersea' ] .or bridge .or road .or 'wildlife management area' .or .desc//p=i 'lake layer' .or .desc//p=i island > $@ + +generate-env-local-scale-candidates: + # Ensure the poetry environment is activated and run the script with the specified config + $(RUN) python external_metadata_awareness/envo_local_scale_extraction.py \ + --oak-config-file config/oak-config.yaml \ + --extraction-config-file config/env-local-scale-extraction-config.yaml + +test: + $(RUN) pytest tests/* +###### END SIERRA's STUFF ####### diff --git a/external_metadata_awareness/envo_local_scale_extraction.py b/external_metadata_awareness/envo_local_scale_extraction.py new file mode 100755 index 0000000..67cd74a --- /dev/null +++ b/external_metadata_awareness/envo_local_scale_extraction.py @@ -0,0 +1,142 @@ +import logging +from typing import List +import yaml +import click +from oaklib import get_adapter +from oaklib.query import onto_query, SimpleQueryTerm + +# Configure logging +logging.basicConfig(level=logging.WARN, format='%(asctime)s - %(levelname)s - %(message)s') + + +def load_configs(oak_config_file, extraction_config_file): + with open(oak_config_file, 'r') as file: + oak_config = yaml.safe_load(file) + with open(extraction_config_file, 'r') as file: + extraction_config = yaml.safe_load(file) + return oak_config, extraction_config + + +def create_exclusion_list(term_labels, adapter) -> List[str]: + """ + Creates a combined FunctionQuery to exclude specific terms and their descendants. + + :param term_labels: List of term labels to exclude. + :param adapter: The ontology adapter. + :return: Combined FunctionQuery to exclude all specified terms and their descendants. + """ + all_ids_to_exclude = [] + for label in term_labels: + # Find the CURIE for the label + term_curies = onto_query(SimpleQueryTerm(term=label), adapter) + if term_curies: + term_curie = term_curies[0] # Assuming one CURIE per label + # Create a descendant exclusion query for the term + list_to_exclude = onto_query([".desc//p=i", term_curie], adapter) + all_ids_to_exclude.extend(list_to_exclude) + return list(set(all_ids_to_exclude)) + + +def create_text_exclusion_list(text_exclusions, adapter): + """ + Creates a combined FunctionQuery to exclude specific terms based on text matching. + + :param text_exclusions: List of text patterns to exclude. + :param adapter: The ontology adapter. + :return: Combined FunctionQuery to exclude all specified text matches. + """ + + all_ids_to_exclude = [] + + for text in text_exclusions: + # Find the CURIE for the label + list_to_exclude = onto_query(["l~"+text], adapter) + all_ids_to_exclude.extend(list_to_exclude) + return list(set(all_ids_to_exclude)) + + +def exclude_terms(full_list, exclusion_list): + """ + Returns a list of items from the full list with the items in the exclusion list removed. + + :param full_list: List of items to be filtered. + :param exclusion_list: List of items to exclude from the full list. + :return: A list with items from exclusion_list removed. + """ + return [item for item in full_list if item not in exclusion_list] + + +def retrieve_individual_terms(terms_to_retrieve: List[str], adapter) -> List[str]: + """ + Creates a list of CURIEs based on the provided list of term labels. + + :param terms_to_retrieve: List of term labels. + :param envo: The ontology adapter. + """ + all_ids = [] + + for term_label in terms_to_retrieve: + # Find the CURIE for the label + list_to_exclude = onto_query([term_label], adapter) + print("term_label", term_label) + print("list_to_exclude", list_to_exclude) + all_ids.extend(list_to_exclude) + return list(set(all_ids)) + + +def extract_terfms_to_file(oak_config_file, extraction_config): + # Load the ontology using the get_adapter function + envo = get_adapter(oak_config_file) + + # Get the entity and exclusions from the config + initial_term_label = extraction_config['entity'] + initial_term_list = onto_query([".desc//p=i", initial_term_label], envo) + logging.info(f"Length of initial term list: {len(initial_term_list)}") + + exclusion_terms_and_children = create_exclusion_list(extraction_config.get('term_and_descendant_exclusions', []), + envo) + + exclusion_terms_from_text = create_text_exclusion_list(extraction_config.get('text_exclusions', []), + envo) + + exclude_single_terms = retrieve_individual_terms(extraction_config.get('exclude_single_terms', []), envo) + solo_inclusion_terms = extraction_config.get('post_process_inclusion_single_terms', []) + logging.info("solo_inclusion_terms", solo_inclusion_terms) + post_process_inclusion_single_terms = retrieve_individual_terms(extraction_config.get('post_process_inclusion_single_terms', []), envo) + logging.info("post_process_inclusion_terms", post_process_inclusion_single_terms) + + + exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + exclude_single_terms + logging.info(f"Length of excluded terms and descendants: {len(exclusion_terms_and_children)}") + logging.info(f"Length of excluded terms from text: {len(exclusion_terms_from_text)}") + logging.info(f"Length of excluded terms from solo terms: {len(post_process_inclusion_single_terms)}") + + remaining_items = exclude_terms(initial_term_list, exclusion_list) + logging.info(f"Length of remaining items: {len(remaining_items)}") + + final_list_to_retrieve = post_process_inclusion_single_terms + remaining_items + + results = onto_query(final_list_to_retrieve, envo, labels=True) + + # Write the results to the output file specified in the extraction config + output_file_path = extraction_config['output'] + with open(output_file_path, 'w') as output_file: + for curie, label in results: + output_file.write(f"{curie}: {label}\n") + + logging.info(f"Results written to {output_file_path}") + + +@click.command() +@click.option('--extraction-config-file', required=True, help='Path to the extraction YAML configuration file.') +@click.option('--oak-config-file', required=True, help='Path to the extraction YAML configuration file.') +def cli(extraction_config_file, oak_config_file): + """ + CLI tool to process an ontology based on the provided YAML configuration file. + """ + _, extraction_config = load_configs(oak_config_file, extraction_config_file) + extract_terms_to_file(oak_config_file, extraction_config) + + +if __name__ == "__main__": + cli() diff --git a/ncbi.Makefile b/ncbi.Makefile new file mode 100644 index 0000000..b679ff1 --- /dev/null +++ b/ncbi.Makefile @@ -0,0 +1,160 @@ +WGET=wget +RUN=poetry run + +# preferable to use a tagged release, but theres good stuff in this commit that hasn't been released yet +MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml +SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml + + +## NCBI STUFF +# very complex documents; many are too large to load into a MongoDB document +downloads/bioproject.xml: + $(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml" # ~ 3 GB August 2024 + +downloads/biosample_set.xml.gz: + $(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz" # ~ 3 GB August 2024 + +local/biosample_set.xml: downloads/biosample_set.xml.gz + gunzip -c $< > $@ + +# for development +downloads/books.xml: + $(WGET) -O $@ "https://www.w3schools.com/xml/books.xml" + +# 8 years old. seems very incomplete. +downloads/biosample.xsd: + $(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co" + +# find code for converting to table in other repos +# or convert to duckdb +downloads/ncbi-biosample-attributes.xml: + $(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/attributes/?format=xml" + +downloads/ncbi-biosample-packages.xml: + $(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml" + +local/ncbi-biosample-packages.csv: downloads/ncbi-biosample-packages.xml + $(RUN) ncbi-packages-csv-report \ + --xml-file $< \ + --output-file $@ + + +# see also https://www.npmjs.com/package/mongodb-schema/v/12.2.0?activeTab=versions + +#local/mongodb-paths-10pct.txt: # 450000 -> ~ 4 minutes # 4.5 M -> heavy load, never finishes. Use streaming approach? +# $(RUN) list-mongodb-paths \ +# --db-name ncbi_metadata \ +# --collection samples \ +# --sample-size 4500000 > $@ + +#local/ncbi_biosamples_inferred_schema.json: # ~ 2 minutes for 410,000 (1%) # ~ 1 hour for 13 million ~ 30% +# $(RUN) python external_metadata_awareness/infer_schema_with_batching.py \ +# --host localhost \ +# --port 27017 \ +# --database ncbi_metadata \ +# --collection samples \ +# --total-samples 13000000 \ +# --batch-size 50000 \ +# --output $@ + +.PHONY: load-biosamples-into-mongo + +local/biosample-count-xml.txt: local/biosample_set.xml + date && grep -c "" $< > $@ && date + +# see also https://gitlab.com/wurssb/insdc_metadata +load-biosamples-into-mongo: local/biosample_set.xml + $(RUN) xml-to-mongo \ + --file-path $< \ + --node-type BioSample \ + --id-field id \ + --db-name biosamples_dev \ + --collection-name biosamples_dev \ + --max-elements 100000 \ + --anticipated-last-id 100000 + +local/biosample-count-mongodb.txt: + date && mongosh --eval 'db.getSiblingDB("ncbi_metadata").samples.countDocuments()' > $@ && date # 1 minute + +local/ncbi-biosamples-packages-counts.tsv: sql/packages-counts.sql + $(RUN) sql-to-tsv \ + --sql-file $< \ + --output-file $@ + +ncbi-biosamples-duckdb-overview: + $(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \ + --connection-string "mongodb://localhost:27017/" \ + --db-name ncbi_metadata \ + --collection-name samples \ + --limit 41000000 \ + --batch-size 100000 \ + --duckdb-file local/ncbi_biosamples.duckdb \ + --table-name overview # no path # 40462422 biosamples in ~ 50 minutes + +# add counts from duckdb; need to compile duckdb or download binary + +ncbi-biosamples-duckdb-attributes: + $(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \ + --connection-string "mongodb://localhost:27017/" \ + --db-name ncbi_metadata \ + --collection-name samples \ + --limit 41000000 \ + --batch-size 100000 \ + --duckdb-file local/ncbi_biosamples.duckdb \ + --table-name attributes \ + --path BioSample.Attributes.Attribute + +ncbi-biosamples-duckdb-links: + $(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \ + --connection-string "mongodb://localhost:27017/" \ + --db-name ncbi_metadata \ + --collection-name samples \ + --limit 41000000 \ + --batch-size 100000 \ + --duckdb-file local/ncbi_biosamples.duckdb \ + --table-name links \ + --path BioSample.Links.Link + + ## @click.option('--path', default="BioSample.Links.Link", required=True, + ## help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').") + ## @click.option('--path', default="BioSample.Ids.Id", required=True, + ## help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').") + ## @click.option('--path', default="BioSample.Description.Organism", required=True, + ## help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').") + +NCBI_BIOSAMPLES_DUCKDB_PATH = local/ncbi_biosamples.duckdb + +local/ncbi-mims-soil-biosamples-env_local_scale.csv: + echo ".mode csv\nSELECT content, COUNT(1) AS sample_count FROM attributes WHERE harmonized_name = 'env_local_scale' AND package_content = 'MIMS.me.soil.6.0' GROUP BY content ORDER BY COUNT(1) DESC;" | duckdb $(NCBI_BIOSAMPLES_DUCKDB_PATH) > $@ + +local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv: local/ncbi-mims-soil-biosamples-env_local_scale.csv + $(RUN) normalize-envo-data \ + --count-col-name sample_count \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ \ + --val-col-name content + +local/ncbi-mims-soil-biosamples-env_local_scale-failures.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv + $(RUN) find-envo-present-no-curie-extracted \ + --input-file $< \ + --output-file $@ + +local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv local/envo-info.csv + $(RUN) merge-in-reference-data \ + --keep-file $(word 1,$^) \ + --keep-key normalized_curie \ + --reference-file $(word 2,$^) \ + --reference-key normalized_curie \ + --reference-addition normalized_label \ + --addition-rename real_label \ + --merged-file $@ + +local/ncbi-mims-soil-biosamples-env_local_scale-annotated.tsv: local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv + date ; $(RUN) runoak \ + --input sqlite:obo:envo annotate \ + --matches-whole-text \ + --output-type tsv \ + --output $@ \ + --text-file $< \ + --match-column normalized_label ; date diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py new file mode 100644 index 0000000..d5c8b4e --- /dev/null +++ b/tests/test_env_local_scale_generator.py @@ -0,0 +1,157 @@ +import pytest +import yaml +import os +from click.testing import CliRunner +from external_metadata_awareness.envo_local_scale_extraction import cli, load_configs, extract_terms_to_file +from oaklib.query import onto_query +from oaklib.selector import get_adapter + + +@pytest.fixture +def oak_config_file(tmp_path): + config_data = { + "ontology_resources": { + "envo": { + "selector": "sqlite:obo:envo" + } + } + } + config_file = tmp_path / "oak_config.yaml" + with open(config_file, 'w') as file: + yaml.dump(config_data, file) + return config_file + + +@pytest.fixture +def extraction_config_file(tmp_path): + config_data = { + "entity": "material entity", + "post_process_inclusion_single_terms": [ + "bridge", + "road", + "wildlife management area" + ], + "term_and_descendant_exclusions": [ + "biome" + , "environmental material" + , "chemical entity" + , "organic material" + , "anatomical entity" + , "organism" + , "plant anatomical entity" + , "healthcare facility" + , "fluid layer" + , "interface layer" + , "manufactured product" + , "anatomical entity environment" + , "ecosystem" + , "area protected according to IUCN guidelines" + , "astronomical body" + , "astronomical object" + , "cloud" + , "collection of organisms" + , "environmental system" + , "ecozone" + , "material isosurface" + , "environmental zone" + , "water current" + , "mass of environmental material" + , "subatomic particle" + , "observing system" + , "particle" + , "planetary structural layer" + , "political entity" + , "meteor" + , "room" + , "transport feature" + , "mass of liquid" + , "RO:0001025 water body" + , "BFO:0000050 environmental monitoring area" + , "BFO:0000050 marine littoral zone" + , "BFO:0000050 marine environmental zone" + , "RO:0002473 sea floor" + , "BFO:0000050 saline water" + , "BFO:0000050 ice" + , "RO:0001025 water body" + , "administrative region" + , "protected area" + , "channel of a watercourse" + , "cryospheric layer" + , "material isosurface" + , "NCBITaxon:1" + , "aeroform" + ], + "text_exclusions": [ + "gaseous" + , "marine" + , "undersea" + , "saline" + , "brackish" + ], + "output": str(tmp_path / "environmental-materials-relationships.txt") + } + config_file = tmp_path / "extraction_config.yaml" + with open(config_file, 'w') as file: + yaml.dump(config_data, file) + return config_file + + +def test_load_configs(oak_config_file, extraction_config_file): + oak_config, extraction_config = load_configs(oak_config_file, extraction_config_file) + assert "ontology_resources" in oak_config + assert "envo" in oak_config["ontology_resources"] + assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo" + assert extraction_config["entity"] == "material entity" + assert "post_process_inclusion_single_terms" in extraction_config + assert "text_exclusions" in extraction_config + assert extraction_config["output"].endswith("environmental-materials-relationships.txt") + + +def test_process_ontology(oak_config_file, extraction_config_file): + _, extraction_config = load_configs(oak_config_file, extraction_config_file) + + # Run the ontology processing + extract_terms_to_file(oak_config_file, extraction_config) + + # Check if the output file is created and has content + output_file_path = extraction_config["output"] + assert os.path.exists(output_file_path), "Output file was not created" + + with open(output_file_path, 'r') as file: + content = file.read() + assert len(content) > 0, "Output file is empty, expected some data." + + # You could also add assertions based on expected content + # For example, checking that excluded terms are not in the output + assert "biome" not in content + assert "brackish" not in content + assert "saline" not in content + assert "wildlife management area" in content + + +def test_cli_runs_successfully(oak_config_file, extraction_config_file): + runner = CliRunner() + result = runner.invoke(cli, ['--extraction-config-file', str(extraction_config_file), '--oak-config-file', + str(oak_config_file)]) + assert result.exit_code == 0 + + # Verify the output file exists and contains the expected results + output_file = extraction_config_file.parent / "environmental-materials-relationships.txt" + assert output_file.exists() + with open(output_file, 'r') as file: + content = file.read() + assert len(content) > 0, "Output file is empty, expected some data." + + # Add additional assertions to check that the CLI correctly excluded terms + assert "biome" not in content + assert "brackish" not in content + assert "saline" not in content + + +def test_onto_query(): + adapter = get_adapter("sqlite:obo:envo") + # desc = onto_query([".desc//p=i", "material entity"], adapter) + # print(len(desc)) + + list_to_exclude = onto_query(["l~saline"], adapter, labels=True) + print(list_to_exclude) \ No newline at end of file