From 9d2f639298698e059045e361259ae2330f519164 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Wed, 28 Aug 2024 18:40:23 -0700 Subject: [PATCH 01/17] extract makefile target into a simple script --- Makefile | 289 +----------------- env_triad.Makefile | 145 +++++++++ external_metadata_awareness/cborg_test.py | 2 +- .../env_local_scale_config.yaml | 57 ++++ .../env_local_scale_extraction.py | 46 +++ ncbi.Makefile | 160 ++++++++++ tests/test_env_local_scale_generator.py | 81 +++++ 7 files changed, 493 insertions(+), 287 deletions(-) create mode 100644 env_triad.Makefile create mode 100755 external_metadata_awareness/env_local_scale_config.yaml create mode 100755 external_metadata_awareness/env_local_scale_extraction.py create mode 100644 ncbi.Makefile create mode 100644 tests/test_env_local_scale_generator.py diff --git a/Makefile b/Makefile index 09143aa..21f0306 100644 --- a/Makefile +++ b/Makefile @@ -5,221 +5,8 @@ RUN=poetry run MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml - -## NCBI STUFF -# very complex documents; many are too large to load into a MongoDB document -downloads/bioproject.xml: - $(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml" # ~ 3 GB August 2024 - -downloads/biosample_set.xml.gz: - $(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz" # ~ 3 GB August 2024 - -local/biosample_set.xml: downloads/biosample_set.xml.gz - gunzip -c $< > $@ - -# for development -downloads/books.xml: - $(WGET) -O $@ "https://www.w3schools.com/xml/books.xml" - -# 8 years old. seems very incomplete. -downloads/biosample.xsd: - $(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co" - -# find code for converting to table in other repos -# or convert to duckdb -downloads/ncbi-biosample-attributes.xml: - $(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/attributes/?format=xml" - -downloads/ncbi-biosample-packages.xml: - $(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml" - -local/ncbi-biosample-packages.csv: downloads/ncbi-biosample-packages.xml - $(RUN) ncbi-packages-csv-report \ - --xml-file $< \ - --output-file $@ - - -# see also https://www.npmjs.com/package/mongodb-schema/v/12.2.0?activeTab=versions - -#local/mongodb-paths-10pct.txt: # 450000 -> ~ 4 minutes # 4.5 M -> heavy load, never finishes. Use streaming approach? -# $(RUN) list-mongodb-paths \ -# --db-name ncbi_metadata \ -# --collection samples \ -# --sample-size 4500000 > $@ - -#local/ncbi_biosamples_inferred_schema.json: # ~ 2 minutes for 410,000 (1%) # ~ 1 hour for 13 million ~ 30% -# $(RUN) python external_metadata_awareness/infer_schema_with_batching.py \ -# --host localhost \ -# --port 27017 \ -# --database ncbi_metadata \ -# --collection samples \ -# --total-samples 13000000 \ -# --batch-size 50000 \ -# --output $@ - -.PHONY: load-biosamples-into-mongo - -local/biosample-count-xml.txt: local/biosample_set.xml - date && grep -c "" $< > $@ && date - -# see also https://gitlab.com/wurssb/insdc_metadata -load-biosamples-into-mongo: local/biosample_set.xml - $(RUN) xml-to-mongo \ - --file-path $< \ - --node-type BioSample \ - --id-field id \ - --db-name biosamples_dev \ - --collection-name biosamples_dev \ - --max-elements 100000 \ - --anticipated-last-id 100000 - -local/biosample-count-mongodb.txt: - date && mongosh --eval 'db.getSiblingDB("ncbi_metadata").samples.countDocuments()' > $@ && date # 1 minute - -local/ncbi-biosamples-packages-counts.tsv: sql/packages-counts.sql - $(RUN) sql-to-tsv \ - --sql-file $< \ - --output-file $@ - -ncbi-biosamples-duckdb-overview: - $(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \ - --connection-string "mongodb://localhost:27017/" \ - --db-name ncbi_metadata \ - --collection-name samples \ - --limit 41000000 \ - --batch-size 100000 \ - --duckdb-file local/ncbi_biosamples.duckdb \ - --table-name overview # no path # 40462422 biosamples in ~ 50 minutes - -# add counts from duckdb; need to compile duckdb or download binary - -ncbi-biosamples-duckdb-attributes: - $(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \ - --connection-string "mongodb://localhost:27017/" \ - --db-name ncbi_metadata \ - --collection-name samples \ - --limit 41000000 \ - --batch-size 100000 \ - --duckdb-file local/ncbi_biosamples.duckdb \ - --table-name attributes \ - --path BioSample.Attributes.Attribute - -ncbi-biosamples-duckdb-links: - $(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \ - --connection-string "mongodb://localhost:27017/" \ - --db-name ncbi_metadata \ - --collection-name samples \ - --limit 41000000 \ - --batch-size 100000 \ - --duckdb-file local/ncbi_biosamples.duckdb \ - --table-name links \ - --path BioSample.Links.Link - - ## @click.option('--path', default="BioSample.Links.Link", required=True, - ## help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').") - ## @click.option('--path', default="BioSample.Ids.Id", required=True, - ## help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').") - ## @click.option('--path', default="BioSample.Description.Organism", required=True, - ## help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').") - -NCBI_BIOSAMPLES_DUCKDB_PATH = local/ncbi_biosamples.duckdb - -local/ncbi-mims-soil-biosamples-env_local_scale.csv: - echo ".mode csv\nSELECT content, COUNT(1) AS sample_count FROM attributes WHERE harmonized_name = 'env_local_scale' AND package_content = 'MIMS.me.soil.6.0' GROUP BY content ORDER BY COUNT(1) DESC;" | duckdb $(NCBI_BIOSAMPLES_DUCKDB_PATH) > $@ - -local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv: local/ncbi-mims-soil-biosamples-env_local_scale.csv - $(RUN) normalize-envo-data \ - --count-col-name sample_count \ - --input-file $< \ - --ontology-prefix ENVO \ - --output-file $@ \ - --val-col-name content - -local/ncbi-mims-soil-biosamples-env_local_scale-failures.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv - $(RUN) find-envo-present-no-curie-extracted \ - --input-file $< \ - --output-file $@ - -local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv local/envo-info.csv - $(RUN) merge-in-reference-data \ - --keep-file $(word 1,$^) \ - --keep-key normalized_curie \ - --reference-file $(word 2,$^) \ - --reference-key normalized_curie \ - --reference-addition normalized_label \ - --addition-rename real_label \ - --merged-file $@ - -local/ncbi-mims-soil-biosamples-env_local_scale-annotated.tsv: local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv - date ; $(RUN) runoak \ - --input sqlite:obo:envo annotate \ - --matches-whole-text \ - --output-type tsv \ - --output $@ \ - --text-file $< \ - --match-column normalized_label ; date - -# ENVO STUFF -# getting fragments of EnvO because the whole thing is too large to feed into an LLM -# our guideline is that env_broad_scale should be answered with an EnvO biome subclass - -# these OAK commands fetch the latest EnvO SQLite file from a BBOP S3 bucket -# it may be a few days behind the envo.owl file form the EnvO GH repo -# use `runoak cache-ls` to see where the SQLite files are cached - -local/biome-relationships.tsv: - $(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00000428 > $@ - # !!! pivot? include entailment? --include-entailed / --no-include-entailed; --non-redundant-entailed / --no-non-redundant-entailed - # LLM web interfaces might want CSVs - -local/biome-relationships.csv: local/biome-relationships.tsv - sed 's/\t/,/g' $< > $@ - #awk 'BEGIN {FS="\t"; OFS=","} {print $$0}' $< > $@ - rm -rf $< - -local/biome-metadata.yaml: - $(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00000428 > $@ - # !!! try different formats? or predicate list? - -local/biome-metadata.json: local/biome-metadata.yaml - yq ea '[.]' $< -o=json | cat > $@ - rm -rf $< - -# our guideline is that env_medium should be answered with an EnvO biome subclass -local/environmental-materials-relationships.tsv: - $(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00010483 > $@ - -local/environmental-materials-relationships.csv: local/environmental-materials-relationships.tsv - sed 's/\t/,/g' $< > $@ - rm -rf $< - -local/environmental-materials-metadata.yaml: - $(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00010483 > $@ - -local/environmental-materials-metadata.json: local/environmental-materials-metadata.yaml - yq ea '[.]' $< -o=json | cat > $@ - rm -rf $< - -local/environmental-material-info.txt: - $(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00010483 > $@ - -local/aquatic-biome-info.txt: - $(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00002030 > $@ # --output-type tsv has lots of info but wrapped in square brackets - -local/aquatic-biome-relationships.tsv: - $(RUN) runoak --input sqlite:obo:envo relationships --output-type tsv --output $@ .desc//p=i ENVO:00002030 - -local/aquatic-biome.png: - $(RUN) runoak --input sqlite:obo:envo viz --no-view --output $@ --gap-fill .desc//p=i ENVO:00002030 - -local/soil-env_broad_scale-algebraic.txt: - $(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ .desc//p=i biome .not .desc//p=i 'aquatic biome' ] .not .desc//p=i 'forest biome' ] .not .desc//p=i 'grassland biome' ] .not .desc//p=i 'desert biome' ] .not biome ] .not 'cropland biome' > $@ - -local/soil-env_broad_scale-algebraic.csv: local/soil-env_broad_scale-algebraic.txt - $(RUN) normalize-envo-data \ - --input-file $< \ - --ontology-prefix ENVO \ - --output-file $@ +include ncbi.Makefile +include env_triad.Makefile # MIXS STUFF downloads/mixs.yaml: @@ -451,77 +238,7 @@ local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.ts --column1 "normalized_curie_biome" \ --column2 "matched_id_biome" -## for env medium -#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv: local/environmental-material-info.txt \ -#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.csv -# $(RUN) detect-curies-in-subset \ -# --tsv-file $(word 2,$^) \ -# --class-info-file $(word 1,$^) \ -# --tsv-column-name normalized-curie \ -# --subset-label environmental-material \ -# --output-file $@ -# -#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-4.csv: local/environmental-material-info.txt \ -#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv -# $(RUN) detect-curies-in-subset \ -# --tsv-file $(word 2,$^) \ -# --class-info-file $(word 1,$^) \ -# --tsv-column-name matched_id \ -# --subset-label environmental_material \ -# --output-file $@ - detected-annotations-to-postgres: local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv $(RUN) load-tsv-into-postgres \ --tsv-file $< \ - --table-name detected_annotations - -# REPORT OF WHETHER A BIOSAMPLE USES A BIOME AS IT'S env_broad_scale VALUE -# joins pre-loaded (grouped) detected_annotations table with individual biosample env_broad_scale assertions -local/soil-water-env-broad-scale.tsv: sql/soil-water-env_broad_scale.sql - $(RUN) sql-to-tsv \ - --sql-file $< \ - --output-file $@ - -#### - -local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \ -local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \ -local/biome-relationships.csv - $(RUN) build-prompt-from-template \ - --spec-file-path $(word 1,$^) \ - --output-file-path $@ - -# suggested models: gpt-4, gpt-4o, gpt-4-turbo (?), claude-3-opus, claude-3.5-sonnet, gemini-1.5-pro-latest -# gemini models don't seem to take a temperature parameter -# cborg/claude-sonnet -local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-prompt.txt - cat $(word 1,$^) | $(RUN) llm prompt --model claude-3.5-sonnet -o temperature 0.01 | tee $@ - -#### - -local/env-local-scale-candidates.txt: - $(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@ - -local/env-local-scale-candidates-relationships.tsv: - $(RUN) runoak --input sqlite:obo:envo relationships [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@ - -local/envo-leaves.txt: - $(RUN) runoak --input sqlite:obo:envo leafs > $@ - -local/envo-leaf-ids.txt: local/envo-leaves.txt - cut -f1 -d' ' $< > $@ - -local/env-local-scale-candidate-ids.txt: local/env-local-scale-candidates.txt - cut -f1 -d' ' $< > $@ - -local/env-local-scale-non-leaf.txt: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt - $(RUN) runoak --input sqlite:obo:envo info .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ] > $@ - -local/env-local-scale-non-leaf.csv: local/env-local-scale-non-leaf.txt - $(RUN) normalize-envo-data \ - --input-file $< \ - --ontology-prefix ENVO \ - --output-file $@ - -local/env-local-scale-non-leaf.png: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt - $(RUN) runoak --input sqlite:obo:envo viz --gap-fill .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ] + --table-name detected_annotations \ No newline at end of file diff --git a/env_triad.Makefile b/env_triad.Makefile new file mode 100644 index 0000000..87a4430 --- /dev/null +++ b/env_triad.Makefile @@ -0,0 +1,145 @@ +WGET=wget +RUN=poetry run + +# preferable to use a tagged release, but theres good stuff in this commit that hasn't been released yet +MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml +SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml + +# ENVO STUFF +# getting fragments of EnvO because the whole thing is too large to feed into an LLM +# our guideline is that env_broad_scale should be answered with an EnvO biome subclass + +# these OAK commands fetch the latest EnvO SQLite file from a BBOP S3 bucket +# it may be a few days behind the envo.owl file form the EnvO GH repo +# use `runoak cache-ls` to see where the SQLite files are cached + +local/biome-relationships.tsv: + $(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00000428 > $@ + # !!! pivot? include entailment? --include-entailed / --no-include-entailed; --non-redundant-entailed / --no-non-redundant-entailed + # LLM web interfaces might want CSVs + +local/biome-relationships.csv: local/biome-relationships.tsv + sed 's/\t/,/g' $< > $@ + #awk 'BEGIN {FS="\t"; OFS=","} {print $$0}' $< > $@ + rm -rf $< + +local/biome-metadata.yaml: + $(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00000428 > $@ + # !!! try different formats? or predicate list? + +local/biome-metadata.json: local/biome-metadata.yaml + yq ea '[.]' $< -o=json | cat > $@ + rm -rf $< + +# our guideline is that env_medium should be answered with an EnvO biome subclass +local/environmental-materials-relationships.tsv: + $(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00010483 > $@ + +local/environmental-materials-relationships.csv: local/environmental-materials-relationships.tsv + sed 's/\t/,/g' $< > $@ + rm -rf $< + +local/environmental-materials-metadata.yaml: + $(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00010483 > $@ + +local/environmental-materials-metadata.json: local/environmental-materials-metadata.yaml + yq ea '[.]' $< -o=json | cat > $@ + rm -rf $< + +local/environmental-material-info.txt: + $(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00010483 > $@ + +local/aquatic-biome-info.txt: + $(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00002030 > $@ # --output-type tsv has lots of info but wrapped in square brackets + +local/aquatic-biome-relationships.tsv: + $(RUN) runoak --input sqlite:obo:envo relationships --output-type tsv --output $@ .desc//p=i ENVO:00002030 + +local/aquatic-biome.png: + $(RUN) runoak --input sqlite:obo:envo viz --no-view --output $@ --gap-fill .desc//p=i ENVO:00002030 + +local/soil-env_broad_scale-algebraic.txt: + $(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ .desc//p=i biome .not .desc//p=i 'aquatic biome' ] .not .desc//p=i 'forest biome' ] .not .desc//p=i 'grassland biome' ] .not .desc//p=i 'desert biome' ] .not biome ] .not 'cropland biome' > $@ + +local/soil-env_broad_scale-algebraic.csv: local/soil-env_broad_scale-algebraic.txt + $(RUN) normalize-envo-data \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ + + +## for env medium +#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv: local/environmental-material-info.txt \ +#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.csv +# $(RUN) detect-curies-in-subset \ +# --tsv-file $(word 2,$^) \ +# --class-info-file $(word 1,$^) \ +# --tsv-column-name normalized-curie \ +# --subset-label environmental-material \ +# --output-file $@ +# +#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-4.csv: local/environmental-material-info.txt \ +#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv +# $(RUN) detect-curies-in-subset \ +# --tsv-file $(word 2,$^) \ +# --class-info-file $(word 1,$^) \ +# --tsv-column-name matched_id \ +# --subset-label environmental_material \ +# --output-file $@ + + +# REPORT OF WHETHER A BIOSAMPLE USES A BIOME AS IT'S env_broad_scale VALUE +# joins pre-loaded (grouped) detected_annotations table with individual biosample env_broad_scale assertions +local/soil-water-env-broad-scale.tsv: sql/soil-water-env_broad_scale.sql + $(RUN) sql-to-tsv \ + --sql-file $< \ + --output-file $@ + +#### + +local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \ +local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \ +local/biome-relationships.csv + $(RUN) build-prompt-from-template \ + --spec-file-path $(word 1,$^) \ + --output-file-path $@ + +# suggested models: gpt-4, gpt-4o, gpt-4-turbo (?), claude-3-opus, claude-3.5-sonnet, gemini-1.5-pro-latest +# gemini models don't seem to take a temperature parameter +# cborg/claude-sonnet +local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-prompt.txt + cat $(word 1,$^) | $(RUN) llm prompt --model claude-3.5-sonnet -o temperature 0.01 | tee $@ + +#### + +#local/env-local-scale-candidates.txt: +# $(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@ + +generate-env-local-scale-candidates: + # Ensure the poetry environment is activated and run the script with the specified config + poetry run python external_metadata_awareness/env_local_scale_extraction.py config.yaml + + +local/env-local-scale-candidates-relationships.tsv: + $(RUN) runoak --input sqlite:obo:envo relationships [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@ + +local/envo-leaves.txt: + $(RUN) runoak --input sqlite:obo:envo leafs > $@ + +local/envo-leaf-ids.txt: local/envo-leaves.txt + cut -f1 -d' ' $< > $@ + +local/env-local-scale-candidate-ids.txt: local/env-local-scale-candidates.txt + cut -f1 -d' ' $< > $@ + +local/env-local-scale-non-leaf.txt: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt + $(RUN) runoak --input sqlite:obo:envo info .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ] > $@ + +local/env-local-scale-non-leaf.csv: local/env-local-scale-non-leaf.txt + $(RUN) normalize-envo-data \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ + +local/env-local-scale-non-leaf.png: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt + $(RUN) runoak --input sqlite:obo:envo viz --gap-fill .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ] diff --git a/external_metadata_awareness/cborg_test.py b/external_metadata_awareness/cborg_test.py index c2cdf6b..94f4607 100644 --- a/external_metadata_awareness/cborg_test.py +++ b/external_metadata_awareness/cborg_test.py @@ -3,7 +3,7 @@ from dotenv import load_dotenv # Load environment variables from local/.env -load_dotenv(os.path.join('..', 'local', '.env')) +load_dotenv(os.path.join('../..', 'local', '.env')) client = openai.OpenAI( api_key=os.environ.get('CBORG_API_KEY'), # Retrieve API key from environment variables diff --git a/external_metadata_awareness/env_local_scale_config.yaml b/external_metadata_awareness/env_local_scale_config.yaml new file mode 100755 index 0000000..f5a1093 --- /dev/null +++ b/external_metadata_awareness/env_local_scale_config.yaml @@ -0,0 +1,57 @@ +# config.yaml +input: "sqlite:obo:envo" +output: "local/environmental-materials-relationships.txt" +entity: "material entity" +exclusions: + - "biome" + - "environmental material" + - "chemical entity" + - "organic material" + - "anatomical entity" + - "organism" + - "plant anatomical entity" + - "healthcare facility" + - "fluid layer" + - "interface layer" + - "manufactured product" + - "anatomical entity environment" + - "ecosystem" + - "area protected according to IUCN guidelines" + - "astronomical body" + - "astronomical object" + - "cloud" + - "collection of organisms" + - "environmental system" + - "ecozone" + - "environmental zone" + - "water current" + - "mass of environmental material" + - "subatomic particle" + - "observing system" + - "particle" + - "planetary structural layer" + - "political entity" + - "meteor" + - "room" + - "transport feature" + - "mass of liquid" + - "RO:0001025 water body" + - "BFO:0000050 environmental monitoring area" + - "BFO:0000050 marine littoral zone" + - "BFO:0000050 marine environmental zone" + - "RO:0002473 sea floor" + - "BFO:0000050 saline water" + - "BFO:0000050 ice" + - "RO:0001025 water body" + - "administrative region" + - "protected area" + - "channel of a watercourse" + - "cryospheric layer" + - "l~gaseous" + - "l~marine" + - "material isosurface" + - "l~undersea" + - "NCBITaxon:1" + - "l~saline" + - "l~brackish" + - "aeroform" \ No newline at end of file diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py new file mode 100755 index 0000000..2f6fbdf --- /dev/null +++ b/external_metadata_awareness/env_local_scale_extraction.py @@ -0,0 +1,46 @@ +import yaml +import click +from oaklib import get_adapter + + +def load_config(config_file): + with open(config_file, 'r') as file: + config = yaml.safe_load(file) + return config + + +def process_ontology(config): + # Load the ontology using the get_adapter function + ontology = get_adapter(config['input']) + + # Get the entity and exclusions from the config + initial_term = config['entity'] + exclusions = config['exclusions'] + + # Get all descendants of the initial term + descendants = ontology.descendants(initial_term) + + # Filter out the excluded terms + filtered_descendants = [ + term for term in descendants + if not any(ontology.label(term) == exclusion for exclusion in exclusions) + ] + + # Write the results to the output file + with open(config['output'], 'w') as output_file: + for term in filtered_descendants: + output_file.write(f"{term}: {ontology.label(term)}\n") + + +@click.command() +@click.argument('config_file') +def cli(config_file): + """ + CLI tool to process an ontology based on the provided YAML configuration file. + """ + config = load_config(config_file) + process_ontology(config) + + +if __name__ == "__main__": + cli() diff --git a/ncbi.Makefile b/ncbi.Makefile new file mode 100644 index 0000000..b679ff1 --- /dev/null +++ b/ncbi.Makefile @@ -0,0 +1,160 @@ +WGET=wget +RUN=poetry run + +# preferable to use a tagged release, but theres good stuff in this commit that hasn't been released yet +MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml +SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml + + +## NCBI STUFF +# very complex documents; many are too large to load into a MongoDB document +downloads/bioproject.xml: + $(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml" # ~ 3 GB August 2024 + +downloads/biosample_set.xml.gz: + $(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz" # ~ 3 GB August 2024 + +local/biosample_set.xml: downloads/biosample_set.xml.gz + gunzip -c $< > $@ + +# for development +downloads/books.xml: + $(WGET) -O $@ "https://www.w3schools.com/xml/books.xml" + +# 8 years old. seems very incomplete. +downloads/biosample.xsd: + $(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co" + +# find code for converting to table in other repos +# or convert to duckdb +downloads/ncbi-biosample-attributes.xml: + $(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/attributes/?format=xml" + +downloads/ncbi-biosample-packages.xml: + $(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml" + +local/ncbi-biosample-packages.csv: downloads/ncbi-biosample-packages.xml + $(RUN) ncbi-packages-csv-report \ + --xml-file $< \ + --output-file $@ + + +# see also https://www.npmjs.com/package/mongodb-schema/v/12.2.0?activeTab=versions + +#local/mongodb-paths-10pct.txt: # 450000 -> ~ 4 minutes # 4.5 M -> heavy load, never finishes. Use streaming approach? +# $(RUN) list-mongodb-paths \ +# --db-name ncbi_metadata \ +# --collection samples \ +# --sample-size 4500000 > $@ + +#local/ncbi_biosamples_inferred_schema.json: # ~ 2 minutes for 410,000 (1%) # ~ 1 hour for 13 million ~ 30% +# $(RUN) python external_metadata_awareness/infer_schema_with_batching.py \ +# --host localhost \ +# --port 27017 \ +# --database ncbi_metadata \ +# --collection samples \ +# --total-samples 13000000 \ +# --batch-size 50000 \ +# --output $@ + +.PHONY: load-biosamples-into-mongo + +local/biosample-count-xml.txt: local/biosample_set.xml + date && grep -c "" $< > $@ && date + +# see also https://gitlab.com/wurssb/insdc_metadata +load-biosamples-into-mongo: local/biosample_set.xml + $(RUN) xml-to-mongo \ + --file-path $< \ + --node-type BioSample \ + --id-field id \ + --db-name biosamples_dev \ + --collection-name biosamples_dev \ + --max-elements 100000 \ + --anticipated-last-id 100000 + +local/biosample-count-mongodb.txt: + date && mongosh --eval 'db.getSiblingDB("ncbi_metadata").samples.countDocuments()' > $@ && date # 1 minute + +local/ncbi-biosamples-packages-counts.tsv: sql/packages-counts.sql + $(RUN) sql-to-tsv \ + --sql-file $< \ + --output-file $@ + +ncbi-biosamples-duckdb-overview: + $(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \ + --connection-string "mongodb://localhost:27017/" \ + --db-name ncbi_metadata \ + --collection-name samples \ + --limit 41000000 \ + --batch-size 100000 \ + --duckdb-file local/ncbi_biosamples.duckdb \ + --table-name overview # no path # 40462422 biosamples in ~ 50 minutes + +# add counts from duckdb; need to compile duckdb or download binary + +ncbi-biosamples-duckdb-attributes: + $(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \ + --connection-string "mongodb://localhost:27017/" \ + --db-name ncbi_metadata \ + --collection-name samples \ + --limit 41000000 \ + --batch-size 100000 \ + --duckdb-file local/ncbi_biosamples.duckdb \ + --table-name attributes \ + --path BioSample.Attributes.Attribute + +ncbi-biosamples-duckdb-links: + $(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \ + --connection-string "mongodb://localhost:27017/" \ + --db-name ncbi_metadata \ + --collection-name samples \ + --limit 41000000 \ + --batch-size 100000 \ + --duckdb-file local/ncbi_biosamples.duckdb \ + --table-name links \ + --path BioSample.Links.Link + + ## @click.option('--path', default="BioSample.Links.Link", required=True, + ## help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').") + ## @click.option('--path', default="BioSample.Ids.Id", required=True, + ## help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').") + ## @click.option('--path', default="BioSample.Description.Organism", required=True, + ## help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').") + +NCBI_BIOSAMPLES_DUCKDB_PATH = local/ncbi_biosamples.duckdb + +local/ncbi-mims-soil-biosamples-env_local_scale.csv: + echo ".mode csv\nSELECT content, COUNT(1) AS sample_count FROM attributes WHERE harmonized_name = 'env_local_scale' AND package_content = 'MIMS.me.soil.6.0' GROUP BY content ORDER BY COUNT(1) DESC;" | duckdb $(NCBI_BIOSAMPLES_DUCKDB_PATH) > $@ + +local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv: local/ncbi-mims-soil-biosamples-env_local_scale.csv + $(RUN) normalize-envo-data \ + --count-col-name sample_count \ + --input-file $< \ + --ontology-prefix ENVO \ + --output-file $@ \ + --val-col-name content + +local/ncbi-mims-soil-biosamples-env_local_scale-failures.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv + $(RUN) find-envo-present-no-curie-extracted \ + --input-file $< \ + --output-file $@ + +local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv local/envo-info.csv + $(RUN) merge-in-reference-data \ + --keep-file $(word 1,$^) \ + --keep-key normalized_curie \ + --reference-file $(word 2,$^) \ + --reference-key normalized_curie \ + --reference-addition normalized_label \ + --addition-rename real_label \ + --merged-file $@ + +local/ncbi-mims-soil-biosamples-env_local_scale-annotated.tsv: local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv + date ; $(RUN) runoak \ + --input sqlite:obo:envo annotate \ + --matches-whole-text \ + --output-type tsv \ + --output $@ \ + --text-file $< \ + --match-column normalized_label ; date diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py new file mode 100644 index 0000000..e56f572 --- /dev/null +++ b/tests/test_env_local_scale_generator.py @@ -0,0 +1,81 @@ +import pytest +import yaml +from click.testing import CliRunner +from external_metadata_awareness.env_local_scale_extraction import cli + + +@pytest.fixture +def sample_config(tmp_path): + """ + :param tmp_path: + :return: + """ + + # Create a sample config.yaml file for testing + config_data = { + "input": "sqlite:obo:envo", + "output": "local/environmental-materials-relationships.txt", + "entity": "material entity", + "exclusions": [ + "biome", + "environmental material", + "chemical entity" + ] + } + config_file = tmp_path / "config.yaml" + with open(config_file, 'w') as file: + yaml.dump(config_data, file) + return str(config_file) + + +def test_generate_command(sample_config): + """ + Test the generate_oak_command function. + :param sample_config: + :return: + + """ + runner = CliRunner() + result = runner.invoke(cli, [sample_config]) + + expected_command = ( + "$(RUN) runoak --input sqlite:obo:envo info [ .desc//p=i 'material entity' ]" + " .not .desc//p=i 'biome'" + " .not .desc//p=i 'environmental material'" + " .not .desc//p=i 'chemical entity'" + " > local/environmental-materials-relationships.txt" + ) + + assert result.exit_code == 0 + assert expected_command in result.output + + +def test_missing_config(): + """ + Test the CLI tool when the config file is missing. + :return: + + """ + runner = CliRunner() + result = runner.invoke(cli, ["nonexistent.yaml"]) + + assert result.exit_code != 0 + assert "No such file or directory" in result.output + + +def test_invalid_config(tmp_path): + """ + Test the CLI tool when the config file is invalid. + :param tmp_path: + :return: + + """ + invalid_config_file = tmp_path / "invalid_config.yaml" + with open(invalid_config_file, 'w') as file: + file.write("Invalid YAML content") + + runner = CliRunner() + result = runner.invoke(cli, [str(invalid_config_file)]) + + assert result.exit_code != 0 + assert "could not find expected" in result.output # Checking for a YAML syntax error message From 0618aa72f28f42c2a03872ec55e6f14eff6ffcfa Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Wed, 28 Aug 2024 18:51:38 -0700 Subject: [PATCH 02/17] add test stubs --- .../env_local_scale_extraction_config.yaml | 0 env_triad.Makefile | 2 +- .../env_local_scale_extraction.py | 4 +- tests/test_env_local_scale_generator.py | 84 +++++++------------ 4 files changed, 36 insertions(+), 54 deletions(-) rename external_metadata_awareness/env_local_scale_config.yaml => config/env_local_scale_extraction_config.yaml (100%) diff --git a/external_metadata_awareness/env_local_scale_config.yaml b/config/env_local_scale_extraction_config.yaml similarity index 100% rename from external_metadata_awareness/env_local_scale_config.yaml rename to config/env_local_scale_extraction_config.yaml diff --git a/env_triad.Makefile b/env_triad.Makefile index 87a4430..128bb55 100644 --- a/env_triad.Makefile +++ b/env_triad.Makefile @@ -117,7 +117,7 @@ local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-pr generate-env-local-scale-candidates: # Ensure the poetry environment is activated and run the script with the specified config - poetry run python external_metadata_awareness/env_local_scale_extraction.py config.yaml + poetry run python external_metadata_awareness/env_local_scale_extraction.py --config-file config/env_local_scale_extraction_config.yaml local/env-local-scale-candidates-relationships.tsv: diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py index 2f6fbdf..f203733 100755 --- a/external_metadata_awareness/env_local_scale_extraction.py +++ b/external_metadata_awareness/env_local_scale_extraction.py @@ -17,9 +17,11 @@ def process_ontology(config): initial_term = config['entity'] exclusions = config['exclusions'] + print(initial_term) # Get all descendants of the initial term descendants = ontology.descendants(initial_term) + # Filter out the excluded terms filtered_descendants = [ term for term in descendants @@ -33,7 +35,7 @@ def process_ontology(config): @click.command() -@click.argument('config_file') +@click.option('--config-file', required=True, help='Path to the YAML configuration file.') def cli(config_file): """ CLI tool to process an ontology based on the provided YAML configuration file. diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index e56f572..3ffbd23 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -1,20 +1,15 @@ import pytest +import os import yaml from click.testing import CliRunner -from external_metadata_awareness.env_local_scale_extraction import cli +from external_metadata_awareness.env_local_scale_extraction import cli, load_config, process_ontology @pytest.fixture -def sample_config(tmp_path): - """ - :param tmp_path: - :return: - """ - - # Create a sample config.yaml file for testing +def config_file(tmp_path): config_data = { "input": "sqlite:obo:envo", - "output": "local/environmental-materials-relationships.txt", + "output": str(tmp_path / "output.txt"), "entity": "material entity", "exclusions": [ "biome", @@ -25,57 +20,42 @@ def sample_config(tmp_path): config_file = tmp_path / "config.yaml" with open(config_file, 'w') as file: yaml.dump(config_data, file) - return str(config_file) + return config_file -def test_generate_command(sample_config): - """ - Test the generate_oak_command function. - :param sample_config: - :return: +def test_load_config(config_file): + config = load_config(config_file) + assert config['input'] == "sqlite:obo:envo" + assert config['output'].endswith("output.txt") + assert config['entity'] == "material entity" + assert "biome" in config['exclusions'] - """ - runner = CliRunner() - result = runner.invoke(cli, [sample_config]) - expected_command = ( - "$(RUN) runoak --input sqlite:obo:envo info [ .desc//p=i 'material entity' ]" - " .not .desc//p=i 'biome'" - " .not .desc//p=i 'environmental material'" - " .not .desc//p=i 'chemical entity'" - " > local/environmental-materials-relationships.txt" - ) +def test_process_ontology(config_file): + config = load_config(config_file) + process_ontology(config) - assert result.exit_code == 0 - assert expected_command in result.output + # Check if the output file is created and not empty + assert os.path.exists(config['output']) + with open(config['output'], 'r') as file: + content = file.read() + assert len(content) > 0, "Output file is empty, expected some data." -def test_missing_config(): - """ - Test the CLI tool when the config file is missing. - :return: - - """ +def test_cli_runs_successfully(config_file): runner = CliRunner() - result = runner.invoke(cli, ["nonexistent.yaml"]) - - assert result.exit_code != 0 - assert "No such file or directory" in result.output - - -def test_invalid_config(tmp_path): - """ - Test the CLI tool when the config file is invalid. - :param tmp_path: - :return: + result = runner.invoke(cli, ['--config-file', str(config_file)]) + assert result.exit_code == 0 + assert os.path.exists(load_config(config_file)['output']) - """ - invalid_config_file = tmp_path / "invalid_config.yaml" - with open(invalid_config_file, 'w') as file: - file.write("Invalid YAML content") - runner = CliRunner() - result = runner.invoke(cli, [str(invalid_config_file)]) +def test_no_exclusions(config_file): + config = load_config(config_file) + config['exclusions'] = [] + process_ontology(config) - assert result.exit_code != 0 - assert "could not find expected" in result.output # Checking for a YAML syntax error message + # Check if the output file is created and has content + assert os.path.exists(config['output']) + with open(config['output'], 'r') as file: + content = file.read() + assert len(content) > 0, "Output file is empty, expected some data even without exclusions." From c09c78c118a4c7e417ab28b9e1b23a48b2ac77f8 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 12:00:18 -0700 Subject: [PATCH 03/17] get tests to pass --- ...=> env-local-scale-extraction-config.yaml} | 3 +- config/oaklib-setup-config.yaml | 3 + .../env_local_scale_extraction.py | 38 ++++----- tests/test_env_local_scale_generator.py | 77 +++++++++++-------- 4 files changed, 69 insertions(+), 52 deletions(-) rename config/{env_local_scale_extraction_config.yaml => env-local-scale-extraction-config.yaml} (97%) create mode 100644 config/oaklib-setup-config.yaml diff --git a/config/env_local_scale_extraction_config.yaml b/config/env-local-scale-extraction-config.yaml similarity index 97% rename from config/env_local_scale_extraction_config.yaml rename to config/env-local-scale-extraction-config.yaml index f5a1093..b676c33 100755 --- a/config/env_local_scale_extraction_config.yaml +++ b/config/env-local-scale-extraction-config.yaml @@ -1,5 +1,4 @@ -# config.yaml -input: "sqlite:obo:envo" +# env-local-scale-extraction-config.yaml output: "local/environmental-materials-relationships.txt" entity: "material entity" exclusions: diff --git a/config/oaklib-setup-config.yaml b/config/oaklib-setup-config.yaml new file mode 100644 index 0000000..7b45869 --- /dev/null +++ b/config/oaklib-setup-config.yaml @@ -0,0 +1,3 @@ +ontology_resources: + envo: + selector: sqlite:obo:envo \ No newline at end of file diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py index f203733..924f4eb 100755 --- a/external_metadata_awareness/env_local_scale_extraction.py +++ b/external_metadata_awareness/env_local_scale_extraction.py @@ -3,45 +3,47 @@ from oaklib import get_adapter -def load_config(config_file): - with open(config_file, 'r') as file: - config = yaml.safe_load(file) - return config +def load_configs(oak_config_file, extraction_config_file): + with open(oak_config_file, 'r') as file: + oak_config = yaml.safe_load(file) + with open(extraction_config_file, 'r') as file: + extraction_config = yaml.safe_load(file) + return oak_config, extraction_config -def process_ontology(config): +def process_ontology(oak_config_file, extraction_config): # Load the ontology using the get_adapter function - ontology = get_adapter(config['input']) + oak_adapter = get_adapter(oak_config_file) # Get the entity and exclusions from the config - initial_term = config['entity'] - exclusions = config['exclusions'] + initial_term_label = extraction_config['entity'] + initial_term_curie = oak_adapter.curies_by_label(label=initial_term_label) + exclusion_labels = extraction_config['exclusions'] - print(initial_term) # Get all descendants of the initial term - descendants = ontology.descendants(initial_term) - + descendants = oak_adapter.descendants(initial_term_curie) # Filter out the excluded terms filtered_descendants = [ term for term in descendants - if not any(ontology.label(term) == exclusion for exclusion in exclusions) + if not any(oak_adapter.label(term) == exclusion for exclusion in exclusion_labels) ] # Write the results to the output file - with open(config['output'], 'w') as output_file: + with open(extraction_config['output'], 'w') as output_file: for term in filtered_descendants: - output_file.write(f"{term}: {ontology.label(term)}\n") + output_file.write(f"{term}: {oak_adapter.label(term)}\n") @click.command() -@click.option('--config-file', required=True, help='Path to the YAML configuration file.') -def cli(config_file): +@click.option('--extraction-config-file', required=True, help='Path to the extraction YAML configuration file.') +@click.option('--oak-config-file', required=True, help='Path to the extraction YAML configuration file.') +def cli(extraction_config_file, oak_config_file): """ CLI tool to process an ontology based on the provided YAML configuration file. """ - config = load_config(config_file) - process_ontology(config) + _, extraction_config = load_configs(oak_config_file, extraction_config_file) + process_ontology(oak_config_file, extraction_config) if __name__ == "__main__": diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index 3ffbd23..d489156 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -1,61 +1,74 @@ import pytest -import os import yaml from click.testing import CliRunner -from external_metadata_awareness.env_local_scale_extraction import cli, load_config, process_ontology +from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, process_ontology @pytest.fixture -def config_file(tmp_path): +def oak_config_file(tmp_path): + config_data = { + "ontology_resources": { + "envo": { + "selector": "sqlite:obo:envo" + } + } + } + config_file = tmp_path / "oak_config.yaml" + with open(config_file, 'w') as file: + yaml.dump(config_data, file) + return config_file + + +@pytest.fixture +def extraction_config_file(tmp_path): config_data = { - "input": "sqlite:obo:envo", - "output": str(tmp_path / "output.txt"), "entity": "material entity", "exclusions": [ "biome", "environmental material", "chemical entity" - ] + ], + "output": str(tmp_path / "output.txt") } - config_file = tmp_path / "config.yaml" + config_file = tmp_path / "extraction_config.yaml" with open(config_file, 'w') as file: yaml.dump(config_data, file) return config_file -def test_load_config(config_file): - config = load_config(config_file) - assert config['input'] == "sqlite:obo:envo" - assert config['output'].endswith("output.txt") - assert config['entity'] == "material entity" - assert "biome" in config['exclusions'] +def test_load_configs(oak_config_file, extraction_config_file): + oak_config, extraction_config = load_configs(oak_config_file, extraction_config_file) + assert "ontology_resources" in oak_config + assert "envo" in oak_config["ontology_resources"] + assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo" + assert extraction_config["entity"] == "material entity" + assert extraction_config["output"].endswith("output.txt") + +def test_process_ontology(oak_config_file, extraction_config_file): + _, extraction_config = load_configs(oak_config_file, extraction_config_file) -def test_process_ontology(config_file): - config = load_config(config_file) - process_ontology(config) + # Replace with a real test ontology and expected behavior if possible. + process_ontology(oak_config_file, extraction_config) - # Check if the output file is created and not empty - assert os.path.exists(config['output']) - with open(config['output'], 'r') as file: + # Check if the output file is created and has content + assert extraction_config["output"] + with open(extraction_config["output"], 'r') as file: content = file.read() + print(content) assert len(content) > 0, "Output file is empty, expected some data." -def test_cli_runs_successfully(config_file): +def test_cli_runs_successfully(oak_config_file, extraction_config_file): runner = CliRunner() - result = runner.invoke(cli, ['--config-file', str(config_file)]) + result = runner.invoke(cli, ['--extraction-config-file', str(extraction_config_file), '--oak-config-file', + str(oak_config_file)]) assert result.exit_code == 0 - assert os.path.exists(load_config(config_file)['output']) + assert "material entity" in result.output or "ENVO:00000447" in result.output - -def test_no_exclusions(config_file): - config = load_config(config_file) - config['exclusions'] = [] - process_ontology(config) - - # Check if the output file is created and has content - assert os.path.exists(config['output']) - with open(config['output'], 'r') as file: + # Verify the output file exists and contains the expected results + output_file = extraction_config_file.parent / "output.txt" + assert output_file.exists() + with open(output_file, 'r') as file: content = file.read() - assert len(content) > 0, "Output file is empty, expected some data even without exclusions." + assert len(content) > 0, "Output file is empty, expected some data." \ No newline at end of file From c86dbcc59dad596ff2497f8ae519aa8bd8ba8f52 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 12:05:35 -0700 Subject: [PATCH 04/17] fixing makefile target --- config/{oaklib-setup-config.yaml => oak-config.yaml} | 0 env_triad.Makefile | 4 +++- tests/test_env_local_scale_generator.py | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) rename config/{oaklib-setup-config.yaml => oak-config.yaml} (100%) diff --git a/config/oaklib-setup-config.yaml b/config/oak-config.yaml similarity index 100% rename from config/oaklib-setup-config.yaml rename to config/oak-config.yaml diff --git a/env_triad.Makefile b/env_triad.Makefile index 128bb55..09e7305 100644 --- a/env_triad.Makefile +++ b/env_triad.Makefile @@ -117,7 +117,9 @@ local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-pr generate-env-local-scale-candidates: # Ensure the poetry environment is activated and run the script with the specified config - poetry run python external_metadata_awareness/env_local_scale_extraction.py --config-file config/env_local_scale_extraction_config.yaml + $(RUN) python external_metadata_awareness/env_local_scale_extraction.py \ + --oak-config-file config/oak-config.yaml \ + --extraction-config-file config/env-local-scale-extraction-config.yaml local/env-local-scale-candidates-relationships.tsv: diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index d489156..1271179 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -64,11 +64,11 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file): result = runner.invoke(cli, ['--extraction-config-file', str(extraction_config_file), '--oak-config-file', str(oak_config_file)]) assert result.exit_code == 0 - assert "material entity" in result.output or "ENVO:00000447" in result.output + assert "material entity" in result.output or "ENVO:00000447" in result.output # Verify the output file exists and contains the expected results output_file = extraction_config_file.parent / "output.txt" assert output_file.exists() with open(output_file, 'r') as file: content = file.read() - assert len(content) > 0, "Output file is empty, expected some data." \ No newline at end of file + assert len(content) > 0, "Output file is empty, expected some data." From a8d5e18f90d41c70bdb723dafb1a9de9b9d4c01d Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 16:37:22 -0700 Subject: [PATCH 05/17] working but no descendents --- config/env-local-scale-extraction-config.yaml | 16 +++++++++------- .../env_local_scale_extraction.py | 13 ++++++++++++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml index b676c33..e796e20 100755 --- a/config/env-local-scale-extraction-config.yaml +++ b/config/env-local-scale-extraction-config.yaml @@ -1,7 +1,13 @@ # env-local-scale-extraction-config.yaml output: "local/environmental-materials-relationships.txt" entity: "material entity" -exclusions: +text_exclusions: + - "l~gaseous" + - "l~marine" + - "l~undersea" + - "l~saline" + - "l~brackish" +term_exclusions: - "biome" - "environmental material" - "chemical entity" @@ -22,6 +28,7 @@ exclusions: - "collection of organisms" - "environmental system" - "ecozone" + - "material isosurface" - "environmental zone" - "water current" - "mass of environmental material" @@ -46,11 +53,6 @@ exclusions: - "protected area" - "channel of a watercourse" - "cryospheric layer" - - "l~gaseous" - - "l~marine" - "material isosurface" - - "l~undersea" - "NCBITaxon:1" - - "l~saline" - - "l~brackish" - - "aeroform" \ No newline at end of file + - "aeroform" diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py index 924f4eb..e5bd2c7 100755 --- a/external_metadata_awareness/env_local_scale_extraction.py +++ b/external_metadata_awareness/env_local_scale_extraction.py @@ -1,6 +1,7 @@ import yaml import click from oaklib import get_adapter +from oaklib.query import onto_query def load_configs(oak_config_file, extraction_config_file): @@ -18,7 +19,17 @@ def process_ontology(oak_config_file, extraction_config): # Get the entity and exclusions from the config initial_term_label = extraction_config['entity'] initial_term_curie = oak_adapter.curies_by_label(label=initial_term_label) - exclusion_labels = extraction_config['exclusions'] + exclusion_labels = extraction_config['term_exclusions'] + exclusion_curies = [] + + for exclusion_label in exclusion_labels: + exclusion_curie = oak_adapter.curies_by_label(label=exclusion_label) + if exclusion_curie: + exclusion_curies.append(exclusion_curie) + + results = onto_query(oak_adapter, initial_term_curie, exclusion_curies) + + excluded_text_matches = extraction_config['text_exclusions'] # Get all descendants of the initial term descendants = oak_adapter.descendants(initial_term_curie) From 8ed8440e589f1080365f1cf82aca918bfdb187d9 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 16:54:24 -0700 Subject: [PATCH 06/17] rewrite with onto_query --- config/env-local-scale-extraction-config.yaml | 10 +- .../env_local_scale_extraction.py | 116 +++++++++++++++--- tests/test_env_local_scale_generator.py | 27 +++- 3 files changed, 123 insertions(+), 30 deletions(-) diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml index e796e20..987142d 100755 --- a/config/env-local-scale-extraction-config.yaml +++ b/config/env-local-scale-extraction-config.yaml @@ -2,11 +2,11 @@ output: "local/environmental-materials-relationships.txt" entity: "material entity" text_exclusions: - - "l~gaseous" - - "l~marine" - - "l~undersea" - - "l~saline" - - "l~brackish" + - "gaseous" + - "marine" + - "undersea" + - "saline" + - "brackish" term_exclusions: - "biome" - "environmental material" diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py index e5bd2c7..ebc6334 100755 --- a/external_metadata_awareness/env_local_scale_extraction.py +++ b/external_metadata_awareness/env_local_scale_extraction.py @@ -1,7 +1,7 @@ import yaml import click from oaklib import get_adapter -from oaklib.query import onto_query +from oaklib.query import onto_query, FunctionQuery, FunctionEnum, SimpleQueryTerm def load_configs(oak_config_file, extraction_config_file): @@ -12,38 +12,114 @@ def load_configs(oak_config_file, extraction_config_file): return oak_config, extraction_config +def create_exclusion_query(term_labels, adapter): + """ + Creates a combined FunctionQuery to exclude specific terms and their descendants. + + :param term_labels: List of term labels to exclude. + :param adapter: The ontology adapter. + :return: Combined FunctionQuery to exclude all specified terms and their descendants. + """ + exclusion_queries = [] + + for label in term_labels: + # Find the CURIE for the label + term_curies = onto_query(SimpleQueryTerm(term=label), adapter) + if term_curies: + term_curie = term_curies[0] # Assuming one CURIE per label + # Create a descendant exclusion query for the term + exclusion_query = FunctionQuery( + function=FunctionEnum.DESCENDANT, + argument=term_curie, + description=f"Descendants of {label}" + ) + exclusion_queries.append(exclusion_query) + + # Combine all exclusion queries into one using the OR (|) operator + if exclusion_queries: + combined_exclusion_query = exclusion_queries[0] + for query in exclusion_queries[1:]: + combined_exclusion_query = combined_exclusion_query | query + return combined_exclusion_query + else: + return None + + +def create_text_exclusion_query(text_exclusions, adapter): + """ + Creates a combined FunctionQuery to exclude specific terms based on text matching. + + :param text_exclusions: List of text patterns to exclude. + :param adapter: The ontology adapter. + :return: Combined FunctionQuery to exclude all specified text matches. + """ + text_exclusion_queries = [] + + for text in text_exclusions: + exclusion_query = SimpleQueryTerm(term=text) + text_exclusion_queries.append(exclusion_query) + + # Combine all exclusion queries into one using the OR (|) operator + if text_exclusion_queries: + combined_text_exclusion_query = text_exclusion_queries[0] + for query in text_exclusion_queries[1:]: + combined_text_exclusion_query = combined_text_exclusion_query | query + return combined_text_exclusion_query + else: + return None + + def process_ontology(oak_config_file, extraction_config): # Load the ontology using the get_adapter function oak_adapter = get_adapter(oak_config_file) # Get the entity and exclusions from the config initial_term_label = extraction_config['entity'] - initial_term_curie = oak_adapter.curies_by_label(label=initial_term_label) - exclusion_labels = extraction_config['term_exclusions'] - exclusion_curies = [] + initial_term_curies = onto_query(SimpleQueryTerm(term=initial_term_label), oak_adapter) + + if not initial_term_curies: + raise ValueError(f"Entity '{initial_term_label}' not found in the ontology.") + + initial_term_curie = initial_term_curies[0] + print("initial_term_curie", initial_term_curie) + + # Create exclusion queries from terms + term_exclusion_query = create_exclusion_query(extraction_config.get('term_exclusions', []), oak_adapter) - for exclusion_label in exclusion_labels: - exclusion_curie = oak_adapter.curies_by_label(label=exclusion_label) - if exclusion_curie: - exclusion_curies.append(exclusion_curie) + # Create exclusion queries from text patterns + text_exclusion_query = create_text_exclusion_query(extraction_config.get('text_exclusions', []), oak_adapter) - results = onto_query(oak_adapter, initial_term_curie, exclusion_curies) + # Combine term and text exclusion queries + combined_exclusion_query = None + if term_exclusion_query and text_exclusion_query: + combined_exclusion_query = term_exclusion_query | text_exclusion_query + elif term_exclusion_query: + combined_exclusion_query = term_exclusion_query + elif text_exclusion_query: + combined_exclusion_query = text_exclusion_query - excluded_text_matches = extraction_config['text_exclusions'] + # Main query for descendants of the specified entity + material_entity_query = FunctionQuery( + function=FunctionEnum.DESCENDANT, + argument=initial_term_curie, # Assuming one CURIE for the entity + description=f"Descendants of {initial_term_label}" + ) - # Get all descendants of the initial term - descendants = oak_adapter.descendants(initial_term_curie) + # Combine the main query with the exclusion query + if combined_exclusion_query: + final_query = material_entity_query - combined_exclusion_query + else: + final_query = material_entity_query - # Filter out the excluded terms - filtered_descendants = [ - term for term in descendants - if not any(oak_adapter.label(term) == exclusion for exclusion in exclusion_labels) - ] + # Execute the final query + result = onto_query(final_query, oak_adapter) # Write the results to the output file with open(extraction_config['output'], 'w') as output_file: - for term in filtered_descendants: - output_file.write(f"{term}: {oak_adapter.label(term)}\n") + for curie in result: + label = oak_adapter.label(curie) + output_file.write(f"{curie}: {label}\n") + print(curie, label) @click.command() @@ -58,4 +134,4 @@ def cli(extraction_config_file, oak_config_file): if __name__ == "__main__": - cli() + cli() \ No newline at end of file diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index 1271179..ced2226 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -1,5 +1,6 @@ import pytest import yaml +import os from click.testing import CliRunner from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, process_ontology @@ -23,11 +24,15 @@ def oak_config_file(tmp_path): def extraction_config_file(tmp_path): config_data = { "entity": "material entity", - "exclusions": [ + "term_exclusions": [ "biome", "environmental material", "chemical entity" ], + "text_exclusions": [ + "brackish", + "marine" + ], "output": str(tmp_path / "output.txt") } config_file = tmp_path / "extraction_config.yaml" @@ -42,22 +47,30 @@ def test_load_configs(oak_config_file, extraction_config_file): assert "envo" in oak_config["ontology_resources"] assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo" assert extraction_config["entity"] == "material entity" + assert "term_exclusions" in extraction_config + assert "text_exclusions" in extraction_config assert extraction_config["output"].endswith("output.txt") def test_process_ontology(oak_config_file, extraction_config_file): _, extraction_config = load_configs(oak_config_file, extraction_config_file) - # Replace with a real test ontology and expected behavior if possible. + # Run the ontology processing process_ontology(oak_config_file, extraction_config) # Check if the output file is created and has content - assert extraction_config["output"] - with open(extraction_config["output"], 'r') as file: + output_file_path = extraction_config["output"] + assert os.path.exists(output_file_path), "Output file was not created" + + with open(output_file_path, 'r') as file: content = file.read() - print(content) assert len(content) > 0, "Output file is empty, expected some data." + # You could also add assertions based on expected content + # For example, checking that excluded terms are not in the output + assert "biome" not in content + assert "brackish" not in content + def test_cli_runs_successfully(oak_config_file, extraction_config_file): runner = CliRunner() @@ -72,3 +85,7 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file): with open(output_file, 'r') as file: content = file.read() assert len(content) > 0, "Output file is empty, expected some data." + + # Add additional assertions to check that the CLI correctly excluded terms + assert "biome" not in content + assert "brackish" not in content \ No newline at end of file From 9681ea1fc0529079e00b89226e9e27fce2de7314 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 18:24:06 -0700 Subject: [PATCH 07/17] use onto_query to get the terms --- .../env_local_scale_extraction.py | 111 ++++++------------ tests/test_env_local_scale_generator.py | 35 +++++- 2 files changed, 67 insertions(+), 79 deletions(-) diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py index ebc6334..b096f87 100755 --- a/external_metadata_awareness/env_local_scale_extraction.py +++ b/external_metadata_awareness/env_local_scale_extraction.py @@ -1,3 +1,5 @@ +from typing import List + import yaml import click from oaklib import get_adapter @@ -12,7 +14,7 @@ def load_configs(oak_config_file, extraction_config_file): return oak_config, extraction_config -def create_exclusion_query(term_labels, adapter): +def create_exclusion_list(term_labels, adapter) -> List[str]: """ Creates a combined FunctionQuery to exclude specific terms and their descendants. @@ -20,29 +22,16 @@ def create_exclusion_query(term_labels, adapter): :param adapter: The ontology adapter. :return: Combined FunctionQuery to exclude all specified terms and their descendants. """ - exclusion_queries = [] - + all_ids_to_exclude = [] for label in term_labels: # Find the CURIE for the label term_curies = onto_query(SimpleQueryTerm(term=label), adapter) if term_curies: term_curie = term_curies[0] # Assuming one CURIE per label # Create a descendant exclusion query for the term - exclusion_query = FunctionQuery( - function=FunctionEnum.DESCENDANT, - argument=term_curie, - description=f"Descendants of {label}" - ) - exclusion_queries.append(exclusion_query) - - # Combine all exclusion queries into one using the OR (|) operator - if exclusion_queries: - combined_exclusion_query = exclusion_queries[0] - for query in exclusion_queries[1:]: - combined_exclusion_query = combined_exclusion_query | query - return combined_exclusion_query - else: - return None + list_to_exclude = onto_query([".desc//p=i", term_curie], adapter) + all_ids_to_exclude.extend(list_to_exclude) + return list(set(all_ids_to_exclude)) def create_text_exclusion_query(text_exclusions, adapter): @@ -53,20 +42,25 @@ def create_text_exclusion_query(text_exclusions, adapter): :param adapter: The ontology adapter. :return: Combined FunctionQuery to exclude all specified text matches. """ - text_exclusion_queries = [] + + all_ids_to_exclude = [] for text in text_exclusions: - exclusion_query = SimpleQueryTerm(term=text) - text_exclusion_queries.append(exclusion_query) + # Find the CURIE for the label + list_to_exclude = onto_query(["l~"+text], adapter) + all_ids_to_exclude.extend(list_to_exclude) + return list(set(all_ids_to_exclude)) - # Combine all exclusion queries into one using the OR (|) operator - if text_exclusion_queries: - combined_text_exclusion_query = text_exclusion_queries[0] - for query in text_exclusion_queries[1:]: - combined_text_exclusion_query = combined_text_exclusion_query | query - return combined_text_exclusion_query - else: - return None + +def exclude_terms(full_list, exclusion_list): + """ + Returns a list of items from the full list with the items in the exclusion list removed. + + :param full_list: List of items to be filtered. + :param exclusion_list: List of items to exclude from the full list. + :return: A list with items from exclusion_list removed. + """ + return [item for item in full_list if item not in exclusion_list] def process_ontology(oak_config_file, extraction_config): @@ -75,51 +69,20 @@ def process_ontology(oak_config_file, extraction_config): # Get the entity and exclusions from the config initial_term_label = extraction_config['entity'] - initial_term_curies = onto_query(SimpleQueryTerm(term=initial_term_label), oak_adapter) - - if not initial_term_curies: - raise ValueError(f"Entity '{initial_term_label}' not found in the ontology.") - - initial_term_curie = initial_term_curies[0] - print("initial_term_curie", initial_term_curie) - - # Create exclusion queries from terms - term_exclusion_query = create_exclusion_query(extraction_config.get('term_exclusions', []), oak_adapter) - - # Create exclusion queries from text patterns - text_exclusion_query = create_text_exclusion_query(extraction_config.get('text_exclusions', []), oak_adapter) - - # Combine term and text exclusion queries - combined_exclusion_query = None - if term_exclusion_query and text_exclusion_query: - combined_exclusion_query = term_exclusion_query | text_exclusion_query - elif term_exclusion_query: - combined_exclusion_query = term_exclusion_query - elif text_exclusion_query: - combined_exclusion_query = text_exclusion_query - - # Main query for descendants of the specified entity - material_entity_query = FunctionQuery( - function=FunctionEnum.DESCENDANT, - argument=initial_term_curie, # Assuming one CURIE for the entity - description=f"Descendants of {initial_term_label}" - ) - - # Combine the main query with the exclusion query - if combined_exclusion_query: - final_query = material_entity_query - combined_exclusion_query - else: - final_query = material_entity_query - - # Execute the final query - result = onto_query(final_query, oak_adapter) - - # Write the results to the output file - with open(extraction_config['output'], 'w') as output_file: - for curie in result: - label = oak_adapter.label(curie) - output_file.write(f"{curie}: {label}\n") - print(curie, label) + initial_term_list = onto_query([".desc//p=i", initial_term_label], oak_adapter) + print("length of initial term list", len(initial_term_list)) + + exclusion_terms = extraction_config.get('term_exclusions', []) + exclusion_texts = extraction_config.get('text_exclusions', []) + + exclusion_terms_and_children = create_exclusion_list(exclusion_terms, oak_adapter) + exclusion_terms_from_text = create_text_exclusion_query(exclusion_texts, oak_adapter) + exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + print("length of excluded terms", len(exclusion_terms_and_children)) + print("length of excluded terms from text", len(exclusion_terms_from_text)) + + remaining_items = exclude_terms(initial_term_list, exclusion_list) + print(len(remaining_items)) @click.command() diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index ced2226..f57235d 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -3,7 +3,8 @@ import os from click.testing import CliRunner from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, process_ontology - +from oaklib.query import onto_query +from oaklib.selector import get_adapter @pytest.fixture def oak_config_file(tmp_path): @@ -25,9 +26,23 @@ def extraction_config_file(tmp_path): config_data = { "entity": "material entity", "term_exclusions": [ - "biome", - "environmental material", - "chemical entity" + "biome" + , "environmental material" + , "chemical entity" + , "organic material" + , "anatomical entity" + , "organism" + , "plant anatomical entity" + , "healthcare facility" + , "fluid layer" + , "interface layer" + , "manufactured product" + , "anatomical entity environment" + , "ecosystem" + , "area protected according to IUCN guidelines" + , "astronomical body" + , "astronomical object" + , "cloud" ], "text_exclusions": [ "brackish", @@ -64,6 +79,7 @@ def test_process_ontology(oak_config_file, extraction_config_file): with open(output_file_path, 'r') as file: content = file.read() + print(content) assert len(content) > 0, "Output file is empty, expected some data." # You could also add assertions based on expected content @@ -88,4 +104,13 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file): # Add additional assertions to check that the CLI correctly excluded terms assert "biome" not in content - assert "brackish" not in content \ No newline at end of file + assert "brackish" not in content + + +def test_onto_query(): + adapter = get_adapter("sqlite:obo:envo") + # desc = onto_query([".desc//p=i", "material entity"], adapter) + # print(len(desc)) + + list_to_exclude = onto_query(["l~saline"], adapter, labels=True) + print(list_to_exclude) \ No newline at end of file From 2d6d2fbfa195099ed0783d505345134e9c551efa Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 19:43:37 -0700 Subject: [PATCH 08/17] add plain text to the config file --- config/env-local-scale-extraction-config.yaml | 78 +++++++++++-------- .../env_local_scale_extraction.py | 57 +++++++++++--- tests/test_env_local_scale_generator.py | 51 ++++++++++-- 3 files changed, 133 insertions(+), 53 deletions(-) diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml index 987142d..43b60b0 100755 --- a/config/env-local-scale-extraction-config.yaml +++ b/config/env-local-scale-extraction-config.yaml @@ -7,52 +7,64 @@ text_exclusions: - "undersea" - "saline" - "brackish" -term_exclusions: - - "biome" - - "environmental material" - - "chemical entity" - - "organic material" +term_exlusions: + - "bridge" + - "road" + - "wildlife management area" +term_and_descendant_exclusions: + - "BFO:0000050" + - "BFO:0000050" + - "BFO:0000050" + - "BFO:0000050" + - "BFO:0000050" + - "BFO:0000050" + - "RO:0001025" + - "RO:0001025" + - "RO:0002473" + - "NCBITaxon:1" + - "administrative region" + - "aeroform" - "anatomical entity" - - "organism" - - "plant anatomical entity" - - "healthcare facility" - - "fluid layer" - - "interface layer" - - "manufactured product" - "anatomical entity environment" - - "ecosystem" - "area protected according to IUCN guidelines" - "astronomical body" - "astronomical object" + - "biome" + - "channel of a watercourse" + - "chemical entity" - "cloud" - "collection of organisms" - - "environmental system" + - "cryospheric layer" - "ecozone" - - "material isosurface" + - "ecosystem" + - "environmental material" + - "environmental monitoring area" + - "environmental system" - "environmental zone" - - "water current" + - "fluid layer" + - "healthcare facility" + - "ice" + - "interface layer" + - "manufactured product" + - "marine environmental zone" + - "marine littoral zone" - "mass of environmental material" - - "subatomic particle" + - "mass of liquid" + - "material isosurface" + - "material isosurface" + - "meteor" - "observing system" + - "organic material" + - "organism" - "particle" - "planetary structural layer" - "political entity" - - "meteor" + - "protected area" - "room" + - "saline water" + - "sea floor" + - "subatomic particle" - "transport feature" - - "mass of liquid" - - "RO:0001025 water body" - - "BFO:0000050 environmental monitoring area" - - "BFO:0000050 marine littoral zone" - - "BFO:0000050 marine environmental zone" - - "RO:0002473 sea floor" - - "BFO:0000050 saline water" - - "BFO:0000050 ice" - - "RO:0001025 water body" - - "administrative region" - - "protected area" - - "channel of a watercourse" - - "cryospheric layer" - - "material isosurface" - - "NCBITaxon:1" - - "aeroform" + - "water body" + - "water body" + - "water current" \ No newline at end of file diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py index b096f87..7e6bb95 100755 --- a/external_metadata_awareness/env_local_scale_extraction.py +++ b/external_metadata_awareness/env_local_scale_extraction.py @@ -34,7 +34,7 @@ def create_exclusion_list(term_labels, adapter) -> List[str]: return list(set(all_ids_to_exclude)) -def create_text_exclusion_query(text_exclusions, adapter): +def create_text_exclusion_list(text_exclusions, adapter): """ Creates a combined FunctionQuery to exclude specific terms based on text matching. @@ -63,26 +63,59 @@ def exclude_terms(full_list, exclusion_list): return [item for item in full_list if item not in exclusion_list] -def process_ontology(oak_config_file, extraction_config): +def create_exclude_solo_terms(exlusion_terms: List[str], adapter) -> List[str]: + """ + Creates a list of CURIEs to exclude based on the provided list of terms. + + :param exlusion_terms: List of term labels to exclude. + :param envo: The ontology adapter. + + """ + + all_ids_to_exclude = [] + + for term_label in exlusion_terms: + # Find the CURIE for the label + list_to_exclude = onto_query([term_label], adapter) + all_ids_to_exclude.extend(list_to_exclude) + return list(set(all_ids_to_exclude)) + pass + + +def extract_terms_to_file(oak_config_file, extraction_config): # Load the ontology using the get_adapter function - oak_adapter = get_adapter(oak_config_file) + envo = get_adapter(oak_config_file) # Get the entity and exclusions from the config initial_term_label = extraction_config['entity'] - initial_term_list = onto_query([".desc//p=i", initial_term_label], oak_adapter) + initial_term_list = onto_query([".desc//p=i", initial_term_label], envo) print("length of initial term list", len(initial_term_list)) - exclusion_terms = extraction_config.get('term_exclusions', []) - exclusion_texts = extraction_config.get('text_exclusions', []) - exclusion_terms_and_children = create_exclusion_list(exclusion_terms, oak_adapter) - exclusion_terms_from_text = create_text_exclusion_query(exclusion_texts, oak_adapter) - exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + exclusion_terms_and_children = create_exclusion_list(extraction_config.get('term_and_descendant_exclusions', []), + envo) + + exclusion_terms_from_text = create_text_exclusion_list(extraction_config.get('text_exclusions', []), + envo) + exluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo) + + exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + exluded_terms print("length of excluded terms", len(exclusion_terms_and_children)) print("length of excluded terms from text", len(exclusion_terms_from_text)) + print("length of excluded terms from solo terms", len(exluded_terms)) remaining_items = exclude_terms(initial_term_list, exclusion_list) - print(len(remaining_items)) + print("length of remaining items", len(remaining_items)) + + results = onto_query(remaining_items, envo, labels=True) + + # Write the results to the output file specified in the extraction config + output_file_path = extraction_config['output'] + with open(output_file_path, 'w') as output_file: + for curie, label in results: + output_file.write(f"{curie}: {label}\n") + + print(f"Results written to {output_file_path}") @click.command() @@ -93,8 +126,8 @@ def cli(extraction_config_file, oak_config_file): CLI tool to process an ontology based on the provided YAML configuration file. """ _, extraction_config = load_configs(oak_config_file, extraction_config_file) - process_ontology(oak_config_file, extraction_config) + extract_terms_to_file(oak_config_file, extraction_config) if __name__ == "__main__": - cli() \ No newline at end of file + cli() diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index f57235d..8eefecb 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -2,10 +2,11 @@ import yaml import os from click.testing import CliRunner -from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, process_ontology +from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, extract_terms_to_file from oaklib.query import onto_query from oaklib.selector import get_adapter + @pytest.fixture def oak_config_file(tmp_path): config_data = { @@ -43,12 +44,46 @@ def extraction_config_file(tmp_path): , "astronomical body" , "astronomical object" , "cloud" + , "collection of organisms" + , "environmental system" + , "ecozone" + , "material isosurface" + , "environmental zone" + , "water current" + , "mass of environmental material" + , "subatomic particle" + , "observing system" + , "particle" + , "planetary structural layer" + , "political entity" + , "meteor" + , "room" + , "transport feature" + , "mass of liquid" + , "RO:0001025 water body" + , "BFO:0000050 environmental monitoring area" + , "BFO:0000050 marine littoral zone" + , "BFO:0000050 marine environmental zone" + , "RO:0002473 sea floor" + , "BFO:0000050 saline water" + , "BFO:0000050 ice" + , "RO:0001025 water body" + , "administrative region" + , "protected area" + , "channel of a watercourse" + , "cryospheric layer" + , "material isosurface" + , "NCBITaxon:1" + , "aeroform" ], "text_exclusions": [ - "brackish", - "marine" + "gaseous" + , "marine" + , "undersea" + , "saline" + , "brackish" ], - "output": str(tmp_path / "output.txt") + "output": str(tmp_path / "environmental-materials-relationships.txt") } config_file = tmp_path / "extraction_config.yaml" with open(config_file, 'w') as file: @@ -71,7 +106,7 @@ def test_process_ontology(oak_config_file, extraction_config_file): _, extraction_config = load_configs(oak_config_file, extraction_config_file) # Run the ontology processing - process_ontology(oak_config_file, extraction_config) + extract_terms_to_file(oak_config_file, extraction_config) # Check if the output file is created and has content output_file_path = extraction_config["output"] @@ -79,13 +114,13 @@ def test_process_ontology(oak_config_file, extraction_config_file): with open(output_file_path, 'r') as file: content = file.read() - print(content) assert len(content) > 0, "Output file is empty, expected some data." # You could also add assertions based on expected content # For example, checking that excluded terms are not in the output assert "biome" not in content assert "brackish" not in content + assert "saline" not in content def test_cli_runs_successfully(oak_config_file, extraction_config_file): @@ -93,10 +128,9 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file): result = runner.invoke(cli, ['--extraction-config-file', str(extraction_config_file), '--oak-config-file', str(oak_config_file)]) assert result.exit_code == 0 - assert "material entity" in result.output or "ENVO:00000447" in result.output # Verify the output file exists and contains the expected results - output_file = extraction_config_file.parent / "output.txt" + output_file = extraction_config_file.parent / "environmental-materials-relationships.txt" assert output_file.exists() with open(output_file, 'r') as file: content = file.read() @@ -105,6 +139,7 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file): # Add additional assertions to check that the CLI correctly excluded terms assert "biome" not in content assert "brackish" not in content + assert "saline" not in content def test_onto_query(): From 7642befb2c1bf9265c508d5fd6c6d7182bda4c5b Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 19:58:30 -0700 Subject: [PATCH 09/17] passing tests --- config/env-local-scale-extraction-config.yaml | 6 ++++-- external_metadata_awareness/env_local_scale_extraction.py | 7 +++---- tests/test_env_local_scale_generator.py | 5 +++++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml index 43b60b0..4c0aa1e 100755 --- a/config/env-local-scale-extraction-config.yaml +++ b/config/env-local-scale-extraction-config.yaml @@ -7,7 +7,8 @@ text_exclusions: - "undersea" - "saline" - "brackish" -term_exlusions: + - "undersea" +term_exclusions: - "bridge" - "road" - "wildlife management area" @@ -45,13 +46,14 @@ term_and_descendant_exclusions: - "healthcare facility" - "ice" - "interface layer" + - "island" + - "lake layer" - "manufactured product" - "marine environmental zone" - "marine littoral zone" - "mass of environmental material" - "mass of liquid" - "material isosurface" - - "material isosurface" - "meteor" - "observing system" - "organic material" diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py index 7e6bb95..93bef46 100755 --- a/external_metadata_awareness/env_local_scale_extraction.py +++ b/external_metadata_awareness/env_local_scale_extraction.py @@ -91,18 +91,17 @@ def extract_terms_to_file(oak_config_file, extraction_config): initial_term_list = onto_query([".desc//p=i", initial_term_label], envo) print("length of initial term list", len(initial_term_list)) - exclusion_terms_and_children = create_exclusion_list(extraction_config.get('term_and_descendant_exclusions', []), envo) exclusion_terms_from_text = create_text_exclusion_list(extraction_config.get('text_exclusions', []), envo) - exluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo) + excluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo) - exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + exluded_terms + exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + excluded_terms print("length of excluded terms", len(exclusion_terms_and_children)) print("length of excluded terms from text", len(exclusion_terms_from_text)) - print("length of excluded terms from solo terms", len(exluded_terms)) + print("length of excluded terms from solo terms", len(excluded_terms)) remaining_items = exclude_terms(initial_term_list, exclusion_list) print("length of remaining items", len(remaining_items)) diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index 8eefecb..3a1d482 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -27,6 +27,11 @@ def extraction_config_file(tmp_path): config_data = { "entity": "material entity", "term_exclusions": [ + "bridge", + "road", + "wildlife management area" + ], + "term_and_descendant_exclusions": [ "biome" , "environmental material" , "chemical entity" From ccb459d3a73b0fe2704792e2e70111a3d8127e69 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 20:14:15 -0700 Subject: [PATCH 10/17] print to logging --- config/env-local-scale-extraction-config.yaml | 2 +- env_triad.Makefile | 2 +- ...tion.py => envo_local_scale_extraction.py} | 28 +++++++++---------- tests/test_env_local_scale_generator.py | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) rename external_metadata_awareness/{env_local_scale_extraction.py => envo_local_scale_extraction.py} (84%) diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml index 4c0aa1e..bb36c80 100755 --- a/config/env-local-scale-extraction-config.yaml +++ b/config/env-local-scale-extraction-config.yaml @@ -1,5 +1,5 @@ # env-local-scale-extraction-config.yaml -output: "local/environmental-materials-relationships.txt" +output: "local/env-local-scale-candidates.txt" entity: "material entity" text_exclusions: - "gaseous" diff --git a/env_triad.Makefile b/env_triad.Makefile index b1626be..585558f 100644 --- a/env_triad.Makefile +++ b/env_triad.Makefile @@ -177,7 +177,7 @@ local/env-local-scale-non-leaf.png: local/env-local-scale-candidates.txt local/e generate-env-local-scale-candidates: # Ensure the poetry environment is activated and run the script with the specified config - $(RUN) python external_metadata_awareness/env_local_scale_extraction.py \ + $(RUN) python external_metadata_awareness/envo_local_scale_extraction.py \ --oak-config-file config/oak-config.yaml \ --extraction-config-file config/env-local-scale-extraction-config.yaml diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/envo_local_scale_extraction.py similarity index 84% rename from external_metadata_awareness/env_local_scale_extraction.py rename to external_metadata_awareness/envo_local_scale_extraction.py index 93bef46..5ce2f2d 100755 --- a/external_metadata_awareness/env_local_scale_extraction.py +++ b/external_metadata_awareness/envo_local_scale_extraction.py @@ -1,9 +1,12 @@ +import logging from typing import List - import yaml import click from oaklib import get_adapter -from oaklib.query import onto_query, FunctionQuery, FunctionEnum, SimpleQueryTerm +from oaklib.query import onto_query, SimpleQueryTerm + +# Configure logging +logging.basicConfig(level=logging.WARN, format='%(asctime)s - %(levelname)s - %(message)s') def load_configs(oak_config_file, extraction_config_file): @@ -63,23 +66,20 @@ def exclude_terms(full_list, exclusion_list): return [item for item in full_list if item not in exclusion_list] -def create_exclude_solo_terms(exlusion_terms: List[str], adapter) -> List[str]: +def create_exclude_solo_terms(exclusion_terms: List[str], adapter) -> List[str]: """ Creates a list of CURIEs to exclude based on the provided list of terms. - :param exlusion_terms: List of term labels to exclude. + :param exclusion_terms: List of term labels to exclude. :param envo: The ontology adapter. - """ - all_ids_to_exclude = [] - for term_label in exlusion_terms: + for term_label in exclusion_terms: # Find the CURIE for the label list_to_exclude = onto_query([term_label], adapter) all_ids_to_exclude.extend(list_to_exclude) return list(set(all_ids_to_exclude)) - pass def extract_terms_to_file(oak_config_file, extraction_config): @@ -89,7 +89,7 @@ def extract_terms_to_file(oak_config_file, extraction_config): # Get the entity and exclusions from the config initial_term_label = extraction_config['entity'] initial_term_list = onto_query([".desc//p=i", initial_term_label], envo) - print("length of initial term list", len(initial_term_list)) + logging.info(f"Length of initial term list: {len(initial_term_list)}") exclusion_terms_and_children = create_exclusion_list(extraction_config.get('term_and_descendant_exclusions', []), envo) @@ -99,12 +99,12 @@ def extract_terms_to_file(oak_config_file, extraction_config): excluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo) exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + excluded_terms - print("length of excluded terms", len(exclusion_terms_and_children)) - print("length of excluded terms from text", len(exclusion_terms_from_text)) - print("length of excluded terms from solo terms", len(excluded_terms)) + logging.info(f"Length of excluded terms and descendants: {len(exclusion_terms_and_children)}") + logging.info(f"Length of excluded terms from text: {len(exclusion_terms_from_text)}") + logging.info(f"Length of excluded terms from solo terms: {len(excluded_terms)}") remaining_items = exclude_terms(initial_term_list, exclusion_list) - print("length of remaining items", len(remaining_items)) + logging.info(f"Length of remaining items: {len(remaining_items)}") results = onto_query(remaining_items, envo, labels=True) @@ -114,7 +114,7 @@ def extract_terms_to_file(oak_config_file, extraction_config): for curie, label in results: output_file.write(f"{curie}: {label}\n") - print(f"Results written to {output_file_path}") + logging.info(f"Results written to {output_file_path}") @click.command() diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index 3a1d482..9511feb 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -2,7 +2,7 @@ import yaml import os from click.testing import CliRunner -from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, extract_terms_to_file +from external_metadata_awareness.envo_local_scale_extraction import cli, load_configs, extract_terms_to_file from oaklib.query import onto_query from oaklib.selector import get_adapter From d3c09d7cc4c5921d0ac731db17ae9066648c2476 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 20:30:11 -0700 Subject: [PATCH 11/17] add test target from makefile --- env_triad.Makefile | 2 ++ tests/test_env_local_scale_generator.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/env_triad.Makefile b/env_triad.Makefile index 585558f..2b6c04e 100644 --- a/env_triad.Makefile +++ b/env_triad.Makefile @@ -181,4 +181,6 @@ generate-env-local-scale-candidates: --oak-config-file config/oak-config.yaml \ --extraction-config-file config/env-local-scale-extraction-config.yaml +test: + $(RUN) pytest tests/* ###### END SIERRA's STUFF ####### diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index 9511feb..2837553 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -104,7 +104,7 @@ def test_load_configs(oak_config_file, extraction_config_file): assert extraction_config["entity"] == "material entity" assert "term_exclusions" in extraction_config assert "text_exclusions" in extraction_config - assert extraction_config["output"].endswith("output.txt") + assert extraction_config["output"].endswith("environmental-materials-relationships.txt") def test_process_ontology(oak_config_file, extraction_config_file): From 6e7b38ef9fb854d5d6ecd2fbf85f5f78f8954749 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 20:31:41 -0700 Subject: [PATCH 12/17] add GH action to run the tests --- .github/workflows/main.yaml | 40 +++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/main.yaml diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml new file mode 100644 index 0000000..4fd9e94 --- /dev/null +++ b/.github/workflows/main.yaml @@ -0,0 +1,40 @@ +# This is a basic workflow to help you get started with Actions + +name: Run tests + +# Controls when the action will run. +on: + # Triggers the workflow on push or pull request events but only for the master branch + pull_request: + types: [opened, synchronize, reopened] + + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "build" + build: + # The type of runner that the job will run on + runs-on: ubuntu-latest + strategy: + matrix: + python: [ "3.9", "3.10", "3.11" ] + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + name: setup python environment + with: + python-version: ${{ matrix.python }} + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install poetry + poetry install + + - name: Run tests + run: | + poetry run pytest tests/* From a3cab2a7bc76bf176e9a25e0446c1b508d5af5f9 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 29 Aug 2024 20:42:52 -0700 Subject: [PATCH 13/17] undo cborg_test change --- external_metadata_awareness/cborg_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external_metadata_awareness/cborg_test.py b/external_metadata_awareness/cborg_test.py index 94f4607..c2cdf6b 100644 --- a/external_metadata_awareness/cborg_test.py +++ b/external_metadata_awareness/cborg_test.py @@ -3,7 +3,7 @@ from dotenv import load_dotenv # Load environment variables from local/.env -load_dotenv(os.path.join('../..', 'local', '.env')) +load_dotenv(os.path.join('..', 'local', '.env')) client = openai.OpenAI( api_key=os.environ.get('CBORG_API_KEY'), # Retrieve API key from environment variables From ec907d0f6617d32eec337cd295b3695854851db7 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Fri, 30 Aug 2024 16:13:46 -0700 Subject: [PATCH 14/17] add in ability to exclude single terms, as well as include term post-extraction --- config/env-local-scale-extraction-config.yaml | 98 ++++++++++++++++++- .../envo_local_scale_extraction.py | 32 +++--- tests/test_env_local_scale_generator.py | 3 +- 3 files changed, 119 insertions(+), 14 deletions(-) diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml index bb36c80..27111d3 100755 --- a/config/env-local-scale-extraction-config.yaml +++ b/config/env-local-scale-extraction-config.yaml @@ -8,7 +8,7 @@ text_exclusions: - "saline" - "brackish" - "undersea" -term_exclusions: +post_process_inclusion_single_terms: - "bridge" - "road" - "wildlife management area" @@ -69,4 +69,98 @@ term_and_descendant_exclusions: - "transport feature" - "water body" - "water body" - - "water current" \ No newline at end of file + - "water current" +"single_term_exclusions": + - anthropised terrestrial environmental zone + - anthropogenic contamination feature + - anthropogenic geographic feature + - area of attached faunal communities + - area of attached mussel assemblages + - area of developed space + - astronomical body part + - biosphere + - body of liquid + - carbonate system of ocean water + - cellular organisms + - child care facility + - cloud part + - compound astronomical body part + - construction + - conveyor system + - cryoform + - educational facility + - environmental zone + - environmental zone of processual equilibrium + - facility + - fiat object + - fiat part of an astronomical object + - floating ice mass + - fluid astronomical body part + - fresh water body + - gaseous astronomical body part + - gaseous part of an atmosphere + - geographic feature + - hail stone + - hydroform + - hydrographic feature + - hydrosphere + - ice decumulation zone + - landform + - layer + - liquid astronomical body part + - lotic water body + - marine hydrothermal vent + - marine reef + - marine water body + - marine water mass + - mass of compounded environmental materials + - mass of environmental material + - mass of solid material + - material accumulation zone + - material decumulation zone + - material entity + - object + - object aggregate + - ocean basin + - open cage mariculture facility + - organismal entity + - pedosphere + - planetary photic zone + - planetary subsurface zone + - pole + - polling place + - polling station + - processed material + - processing plant + - public infrastructure + - public transit system + - rapid transit system + - rain + - range of seamounts + - rocky reef + - root + - saline water body + - sea ice floe + - sea ice hummock + - sea ice mass + - seamount + - sleet pellet + - sleet pellet + - soil horizon + - soil layer + - solid astronomical body part + - solid layer + - subsurface landform + - subsurface zone of an astronomical body + - surface landform + - system + - Taylor column + - technosphere + - underground water body + - volcanic feature + - water body + - watercourse + - water mass + - water-based rain + + diff --git a/external_metadata_awareness/envo_local_scale_extraction.py b/external_metadata_awareness/envo_local_scale_extraction.py index 5ce2f2d..7366547 100755 --- a/external_metadata_awareness/envo_local_scale_extraction.py +++ b/external_metadata_awareness/envo_local_scale_extraction.py @@ -66,20 +66,22 @@ def exclude_terms(full_list, exclusion_list): return [item for item in full_list if item not in exclusion_list] -def create_exclude_solo_terms(exclusion_terms: List[str], adapter) -> List[str]: +def retrieve_individual_terms(terms_to_retrieve: List[str], adapter) -> List[str]: """ - Creates a list of CURIEs to exclude based on the provided list of terms. + Creates a list of CURIEs based on the provided list of term labels. - :param exclusion_terms: List of term labels to exclude. + :param terms_to_retrieve: List of term labels. :param envo: The ontology adapter. """ - all_ids_to_exclude = [] + all_ids = [] - for term_label in exclusion_terms: + for term_label in terms_to_retrieve: # Find the CURIE for the label list_to_exclude = onto_query([term_label], adapter) - all_ids_to_exclude.extend(list_to_exclude) - return list(set(all_ids_to_exclude)) + print("term_label", term_label) + print("list_to_exclude", list_to_exclude) + all_ids.extend(list_to_exclude) + return list(set(all_ids)) def extract_terms_to_file(oak_config_file, extraction_config): @@ -96,17 +98,25 @@ def extract_terms_to_file(oak_config_file, extraction_config): exclusion_terms_from_text = create_text_exclusion_list(extraction_config.get('text_exclusions', []), envo) - excluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo) - exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + excluded_terms + exclude_single_terms = retrieve_individual_terms(extraction_config.get('exclude_single_terms', []), envo) + solo_inclusion_terms = extraction_config.get('post_process_inclusion_single_terms', []) + logging.info("solo_inclusion_terms", solo_inclusion_terms) + post_process_inclusion_single_terms = retrieve_individual_terms(extraction_config.get('post_process_inclusion_single_terms', []), envo) + logging.info("post_process_inclusion_terms", post_process_inclusion_single_terms) + + + exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + exclude_single_terms logging.info(f"Length of excluded terms and descendants: {len(exclusion_terms_and_children)}") logging.info(f"Length of excluded terms from text: {len(exclusion_terms_from_text)}") - logging.info(f"Length of excluded terms from solo terms: {len(excluded_terms)}") + logging.info(f"Length of excluded terms from solo terms: {len(post_process_inclusion_single_terms)}") remaining_items = exclude_terms(initial_term_list, exclusion_list) logging.info(f"Length of remaining items: {len(remaining_items)}") - results = onto_query(remaining_items, envo, labels=True) + final_list_to_retrieve = post_process_inclusion_single_terms + remaining_items + + results = onto_query(final_list_to_retrieve, envo, labels=True) # Write the results to the output file specified in the extraction config output_file_path = extraction_config['output'] diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index 2837553..21a6e7f 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -26,7 +26,7 @@ def oak_config_file(tmp_path): def extraction_config_file(tmp_path): config_data = { "entity": "material entity", - "term_exclusions": [ + "post_process_inclusion_single_terms": [ "bridge", "road", "wildlife management area" @@ -126,6 +126,7 @@ def test_process_ontology(oak_config_file, extraction_config_file): assert "biome" not in content assert "brackish" not in content assert "saline" not in content + assert "wildlife management area" in content def test_cli_runs_successfully(oak_config_file, extraction_config_file): From 33fb5fc4de38905dc68e8b999594b84cdefb822a Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Fri, 30 Aug 2024 16:17:49 -0700 Subject: [PATCH 15/17] fixing test data --- config/env-local-scale-extraction-config.yaml | 183 +++++++++--------- tests/test_env_local_scale_generator.py | 3 +- 2 files changed, 94 insertions(+), 92 deletions(-) diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml index 27111d3..c7fe387 100755 --- a/config/env-local-scale-extraction-config.yaml +++ b/config/env-local-scale-extraction-config.yaml @@ -71,96 +71,97 @@ term_and_descendant_exclusions: - "water body" - "water current" "single_term_exclusions": - - anthropised terrestrial environmental zone - - anthropogenic contamination feature - - anthropogenic geographic feature - - area of attached faunal communities - - area of attached mussel assemblages - - area of developed space - - astronomical body part - - biosphere - - body of liquid - - carbonate system of ocean water - - cellular organisms - - child care facility - - cloud part - - compound astronomical body part - - construction - - conveyor system - - cryoform - - educational facility - - environmental zone - - environmental zone of processual equilibrium - - facility - - fiat object - - fiat part of an astronomical object - - floating ice mass - - fluid astronomical body part - - fresh water body - - gaseous astronomical body part - - gaseous part of an atmosphere - - geographic feature - - hail stone - - hydroform - - hydrographic feature - - hydrosphere - - ice decumulation zone - - landform - - layer - - liquid astronomical body part - - lotic water body - - marine hydrothermal vent - - marine reef - - marine water body - - marine water mass - - mass of compounded environmental materials - - mass of environmental material - - mass of solid material - - material accumulation zone - - material decumulation zone - - material entity - - object - - object aggregate - - ocean basin - - open cage mariculture facility - - organismal entity - - pedosphere - - planetary photic zone - - planetary subsurface zone - - pole - - polling place - - polling station - - processed material - - processing plant - - public infrastructure - - public transit system - - rapid transit system - - rain - - range of seamounts - - rocky reef - - root - - saline water body - - sea ice floe - - sea ice hummock - - sea ice mass - - seamount - - sleet pellet - - sleet pellet - - soil horizon - - soil layer - - solid astronomical body part - - solid layer - - subsurface landform - - subsurface zone of an astronomical body - - surface landform - - system - - Taylor column - - technosphere - - underground water body - - volcanic feature - - water body - - watercourse - - water mass - - water-based rain + - "anthropised terrestrial environmental zone" + - "anthropogenic contamination feature" + - "anthropogenic geographic feature" + - "area of attached faunal communities" + - "area of attached mussel assemblages" + - "area of developed space" + - "astronomical body part" + - "biosphere" + - "body of liquid" + - "carbonate system of ocean water" + - "cellular organisms" + - "child care facility" + - "cloud part" + - "compound astronomical body part" + - "construction" + - "conveyor system" + - "cryoform" + - "educational facility" + - "environmental zone" + - "environmental zone of processual equilibrium" + - "facility" + - "fiat object" + - "fiat part of an astronomical object" + - "floating ice mass" + - "fluid astronomical body part" + - "fresh water body" + - "gaseous astronomical body part" + - "gaseous part of an atmosphere" + - "geographic feature" + - "hail stone" + - "hydroform" + - "hydrographic feature" + - "hydrosphere" + - "ice decumulation zone" + - "landform" + - "layer" + - "liquid astronomical body part" + - "lotic water body" + - "marine hydrothermal vent" + - "marine reef" + - "marine water body" + - "marine water mass" + - "mass of compounded environmental materials" + - "mass of environmental material" + - "mass of solid material" + - "material accumulation zone" + - "material decumulation zone" + - "material entity" + - "object" + - "object aggregate" + - "ocean basin" + - "open cage mariculture facility" + - "organismal entity" + - "pedosphere" + - "planetary photic zone" + - "planetary subsurface zone" + - "pole" + - "polling place" + - "polling station" + - "processed material" + - "processing plant" + - "public infrastructure" + - "public transit system" + - "rapid transit system" + - "rain" + - "range of seamounts" + - "rocky reef" + - "root" + - "saline water body" + - "sea ice floe" + - "sea ice hummock" + - "sea ice mass" + - "seamount" + - "sleet pellet" + - "sleet pellet" + - "soil horizon" + - "soil layer" + - "solid astronomical body part" + - "solid layer" + - "subsurface landform" + - "subsurface zone of an astronomical body" + - "surface landform" + - "system" + - "Taylor column" + - "technosphere" + - "underground water body" + - "volcanic feature" + - "water body" + - "watercourse" + - "water mass" + - "water-based rain" + diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index 21a6e7f..b02bc20 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -102,7 +102,8 @@ def test_load_configs(oak_config_file, extraction_config_file): assert "envo" in oak_config["ontology_resources"] assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo" assert extraction_config["entity"] == "material entity" - assert "term_exclusions" in extraction_config + assert "post_process_inclusion_single_terms" in extraction_config + assert "single_term_exclusions" in extraction_config assert "text_exclusions" in extraction_config assert extraction_config["output"].endswith("environmental-materials-relationships.txt") From a389de1b997b5fca2acc05bcc7245c6083f0c4da Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Fri, 30 Aug 2024 16:18:12 -0700 Subject: [PATCH 16/17] fixing test data --- tests/test_env_local_scale_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py index b02bc20..d5c8b4e 100644 --- a/tests/test_env_local_scale_generator.py +++ b/tests/test_env_local_scale_generator.py @@ -103,7 +103,6 @@ def test_load_configs(oak_config_file, extraction_config_file): assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo" assert extraction_config["entity"] == "material entity" assert "post_process_inclusion_single_terms" in extraction_config - assert "single_term_exclusions" in extraction_config assert "text_exclusions" in extraction_config assert extraction_config["output"].endswith("environmental-materials-relationships.txt") From 24526e878cf980957af0be9ab1c87e365ad21e4c Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 14 Nov 2024 10:24:21 -0800 Subject: [PATCH 17/17] commit latest work --- external_metadata_awareness/envo_local_scale_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external_metadata_awareness/envo_local_scale_extraction.py b/external_metadata_awareness/envo_local_scale_extraction.py index 7366547..67cd74a 100755 --- a/external_metadata_awareness/envo_local_scale_extraction.py +++ b/external_metadata_awareness/envo_local_scale_extraction.py @@ -84,7 +84,7 @@ def retrieve_individual_terms(terms_to_retrieve: List[str], adapter) -> List[str return list(set(all_ids)) -def extract_terms_to_file(oak_config_file, extraction_config): +def extract_terfms_to_file(oak_config_file, extraction_config): # Load the ontology using the get_adapter function envo = get_adapter(oak_config_file)