From 9d2f639298698e059045e361259ae2330f519164 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Wed, 28 Aug 2024 18:40:23 -0700
Subject: [PATCH 01/17] extract makefile target into a simple script

---
 Makefile                                      | 289 +-----------------
 env_triad.Makefile                            | 145 +++++++++
 external_metadata_awareness/cborg_test.py     |   2 +-
 .../env_local_scale_config.yaml               |  57 ++++
 .../env_local_scale_extraction.py             |  46 +++
 ncbi.Makefile                                 | 160 ++++++++++
 tests/test_env_local_scale_generator.py       |  81 +++++
 7 files changed, 493 insertions(+), 287 deletions(-)
 create mode 100644 env_triad.Makefile
 create mode 100755 external_metadata_awareness/env_local_scale_config.yaml
 create mode 100755 external_metadata_awareness/env_local_scale_extraction.py
 create mode 100644 ncbi.Makefile
 create mode 100644 tests/test_env_local_scale_generator.py
diff --git a/Makefile b/Makefile
index 09143aa..21f0306 100644
--- a/Makefile
+++ b/Makefile
@@ -5,221 +5,8 @@ RUN=poetry run
 MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml
 SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml
 
-
-## NCBI STUFF
-# very complex documents; many are too large to load into a MongoDB document
-downloads/bioproject.xml:
-	$(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml" # ~ 3 GB August 2024
-
-downloads/biosample_set.xml.gz:
-	$(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz" # ~ 3 GB August 2024
-
-local/biosample_set.xml: downloads/biosample_set.xml.gz
-	gunzip -c $< > $@
-
-# for development
-downloads/books.xml:
-	$(WGET) -O $@ "https://www.w3schools.com/xml/books.xml"
-
-# 8 years old. seems very incomplete.
-downloads/biosample.xsd:
-	$(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co"
-
-# find code for converting to table in other repos 
-# or convert to duckdb
-downloads/ncbi-biosample-attributes.xml:
-	$(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/attributes/?format=xml"
-
-downloads/ncbi-biosample-packages.xml:
-	$(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml"
-
-local/ncbi-biosample-packages.csv: downloads/ncbi-biosample-packages.xml
-	$(RUN) ncbi-packages-csv-report \
-	--xml-file $< \
-	--output-file $@
-
-
-# see also https://www.npmjs.com/package/mongodb-schema/v/12.2.0?activeTab=versions
-
-#local/mongodb-paths-10pct.txt: # 450000 -> ~ 4 minutes # 4.5 M -> heavy load, never finishes. Use streaming approach?
-#	$(RUN) list-mongodb-paths \
-#		--db-name ncbi_metadata \
-#		--collection samples \
-#		--sample-size 4500000 > $@
-
-#local/ncbi_biosamples_inferred_schema.json: # ~ 2 minutes for 410,000 (1%) # ~ 1 hour for 13 million ~ 30%
-#	$(RUN) python external_metadata_awareness/infer_schema_with_batching.py \
-#		--host localhost \
-#		--port 27017 \
-#		--database ncbi_metadata \
-#		--collection samples \
-#		--total-samples 13000000 \
-#		--batch-size 50000 \
-#		--output $@
-
-.PHONY: load-biosamples-into-mongo
-
-local/biosample-count-xml.txt: local/biosample_set.xml
-	date && grep -c "</BioSample>" $< > $@ && date
-
-# see also https://gitlab.com/wurssb/insdc_metadata
-load-biosamples-into-mongo: local/biosample_set.xml
-	$(RUN) xml-to-mongo \
-		--file-path $< \
-		--node-type BioSample \
-		--id-field id \
-		--db-name biosamples_dev \
-		--collection-name biosamples_dev \
-		--max-elements 100000 \
-		--anticipated-last-id 100000
-
-local/biosample-count-mongodb.txt:
-	date && mongosh --eval 'db.getSiblingDB("ncbi_metadata").samples.countDocuments()' > $@ && date # 1 minute
-
-local/ncbi-biosamples-packages-counts.tsv: sql/packages-counts.sql
-	$(RUN) sql-to-tsv \
-	--sql-file $< \
-	--output-file $@
-
-ncbi-biosamples-duckdb-overview:
-	$(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \
-		--connection-string "mongodb://localhost:27017/" \
-		--db-name ncbi_metadata \
-		--collection-name samples \
-		--limit 41000000 \
-		--batch-size 100000 \
-		--duckdb-file local/ncbi_biosamples.duckdb \
-		--table-name overview # no path # 40462422 biosamples in ~ 50 minutes
-
-# add counts from duckdb; need to compile duckdb or download binary
-
-ncbi-biosamples-duckdb-attributes:
-	$(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \
-		--connection-string "mongodb://localhost:27017/" \
-		--db-name ncbi_metadata \
-		--collection-name samples \
-		--limit 41000000 \
-		--batch-size 100000 \
-		--duckdb-file local/ncbi_biosamples.duckdb \
-		--table-name attributes \
-		--path BioSample.Attributes.Attribute
-
-ncbi-biosamples-duckdb-links:
-	$(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \
-		--connection-string "mongodb://localhost:27017/" \
-		--db-name ncbi_metadata \
-		--collection-name samples \
-		--limit 41000000 \
-		--batch-size 100000 \
-		--duckdb-file local/ncbi_biosamples.duckdb \
-		--table-name links \
-		--path BioSample.Links.Link
-
-  ## @click.option('--path', default="BioSample.Links.Link", required=True,
-  ##               help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').")
-  ## @click.option('--path', default="BioSample.Ids.Id", required=True,
-  ##               help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').")
-  ## @click.option('--path', default="BioSample.Description.Organism", required=True,
-  ##               help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').")
-
-NCBI_BIOSAMPLES_DUCKDB_PATH = local/ncbi_biosamples.duckdb
-
-local/ncbi-mims-soil-biosamples-env_local_scale.csv:
-	echo ".mode csv\nSELECT content, COUNT(1) AS sample_count FROM attributes WHERE harmonized_name = 'env_local_scale' AND package_content = 'MIMS.me.soil.6.0' GROUP BY content ORDER BY COUNT(1) DESC;" | duckdb $(NCBI_BIOSAMPLES_DUCKDB_PATH) > $@
-
-local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv: local/ncbi-mims-soil-biosamples-env_local_scale.csv
-	$(RUN) normalize-envo-data \
-		--count-col-name sample_count \
-		--input-file $< \
-		--ontology-prefix ENVO \
-		--output-file $@ \
-		--val-col-name content
-
-local/ncbi-mims-soil-biosamples-env_local_scale-failures.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv
-	$(RUN) find-envo-present-no-curie-extracted \
-		--input-file $< \
-		--output-file $@
-
-local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv local/envo-info.csv
-	$(RUN) merge-in-reference-data \
-		--keep-file $(word 1,$^) \
-		--keep-key normalized_curie \
-		--reference-file $(word 2,$^) \
-		--reference-key normalized_curie \
-		--reference-addition normalized_label \
-		--addition-rename real_label \
-		--merged-file $@
-
-local/ncbi-mims-soil-biosamples-env_local_scale-annotated.tsv: local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv
-	date ; $(RUN) runoak \
-		--input sqlite:obo:envo annotate \
-		--matches-whole-text \
-		--output-type tsv \
-		--output $@ \
-		--text-file $< \
-		--match-column normalized_label ; date
-
-# ENVO STUFF
-# getting fragments of EnvO because the whole thing is too large to feed into an LLM
-# our guideline is that env_broad_scale should be answered with an EnvO biome subclass
-
-# these OAK commands fetch the latest EnvO SQLite file from a BBOP S3 bucket
-# it may be a few days behind the envo.owl file form the EnvO GH repo
-# use `runoak cache-ls` to see where the SQLite files are cached
-
-local/biome-relationships.tsv:
-	$(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00000428 > $@
-	# !!! pivot? include entailment? --include-entailed / --no-include-entailed; --non-redundant-entailed / --no-non-redundant-entailed
-	# LLM web interfaces might want CSVs
-
-local/biome-relationships.csv: local/biome-relationships.tsv
-	sed 's/\t/,/g' $< > $@
-	#awk 'BEGIN {FS="\t"; OFS=","} {print $$0}' $< > $@
-	rm -rf $<
-
-local/biome-metadata.yaml:
-	$(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00000428 > $@
-	 # !!! try different formats? or predicate list?
-
-local/biome-metadata.json: local/biome-metadata.yaml
-	yq ea '[.]' $< -o=json | cat > $@
-	rm -rf $<
-
-# our guideline is that env_medium should be answered with an EnvO biome subclass
-local/environmental-materials-relationships.tsv:
-	$(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00010483 > $@
-
-local/environmental-materials-relationships.csv: local/environmental-materials-relationships.tsv
-	sed 's/\t/,/g' $< > $@
-	rm -rf $<
-
-local/environmental-materials-metadata.yaml:
-	$(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00010483 > $@
-
-local/environmental-materials-metadata.json: local/environmental-materials-metadata.yaml
-	yq ea '[.]' $< -o=json | cat > $@
-	rm -rf $<
-
-local/environmental-material-info.txt:
-	$(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00010483 > $@
-
-local/aquatic-biome-info.txt:
-	$(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00002030 > $@  # --output-type tsv has lots of info but wrapped in square brackets
-
-local/aquatic-biome-relationships.tsv:
-	$(RUN) runoak --input sqlite:obo:envo relationships --output-type tsv --output $@ .desc//p=i ENVO:00002030
-
-local/aquatic-biome.png:
-	$(RUN) runoak --input sqlite:obo:envo viz --no-view --output $@ --gap-fill .desc//p=i ENVO:00002030
-
-local/soil-env_broad_scale-algebraic.txt:
-	$(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ .desc//p=i biome .not .desc//p=i 'aquatic biome' ] .not .desc//p=i 'forest biome' ] .not .desc//p=i 'grassland biome' ]  .not .desc//p=i 'desert biome' ] .not biome ]  .not 'cropland biome' > $@
-
-local/soil-env_broad_scale-algebraic.csv: local/soil-env_broad_scale-algebraic.txt
-	$(RUN) normalize-envo-data \
-		--input-file $< \
-		--ontology-prefix ENVO \
-		--output-file $@
+include ncbi.Makefile
+include env_triad.Makefile
 
 # MIXS STUFF
 downloads/mixs.yaml:
@@ -451,77 +238,7 @@ local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.ts
 		--column1 "normalized_curie_biome" \
 		--column2 "matched_id_biome"
 
-## for env medium
-#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv: local/environmental-material-info.txt \
-#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.csv
-#	$(RUN) detect-curies-in-subset \
-#		--tsv-file $(word 2,$^) \
-#		--class-info-file $(word 1,$^)  \
-#		--tsv-column-name normalized-curie \
-#		--subset-label environmental-material \
-#		--output-file $@
-#
-#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-4.csv: local/environmental-material-info.txt \
-#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv
-#	$(RUN) detect-curies-in-subset \
-#		--tsv-file $(word 2,$^) \
-#		--class-info-file $(word 1,$^)  \
-#		--tsv-column-name matched_id \
-#		--subset-label environmental_material \
-#		--output-file $@
-
 detected-annotations-to-postgres: local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv
 	$(RUN) load-tsv-into-postgres \
 	--tsv-file $< \
-	--table-name detected_annotations
-
-# REPORT OF WHETHER A BIOSAMPLE USES A BIOME AS IT'S env_broad_scale VALUE
-# joins pre-loaded (grouped) detected_annotations table with individual biosample env_broad_scale assertions
-local/soil-water-env-broad-scale.tsv: sql/soil-water-env_broad_scale.sql
-	$(RUN) sql-to-tsv \
-	--sql-file $< \
-	--output-file $@
-
-####
-
-local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \
-local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \
-local/biome-relationships.csv
-	$(RUN) build-prompt-from-template \
-		--spec-file-path $(word 1,$^) \
-		--output-file-path $@
-
-# suggested models: gpt-4, gpt-4o, gpt-4-turbo (?), claude-3-opus, claude-3.5-sonnet, gemini-1.5-pro-latest
-# gemini models don't seem to take a temperature parameter
-# cborg/claude-sonnet
-local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-prompt.txt
-	cat $(word 1,$^) | $(RUN) llm prompt --model claude-3.5-sonnet -o temperature 0.01 | tee $@
-
-####
-
-local/env-local-scale-candidates.txt:
-	$(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@
-
-local/env-local-scale-candidates-relationships.tsv:
-	$(RUN) runoak --input sqlite:obo:envo relationships [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@
-
-local/envo-leaves.txt:
-	$(RUN) runoak --input sqlite:obo:envo leafs > $@
-
-local/envo-leaf-ids.txt: local/envo-leaves.txt
-	cut -f1 -d' ' $< > $@
-
-local/env-local-scale-candidate-ids.txt: local/env-local-scale-candidates.txt
-	cut -f1 -d' ' $< > $@
-
-local/env-local-scale-non-leaf.txt: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt
-	$(RUN) runoak --input sqlite:obo:envo info .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ] > $@
-
-local/env-local-scale-non-leaf.csv: local/env-local-scale-non-leaf.txt
-	$(RUN) normalize-envo-data \
-		--input-file $< \
-		--ontology-prefix ENVO \
-		--output-file $@
-
-local/env-local-scale-non-leaf.png: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt
-	$(RUN) runoak --input sqlite:obo:envo viz --gap-fill .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ]
+	--table-name detected_annotations
\ No newline at end of file
diff --git a/env_triad.Makefile b/env_triad.Makefile
new file mode 100644
index 0000000..87a4430
--- /dev/null
+++ b/env_triad.Makefile
@@ -0,0 +1,145 @@
+WGET=wget
+RUN=poetry run
+
+# preferable to use a tagged release, but theres good stuff in this commit that hasn't been released yet
+MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml
+SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml
+
+# ENVO STUFF
+# getting fragments of EnvO because the whole thing is too large to feed into an LLM
+# our guideline is that env_broad_scale should be answered with an EnvO biome subclass
+
+# these OAK commands fetch the latest EnvO SQLite file from a BBOP S3 bucket
+# it may be a few days behind the envo.owl file form the EnvO GH repo
+# use `runoak cache-ls` to see where the SQLite files are cached
+
+local/biome-relationships.tsv:
+	$(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00000428 > $@
+	# !!! pivot? include entailment? --include-entailed / --no-include-entailed; --non-redundant-entailed / --no-non-redundant-entailed
+	# LLM web interfaces might want CSVs
+
+local/biome-relationships.csv: local/biome-relationships.tsv
+	sed 's/\t/,/g' $< > $@
+	#awk 'BEGIN {FS="\t"; OFS=","} {print $$0}' $< > $@
+	rm -rf $<
+
+local/biome-metadata.yaml:
+	$(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00000428 > $@
+	 # !!! try different formats? or predicate list?
+
+local/biome-metadata.json: local/biome-metadata.yaml
+	yq ea '[.]' $< -o=json | cat > $@
+	rm -rf $<
+
+# our guideline is that env_medium should be answered with an EnvO biome subclass
+local/environmental-materials-relationships.tsv:
+	$(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00010483 > $@
+
+local/environmental-materials-relationships.csv: local/environmental-materials-relationships.tsv
+	sed 's/\t/,/g' $< > $@
+	rm -rf $<
+
+local/environmental-materials-metadata.yaml:
+	$(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00010483 > $@
+
+local/environmental-materials-metadata.json: local/environmental-materials-metadata.yaml
+	yq ea '[.]' $< -o=json | cat > $@
+	rm -rf $<
+
+local/environmental-material-info.txt:
+	$(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00010483 > $@
+
+local/aquatic-biome-info.txt:
+	$(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00002030 > $@  # --output-type tsv has lots of info but wrapped in square brackets
+
+local/aquatic-biome-relationships.tsv:
+	$(RUN) runoak --input sqlite:obo:envo relationships --output-type tsv --output $@ .desc//p=i ENVO:00002030
+
+local/aquatic-biome.png:
+	$(RUN) runoak --input sqlite:obo:envo viz --no-view --output $@ --gap-fill .desc//p=i ENVO:00002030
+
+local/soil-env_broad_scale-algebraic.txt:
+	$(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ .desc//p=i biome .not .desc//p=i 'aquatic biome' ] .not .desc//p=i 'forest biome' ] .not .desc//p=i 'grassland biome' ]  .not .desc//p=i 'desert biome' ] .not biome ]  .not 'cropland biome' > $@
+
+local/soil-env_broad_scale-algebraic.csv: local/soil-env_broad_scale-algebraic.txt
+	$(RUN) normalize-envo-data \
+		--input-file $< \
+		--ontology-prefix ENVO \
+		--output-file $@
+
+
+## for env medium
+#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv: local/environmental-material-info.txt \
+#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.csv
+#	$(RUN) detect-curies-in-subset \
+#		--tsv-file $(word 2,$^) \
+#		--class-info-file $(word 1,$^)  \
+#		--tsv-column-name normalized-curie \
+#		--subset-label environmental-material \
+#		--output-file $@
+#
+#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-4.csv: local/environmental-material-info.txt \
+#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv
+#	$(RUN) detect-curies-in-subset \
+#		--tsv-file $(word 2,$^) \
+#		--class-info-file $(word 1,$^)  \
+#		--tsv-column-name matched_id \
+#		--subset-label environmental_material \
+#		--output-file $@
+
+
+# REPORT OF WHETHER A BIOSAMPLE USES A BIOME AS IT'S env_broad_scale VALUE
+# joins pre-loaded (grouped) detected_annotations table with individual biosample env_broad_scale assertions
+local/soil-water-env-broad-scale.tsv: sql/soil-water-env_broad_scale.sql
+	$(RUN) sql-to-tsv \
+	--sql-file $< \
+	--output-file $@
+
+####
+
+local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \
+local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \
+local/biome-relationships.csv
+	$(RUN) build-prompt-from-template \
+		--spec-file-path $(word 1,$^) \
+		--output-file-path $@
+
+# suggested models: gpt-4, gpt-4o, gpt-4-turbo (?), claude-3-opus, claude-3.5-sonnet, gemini-1.5-pro-latest
+# gemini models don't seem to take a temperature parameter
+# cborg/claude-sonnet
+local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-prompt.txt
+	cat $(word 1,$^) | $(RUN) llm prompt --model claude-3.5-sonnet -o temperature 0.01 | tee $@
+
+####
+
+#local/env-local-scale-candidates.txt:
+#	$(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@
+
+generate-env-local-scale-candidates:
+	# Ensure the poetry environment is activated and run the script with the specified config
+	poetry run python external_metadata_awareness/env_local_scale_extraction.py config.yaml
+
+
+local/env-local-scale-candidates-relationships.tsv:
+	$(RUN) runoak --input sqlite:obo:envo relationships [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@
+
+local/envo-leaves.txt:
+	$(RUN) runoak --input sqlite:obo:envo leafs > $@
+
+local/envo-leaf-ids.txt: local/envo-leaves.txt
+	cut -f1 -d' ' $< > $@
+
+local/env-local-scale-candidate-ids.txt: local/env-local-scale-candidates.txt
+	cut -f1 -d' ' $< > $@
+
+local/env-local-scale-non-leaf.txt: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt
+	$(RUN) runoak --input sqlite:obo:envo info .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ] > $@
+
+local/env-local-scale-non-leaf.csv: local/env-local-scale-non-leaf.txt
+	$(RUN) normalize-envo-data \
+		--input-file $< \
+		--ontology-prefix ENVO \
+		--output-file $@
+
+local/env-local-scale-non-leaf.png: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt
+	$(RUN) runoak --input sqlite:obo:envo viz --gap-fill .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ]
diff --git a/external_metadata_awareness/cborg_test.py b/external_metadata_awareness/cborg_test.py
index c2cdf6b..94f4607 100644
--- a/external_metadata_awareness/cborg_test.py
+++ b/external_metadata_awareness/cborg_test.py
@@ -3,7 +3,7 @@
 from dotenv import load_dotenv
 
 # Load environment variables from local/.env
-load_dotenv(os.path.join('..', 'local', '.env'))
+load_dotenv(os.path.join('../..', 'local', '.env'))
 
 client = openai.OpenAI(
     api_key=os.environ.get('CBORG_API_KEY'),  # Retrieve API key from environment variables
diff --git a/external_metadata_awareness/env_local_scale_config.yaml b/external_metadata_awareness/env_local_scale_config.yaml
new file mode 100755
index 0000000..f5a1093
--- /dev/null
+++ b/external_metadata_awareness/env_local_scale_config.yaml
@@ -0,0 +1,57 @@
+# config.yaml
+input: "sqlite:obo:envo"
+output: "local/environmental-materials-relationships.txt"
+entity: "material entity"
+exclusions:
+  - "biome"
+  - "environmental material"
+  - "chemical entity"
+  - "organic material"
+  - "anatomical entity"
+  - "organism"
+  - "plant anatomical entity"
+  - "healthcare facility"
+  - "fluid layer"
+  - "interface layer"
+  - "manufactured product"
+  - "anatomical entity environment"
+  - "ecosystem"
+  - "area protected according to IUCN guidelines"
+  - "astronomical body"
+  - "astronomical object"
+  - "cloud"
+  - "collection of organisms"
+  - "environmental system"
+  - "ecozone"
+  - "environmental zone"
+  - "water current"
+  - "mass of environmental material"
+  - "subatomic particle"
+  - "observing system"
+  - "particle"
+  - "planetary structural layer"
+  - "political entity"
+  - "meteor"
+  - "room"
+  - "transport feature"
+  - "mass of liquid"
+  - "RO:0001025 water body"
+  - "BFO:0000050 environmental monitoring area"
+  - "BFO:0000050 marine littoral zone"
+  - "BFO:0000050 marine environmental zone"
+  - "RO:0002473 sea floor"
+  - "BFO:0000050 saline water"
+  - "BFO:0000050 ice"
+  - "RO:0001025 water body"
+  - "administrative region"
+  - "protected area"
+  - "channel of a watercourse"
+  - "cryospheric layer"
+  - "l~gaseous"
+  - "l~marine"
+  - "material isosurface"
+  - "l~undersea"
+  - "NCBITaxon:1"
+  - "l~saline"
+  - "l~brackish"
+  - "aeroform"
\ No newline at end of file
diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py
new file mode 100755
index 0000000..2f6fbdf
--- /dev/null
+++ b/external_metadata_awareness/env_local_scale_extraction.py
@@ -0,0 +1,46 @@
+import yaml
+import click
+from oaklib import get_adapter
+
+
+def load_config(config_file):
+    with open(config_file, 'r') as file:
+        config = yaml.safe_load(file)
+    return config
+
+
+def process_ontology(config):
+    # Load the ontology using the get_adapter function
+    ontology = get_adapter(config['input'])
+
+    # Get the entity and exclusions from the config
+    initial_term = config['entity']
+    exclusions = config['exclusions']
+
+    # Get all descendants of the initial term
+    descendants = ontology.descendants(initial_term)
+
+    # Filter out the excluded terms
+    filtered_descendants = [
+        term for term in descendants
+        if not any(ontology.label(term) == exclusion for exclusion in exclusions)
+    ]
+
+    # Write the results to the output file
+    with open(config['output'], 'w') as output_file:
+        for term in filtered_descendants:
+            output_file.write(f"{term}: {ontology.label(term)}\n")
+
+
+@click.command()
+@click.argument('config_file')
+def cli(config_file):
+    """
+    CLI tool to process an ontology based on the provided YAML configuration file.
+    """
+    config = load_config(config_file)
+    process_ontology(config)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/ncbi.Makefile b/ncbi.Makefile
new file mode 100644
index 0000000..b679ff1
--- /dev/null
+++ b/ncbi.Makefile
@@ -0,0 +1,160 @@
+WGET=wget
+RUN=poetry run
+
+# preferable to use a tagged release, but theres good stuff in this commit that hasn't been released yet
+MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml
+SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml
+
+
+## NCBI STUFF
+# very complex documents; many are too large to load into a MongoDB document
+downloads/bioproject.xml:
+	$(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml" # ~ 3 GB August 2024
+
+downloads/biosample_set.xml.gz:
+	$(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz" # ~ 3 GB August 2024
+
+local/biosample_set.xml: downloads/biosample_set.xml.gz
+	gunzip -c $< > $@
+
+# for development
+downloads/books.xml:
+	$(WGET) -O $@ "https://www.w3schools.com/xml/books.xml"
+
+# 8 years old. seems very incomplete.
+downloads/biosample.xsd:
+	$(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co"
+
+# find code for converting to table in other repos
+# or convert to duckdb
+downloads/ncbi-biosample-attributes.xml:
+	$(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/attributes/?format=xml"
+
+downloads/ncbi-biosample-packages.xml:
+	$(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml"
+
+local/ncbi-biosample-packages.csv: downloads/ncbi-biosample-packages.xml
+	$(RUN) ncbi-packages-csv-report \
+	--xml-file $< \
+	--output-file $@
+
+
+# see also https://www.npmjs.com/package/mongodb-schema/v/12.2.0?activeTab=versions
+
+#local/mongodb-paths-10pct.txt: # 450000 -> ~ 4 minutes # 4.5 M -> heavy load, never finishes. Use streaming approach?
+#	$(RUN) list-mongodb-paths \
+#		--db-name ncbi_metadata \
+#		--collection samples \
+#		--sample-size 4500000 > $@
+
+#local/ncbi_biosamples_inferred_schema.json: # ~ 2 minutes for 410,000 (1%) # ~ 1 hour for 13 million ~ 30%
+#	$(RUN) python external_metadata_awareness/infer_schema_with_batching.py \
+#		--host localhost \
+#		--port 27017 \
+#		--database ncbi_metadata \
+#		--collection samples \
+#		--total-samples 13000000 \
+#		--batch-size 50000 \
+#		--output $@
+
+.PHONY: load-biosamples-into-mongo
+
+local/biosample-count-xml.txt: local/biosample_set.xml
+	date && grep -c "</BioSample>" $< > $@ && date
+
+# see also https://gitlab.com/wurssb/insdc_metadata
+load-biosamples-into-mongo: local/biosample_set.xml
+	$(RUN) xml-to-mongo \
+		--file-path $< \
+		--node-type BioSample \
+		--id-field id \
+		--db-name biosamples_dev \
+		--collection-name biosamples_dev \
+		--max-elements 100000 \
+		--anticipated-last-id 100000
+
+local/biosample-count-mongodb.txt:
+	date && mongosh --eval 'db.getSiblingDB("ncbi_metadata").samples.countDocuments()' > $@ && date # 1 minute
+
+local/ncbi-biosamples-packages-counts.tsv: sql/packages-counts.sql
+	$(RUN) sql-to-tsv \
+	--sql-file $< \
+	--output-file $@
+
+ncbi-biosamples-duckdb-overview:
+	$(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \
+		--connection-string "mongodb://localhost:27017/" \
+		--db-name ncbi_metadata \
+		--collection-name samples \
+		--limit 41000000 \
+		--batch-size 100000 \
+		--duckdb-file local/ncbi_biosamples.duckdb \
+		--table-name overview # no path # 40462422 biosamples in ~ 50 minutes
+
+# add counts from duckdb; need to compile duckdb or download binary
+
+ncbi-biosamples-duckdb-attributes:
+	$(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \
+		--connection-string "mongodb://localhost:27017/" \
+		--db-name ncbi_metadata \
+		--collection-name samples \
+		--limit 41000000 \
+		--batch-size 100000 \
+		--duckdb-file local/ncbi_biosamples.duckdb \
+		--table-name attributes \
+		--path BioSample.Attributes.Attribute
+
+ncbi-biosamples-duckdb-links:
+	$(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \
+		--connection-string "mongodb://localhost:27017/" \
+		--db-name ncbi_metadata \
+		--collection-name samples \
+		--limit 41000000 \
+		--batch-size 100000 \
+		--duckdb-file local/ncbi_biosamples.duckdb \
+		--table-name links \
+		--path BioSample.Links.Link
+
+  ## @click.option('--path', default="BioSample.Links.Link", required=True,
+  ##               help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').")
+  ## @click.option('--path', default="BioSample.Ids.Id", required=True,
+  ##               help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').")
+  ## @click.option('--path', default="BioSample.Description.Organism", required=True,
+  ##               help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').")
+
+NCBI_BIOSAMPLES_DUCKDB_PATH = local/ncbi_biosamples.duckdb
+
+local/ncbi-mims-soil-biosamples-env_local_scale.csv:
+	echo ".mode csv\nSELECT content, COUNT(1) AS sample_count FROM attributes WHERE harmonized_name = 'env_local_scale' AND package_content = 'MIMS.me.soil.6.0' GROUP BY content ORDER BY COUNT(1) DESC;" | duckdb $(NCBI_BIOSAMPLES_DUCKDB_PATH) > $@
+
+local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv: local/ncbi-mims-soil-biosamples-env_local_scale.csv
+	$(RUN) normalize-envo-data \
+		--count-col-name sample_count \
+		--input-file $< \
+		--ontology-prefix ENVO \
+		--output-file $@ \
+		--val-col-name content
+
+local/ncbi-mims-soil-biosamples-env_local_scale-failures.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv
+	$(RUN) find-envo-present-no-curie-extracted \
+		--input-file $< \
+		--output-file $@
+
+local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv local/envo-info.csv
+	$(RUN) merge-in-reference-data \
+		--keep-file $(word 1,$^) \
+		--keep-key normalized_curie \
+		--reference-file $(word 2,$^) \
+		--reference-key normalized_curie \
+		--reference-addition normalized_label \
+		--addition-rename real_label \
+		--merged-file $@
+
+local/ncbi-mims-soil-biosamples-env_local_scale-annotated.tsv: local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv
+	date ; $(RUN) runoak \
+		--input sqlite:obo:envo annotate \
+		--matches-whole-text \
+		--output-type tsv \
+		--output $@ \
+		--text-file $< \
+		--match-column normalized_label ; date
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
new file mode 100644
index 0000000..e56f572
--- /dev/null
+++ b/tests/test_env_local_scale_generator.py
@@ -0,0 +1,81 @@
+import pytest
+import yaml
+from click.testing import CliRunner
+from external_metadata_awareness.env_local_scale_extraction import cli
+
+
+@pytest.fixture
+def sample_config(tmp_path):
+    """
+    :param tmp_path:
+    :return:
+    """
+
+    # Create a sample config.yaml file for testing
+    config_data = {
+        "input": "sqlite:obo:envo",
+        "output": "local/environmental-materials-relationships.txt",
+        "entity": "material entity",
+        "exclusions": [
+            "biome",
+            "environmental material",
+            "chemical entity"
+        ]
+    }
+    config_file = tmp_path / "config.yaml"
+    with open(config_file, 'w') as file:
+        yaml.dump(config_data, file)
+    return str(config_file)
+
+
+def test_generate_command(sample_config):
+    """
+    Test the generate_oak_command function.
+    :param sample_config:
+    :return:
+
+    """
+    runner = CliRunner()
+    result = runner.invoke(cli, [sample_config])
+
+    expected_command = (
+        "$(RUN) runoak --input sqlite:obo:envo info [ .desc//p=i 'material entity' ]"
+        " .not .desc//p=i 'biome'"
+        " .not .desc//p=i 'environmental material'"
+        " .not .desc//p=i 'chemical entity'"
+        " > local/environmental-materials-relationships.txt"
+    )
+
+    assert result.exit_code == 0
+    assert expected_command in result.output
+
+
+def test_missing_config():
+    """
+    Test the CLI tool when the config file is missing.
+    :return:
+
+    """
+    runner = CliRunner()
+    result = runner.invoke(cli, ["nonexistent.yaml"])
+
+    assert result.exit_code != 0
+    assert "No such file or directory" in result.output
+
+
+def test_invalid_config(tmp_path):
+    """
+    Test the CLI tool when the config file is invalid.
+    :param tmp_path:
+    :return:
+
+    """
+    invalid_config_file = tmp_path / "invalid_config.yaml"
+    with open(invalid_config_file, 'w') as file:
+        file.write("Invalid YAML content")
+
+    runner = CliRunner()
+    result = runner.invoke(cli, [str(invalid_config_file)])
+
+    assert result.exit_code != 0
+    assert "could not find expected" in result.output  # Checking for a YAML syntax error message

From 0618aa72f28f42c2a03872ec55e6f14eff6ffcfa Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Wed, 28 Aug 2024 18:51:38 -0700
Subject: [PATCH 02/17] add test stubs

---
 .../env_local_scale_extraction_config.yaml    |  0
 env_triad.Makefile                            |  2 +-
 .../env_local_scale_extraction.py             |  4 +-
 tests/test_env_local_scale_generator.py       | 84 +++++++------------
 4 files changed, 36 insertions(+), 54 deletions(-)
 rename external_metadata_awareness/env_local_scale_config.yaml => config/env_local_scale_extraction_config.yaml (100%)

diff --git a/external_metadata_awareness/env_local_scale_config.yaml b/config/env_local_scale_extraction_config.yaml
similarity index 100%
rename from external_metadata_awareness/env_local_scale_config.yaml
rename to config/env_local_scale_extraction_config.yaml
diff --git a/env_triad.Makefile b/env_triad.Makefile
index 87a4430..128bb55 100644
--- a/env_triad.Makefile
+++ b/env_triad.Makefile
@@ -117,7 +117,7 @@ local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-pr
 
 generate-env-local-scale-candidates:
 	# Ensure the poetry environment is activated and run the script with the specified config
-	poetry run python external_metadata_awareness/env_local_scale_extraction.py config.yaml
+	poetry run python external_metadata_awareness/env_local_scale_extraction.py --config-file config/env_local_scale_extraction_config.yaml
 
 
 local/env-local-scale-candidates-relationships.tsv:
diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py
index 2f6fbdf..f203733 100755
--- a/external_metadata_awareness/env_local_scale_extraction.py
+++ b/external_metadata_awareness/env_local_scale_extraction.py
@@ -17,9 +17,11 @@ def process_ontology(config):
     initial_term = config['entity']
     exclusions = config['exclusions']
 
+    print(initial_term)
     # Get all descendants of the initial term
     descendants = ontology.descendants(initial_term)
 
+
     # Filter out the excluded terms
     filtered_descendants = [
         term for term in descendants
@@ -33,7 +35,7 @@ def process_ontology(config):
 
 
 @click.command()
-@click.argument('config_file')
+@click.option('--config-file', required=True, help='Path to the YAML configuration file.')
 def cli(config_file):
     """
     CLI tool to process an ontology based on the provided YAML configuration file.
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index e56f572..3ffbd23 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -1,20 +1,15 @@
 import pytest
+import os
 import yaml
 from click.testing import CliRunner
-from external_metadata_awareness.env_local_scale_extraction import cli
+from external_metadata_awareness.env_local_scale_extraction import cli, load_config, process_ontology
 
 
 @pytest.fixture
-def sample_config(tmp_path):
-    """
-    :param tmp_path:
-    :return:
-    """
-
-    # Create a sample config.yaml file for testing
+def config_file(tmp_path):
     config_data = {
         "input": "sqlite:obo:envo",
-        "output": "local/environmental-materials-relationships.txt",
+        "output": str(tmp_path / "output.txt"),
         "entity": "material entity",
         "exclusions": [
             "biome",
@@ -25,57 +20,42 @@ def sample_config(tmp_path):
     config_file = tmp_path / "config.yaml"
     with open(config_file, 'w') as file:
         yaml.dump(config_data, file)
-    return str(config_file)
+    return config_file
 
 
-def test_generate_command(sample_config):
-    """
-    Test the generate_oak_command function.
-    :param sample_config:
-    :return:
+def test_load_config(config_file):
+    config = load_config(config_file)
+    assert config['input'] == "sqlite:obo:envo"
+    assert config['output'].endswith("output.txt")
+    assert config['entity'] == "material entity"
+    assert "biome" in config['exclusions']
 
-    """
-    runner = CliRunner()
-    result = runner.invoke(cli, [sample_config])
 
-    expected_command = (
-        "$(RUN) runoak --input sqlite:obo:envo info [ .desc//p=i 'material entity' ]"
-        " .not .desc//p=i 'biome'"
-        " .not .desc//p=i 'environmental material'"
-        " .not .desc//p=i 'chemical entity'"
-        " > local/environmental-materials-relationships.txt"
-    )
+def test_process_ontology(config_file):
+    config = load_config(config_file)
+    process_ontology(config)
 
-    assert result.exit_code == 0
-    assert expected_command in result.output
+    # Check if the output file is created and not empty
+    assert os.path.exists(config['output'])
+    with open(config['output'], 'r') as file:
+        content = file.read()
+        assert len(content) > 0, "Output file is empty, expected some data."
 
 
-def test_missing_config():
-    """
-    Test the CLI tool when the config file is missing.
-    :return:
-
-    """
+def test_cli_runs_successfully(config_file):
     runner = CliRunner()
-    result = runner.invoke(cli, ["nonexistent.yaml"])
-
-    assert result.exit_code != 0
-    assert "No such file or directory" in result.output
-
-
-def test_invalid_config(tmp_path):
-    """
-    Test the CLI tool when the config file is invalid.
-    :param tmp_path:
-    :return:
+    result = runner.invoke(cli, ['--config-file', str(config_file)])
+    assert result.exit_code == 0
+    assert os.path.exists(load_config(config_file)['output'])
 
-    """
-    invalid_config_file = tmp_path / "invalid_config.yaml"
-    with open(invalid_config_file, 'w') as file:
-        file.write("Invalid YAML content")
 
-    runner = CliRunner()
-    result = runner.invoke(cli, [str(invalid_config_file)])
+def test_no_exclusions(config_file):
+    config = load_config(config_file)
+    config['exclusions'] = []
+    process_ontology(config)
 
-    assert result.exit_code != 0
-    assert "could not find expected" in result.output  # Checking for a YAML syntax error message
+    # Check if the output file is created and has content
+    assert os.path.exists(config['output'])
+    with open(config['output'], 'r') as file:
+        content = file.read()
+        assert len(content) > 0, "Output file is empty, expected some data even without exclusions."

From c09c78c118a4c7e417ab28b9e1b23a48b2ac77f8 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 12:00:18 -0700
Subject: [PATCH 03/17] get tests to pass

---
 ...=> env-local-scale-extraction-config.yaml} |  3 +-
 config/oaklib-setup-config.yaml               |  3 +
 .../env_local_scale_extraction.py             | 38 ++++-----
 tests/test_env_local_scale_generator.py       | 77 +++++++++++--------
 4 files changed, 69 insertions(+), 52 deletions(-)
 rename config/{env_local_scale_extraction_config.yaml => env-local-scale-extraction-config.yaml} (97%)
 create mode 100644 config/oaklib-setup-config.yaml

diff --git a/config/env_local_scale_extraction_config.yaml b/config/env-local-scale-extraction-config.yaml
similarity index 97%
rename from config/env_local_scale_extraction_config.yaml
rename to config/env-local-scale-extraction-config.yaml
index f5a1093..b676c33 100755
--- a/config/env_local_scale_extraction_config.yaml
+++ b/config/env-local-scale-extraction-config.yaml
@@ -1,5 +1,4 @@
-# config.yaml
-input: "sqlite:obo:envo"
+# env-local-scale-extraction-config.yaml
 output: "local/environmental-materials-relationships.txt"
 entity: "material entity"
 exclusions:
diff --git a/config/oaklib-setup-config.yaml b/config/oaklib-setup-config.yaml
new file mode 100644
index 0000000..7b45869
--- /dev/null
+++ b/config/oaklib-setup-config.yaml
@@ -0,0 +1,3 @@
+ontology_resources:
+  envo:
+    selector: sqlite:obo:envo
\ No newline at end of file
diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py
index f203733..924f4eb 100755
--- a/external_metadata_awareness/env_local_scale_extraction.py
+++ b/external_metadata_awareness/env_local_scale_extraction.py
@@ -3,45 +3,47 @@
 from oaklib import get_adapter
 
 
-def load_config(config_file):
-    with open(config_file, 'r') as file:
-        config = yaml.safe_load(file)
-    return config
+def load_configs(oak_config_file, extraction_config_file):
+    with open(oak_config_file, 'r') as file:
+        oak_config = yaml.safe_load(file)
+    with open(extraction_config_file, 'r') as file:
+        extraction_config = yaml.safe_load(file)
+    return oak_config, extraction_config
 
 
-def process_ontology(config):
+def process_ontology(oak_config_file, extraction_config):
     # Load the ontology using the get_adapter function
-    ontology = get_adapter(config['input'])
+    oak_adapter = get_adapter(oak_config_file)
 
     # Get the entity and exclusions from the config
-    initial_term = config['entity']
-    exclusions = config['exclusions']
+    initial_term_label = extraction_config['entity']
+    initial_term_curie = oak_adapter.curies_by_label(label=initial_term_label)
+    exclusion_labels = extraction_config['exclusions']
 
-    print(initial_term)
     # Get all descendants of the initial term
-    descendants = ontology.descendants(initial_term)
-
+    descendants = oak_adapter.descendants(initial_term_curie)
 
     # Filter out the excluded terms
     filtered_descendants = [
         term for term in descendants
-        if not any(ontology.label(term) == exclusion for exclusion in exclusions)
+        if not any(oak_adapter.label(term) == exclusion for exclusion in exclusion_labels)
     ]
 
     # Write the results to the output file
-    with open(config['output'], 'w') as output_file:
+    with open(extraction_config['output'], 'w') as output_file:
         for term in filtered_descendants:
-            output_file.write(f"{term}: {ontology.label(term)}\n")
+            output_file.write(f"{term}: {oak_adapter.label(term)}\n")
 
 
 @click.command()
-@click.option('--config-file', required=True, help='Path to the YAML configuration file.')
-def cli(config_file):
+@click.option('--extraction-config-file', required=True, help='Path to the extraction YAML configuration file.')
+@click.option('--oak-config-file', required=True, help='Path to the extraction YAML configuration file.')
+def cli(extraction_config_file, oak_config_file):
     """
     CLI tool to process an ontology based on the provided YAML configuration file.
     """
-    config = load_config(config_file)
-    process_ontology(config)
+    _, extraction_config = load_configs(oak_config_file, extraction_config_file)
+    process_ontology(oak_config_file, extraction_config)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index 3ffbd23..d489156 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -1,61 +1,74 @@
 import pytest
-import os
 import yaml
 from click.testing import CliRunner
-from external_metadata_awareness.env_local_scale_extraction import cli, load_config, process_ontology
+from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, process_ontology
 
 
 @pytest.fixture
-def config_file(tmp_path):
+def oak_config_file(tmp_path):
+    config_data = {
+        "ontology_resources": {
+            "envo": {
+                "selector": "sqlite:obo:envo"
+            }
+        }
+    }
+    config_file = tmp_path / "oak_config.yaml"
+    with open(config_file, 'w') as file:
+        yaml.dump(config_data, file)
+    return config_file
+
+
+@pytest.fixture
+def extraction_config_file(tmp_path):
     config_data = {
-        "input": "sqlite:obo:envo",
-        "output": str(tmp_path / "output.txt"),
         "entity": "material entity",
         "exclusions": [
             "biome",
             "environmental material",
             "chemical entity"
-        ]
+        ],
+        "output": str(tmp_path / "output.txt")
     }
-    config_file = tmp_path / "config.yaml"
+    config_file = tmp_path / "extraction_config.yaml"
     with open(config_file, 'w') as file:
         yaml.dump(config_data, file)
     return config_file
 
 
-def test_load_config(config_file):
-    config = load_config(config_file)
-    assert config['input'] == "sqlite:obo:envo"
-    assert config['output'].endswith("output.txt")
-    assert config['entity'] == "material entity"
-    assert "biome" in config['exclusions']
+def test_load_configs(oak_config_file, extraction_config_file):
+    oak_config, extraction_config = load_configs(oak_config_file, extraction_config_file)
+    assert "ontology_resources" in oak_config
+    assert "envo" in oak_config["ontology_resources"]
+    assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo"
+    assert extraction_config["entity"] == "material entity"
+    assert extraction_config["output"].endswith("output.txt")
+
 
+def test_process_ontology(oak_config_file, extraction_config_file):
+    _, extraction_config = load_configs(oak_config_file, extraction_config_file)
 
-def test_process_ontology(config_file):
-    config = load_config(config_file)
-    process_ontology(config)
+    # Replace with a real test ontology and expected behavior if possible.
+    process_ontology(oak_config_file, extraction_config)
 
-    # Check if the output file is created and not empty
-    assert os.path.exists(config['output'])
-    with open(config['output'], 'r') as file:
+    # Check if the output file is created and has content
+    assert extraction_config["output"]
+    with open(extraction_config["output"], 'r') as file:
         content = file.read()
+        print(content)
         assert len(content) > 0, "Output file is empty, expected some data."
 
 
-def test_cli_runs_successfully(config_file):
+def test_cli_runs_successfully(oak_config_file, extraction_config_file):
     runner = CliRunner()
-    result = runner.invoke(cli, ['--config-file', str(config_file)])
+    result = runner.invoke(cli, ['--extraction-config-file', str(extraction_config_file), '--oak-config-file',
+                                 str(oak_config_file)])
     assert result.exit_code == 0
-    assert os.path.exists(load_config(config_file)['output'])
+    assert "material entity" in result.output or "ENVO:00000447" in result.output  
 
-
-def test_no_exclusions(config_file):
-    config = load_config(config_file)
-    config['exclusions'] = []
-    process_ontology(config)
-
-    # Check if the output file is created and has content
-    assert os.path.exists(config['output'])
-    with open(config['output'], 'r') as file:
+    # Verify the output file exists and contains the expected results
+    output_file = extraction_config_file.parent / "output.txt"
+    assert output_file.exists()
+    with open(output_file, 'r') as file:
         content = file.read()
-        assert len(content) > 0, "Output file is empty, expected some data even without exclusions."
+        assert len(content) > 0, "Output file is empty, expected some data."
\ No newline at end of file

From c86dbcc59dad596ff2497f8ae519aa8bd8ba8f52 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 12:05:35 -0700
Subject: [PATCH 04/17] fixing makefile target

---
 config/{oaklib-setup-config.yaml => oak-config.yaml} | 0
 env_triad.Makefile                                   | 4 +++-
 tests/test_env_local_scale_generator.py              | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)
 rename config/{oaklib-setup-config.yaml => oak-config.yaml} (100%)

diff --git a/config/oaklib-setup-config.yaml b/config/oak-config.yaml
similarity index 100%
rename from config/oaklib-setup-config.yaml
rename to config/oak-config.yaml
diff --git a/env_triad.Makefile b/env_triad.Makefile
index 128bb55..09e7305 100644
--- a/env_triad.Makefile
+++ b/env_triad.Makefile
@@ -117,7 +117,9 @@ local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-pr
 
 generate-env-local-scale-candidates:
 	# Ensure the poetry environment is activated and run the script with the specified config
-	poetry run python external_metadata_awareness/env_local_scale_extraction.py --config-file config/env_local_scale_extraction_config.yaml
+	$(RUN) python external_metadata_awareness/env_local_scale_extraction.py \
+           --oak-config-file config/oak-config.yaml \
+           --extraction-config-file config/env-local-scale-extraction-config.yaml
 
 
 local/env-local-scale-candidates-relationships.tsv:
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index d489156..1271179 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -64,11 +64,11 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file):
     result = runner.invoke(cli, ['--extraction-config-file', str(extraction_config_file), '--oak-config-file',
                                  str(oak_config_file)])
     assert result.exit_code == 0
-    assert "material entity" in result.output or "ENVO:00000447" in result.output  
+    assert "material entity" in result.output or "ENVO:00000447" in result.output
 
     # Verify the output file exists and contains the expected results
     output_file = extraction_config_file.parent / "output.txt"
     assert output_file.exists()
     with open(output_file, 'r') as file:
         content = file.read()
-        assert len(content) > 0, "Output file is empty, expected some data."
\ No newline at end of file
+        assert len(content) > 0, "Output file is empty, expected some data."

From a8d5e18f90d41c70bdb723dafb1a9de9b9d4c01d Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 16:37:22 -0700
Subject: [PATCH 05/17] working but no descendents

---
 config/env-local-scale-extraction-config.yaml    | 16 +++++++++-------
 .../env_local_scale_extraction.py                | 13 ++++++++++++-
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml
index b676c33..e796e20 100755
--- a/config/env-local-scale-extraction-config.yaml
+++ b/config/env-local-scale-extraction-config.yaml
@@ -1,7 +1,13 @@
 # env-local-scale-extraction-config.yaml
 output: "local/environmental-materials-relationships.txt"
 entity: "material entity"
-exclusions:
+text_exclusions:
+  - "l~gaseous"
+  - "l~marine"
+  - "l~undersea"
+  - "l~saline"
+  - "l~brackish"
+term_exclusions:
   - "biome"
   - "environmental material"
   - "chemical entity"
@@ -22,6 +28,7 @@ exclusions:
   - "collection of organisms"
   - "environmental system"
   - "ecozone"
+  - "material isosurface"
   - "environmental zone"
   - "water current"
   - "mass of environmental material"
@@ -46,11 +53,6 @@ exclusions:
   - "protected area"
   - "channel of a watercourse"
   - "cryospheric layer"
-  - "l~gaseous"
-  - "l~marine"
   - "material isosurface"
-  - "l~undersea"
   - "NCBITaxon:1"
-  - "l~saline"
-  - "l~brackish"
-  - "aeroform"
\ No newline at end of file
+  - "aeroform"
diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py
index 924f4eb..e5bd2c7 100755
--- a/external_metadata_awareness/env_local_scale_extraction.py
+++ b/external_metadata_awareness/env_local_scale_extraction.py
@@ -1,6 +1,7 @@
 import yaml
 import click
 from oaklib import get_adapter
+from oaklib.query import onto_query
 
 
 def load_configs(oak_config_file, extraction_config_file):
@@ -18,7 +19,17 @@ def process_ontology(oak_config_file, extraction_config):
     # Get the entity and exclusions from the config
     initial_term_label = extraction_config['entity']
     initial_term_curie = oak_adapter.curies_by_label(label=initial_term_label)
-    exclusion_labels = extraction_config['exclusions']
+    exclusion_labels = extraction_config['term_exclusions']
+    exclusion_curies = []
+
+    for exclusion_label in exclusion_labels:
+        exclusion_curie = oak_adapter.curies_by_label(label=exclusion_label)
+        if exclusion_curie:
+            exclusion_curies.append(exclusion_curie)
+
+    results = onto_query(oak_adapter, initial_term_curie, exclusion_curies)
+
+    excluded_text_matches = extraction_config['text_exclusions']
 
     # Get all descendants of the initial term
     descendants = oak_adapter.descendants(initial_term_curie)

From 8ed8440e589f1080365f1cf82aca918bfdb187d9 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 16:54:24 -0700
Subject: [PATCH 06/17] rewrite with onto_query

---
 config/env-local-scale-extraction-config.yaml |  10 +-
 .../env_local_scale_extraction.py             | 116 +++++++++++++++---
 tests/test_env_local_scale_generator.py       |  27 +++-
 3 files changed, 123 insertions(+), 30 deletions(-)

diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml
index e796e20..987142d 100755
--- a/config/env-local-scale-extraction-config.yaml
+++ b/config/env-local-scale-extraction-config.yaml
@@ -2,11 +2,11 @@
 output: "local/environmental-materials-relationships.txt"
 entity: "material entity"
 text_exclusions:
-  - "l~gaseous"
-  - "l~marine"
-  - "l~undersea"
-  - "l~saline"
-  - "l~brackish"
+  - "gaseous"
+  - "marine"
+  - "undersea"
+  - "saline"
+  - "brackish"
 term_exclusions:
   - "biome"
   - "environmental material"
diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py
index e5bd2c7..ebc6334 100755
--- a/external_metadata_awareness/env_local_scale_extraction.py
+++ b/external_metadata_awareness/env_local_scale_extraction.py
@@ -1,7 +1,7 @@
 import yaml
 import click
 from oaklib import get_adapter
-from oaklib.query import onto_query
+from oaklib.query import onto_query, FunctionQuery, FunctionEnum, SimpleQueryTerm
 
 
 def load_configs(oak_config_file, extraction_config_file):
@@ -12,38 +12,114 @@ def load_configs(oak_config_file, extraction_config_file):
     return oak_config, extraction_config
 
 
+def create_exclusion_query(term_labels, adapter):
+    """
+    Creates a combined FunctionQuery to exclude specific terms and their descendants.
+
+    :param term_labels: List of term labels to exclude.
+    :param adapter: The ontology adapter.
+    :return: Combined FunctionQuery to exclude all specified terms and their descendants.
+    """
+    exclusion_queries = []
+
+    for label in term_labels:
+        # Find the CURIE for the label
+        term_curies = onto_query(SimpleQueryTerm(term=label), adapter)
+        if term_curies:
+            term_curie = term_curies[0]  # Assuming one CURIE per label
+            # Create a descendant exclusion query for the term
+            exclusion_query = FunctionQuery(
+                function=FunctionEnum.DESCENDANT,
+                argument=term_curie,
+                description=f"Descendants of {label}"
+            )
+            exclusion_queries.append(exclusion_query)
+
+    # Combine all exclusion queries into one using the OR (|) operator
+    if exclusion_queries:
+        combined_exclusion_query = exclusion_queries[0]
+        for query in exclusion_queries[1:]:
+            combined_exclusion_query = combined_exclusion_query | query
+        return combined_exclusion_query
+    else:
+        return None
+
+
+def create_text_exclusion_query(text_exclusions, adapter):
+    """
+    Creates a combined FunctionQuery to exclude specific terms based on text matching.
+
+    :param text_exclusions: List of text patterns to exclude.
+    :param adapter: The ontology adapter.
+    :return: Combined FunctionQuery to exclude all specified text matches.
+    """
+    text_exclusion_queries = []
+
+    for text in text_exclusions:
+        exclusion_query = SimpleQueryTerm(term=text)
+        text_exclusion_queries.append(exclusion_query)
+
+    # Combine all exclusion queries into one using the OR (|) operator
+    if text_exclusion_queries:
+        combined_text_exclusion_query = text_exclusion_queries[0]
+        for query in text_exclusion_queries[1:]:
+            combined_text_exclusion_query = combined_text_exclusion_query | query
+        return combined_text_exclusion_query
+    else:
+        return None
+
+
 def process_ontology(oak_config_file, extraction_config):
     # Load the ontology using the get_adapter function
     oak_adapter = get_adapter(oak_config_file)
 
     # Get the entity and exclusions from the config
     initial_term_label = extraction_config['entity']
-    initial_term_curie = oak_adapter.curies_by_label(label=initial_term_label)
-    exclusion_labels = extraction_config['term_exclusions']
-    exclusion_curies = []
+    initial_term_curies = onto_query(SimpleQueryTerm(term=initial_term_label), oak_adapter)
+
+    if not initial_term_curies:
+        raise ValueError(f"Entity '{initial_term_label}' not found in the ontology.")
+
+    initial_term_curie = initial_term_curies[0]
+    print("initial_term_curie", initial_term_curie)
+
+    # Create exclusion queries from terms
+    term_exclusion_query = create_exclusion_query(extraction_config.get('term_exclusions', []), oak_adapter)
 
-    for exclusion_label in exclusion_labels:
-        exclusion_curie = oak_adapter.curies_by_label(label=exclusion_label)
-        if exclusion_curie:
-            exclusion_curies.append(exclusion_curie)
+    # Create exclusion queries from text patterns
+    text_exclusion_query = create_text_exclusion_query(extraction_config.get('text_exclusions', []), oak_adapter)
 
-    results = onto_query(oak_adapter, initial_term_curie, exclusion_curies)
+    # Combine term and text exclusion queries
+    combined_exclusion_query = None
+    if term_exclusion_query and text_exclusion_query:
+        combined_exclusion_query = term_exclusion_query | text_exclusion_query
+    elif term_exclusion_query:
+        combined_exclusion_query = term_exclusion_query
+    elif text_exclusion_query:
+        combined_exclusion_query = text_exclusion_query
 
-    excluded_text_matches = extraction_config['text_exclusions']
+    # Main query for descendants of the specified entity
+    material_entity_query = FunctionQuery(
+        function=FunctionEnum.DESCENDANT,
+        argument=initial_term_curie,  # Assuming one CURIE for the entity
+        description=f"Descendants of {initial_term_label}"
+    )
 
-    # Get all descendants of the initial term
-    descendants = oak_adapter.descendants(initial_term_curie)
+    # Combine the main query with the exclusion query
+    if combined_exclusion_query:
+        final_query = material_entity_query - combined_exclusion_query
+    else:
+        final_query = material_entity_query
 
-    # Filter out the excluded terms
-    filtered_descendants = [
-        term for term in descendants
-        if not any(oak_adapter.label(term) == exclusion for exclusion in exclusion_labels)
-    ]
+    # Execute the final query
+    result = onto_query(final_query, oak_adapter)
 
     # Write the results to the output file
     with open(extraction_config['output'], 'w') as output_file:
-        for term in filtered_descendants:
-            output_file.write(f"{term}: {oak_adapter.label(term)}\n")
+        for curie in result:
+            label = oak_adapter.label(curie)
+            output_file.write(f"{curie}: {label}\n")
+            print(curie, label)
 
 
 @click.command()
@@ -58,4 +134,4 @@ def cli(extraction_config_file, oak_config_file):
 
 
 if __name__ == "__main__":
-    cli()
+    cli()
\ No newline at end of file
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index 1271179..ced2226 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -1,5 +1,6 @@
 import pytest
 import yaml
+import os
 from click.testing import CliRunner
 from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, process_ontology
 
@@ -23,11 +24,15 @@ def oak_config_file(tmp_path):
 def extraction_config_file(tmp_path):
     config_data = {
         "entity": "material entity",
-        "exclusions": [
+        "term_exclusions": [
             "biome",
             "environmental material",
             "chemical entity"
         ],
+        "text_exclusions": [
+            "brackish",
+            "marine"
+        ],
         "output": str(tmp_path / "output.txt")
     }
     config_file = tmp_path / "extraction_config.yaml"
@@ -42,22 +47,30 @@ def test_load_configs(oak_config_file, extraction_config_file):
     assert "envo" in oak_config["ontology_resources"]
     assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo"
     assert extraction_config["entity"] == "material entity"
+    assert "term_exclusions" in extraction_config
+    assert "text_exclusions" in extraction_config
     assert extraction_config["output"].endswith("output.txt")
 
 
 def test_process_ontology(oak_config_file, extraction_config_file):
     _, extraction_config = load_configs(oak_config_file, extraction_config_file)
 
-    # Replace with a real test ontology and expected behavior if possible.
+    # Run the ontology processing
     process_ontology(oak_config_file, extraction_config)
 
     # Check if the output file is created and has content
-    assert extraction_config["output"]
-    with open(extraction_config["output"], 'r') as file:
+    output_file_path = extraction_config["output"]
+    assert os.path.exists(output_file_path), "Output file was not created"
+
+    with open(output_file_path, 'r') as file:
         content = file.read()
-        print(content)
         assert len(content) > 0, "Output file is empty, expected some data."
 
+    # You could also add assertions based on expected content
+    # For example, checking that excluded terms are not in the output
+    assert "biome" not in content
+    assert "brackish" not in content
+
 
 def test_cli_runs_successfully(oak_config_file, extraction_config_file):
     runner = CliRunner()
@@ -72,3 +85,7 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file):
     with open(output_file, 'r') as file:
         content = file.read()
         assert len(content) > 0, "Output file is empty, expected some data."
+
+    # Add additional assertions to check that the CLI correctly excluded terms
+    assert "biome" not in content
+    assert "brackish" not in content
\ No newline at end of file

From 9681ea1fc0529079e00b89226e9e27fce2de7314 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 18:24:06 -0700
Subject: [PATCH 07/17] use onto_query to get the terms

---
 .../env_local_scale_extraction.py             | 111 ++++++------------
 tests/test_env_local_scale_generator.py       |  35 +++++-
 2 files changed, 67 insertions(+), 79 deletions(-)

diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py
index ebc6334..b096f87 100755
--- a/external_metadata_awareness/env_local_scale_extraction.py
+++ b/external_metadata_awareness/env_local_scale_extraction.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import yaml
 import click
 from oaklib import get_adapter
@@ -12,7 +14,7 @@ def load_configs(oak_config_file, extraction_config_file):
     return oak_config, extraction_config
 
 
-def create_exclusion_query(term_labels, adapter):
+def create_exclusion_list(term_labels, adapter) -> List[str]:
     """
     Creates a combined FunctionQuery to exclude specific terms and their descendants.
 
@@ -20,29 +22,16 @@ def create_exclusion_query(term_labels, adapter):
     :param adapter: The ontology adapter.
     :return: Combined FunctionQuery to exclude all specified terms and their descendants.
     """
-    exclusion_queries = []
-
+    all_ids_to_exclude = []
     for label in term_labels:
         # Find the CURIE for the label
         term_curies = onto_query(SimpleQueryTerm(term=label), adapter)
         if term_curies:
             term_curie = term_curies[0]  # Assuming one CURIE per label
             # Create a descendant exclusion query for the term
-            exclusion_query = FunctionQuery(
-                function=FunctionEnum.DESCENDANT,
-                argument=term_curie,
-                description=f"Descendants of {label}"
-            )
-            exclusion_queries.append(exclusion_query)
-
-    # Combine all exclusion queries into one using the OR (|) operator
-    if exclusion_queries:
-        combined_exclusion_query = exclusion_queries[0]
-        for query in exclusion_queries[1:]:
-            combined_exclusion_query = combined_exclusion_query | query
-        return combined_exclusion_query
-    else:
-        return None
+            list_to_exclude = onto_query([".desc//p=i", term_curie], adapter)
+            all_ids_to_exclude.extend(list_to_exclude)
+    return list(set(all_ids_to_exclude))
 
 
 def create_text_exclusion_query(text_exclusions, adapter):
@@ -53,20 +42,25 @@ def create_text_exclusion_query(text_exclusions, adapter):
     :param adapter: The ontology adapter.
     :return: Combined FunctionQuery to exclude all specified text matches.
     """
-    text_exclusion_queries = []
+
+    all_ids_to_exclude = []
 
     for text in text_exclusions:
-        exclusion_query = SimpleQueryTerm(term=text)
-        text_exclusion_queries.append(exclusion_query)
+        # Find the CURIE for the label
+        list_to_exclude = onto_query(["l~"+text], adapter)
+        all_ids_to_exclude.extend(list_to_exclude)
+    return list(set(all_ids_to_exclude))
 
-    # Combine all exclusion queries into one using the OR (|) operator
-    if text_exclusion_queries:
-        combined_text_exclusion_query = text_exclusion_queries[0]
-        for query in text_exclusion_queries[1:]:
-            combined_text_exclusion_query = combined_text_exclusion_query | query
-        return combined_text_exclusion_query
-    else:
-        return None
+
+def exclude_terms(full_list, exclusion_list):
+    """
+    Returns a list of items from the full list with the items in the exclusion list removed.
+
+    :param full_list: List of items to be filtered.
+    :param exclusion_list: List of items to exclude from the full list.
+    :return: A list with items from exclusion_list removed.
+    """
+    return [item for item in full_list if item not in exclusion_list]
 
 
 def process_ontology(oak_config_file, extraction_config):
@@ -75,51 +69,20 @@ def process_ontology(oak_config_file, extraction_config):
 
     # Get the entity and exclusions from the config
     initial_term_label = extraction_config['entity']
-    initial_term_curies = onto_query(SimpleQueryTerm(term=initial_term_label), oak_adapter)
-
-    if not initial_term_curies:
-        raise ValueError(f"Entity '{initial_term_label}' not found in the ontology.")
-
-    initial_term_curie = initial_term_curies[0]
-    print("initial_term_curie", initial_term_curie)
-
-    # Create exclusion queries from terms
-    term_exclusion_query = create_exclusion_query(extraction_config.get('term_exclusions', []), oak_adapter)
-
-    # Create exclusion queries from text patterns
-    text_exclusion_query = create_text_exclusion_query(extraction_config.get('text_exclusions', []), oak_adapter)
-
-    # Combine term and text exclusion queries
-    combined_exclusion_query = None
-    if term_exclusion_query and text_exclusion_query:
-        combined_exclusion_query = term_exclusion_query | text_exclusion_query
-    elif term_exclusion_query:
-        combined_exclusion_query = term_exclusion_query
-    elif text_exclusion_query:
-        combined_exclusion_query = text_exclusion_query
-
-    # Main query for descendants of the specified entity
-    material_entity_query = FunctionQuery(
-        function=FunctionEnum.DESCENDANT,
-        argument=initial_term_curie,  # Assuming one CURIE for the entity
-        description=f"Descendants of {initial_term_label}"
-    )
-
-    # Combine the main query with the exclusion query
-    if combined_exclusion_query:
-        final_query = material_entity_query - combined_exclusion_query
-    else:
-        final_query = material_entity_query
-
-    # Execute the final query
-    result = onto_query(final_query, oak_adapter)
-
-    # Write the results to the output file
-    with open(extraction_config['output'], 'w') as output_file:
-        for curie in result:
-            label = oak_adapter.label(curie)
-            output_file.write(f"{curie}: {label}\n")
-            print(curie, label)
+    initial_term_list = onto_query([".desc//p=i", initial_term_label], oak_adapter)
+    print("length of initial term list", len(initial_term_list))
+
+    exclusion_terms = extraction_config.get('term_exclusions', [])
+    exclusion_texts = extraction_config.get('text_exclusions', [])
+
+    exclusion_terms_and_children = create_exclusion_list(exclusion_terms, oak_adapter)
+    exclusion_terms_from_text = create_text_exclusion_query(exclusion_texts, oak_adapter)
+    exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text
+    print("length of excluded terms", len(exclusion_terms_and_children))
+    print("length of excluded terms from text", len(exclusion_terms_from_text))
+
+    remaining_items = exclude_terms(initial_term_list, exclusion_list)
+    print(len(remaining_items))
 
 
 @click.command()
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index ced2226..f57235d 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -3,7 +3,8 @@
 import os
 from click.testing import CliRunner
 from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, process_ontology
-
+from oaklib.query import onto_query
+from oaklib.selector import get_adapter
 
 @pytest.fixture
 def oak_config_file(tmp_path):
@@ -25,9 +26,23 @@ def extraction_config_file(tmp_path):
     config_data = {
         "entity": "material entity",
         "term_exclusions": [
-            "biome",
-            "environmental material",
-            "chemical entity"
+            "biome"
+            , "environmental material"
+            , "chemical entity"
+            , "organic material"
+            , "anatomical entity"
+            , "organism"
+            , "plant anatomical entity"
+            , "healthcare facility"
+            , "fluid layer"
+            , "interface layer"
+            , "manufactured product"
+            , "anatomical entity environment"
+            , "ecosystem"
+            , "area protected according to IUCN guidelines"
+            , "astronomical body"
+            , "astronomical object"
+            , "cloud"
         ],
         "text_exclusions": [
             "brackish",
@@ -64,6 +79,7 @@ def test_process_ontology(oak_config_file, extraction_config_file):
 
     with open(output_file_path, 'r') as file:
         content = file.read()
+        print(content)
         assert len(content) > 0, "Output file is empty, expected some data."
 
     # You could also add assertions based on expected content
@@ -88,4 +104,13 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file):
 
     # Add additional assertions to check that the CLI correctly excluded terms
     assert "biome" not in content
-    assert "brackish" not in content
\ No newline at end of file
+    assert "brackish" not in content
+
+
+def test_onto_query():
+    adapter = get_adapter("sqlite:obo:envo")
+    # desc = onto_query([".desc//p=i", "material entity"], adapter)
+    # print(len(desc))
+
+    list_to_exclude = onto_query(["l~saline"], adapter, labels=True)
+    print(list_to_exclude)
\ No newline at end of file

From 2d6d2fbfa195099ed0783d505345134e9c551efa Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 19:43:37 -0700
Subject: [PATCH 08/17] add plain text to the config file

---
 config/env-local-scale-extraction-config.yaml | 78 +++++++++++--------
 .../env_local_scale_extraction.py             | 57 +++++++++++---
 tests/test_env_local_scale_generator.py       | 51 ++++++++++--
 3 files changed, 133 insertions(+), 53 deletions(-)

diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml
index 987142d..43b60b0 100755
--- a/config/env-local-scale-extraction-config.yaml
+++ b/config/env-local-scale-extraction-config.yaml
@@ -7,52 +7,64 @@ text_exclusions:
   - "undersea"
   - "saline"
   - "brackish"
-term_exclusions:
-  - "biome"
-  - "environmental material"
-  - "chemical entity"
-  - "organic material"
+term_exlusions:
+  - "bridge"
+  - "road"
+  - "wildlife management area"
+term_and_descendant_exclusions:
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "RO:0001025"
+  - "RO:0001025"
+  - "RO:0002473"
+  - "NCBITaxon:1"
+  - "administrative region"
+  - "aeroform"
   - "anatomical entity"
-  - "organism"
-  - "plant anatomical entity"
-  - "healthcare facility"
-  - "fluid layer"
-  - "interface layer"
-  - "manufactured product"
   - "anatomical entity environment"
-  - "ecosystem"
   - "area protected according to IUCN guidelines"
   - "astronomical body"
   - "astronomical object"
+  - "biome"
+  - "channel of a watercourse"
+  - "chemical entity"
   - "cloud"
   - "collection of organisms"
-  - "environmental system"
+  - "cryospheric layer"
   - "ecozone"
-  - "material isosurface"
+  - "ecosystem"
+  - "environmental material"
+  - "environmental monitoring area"
+  - "environmental system"
   - "environmental zone"
-  - "water current"
+  - "fluid layer"
+  - "healthcare facility"
+  - "ice"
+  - "interface layer"
+  - "manufactured product"
+  - "marine environmental zone"
+  - "marine littoral zone"
   - "mass of environmental material"
-  - "subatomic particle"
+  - "mass of liquid"
+  - "material isosurface"
+  - "material isosurface"
+  - "meteor"
   - "observing system"
+  - "organic material"
+  - "organism"
   - "particle"
   - "planetary structural layer"
   - "political entity"
-  - "meteor"
+  - "protected area"
   - "room"
+  - "saline water"
+  - "sea floor"
+  - "subatomic particle"
   - "transport feature"
-  - "mass of liquid"
-  - "RO:0001025 water body"
-  - "BFO:0000050 environmental monitoring area"
-  - "BFO:0000050 marine littoral zone"
-  - "BFO:0000050 marine environmental zone"
-  - "RO:0002473 sea floor"
-  - "BFO:0000050 saline water"
-  - "BFO:0000050 ice"
-  - "RO:0001025 water body"
-  - "administrative region"
-  - "protected area"
-  - "channel of a watercourse"
-  - "cryospheric layer"
-  - "material isosurface"
-  - "NCBITaxon:1"
-  - "aeroform"
+  - "water body"
+  - "water body"
+  - "water current"
\ No newline at end of file
diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py
index b096f87..7e6bb95 100755
--- a/external_metadata_awareness/env_local_scale_extraction.py
+++ b/external_metadata_awareness/env_local_scale_extraction.py
@@ -34,7 +34,7 @@ def create_exclusion_list(term_labels, adapter) -> List[str]:
     return list(set(all_ids_to_exclude))
 
 
-def create_text_exclusion_query(text_exclusions, adapter):
+def create_text_exclusion_list(text_exclusions, adapter):
     """
     Creates a combined FunctionQuery to exclude specific terms based on text matching.
 
@@ -63,26 +63,59 @@ def exclude_terms(full_list, exclusion_list):
     return [item for item in full_list if item not in exclusion_list]
 
 
-def process_ontology(oak_config_file, extraction_config):
+def create_exclude_solo_terms(exlusion_terms: List[str], adapter) -> List[str]:
+    """
+    Creates a list of CURIEs to exclude based on the provided list of terms.
+
+    :param exlusion_terms: List of term labels to exclude.
+    :param envo: The ontology adapter.
+
+    """
+
+    all_ids_to_exclude = []
+
+    for term_label in exlusion_terms:
+        # Find the CURIE for the label
+        list_to_exclude = onto_query([term_label], adapter)
+        all_ids_to_exclude.extend(list_to_exclude)
+    return list(set(all_ids_to_exclude))
+    pass
+
+
+def extract_terms_to_file(oak_config_file, extraction_config):
     # Load the ontology using the get_adapter function
-    oak_adapter = get_adapter(oak_config_file)
+    envo = get_adapter(oak_config_file)
 
     # Get the entity and exclusions from the config
     initial_term_label = extraction_config['entity']
-    initial_term_list = onto_query([".desc//p=i", initial_term_label], oak_adapter)
+    initial_term_list = onto_query([".desc//p=i", initial_term_label], envo)
     print("length of initial term list", len(initial_term_list))
 
-    exclusion_terms = extraction_config.get('term_exclusions', [])
-    exclusion_texts = extraction_config.get('text_exclusions', [])
 
-    exclusion_terms_and_children = create_exclusion_list(exclusion_terms, oak_adapter)
-    exclusion_terms_from_text = create_text_exclusion_query(exclusion_texts, oak_adapter)
-    exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text
+    exclusion_terms_and_children = create_exclusion_list(extraction_config.get('term_and_descendant_exclusions', []),
+                                                         envo)
+
+    exclusion_terms_from_text = create_text_exclusion_list(extraction_config.get('text_exclusions', []),
+                                                           envo)
+    exluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo)
+
+    exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + exluded_terms
     print("length of excluded terms", len(exclusion_terms_and_children))
     print("length of excluded terms from text", len(exclusion_terms_from_text))
+    print("length of excluded terms from solo terms", len(exluded_terms))
 
     remaining_items = exclude_terms(initial_term_list, exclusion_list)
-    print(len(remaining_items))
+    print("length of remaining items", len(remaining_items))
+
+    results = onto_query(remaining_items, envo, labels=True)
+
+    # Write the results to the output file specified in the extraction config
+    output_file_path = extraction_config['output']
+    with open(output_file_path, 'w') as output_file:
+        for curie, label in results:
+            output_file.write(f"{curie}: {label}\n")
+
+    print(f"Results written to {output_file_path}")
 
 
 @click.command()
@@ -93,8 +126,8 @@ def cli(extraction_config_file, oak_config_file):
     CLI tool to process an ontology based on the provided YAML configuration file.
     """
     _, extraction_config = load_configs(oak_config_file, extraction_config_file)
-    process_ontology(oak_config_file, extraction_config)
+    extract_terms_to_file(oak_config_file, extraction_config)
 
 
 if __name__ == "__main__":
-    cli()
\ No newline at end of file
+    cli()
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index f57235d..8eefecb 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -2,10 +2,11 @@
 import yaml
 import os
 from click.testing import CliRunner
-from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, process_ontology
+from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, extract_terms_to_file
 from oaklib.query import onto_query
 from oaklib.selector import get_adapter
 
+
 @pytest.fixture
 def oak_config_file(tmp_path):
     config_data = {
@@ -43,12 +44,46 @@ def extraction_config_file(tmp_path):
             , "astronomical body"
             , "astronomical object"
             , "cloud"
+            , "collection of organisms"
+            , "environmental system"
+            , "ecozone"
+            , "material isosurface"
+            , "environmental zone"
+            , "water current"
+            , "mass of environmental material"
+            , "subatomic particle"
+            , "observing system"
+            , "particle"
+            , "planetary structural layer"
+            , "political entity"
+            , "meteor"
+            , "room"
+            , "transport feature"
+            , "mass of liquid"
+            , "RO:0001025 water body"
+            , "BFO:0000050 environmental monitoring area"
+            , "BFO:0000050 marine littoral zone"
+            , "BFO:0000050 marine environmental zone"
+            , "RO:0002473 sea floor"
+            , "BFO:0000050 saline water"
+            , "BFO:0000050 ice"
+            , "RO:0001025 water body"
+            , "administrative region"
+            , "protected area"
+            , "channel of a watercourse"
+            , "cryospheric layer"
+            , "material isosurface"
+            , "NCBITaxon:1"
+            , "aeroform"
         ],
         "text_exclusions": [
-            "brackish",
-            "marine"
+            "gaseous"
+            , "marine"
+            , "undersea"
+            , "saline"
+            , "brackish"
         ],
-        "output": str(tmp_path / "output.txt")
+        "output": str(tmp_path / "environmental-materials-relationships.txt")
     }
     config_file = tmp_path / "extraction_config.yaml"
     with open(config_file, 'w') as file:
@@ -71,7 +106,7 @@ def test_process_ontology(oak_config_file, extraction_config_file):
     _, extraction_config = load_configs(oak_config_file, extraction_config_file)
 
     # Run the ontology processing
-    process_ontology(oak_config_file, extraction_config)
+    extract_terms_to_file(oak_config_file, extraction_config)
 
     # Check if the output file is created and has content
     output_file_path = extraction_config["output"]
@@ -79,13 +114,13 @@ def test_process_ontology(oak_config_file, extraction_config_file):
 
     with open(output_file_path, 'r') as file:
         content = file.read()
-        print(content)
         assert len(content) > 0, "Output file is empty, expected some data."
 
     # You could also add assertions based on expected content
     # For example, checking that excluded terms are not in the output
     assert "biome" not in content
     assert "brackish" not in content
+    assert "saline" not in content
 
 
 def test_cli_runs_successfully(oak_config_file, extraction_config_file):
@@ -93,10 +128,9 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file):
     result = runner.invoke(cli, ['--extraction-config-file', str(extraction_config_file), '--oak-config-file',
                                  str(oak_config_file)])
     assert result.exit_code == 0
-    assert "material entity" in result.output or "ENVO:00000447" in result.output
 
     # Verify the output file exists and contains the expected results
-    output_file = extraction_config_file.parent / "output.txt"
+    output_file = extraction_config_file.parent / "environmental-materials-relationships.txt"
     assert output_file.exists()
     with open(output_file, 'r') as file:
         content = file.read()
@@ -105,6 +139,7 @@ def test_cli_runs_successfully(oak_config_file, extraction_config_file):
     # Add additional assertions to check that the CLI correctly excluded terms
     assert "biome" not in content
     assert "brackish" not in content
+    assert "saline" not in content
 
 
 def test_onto_query():

From 7642befb2c1bf9265c508d5fd6c6d7182bda4c5b Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 19:58:30 -0700
Subject: [PATCH 09/17] passing tests

---
 config/env-local-scale-extraction-config.yaml             | 6 ++++--
 external_metadata_awareness/env_local_scale_extraction.py | 7 +++----
 tests/test_env_local_scale_generator.py                   | 5 +++++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml
index 43b60b0..4c0aa1e 100755
--- a/config/env-local-scale-extraction-config.yaml
+++ b/config/env-local-scale-extraction-config.yaml
@@ -7,7 +7,8 @@ text_exclusions:
   - "undersea"
   - "saline"
   - "brackish"
-term_exlusions:
+  - "undersea"
+term_exclusions:
   - "bridge"
   - "road"
   - "wildlife management area"
@@ -45,13 +46,14 @@ term_and_descendant_exclusions:
   - "healthcare facility"
   - "ice"
   - "interface layer"
+  - "island"
+  - "lake layer"
   - "manufactured product"
   - "marine environmental zone"
   - "marine littoral zone"
   - "mass of environmental material"
   - "mass of liquid"
   - "material isosurface"
-  - "material isosurface"
   - "meteor"
   - "observing system"
   - "organic material"
diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/env_local_scale_extraction.py
index 7e6bb95..93bef46 100755
--- a/external_metadata_awareness/env_local_scale_extraction.py
+++ b/external_metadata_awareness/env_local_scale_extraction.py
@@ -91,18 +91,17 @@ def extract_terms_to_file(oak_config_file, extraction_config):
     initial_term_list = onto_query([".desc//p=i", initial_term_label], envo)
     print("length of initial term list", len(initial_term_list))
 
-
     exclusion_terms_and_children = create_exclusion_list(extraction_config.get('term_and_descendant_exclusions', []),
                                                          envo)
 
     exclusion_terms_from_text = create_text_exclusion_list(extraction_config.get('text_exclusions', []),
                                                            envo)
-    exluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo)
+    excluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo)
 
-    exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + exluded_terms
+    exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + excluded_terms
     print("length of excluded terms", len(exclusion_terms_and_children))
     print("length of excluded terms from text", len(exclusion_terms_from_text))
-    print("length of excluded terms from solo terms", len(exluded_terms))
+    print("length of excluded terms from solo terms", len(excluded_terms))
 
     remaining_items = exclude_terms(initial_term_list, exclusion_list)
     print("length of remaining items", len(remaining_items))
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index 8eefecb..3a1d482 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -27,6 +27,11 @@ def extraction_config_file(tmp_path):
     config_data = {
         "entity": "material entity",
         "term_exclusions": [
+            "bridge",
+            "road",
+            "wildlife management area"
+        ],
+        "term_and_descendant_exclusions": [
             "biome"
             , "environmental material"
             , "chemical entity"

From ccb459d3a73b0fe2704792e2e70111a3d8127e69 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 20:14:15 -0700
Subject: [PATCH 10/17] print to logging

---
 config/env-local-scale-extraction-config.yaml |  2 +-
 env_triad.Makefile                            |  2 +-
 ...tion.py => envo_local_scale_extraction.py} | 28 +++++++++----------
 tests/test_env_local_scale_generator.py       |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)
 rename external_metadata_awareness/{env_local_scale_extraction.py => envo_local_scale_extraction.py} (84%)

diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml
index 4c0aa1e..bb36c80 100755
--- a/config/env-local-scale-extraction-config.yaml
+++ b/config/env-local-scale-extraction-config.yaml
@@ -1,5 +1,5 @@
 # env-local-scale-extraction-config.yaml
-output: "local/environmental-materials-relationships.txt"
+output: "local/env-local-scale-candidates.txt"
 entity: "material entity"
 text_exclusions:
   - "gaseous"
diff --git a/env_triad.Makefile b/env_triad.Makefile
index b1626be..585558f 100644
--- a/env_triad.Makefile
+++ b/env_triad.Makefile
@@ -177,7 +177,7 @@ local/env-local-scale-non-leaf.png: local/env-local-scale-candidates.txt local/e
 
 generate-env-local-scale-candidates:
 	# Ensure the poetry environment is activated and run the script with the specified config
-	$(RUN) python external_metadata_awareness/env_local_scale_extraction.py \
+	$(RUN) python external_metadata_awareness/envo_local_scale_extraction.py \
            --oak-config-file config/oak-config.yaml \
            --extraction-config-file config/env-local-scale-extraction-config.yaml
 
diff --git a/external_metadata_awareness/env_local_scale_extraction.py b/external_metadata_awareness/envo_local_scale_extraction.py
similarity index 84%
rename from external_metadata_awareness/env_local_scale_extraction.py
rename to external_metadata_awareness/envo_local_scale_extraction.py
index 93bef46..5ce2f2d 100755
--- a/external_metadata_awareness/env_local_scale_extraction.py
+++ b/external_metadata_awareness/envo_local_scale_extraction.py
@@ -1,9 +1,12 @@
+import logging
 from typing import List
-
 import yaml
 import click
 from oaklib import get_adapter
-from oaklib.query import onto_query, FunctionQuery, FunctionEnum, SimpleQueryTerm
+from oaklib.query import onto_query, SimpleQueryTerm
+
+# Configure logging
+logging.basicConfig(level=logging.WARN, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 def load_configs(oak_config_file, extraction_config_file):
@@ -63,23 +66,20 @@ def exclude_terms(full_list, exclusion_list):
     return [item for item in full_list if item not in exclusion_list]
 
 
-def create_exclude_solo_terms(exlusion_terms: List[str], adapter) -> List[str]:
+def create_exclude_solo_terms(exclusion_terms: List[str], adapter) -> List[str]:
     """
     Creates a list of CURIEs to exclude based on the provided list of terms.
 
-    :param exlusion_terms: List of term labels to exclude.
+    :param exclusion_terms: List of term labels to exclude.
     :param envo: The ontology adapter.
-
     """
-
     all_ids_to_exclude = []
 
-    for term_label in exlusion_terms:
+    for term_label in exclusion_terms:
         # Find the CURIE for the label
         list_to_exclude = onto_query([term_label], adapter)
         all_ids_to_exclude.extend(list_to_exclude)
     return list(set(all_ids_to_exclude))
-    pass
 
 
 def extract_terms_to_file(oak_config_file, extraction_config):
@@ -89,7 +89,7 @@ def extract_terms_to_file(oak_config_file, extraction_config):
     # Get the entity and exclusions from the config
     initial_term_label = extraction_config['entity']
     initial_term_list = onto_query([".desc//p=i", initial_term_label], envo)
-    print("length of initial term list", len(initial_term_list))
+    logging.info(f"Length of initial term list: {len(initial_term_list)}")
 
     exclusion_terms_and_children = create_exclusion_list(extraction_config.get('term_and_descendant_exclusions', []),
                                                          envo)
@@ -99,12 +99,12 @@ def extract_terms_to_file(oak_config_file, extraction_config):
     excluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo)
 
     exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + excluded_terms
-    print("length of excluded terms", len(exclusion_terms_and_children))
-    print("length of excluded terms from text", len(exclusion_terms_from_text))
-    print("length of excluded terms from solo terms", len(excluded_terms))
+    logging.info(f"Length of excluded terms and descendants: {len(exclusion_terms_and_children)}")
+    logging.info(f"Length of excluded terms from text: {len(exclusion_terms_from_text)}")
+    logging.info(f"Length of excluded terms from solo terms: {len(excluded_terms)}")
 
     remaining_items = exclude_terms(initial_term_list, exclusion_list)
-    print("length of remaining items", len(remaining_items))
+    logging.info(f"Length of remaining items: {len(remaining_items)}")
 
     results = onto_query(remaining_items, envo, labels=True)
 
@@ -114,7 +114,7 @@ def extract_terms_to_file(oak_config_file, extraction_config):
         for curie, label in results:
             output_file.write(f"{curie}: {label}\n")
 
-    print(f"Results written to {output_file_path}")
+    logging.info(f"Results written to {output_file_path}")
 
 
 @click.command()
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index 3a1d482..9511feb 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -2,7 +2,7 @@
 import yaml
 import os
 from click.testing import CliRunner
-from external_metadata_awareness.env_local_scale_extraction import cli, load_configs, extract_terms_to_file
+from external_metadata_awareness.envo_local_scale_extraction import cli, load_configs, extract_terms_to_file
 from oaklib.query import onto_query
 from oaklib.selector import get_adapter
 

From d3c09d7cc4c5921d0ac731db17ae9066648c2476 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 20:30:11 -0700
Subject: [PATCH 11/17] add test target from makefile

---
 env_triad.Makefile                      | 2 ++
 tests/test_env_local_scale_generator.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/env_triad.Makefile b/env_triad.Makefile
index 585558f..2b6c04e 100644
--- a/env_triad.Makefile
+++ b/env_triad.Makefile
@@ -181,4 +181,6 @@ generate-env-local-scale-candidates:
            --oak-config-file config/oak-config.yaml \
            --extraction-config-file config/env-local-scale-extraction-config.yaml
 
+test:
+	$(RUN) pytest tests/*
 ###### END SIERRA's STUFF #######
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index 9511feb..2837553 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -104,7 +104,7 @@ def test_load_configs(oak_config_file, extraction_config_file):
     assert extraction_config["entity"] == "material entity"
     assert "term_exclusions" in extraction_config
     assert "text_exclusions" in extraction_config
-    assert extraction_config["output"].endswith("output.txt")
+    assert extraction_config["output"].endswith("environmental-materials-relationships.txt")
 
 
 def test_process_ontology(oak_config_file, extraction_config_file):

From 6e7b38ef9fb854d5d6ecd2fbf85f5f78f8954749 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 20:31:41 -0700
Subject: [PATCH 12/17] add GH action to run the tests

---
 .github/workflows/main.yaml | 40 +++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .github/workflows/main.yaml

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
new file mode 100644
index 0000000..4fd9e94
--- /dev/null
+++ b/.github/workflows/main.yaml
@@ -0,0 +1,40 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Run tests
+
+# Controls when the action will run.
+on:
+  # Triggers the workflow on push or pull request events but only for the master branch
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: [ "3.9", "3.10", "3.11" ]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      - uses: actions/setup-python@v2
+        name: setup python environment
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install poetry
+          poetry install 
+
+      - name: Run tests
+        run: |
+         poetry run pytest tests/*

From a3cab2a7bc76bf176e9a25e0446c1b508d5af5f9 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 29 Aug 2024 20:42:52 -0700
Subject: [PATCH 13/17] undo cborg_test change

---
 external_metadata_awareness/cborg_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external_metadata_awareness/cborg_test.py b/external_metadata_awareness/cborg_test.py
index 94f4607..c2cdf6b 100644
--- a/external_metadata_awareness/cborg_test.py
+++ b/external_metadata_awareness/cborg_test.py
@@ -3,7 +3,7 @@
 from dotenv import load_dotenv
 
 # Load environment variables from local/.env
-load_dotenv(os.path.join('../..', 'local', '.env'))
+load_dotenv(os.path.join('..', 'local', '.env'))
 
 client = openai.OpenAI(
     api_key=os.environ.get('CBORG_API_KEY'),  # Retrieve API key from environment variables

From ec907d0f6617d32eec337cd295b3695854851db7 Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Fri, 30 Aug 2024 16:13:46 -0700
Subject: [PATCH 14/17] add in ability to exclude single terms, as well as
 include term post-extraction

---
 config/env-local-scale-extraction-config.yaml | 98 ++++++++++++++++++-
 .../envo_local_scale_extraction.py            | 32 +++---
 tests/test_env_local_scale_generator.py       |  3 +-
 3 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml
index bb36c80..27111d3 100755
--- a/config/env-local-scale-extraction-config.yaml
+++ b/config/env-local-scale-extraction-config.yaml
@@ -8,7 +8,7 @@ text_exclusions:
   - "saline"
   - "brackish"
   - "undersea"
-term_exclusions:
+post_process_inclusion_single_terms:
   - "bridge"
   - "road"
   - "wildlife management area"
@@ -69,4 +69,98 @@ term_and_descendant_exclusions:
   - "transport feature"
   - "water body"
   - "water body"
-  - "water current"
\ No newline at end of file
+  - "water current"
+"single_term_exclusions":
+  - anthropised terrestrial environmental zone
+  - anthropogenic contamination feature
+  - anthropogenic geographic feature
+  - area of attached faunal communities
+  - area of attached mussel assemblages
+  - area of developed space
+  - astronomical body part
+  - biosphere
+  - body of liquid
+  - carbonate system of ocean water
+  - cellular organisms
+  - child care facility
+  - cloud part
+  - compound astronomical body part
+  - construction
+  - conveyor system
+  - cryoform
+  - educational facility
+  - environmental zone
+  - environmental zone of processual equilibrium
+  - facility
+  - fiat object
+  - fiat part of an astronomical object
+  - floating ice mass
+  - fluid astronomical body part
+  - fresh water body
+  - gaseous astronomical body part
+  - gaseous part of an atmosphere
+  - geographic feature
+  - hail stone
+  - hydroform
+  - hydrographic feature
+  - hydrosphere
+  - ice decumulation zone
+  - landform
+  - layer
+  - liquid astronomical body part
+  - lotic water body
+  - marine hydrothermal vent
+  - marine reef
+  - marine water body
+  - marine water mass
+  - mass of compounded environmental materials
+  - mass of environmental material
+  - mass of solid material
+  - material accumulation zone
+  - material decumulation zone
+  - material entity
+  - object
+  - object aggregate
+  - ocean basin
+  - open cage mariculture facility
+  - organismal entity
+  - pedosphere
+  - planetary photic zone
+  - planetary subsurface zone
+  - pole
+  - polling place
+  - polling station
+  - processed material
+  - processing plant
+  - public infrastructure
+  - public transit system
+  - rapid transit system
+  - rain
+  - range of seamounts
+  - rocky reef
+  - root
+  - saline water body
+  - sea ice floe
+  - sea ice hummock
+  - sea ice mass
+  - seamount
+  - sleet pellet
+  - sleet pellet
+  - soil horizon
+  - soil layer
+  - solid astronomical body part
+  - solid layer
+  - subsurface landform
+  - subsurface zone of an astronomical body
+  - surface landform
+  - system
+  - Taylor column
+  - technosphere
+  - underground water body
+  - volcanic feature
+  - water body
+  - watercourse
+  - water mass
+  - water-based rain
+
+
diff --git a/external_metadata_awareness/envo_local_scale_extraction.py b/external_metadata_awareness/envo_local_scale_extraction.py
index 5ce2f2d..7366547 100755
--- a/external_metadata_awareness/envo_local_scale_extraction.py
+++ b/external_metadata_awareness/envo_local_scale_extraction.py
@@ -66,20 +66,22 @@ def exclude_terms(full_list, exclusion_list):
     return [item for item in full_list if item not in exclusion_list]
 
 
-def create_exclude_solo_terms(exclusion_terms: List[str], adapter) -> List[str]:
+def retrieve_individual_terms(terms_to_retrieve: List[str], adapter) -> List[str]:
     """
-    Creates a list of CURIEs to exclude based on the provided list of terms.
+    Creates a list of CURIEs based on the provided list of term labels.
 
-    :param exclusion_terms: List of term labels to exclude.
+    :param terms_to_retrieve: List of term labels.
     :param envo: The ontology adapter.
     """
-    all_ids_to_exclude = []
+    all_ids = []
 
-    for term_label in exclusion_terms:
+    for term_label in terms_to_retrieve:
         # Find the CURIE for the label
         list_to_exclude = onto_query([term_label], adapter)
-        all_ids_to_exclude.extend(list_to_exclude)
-    return list(set(all_ids_to_exclude))
+        print("term_label", term_label)
+        print("list_to_exclude", list_to_exclude)
+        all_ids.extend(list_to_exclude)
+    return list(set(all_ids))
 
 
 def extract_terms_to_file(oak_config_file, extraction_config):
@@ -96,17 +98,25 @@ def extract_terms_to_file(oak_config_file, extraction_config):
 
     exclusion_terms_from_text = create_text_exclusion_list(extraction_config.get('text_exclusions', []),
                                                            envo)
-    excluded_terms = create_exclude_solo_terms(extraction_config.get('term_exclusions', []), envo)
 
-    exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + excluded_terms
+    exclude_single_terms = retrieve_individual_terms(extraction_config.get('exclude_single_terms', []), envo)
+    solo_inclusion_terms = extraction_config.get('post_process_inclusion_single_terms', [])
+    logging.info("solo_inclusion_terms", solo_inclusion_terms)
+    post_process_inclusion_single_terms = retrieve_individual_terms(extraction_config.get('post_process_inclusion_single_terms', []), envo)
+    logging.info("post_process_inclusion_terms", post_process_inclusion_single_terms)
+
+
+    exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + exclude_single_terms
     logging.info(f"Length of excluded terms and descendants: {len(exclusion_terms_and_children)}")
     logging.info(f"Length of excluded terms from text: {len(exclusion_terms_from_text)}")
-    logging.info(f"Length of excluded terms from solo terms: {len(excluded_terms)}")
+    logging.info(f"Length of excluded terms from solo terms: {len(post_process_inclusion_single_terms)}")
 
     remaining_items = exclude_terms(initial_term_list, exclusion_list)
     logging.info(f"Length of remaining items: {len(remaining_items)}")
 
-    results = onto_query(remaining_items, envo, labels=True)
+    final_list_to_retrieve = post_process_inclusion_single_terms + remaining_items
+
+    results = onto_query(final_list_to_retrieve, envo, labels=True)
 
     # Write the results to the output file specified in the extraction config
     output_file_path = extraction_config['output']
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index 2837553..21a6e7f 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -26,7 +26,7 @@ def oak_config_file(tmp_path):
 def extraction_config_file(tmp_path):
     config_data = {
         "entity": "material entity",
-        "term_exclusions": [
+        "post_process_inclusion_single_terms": [
             "bridge",
             "road",
             "wildlife management area"
@@ -126,6 +126,7 @@ def test_process_ontology(oak_config_file, extraction_config_file):
     assert "biome" not in content
     assert "brackish" not in content
     assert "saline" not in content
+    assert "wildlife management area" in content
 
 
 def test_cli_runs_successfully(oak_config_file, extraction_config_file):

From 33fb5fc4de38905dc68e8b999594b84cdefb822a Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Fri, 30 Aug 2024 16:17:49 -0700
Subject: [PATCH 15/17] fixing test data

---
 config/env-local-scale-extraction-config.yaml | 183 +++++++++---------
 tests/test_env_local_scale_generator.py       |   3 +-
 2 files changed, 94 insertions(+), 92 deletions(-)

diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml
index 27111d3..c7fe387 100755
--- a/config/env-local-scale-extraction-config.yaml
+++ b/config/env-local-scale-extraction-config.yaml
@@ -71,96 +71,97 @@ term_and_descendant_exclusions:
   - "water body"
   - "water current"
 "single_term_exclusions":
-  - anthropised terrestrial environmental zone
-  - anthropogenic contamination feature
-  - anthropogenic geographic feature
-  - area of attached faunal communities
-  - area of attached mussel assemblages
-  - area of developed space
-  - astronomical body part
-  - biosphere
-  - body of liquid
-  - carbonate system of ocean water
-  - cellular organisms
-  - child care facility
-  - cloud part
-  - compound astronomical body part
-  - construction
-  - conveyor system
-  - cryoform
-  - educational facility
-  - environmental zone
-  - environmental zone of processual equilibrium
-  - facility
-  - fiat object
-  - fiat part of an astronomical object
-  - floating ice mass
-  - fluid astronomical body part
-  - fresh water body
-  - gaseous astronomical body part
-  - gaseous part of an atmosphere
-  - geographic feature
-  - hail stone
-  - hydroform
-  - hydrographic feature
-  - hydrosphere
-  - ice decumulation zone
-  - landform
-  - layer
-  - liquid astronomical body part
-  - lotic water body
-  - marine hydrothermal vent
-  - marine reef
-  - marine water body
-  - marine water mass
-  - mass of compounded environmental materials
-  - mass of environmental material
-  - mass of solid material
-  - material accumulation zone
-  - material decumulation zone
-  - material entity
-  - object
-  - object aggregate
-  - ocean basin
-  - open cage mariculture facility
-  - organismal entity
-  - pedosphere
-  - planetary photic zone
-  - planetary subsurface zone
-  - pole
-  - polling place
-  - polling station
-  - processed material
-  - processing plant
-  - public infrastructure
-  - public transit system
-  - rapid transit system
-  - rain
-  - range of seamounts
-  - rocky reef
-  - root
-  - saline water body
-  - sea ice floe
-  - sea ice hummock
-  - sea ice mass
-  - seamount
-  - sleet pellet
-  - sleet pellet
-  - soil horizon
-  - soil layer
-  - solid astronomical body part
-  - solid layer
-  - subsurface landform
-  - subsurface zone of an astronomical body
-  - surface landform
-  - system
-  - Taylor column
-  - technosphere
-  - underground water body
-  - volcanic feature
-  - water body
-  - watercourse
-  - water mass
-  - water-based rain
+  - "anthropised terrestrial environmental zone"
+  - "anthropogenic contamination feature"
+  - "anthropogenic geographic feature"
+  - "area of attached faunal communities"
+  - "area of attached mussel assemblages"
+  - "area of developed space"
+  - "astronomical body part"
+  - "biosphere"
+  - "body of liquid"
+  - "carbonate system of ocean water"
+  - "cellular organisms"
+  - "child care facility"
+  - "cloud part"
+  - "compound astronomical body part"
+  - "construction"
+  - "conveyor system"
+  - "cryoform"
+  - "educational facility"
+  - "environmental zone"
+  - "environmental zone of processual equilibrium"
+  - "facility"
+  - "fiat object"
+  - "fiat part of an astronomical object"
+  - "floating ice mass"
+  - "fluid astronomical body part"
+  - "fresh water body"
+  - "gaseous astronomical body part"
+  - "gaseous part of an atmosphere"
+  - "geographic feature"
+  - "hail stone"
+  - "hydroform"
+  - "hydrographic feature"
+  - "hydrosphere"
+  - "ice decumulation zone"
+  - "landform"
+  - "layer"
+  - "liquid astronomical body part"
+  - "lotic water body"
+  - "marine hydrothermal vent"
+  - "marine reef"
+  - "marine water body"
+  - "marine water mass"
+  - "mass of compounded environmental materials"
+  - "mass of environmental material"
+  - "mass of solid material"
+  - "material accumulation zone"
+  - "material decumulation zone"
+  - "material entity"
+  - "object"
+  - "object aggregate"
+  - "ocean basin"
+  - "open cage mariculture facility"
+  - "organismal entity"
+  - "pedosphere"
+  - "planetary photic zone"
+  - "planetary subsurface zone"
+  - "pole"
+  - "polling place"
+  - "polling station"
+  - "processed material"
+  - "processing plant"
+  - "public infrastructure"
+  - "public transit system"
+  - "rapid transit system"
+  - "rain"
+  - "range of seamounts"
+  - "rocky reef"
+  - "root"
+  - "saline water body"
+  - "sea ice floe"
+  - "sea ice hummock"
+  - "sea ice mass"
+  - "seamount"
+  - "sleet pellet"
+  - "sleet pellet"
+  - "soil horizon"
+  - "soil layer"
+  - "solid astronomical body part"
+  - "solid layer"
+  - "subsurface landform"
+  - "subsurface zone of an astronomical body"
+  - "surface landform"
+  - "system"
+  - "Taylor column"
+  - "technosphere"
+  - "underground water body"
+  - "volcanic feature"
+  - "water body"
+  - "watercourse"
+  - "water mass"
+  - "water-based rain"
+
 
 
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index 21a6e7f..b02bc20 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -102,7 +102,8 @@ def test_load_configs(oak_config_file, extraction_config_file):
     assert "envo" in oak_config["ontology_resources"]
     assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo"
     assert extraction_config["entity"] == "material entity"
-    assert "term_exclusions" in extraction_config
+    assert "post_process_inclusion_single_terms" in extraction_config
+    assert "single_term_exclusions" in extraction_config
     assert "text_exclusions" in extraction_config
     assert extraction_config["output"].endswith("environmental-materials-relationships.txt")
 

From a389de1b997b5fca2acc05bcc7245c6083f0c4da Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Fri, 30 Aug 2024 16:18:12 -0700
Subject: [PATCH 16/17] fixing test data

---
 tests/test_env_local_scale_generator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
index b02bc20..d5c8b4e 100644
--- a/tests/test_env_local_scale_generator.py
+++ b/tests/test_env_local_scale_generator.py
@@ -103,7 +103,6 @@ def test_load_configs(oak_config_file, extraction_config_file):
     assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo"
     assert extraction_config["entity"] == "material entity"
     assert "post_process_inclusion_single_terms" in extraction_config
-    assert "single_term_exclusions" in extraction_config
     assert "text_exclusions" in extraction_config
     assert extraction_config["output"].endswith("environmental-materials-relationships.txt")
 

From 24526e878cf980957af0be9ab1c87e365ad21e4c Mon Sep 17 00:00:00 2001
From: Sierra Taylor Moxon <sierra.taylor@gmail.com>
Date: Thu, 14 Nov 2024 10:24:21 -0800
Subject: [PATCH 17/17] commit latest work

---
 external_metadata_awareness/envo_local_scale_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external_metadata_awareness/envo_local_scale_extraction.py b/external_metadata_awareness/envo_local_scale_extraction.py
index 7366547..67cd74a 100755
--- a/external_metadata_awareness/envo_local_scale_extraction.py
+++ b/external_metadata_awareness/envo_local_scale_extraction.py
@@ -84,7 +84,7 @@ def retrieve_individual_terms(terms_to_retrieve: List[str], adapter) -> List[str
     return list(set(all_ids))
 
 
-def extract_terms_to_file(oak_config_file, extraction_config):
+def extract_terfms_to_file(oak_config_file, extraction_config):
     # Load the ontology using the get_adapter function
     envo = get_adapter(oak_config_file)