diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
new file mode 100644
index 0000000..4fd9e94
--- /dev/null
+++ b/.github/workflows/main.yaml
@@ -0,0 +1,40 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Run tests
+
+# Controls when the action will run.
+on:
+  # Triggers the workflow on push or pull request events but only for the master branch
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: [ "3.9", "3.10", "3.11" ]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      - uses: actions/setup-python@v2
+        name: setup python environment
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install poetry
+          poetry install 
+
+      - name: Run tests
+        run: |
+         poetry run pytest tests/*
diff --git a/Makefile b/Makefile
index 444c572..e179fa7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,7 @@
 WGET=wget
 RUN=poetry run
 
+include Makefiles/env_broad_scale.Makefile
 #include Makefiles/env_broad_scale.Makefile
 #include Makefiles/soil-env_medium.Makefile
 include Makefiles/envo.Makefile
@@ -23,6 +24,200 @@ include Makefiles/soil-env_medium.Makefile
 local/microbiomedata-repos.csv:
 	. ./report-microbiomedata-repos.sh > $@
 
+# NMDC SCHEMA STUFF
+downloads/nmdc_submission_schema.yaml:
+	wget -O $@ $(SUBMISSION_SCHEMA_URL)
+
+local/established-value-sets-from-submission-schema.json: downloads/nmdc_submission_schema.yaml
+	yq -o=json e '{"enums": {"EnvBroadScaleSoilEnum": .enums.EnvBroadScaleSoilEnum, "EnvLocalScaleSoilEnum": .enums.EnvLocalScaleSoilEnum, "EnvMediumSoilEnum": .enums.EnvMediumSoilEnum}}' $< | cat > $@ # ~ 48
+
+local/nmdc-submission-schema-enums-keys.txt: downloads/nmdc_submission_schema.yaml
+	yq eval '.enums | keys | .[]' $< | sort  > $@
+
+local/EnvBroadScaleSoilEnum-pvs-keys.txt: downloads/nmdc_submission_schema.yaml
+	yq eval '.enums.EnvBroadScaleSoilEnum.permissible_values | keys | .[]' $< | cat > $@
+
+local/EnvBroadScaleSoilEnum-pvs-keys-parsed.csv: local/EnvBroadScaleSoilEnum-pvs-keys.txt
+	$(RUN) normalize-envo-data \
+		--input-file $< \
+		--ontology-prefix ENVO \
+		--output-file $@
+
+local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv: local/EnvBroadScaleSoilEnum-pvs-keys-parsed.csv
+	cut -f3,4 -d, $< | head -n 1 > $<.header.csv
+	tail -n +2 $< | cut -f3,4 -d, | sort | uniq > $@.tmp
+	cat $<.header.csv $@.tmp > $@
+	rm -rf $<.header.csv $@.tmp
+
+local/EnvBroadScaleSoilEnum.png: local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv
+	cat $< | tail -n +2  | cut -f1 -d, > $@.ids.txt
+	$(RUN) runoak --input sqlite:obo:envo viz --no-view --output $@ --gap-fill [ .idfile $@.ids.txt ] .or biome
+	rm -rf $@.ids.txt
+
+local/EnvMediumSoilEnum-pvs-keys.txt: downloads/nmdc_submission_schema.yaml
+	yq eval '.enums.EnvMediumSoilEnum.permissible_values | keys | .[]' $< | cat > $@
+
+local/EnvMediumSoilEnum-pvs-keys-parsed.csv: local/EnvMediumSoilEnum-pvs-keys.txt
+	$(RUN) normalize-envo-data \
+		--input-file $< \
+		--ontology-prefix ENVO \
+		--output-file $@
+
+local/EnvMediumSoilEnum-pvs-keys-parsed-unique.csv: local/EnvMediumSoilEnum-pvs-keys-parsed.csv
+	cut -f3,4 -d, $< | head -n 1 > $<.header.csv
+	tail -n +2 $< | cut -f3,4 -d, | sort | uniq > $@.tmp
+	cat $<.header.csv $@.tmp > $@
+	rm -rf $<.header.csv $@.tmp
+
+local/EnvMediumSoilEnum.png: local/EnvMediumSoilEnum-pvs-keys-parsed-unique.csv
+	cat $< | tail -n +2  | cut -f1 -d, > $@.ids.txt
+	$(RUN) runoak --input sqlite:obo:envo viz --gap-fill --no-view --output $@ .idfile $@.ids.txt
+	rm -rf $@.ids.txt
+
+# NMDC METADATA STUFF
+downloads/nmdc-production-studies.json:
+	wget -O $@.bak https://api.microbiomedata.org/nmdcschema/study_set?max_page_size=999999
+	yq '.resources' -o=json $@.bak | cat > $@
+	rm -rf $@.bak
+
+downloads/nmdc-production-biosamples.json:
+	wget -O $@.bak https://api.microbiomedata.org/nmdcschema/biosample_set?max_page_size=999999
+	yq '.resources' -o=json $@.bak | cat > $@
+	rm -rf $@.bak
+
+local/nmdc-production-biosamples-5pct.json: downloads/nmdc-production-biosamples.json
+	$(RUN) random-sample-resources \
+		--input-file $< \
+		--output-file $@ \
+		--sample-percentage 5
+
+local/nmdc-production-biosamples-json-to-context.tsv: downloads/nmdc-production-biosamples.json
+	$(RUN) biosample-json-to-context-tsv \
+		--input-file $< \
+		--output-file $@
+
+local/nmdc-production-biosamples-env-package.json:
+	curl -X 'GET' \
+		'https://api.microbiomedata.org/nmdcschema/biosample_set?max_page_size=999999&projection=env_package' \
+		-H 'accept: application/json' > $@.bak
+	yq '.resources' -o=json $@.bak | cat > $@ # ENVO:00001998 is also soil
+	rm -rf $@.bak
+
+local/nmdc-production-studies-images.csv: downloads/nmdc-production-studies.json
+	$(RUN) python external_metadata_awareness/study-image-table.py \
+		--input-file $< \
+		--output-file $@
+
+####
+
+# biosamples that are part of a particular study
+downloads/sty-11-ev70y104_biosamples.json:
+	wget -O $@.bak 'https://api.microbiomedata.org/nmdcschema/biosample_set?filter=%7B%22part_of%22%3A%20%22nmdc%3Asty-11-ev70y104%22%7D&max_page_size=999999'
+	yq -o=json e '.resources' $@.bak | cat > $@
+	rm -rf $@.bak
+
+# metadata about a particular study
+downloads/sty-11-ev70y104_study.json:
+	wget -O $@.bak 'https://api.microbiomedata.org/nmdcschema/ids/nmdc%3Asty-11-ev70y104'
+	yq -o=json e '.' $@.bak | cat > $@
+	rm -rf $@.bak
+
+####
+
+valid-env_broad_scale-biosample-all: valid-env_broad_scale-biosample-clean \
+local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv \
+local/ncbi-biosamples-context-value-counts-failures.csv
+
+valid-env_broad_scale-biosample-clean:
+	rm -rf local/biome-info.txt \
+		local/envo-info.csv \
+		local/envo-info.txt \
+		local/ncbi-biosamples-context-value-counts.csv \
+		local/ncbi-biosamples-context-value-counts-normalized.csv \
+		local/ncbi-biosamples-context-value-counts-real-labels.csv \
+		local/ncbi-biosamples-context-value-counts-real-labels-only-annotated.tsv \
+		local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-1.tsv \
+		local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.tsv \
+		local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv
+
+local/ncbi-biosamples-context-value-counts.csv:
+	$(RUN) count-biosample-context-vals-from-postgres \
+		--output-file $@ \
+		--min-count 2
+
+local/ncbi-biosamples-context-value-counts-normalized.csv: local/ncbi-biosamples-context-value-counts.csv
+	$(RUN) normalize-envo-data \
+		--count-col-name total_count \
+		--input-file $< \
+		--ontology-prefix ENVO \
+		--output-file $@ \
+		--val-col-name value
+
+local/ncbi-biosamples-context-value-counts-failures.csv: local/ncbi-biosamples-context-value-counts-normalized.csv
+	$(RUN) find-envo-present-no-curie-extracted \
+		--input-file $< \
+		--output-file $@
+
+local/envo-info.txt:
+	$(RUN) runoak --input sqlite:obo:envo info  .desc//p=i continuant > $@ # or .ALL
+
+local/envo-info.csv: local/envo-info.txt
+	$(RUN) normalize-envo-data \
+			--input-file $< \
+			--ontology-prefix ENVO \
+			--output-file $@
+
+local/ncbi-biosamples-context-value-counts-real-labels.csv: local/ncbi-biosamples-context-value-counts-normalized.csv local/envo-info.csv
+	$(RUN) merge-in-reference-data \
+		--keep-file $(word 1,$^) \
+		--keep-key normalized_curie \
+		--reference-file $(word 2,$^) \
+		--reference-key normalized_curie \
+		--reference-addition normalized_label \
+		--addition-rename real_label \
+		--merged-file $@
+
+local/ncbi-biosamples-context-value-counts-real-labels-only-annotated.tsv: local/ncbi-biosamples-context-value-counts-real-labels.csv
+	date ; $(RUN) runoak \
+		--input sqlite:obo:envo annotate \
+		--matches-whole-text \
+		--output-type tsv \
+		--output $@ \
+		--text-file $< \
+		--match-column normalized_label ; date
+
+local/biome-info.txt:
+	$(RUN) runoak --input sqlite:obo:envo info  .desc//p=i ENVO:00000428 > $@
+
+local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-1.tsv: local/biome-info.txt \
+local/ncbi-biosamples-context-value-counts-real-labels-only-annotated.tsv
+	$(RUN) detect-curies-in-subset \
+		--tsv-file $(word 2,$^) \
+		--class-info-file $(word 1,$^)  \
+		--tsv-column-name normalized_curie \
+		--subset-label biome \
+		--output-file $@
+
+local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.tsv: local/biome-info.txt \
+local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-1.tsv
+	$(RUN) detect-curies-in-subset \
+		--tsv-file $(word 2,$^) \
+		--class-info-file $(word 1,$^)  \
+		--tsv-column-name matched_id \
+		--subset-label biome \
+		--output-file $@
+
+local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv: local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.tsv
+	$(RUN) or-boolean-columns \
+		--input-file $< \
+		--output-file $@ \
+		--column1 "normalized_curie_biome" \
+		--column2 "matched_id_biome"
+
+detected-annotations-to-postgres: local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2-or-ed.tsv
+	$(RUN) load-tsv-into-postgres \
+	--tsv-file $< \
+	--table-name detected_annotations
 local/envo_goldterms.db:
 	$(RUN) runoak --input sqlite:obo:envo ontology-metadata --all > /dev/null # ensure semsql file is cached
 	$(RUN) runoak --input sqlite:obo:goldterms ontology-metadata --all > /dev/null # ensure semsql file is cached
@@ -31,5 +226,3 @@ local/envo_goldterms.db:
 		--primary-db local/envo.db \
 		--secondary-db ~/.data/oaklib/goldterms.db
 	mv local/envo.db $@
-
-
diff --git a/config/env-local-scale-extraction-config.yaml b/config/env-local-scale-extraction-config.yaml
new file mode 100755
index 0000000..c7fe387
--- /dev/null
+++ b/config/env-local-scale-extraction-config.yaml
@@ -0,0 +1,167 @@
+# env-local-scale-extraction-config.yaml
+output: "local/env-local-scale-candidates.txt"
+entity: "material entity"
+text_exclusions:
+  - "gaseous"
+  - "marine"
+  - "undersea"
+  - "saline"
+  - "brackish"
+  - "undersea"
+post_process_inclusion_single_terms:
+  - "bridge"
+  - "road"
+  - "wildlife management area"
+term_and_descendant_exclusions:
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "BFO:0000050"
+  - "RO:0001025"
+  - "RO:0001025"
+  - "RO:0002473"
+  - "NCBITaxon:1"
+  - "administrative region"
+  - "aeroform"
+  - "anatomical entity"
+  - "anatomical entity environment"
+  - "area protected according to IUCN guidelines"
+  - "astronomical body"
+  - "astronomical object"
+  - "biome"
+  - "channel of a watercourse"
+  - "chemical entity"
+  - "cloud"
+  - "collection of organisms"
+  - "cryospheric layer"
+  - "ecozone"
+  - "ecosystem"
+  - "environmental material"
+  - "environmental monitoring area"
+  - "environmental system"
+  - "environmental zone"
+  - "fluid layer"
+  - "healthcare facility"
+  - "ice"
+  - "interface layer"
+  - "island"
+  - "lake layer"
+  - "manufactured product"
+  - "marine environmental zone"
+  - "marine littoral zone"
+  - "mass of environmental material"
+  - "mass of liquid"
+  - "material isosurface"
+  - "meteor"
+  - "observing system"
+  - "organic material"
+  - "organism"
+  - "particle"
+  - "planetary structural layer"
+  - "political entity"
+  - "protected area"
+  - "room"
+  - "saline water"
+  - "sea floor"
+  - "subatomic particle"
+  - "transport feature"
+  - "water body"
+  - "water body"
+  - "water current"
+"single_term_exclusions":
+  - "anthropised terrestrial environmental zone"
+  - "anthropogenic contamination feature"
+  - "anthropogenic geographic feature"
+  - "area of attached faunal communities"
+  - "area of attached mussel assemblages"
+  - "area of developed space"
+  - "astronomical body part"
+  - "biosphere"
+  - "body of liquid"
+  - "carbonate system of ocean water"
+  - "cellular organisms"
+  - "child care facility"
+  - "cloud part"
+  - "compound astronomical body part"
+  - "construction"
+  - "conveyor system"
+  - "cryoform"
+  - "educational facility"
+  - "environmental zone"
+  - "environmental zone of processual equilibrium"
+  - "facility"
+  - "fiat object"
+  - "fiat part of an astronomical object"
+  - "floating ice mass"
+  - "fluid astronomical body part"
+  - "fresh water body"
+  - "gaseous astronomical body part"
+  - "gaseous part of an atmosphere"
+  - "geographic feature"
+  - "hail stone"
+  - "hydroform"
+  - "hydrographic feature"
+  - "hydrosphere"
+  - "ice decumulation zone"
+  - "landform"
+  - "layer"
+  - "liquid astronomical body part"
+  - "lotic water body"
+  - "marine hydrothermal vent"
+  - "marine reef"
+  - "marine water body"
+  - "marine water mass"
+  - "mass of compounded environmental materials"
+  - "mass of environmental material"
+  - "mass of solid material"
+  - "material accumulation zone"
+  - "material decumulation zone"
+  - "material entity"
+  - "object"
+  - "object aggregate"
+  - "ocean basin"
+  - "open cage mariculture facility"
+  - "organismal entity"
+  - "pedosphere"
+  - "planetary photic zone"
+  - "planetary subsurface zone"
+  - "pole"
+  - "polling place"
+  - "polling station"
+  - "processed material"
+  - "processing plant"
+  - "public infrastructure"
+  - "public transit system"
+  - "rapid transit system"
+  - "rain"
+  - "range of seamounts"
+  - "rocky reef"
+  - "root"
+  - "saline water body"
+  - "sea ice floe"
+  - "sea ice hummock"
+  - "sea ice mass"
+  - "seamount"
+  - "sleet pellet"
+  - "sleet pellet"
+  - "soil horizon"
+  - "soil layer"
+  - "solid astronomical body part"
+  - "solid layer"
+  - "subsurface landform"
+  - "subsurface zone of an astronomical body"
+  - "surface landform"
+  - "system"
+  - "Taylor column"
+  - "technosphere"
+  - "underground water body"
+  - "volcanic feature"
+  - "water body"
+  - "watercourse"
+  - "water mass"
+  - "water-based rain"
+
+
+
diff --git a/config/oak-config.yaml b/config/oak-config.yaml
new file mode 100644
index 0000000..7b45869
--- /dev/null
+++ b/config/oak-config.yaml
@@ -0,0 +1,3 @@
+ontology_resources:
+  envo:
+    selector: sqlite:obo:envo
\ No newline at end of file
diff --git a/env_triad.Makefile b/env_triad.Makefile
new file mode 100644
index 0000000..5bbd240
--- /dev/null
+++ b/env_triad.Makefile
@@ -0,0 +1,197 @@
+WGET=wget
+RUN=poetry run
+
+# preferable to use a tagged release, but theres good stuff in this commit that hasn't been released yet
+MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml
+SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml
+
+# ENVO STUFF
+# getting fragments of EnvO because the whole thing is too large to feed into an LLM
+# our guideline is that env_broad_scale should be answered with an EnvO biome subclass
+
+# these OAK commands fetch the latest EnvO SQLite file from a BBOP S3 bucket
+# it may be a few days behind the envo.owl file form the EnvO GH repo
+# use `runoak cache-ls` to see where the SQLite files are cached
+
+local/biome-relationships.tsv:
+	$(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00000428 > $@
+	# !!! pivot? include entailment? --include-entailed / --no-include-entailed; --non-redundant-entailed / --no-non-redundant-entailed
+	# LLM web interfaces might want CSVs
+
+local/biome-relationships.csv: local/biome-relationships.tsv
+	sed 's/\t/,/g' $< > $@
+	#awk 'BEGIN {FS="\t"; OFS=","} {print $$0}' $< > $@
+	rm -rf $<
+
+local/biome-metadata.yaml:
+	$(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00000428 > $@
+	 # !!! try different formats? or predicate list?
+
+local/biome-metadata.json: local/biome-metadata.yaml
+	yq ea '[.]' $< -o=json | cat > $@
+	rm -rf $<
+
+# our guideline is that env_medium should be answered with an EnvO biome subclass
+local/environmental-materials-relationships.tsv:
+	$(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i ENVO:00010483 > $@
+
+local/environmental-materials-relationships.csv: local/environmental-materials-relationships.tsv
+	sed 's/\t/,/g' $< > $@
+	rm -rf $<
+
+local/environmental-materials-metadata.yaml:
+	$(RUN) runoak --input sqlite:obo:envo term-metadata .desc//p=i ENVO:00010483 > $@
+
+local/environmental-materials-metadata.json: local/environmental-materials-metadata.yaml
+	yq ea '[.]' $< -o=json | cat > $@
+	rm -rf $<
+
+local/environmental-material-info.txt:
+	$(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00010483 > $@
+
+local/aquatic-biome-info.txt:
+	$(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00002030 > $@  # --output-type tsv has lots of info but wrapped in square brackets
+
+local/aquatic-biome-relationships.tsv:
+	$(RUN) runoak --input sqlite:obo:envo relationships --output-type tsv --output $@ .desc//p=i ENVO:00002030
+
+local/aquatic-biome.png:
+	$(RUN) runoak --input sqlite:obo:envo viz --no-view --output $@ --gap-fill .desc//p=i ENVO:00002030
+
+local/soil-env_broad_scale-algebraic.txt:
+	$(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ .desc//p=i biome .not .desc//p=i 'aquatic biome' ] .not .desc//p=i 'forest biome' ] .not .desc//p=i 'grassland biome' ]  .not .desc//p=i 'desert biome' ] .not biome ]  .not 'cropland biome' > $@
+
+local/soil-env_broad_scale-algebraic.csv: local/soil-env_broad_scale-algebraic.txt
+	$(RUN) normalize-envo-data \
+		--input-file $< \
+		--ontology-prefix ENVO \
+		--output-file $@
+
+
+## for env medium
+#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv: local/environmental-material-info.txt \
+#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-2.csv
+#	$(RUN) detect-curies-in-subset \
+#		--tsv-file $(word 2,$^) \
+#		--class-info-file $(word 1,$^)  \
+#		--tsv-column-name normalized-curie \
+#		--subset-label environmental-material \
+#		--output-file $@
+#
+#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-4.csv: local/environmental-material-info.txt \
+#local/ncbi-biosamples-context-value-counts-real-labels-only-annotated-3.csv
+#	$(RUN) detect-curies-in-subset \
+#		--tsv-file $(word 2,$^) \
+#		--class-info-file $(word 1,$^)  \
+#		--tsv-column-name matched_id \
+#		--subset-label environmental_material \
+#		--output-file $@
+
+
+# REPORT OF WHETHER A BIOSAMPLE USES A BIOME AS IT'S env_broad_scale VALUE
+# joins pre-loaded (grouped) detected_annotations table with individual biosample env_broad_scale assertions
+local/soil-water-env-broad-scale.tsv: sql/soil-water-env_broad_scale.sql
+	$(RUN) sql-to-tsv \
+	--sql-file $< \
+	--output-file $@
+
+####
+
+local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \
+local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \
+local/biome-relationships.csv
+	$(RUN) build-prompt-from-template \
+		--spec-file-path $(word 1,$^) \
+		--output-file-path $@
+
+# suggested models: gpt-4, gpt-4o, gpt-4-turbo (?), claude-3-opus, claude-3.5-sonnet, gemini-1.5-pro-latest
+# gemini models don't seem to take a temperature parameter
+# cborg/claude-sonnet
+local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-prompt.txt
+	cat $(word 1,$^) | $(RUN) llm prompt --model claude-3.5-sonnet -o temperature 0.01 | tee $@
+
+####
+
+#local/env-local-scale-candidates.txt:
+#	$(RUN) runoak --input sqlite:obo:envo info [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ .desc//p=i 'material entity' ] .not .desc//p=i 'biome' ] .not .desc//p=i 'environmental material' ] .not .desc//p=i 'chemical entity' ] .not .desc//p=i 'organic material' ] .not .desc//p=i 'anatomical entity' ] .not .desc//p=i 'organism' ] .not .desc//p=i 'plant anatomical entity' ] .not .desc//p=i 'healthcare facility' ] .not .desc//p=i 'fluid layer' ] .not .desc//p=i 'interface layer' ] .not .desc//p=i 'manufactured product' ] .not .desc//p=i 'anatomical entity environment' ] .not .desc//p=i 'ecosystem' ] .not .desc//p=i 'area protected according to IUCN guidelines' ] .not .desc//p=i 'astronomical body' ] .not .desc//p=i 'astronomical object' ] .not .desc//p=i 'cloud' ] .not .desc//p=i 'collection of organisms' ] .not .desc//p=i 'environmental system' ] .not .desc//p=i 'ecozone' ] .not .desc//p=i 'environmental zone' ] .not .desc//p=i 'water current' ] .not .desc//p=i 'mass of environmental material' ] .not .desc//p=i 'subatomic particle' ] .not .desc//p=i 'observing system' ] .not .desc//p=i 'particle' ] .not .desc//p=i 'planetary structural layer' ] .not .desc//p=i 'political entity' ] .not .desc//p=i 'meteor' ] .not .desc//p=i 'room' ] .not .desc//p=i 'transport feature' ] .not .desc//p=i 'mass of liquid' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=BFO:0000050 'environmental monitoring area' ] .not .desc//p=BFO:0000050 'marine littoral zone' ] .not .desc//p=BFO:0000050 'marine environmental zone' ] .not .desc//p=RO:0002473 'sea floor' ] .not .desc//p=BFO:0000050 'saline water' ] .not .desc//p=BFO:0000050 'ice' ] .not .desc//p=RO:0001025 'water body' ] .not .desc//p=i 'administrative region' ] .not .desc//p=i 'protected area' ] .not .desc//p=i 'channel of a watercourse' ] .not .desc//p=i 'cryospheric layer' ] .not 'l~gaseous' ] .not 'l~marine' ] .not .desc//p=i 'material isosurface' ] .not 'l~undersea' ] .not .desc//p=i NCBITaxon:1 ] .not 'l~saline' ] .not 'l~brackish' ] .not .desc//p=i 'aeroform' > $@
+# REPORT OF WHETHER A BIOSAMPLE USES A BIOME AS IT'S env_broad_scale VALUE
+# joins pre-loaded (grouped) detected_annotations table with individual biosample env_broad_scale assertions
+local/soil-water-env-broad-scale.tsv: sql/soil-water-env_broad_scale.sql
+	$(RUN) sql-to-tsv \
+	--sql-file $< \
+	--output-file $@
+
+####
+
+local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \
+local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \
+local/biome-relationships.csv
+	$(RUN) build-prompt-from-template \
+		--spec-file-path $(word 1,$^) \
+		--output-file-path $@
+
+# suggested models: gpt-4, gpt-4o, gpt-4-turbo (?), claude-3-opus, claude-3.5-sonnet, gemini-1.5-pro-latest
+# gemini models don't seem to take a temperature parameter
+# cborg/claude-sonnet
+local/unused-terrestrial-biomes-response.txt: local/unused-terrestrial-biomes-prompt.txt
+	cat $(word 1,$^) | $(RUN) llm prompt --model claude-3.5-sonnet -o temperature 0.01 | tee $@
+
+####
+
+# remove .desc//p=BFO:0000050 'marine littoral zone' .or .desc//p=BFO:0000050 'saline water' .or .desc//p=RO:0001025 'water body' .or .desc//p=RO:0001025 'water body' .or .desc//p=RO:0002473 ' because of .or 'l~marine'
+# remove .or .desc//p=i 'organic material' because of .or .desc//p=i 'environmental material'
+# remove .or .desc//p=i 'mass of liquid' because of .or .desc//p=i 'mass of environmental material'
+# removed  .or .desc//p=i NCBITaxon:1 because of .or .desc//p=i 'organism'
+# .or .desc//p=i 'organic material' (.desc//p=i 'environmental material')
+# .or .desc//p=i 'gas planet' (.desc//p=i 'environmental material')
+
+local/env-local-scale-candidates-relationships.tsv:
+	$(RUN) runoak --input sqlite:obo:envo relationships .desc//p=i 'material entity' .not [ .desc//p=i 'biome' .or .desc//p=i 'environmental material'  .or .desc//p=i 'meteorite'  .or .desc//p=i 'chemical entity' .or .desc//p=i 'organic material' .or .desc//p=i 'anatomical entity' .or .desc//p=i 'organism'  .or .desc//p=i 'plant anatomical entity'  .or .desc//p=i 'healthcare facility'  .or .desc//p=i 'fluid layer'  .or .desc//p=i 'interface layer'  .or .desc//p=i 'manufactured product'  .or .desc//p=i 'anatomical entity environment'  .or .desc//p=i 'ecosystem'  .or .desc//p=i 'area protected according to IUCN guidelines'  .or .desc//p=i 'astronomical body'  .or .desc//p=i 'astronomical object'  .or .desc//p=i 'cloud'  .or .desc//p=i 'collection of organisms'  .or .desc//p=i 'environmental system'  .or .desc//p=i 'ecozone'  .or .desc//p=i 'environmental zone'  .or .desc//p=i 'water current'  .or .desc//p=i 'mass of environmental material'  .or .desc//p=i 'subatomic particle'  .or .desc//p=i 'observing system'  .or .desc//p=i 'particle'  .or .desc//p=i 'planetary structural layer'  .or .desc//p=i 'political entity'  .or .desc//p=i 'meteor'  .or .desc//p=i 'room'  .or .desc//p=i 'transport feature'  .or .desc//p=i 'mass of liquid'  .or .desc//p=RO:0001025 'water body'  .or .desc//p=BFO:0000050 'environmental monitoring area'  .or .desc//p=BFO:0000050 'marine littoral zone'  .or .desc//p=BFO:0000050 'marine environmental zone'  .or .desc//p=RO:0002473 'sea floor'  .or .desc//p=BFO:0000050 'saline water'  .or .desc//p=BFO:0000050 'ice'  .or .desc//p=RO:0001025 'water body'  .or .desc//p=i 'administrative region'  .or .desc//p=i 'protected area'  .or .desc//p=i 'channel of a watercourse'  .or .desc//p=i 'cryospheric layer'  .or 'l~gaseous'  .or 'l~marine'  .or .desc//p=i 'material isosurface'  .or 'l~undersea'  .or .desc//p=i NCBITaxon:1  .or 'l~saline'  .or 'l~brackish'  .or .desc//p=i 'aeroform' ] > $@
+
+local/envo-leaves.txt:
+	$(RUN) runoak --input sqlite:obo:envo leafs > $@
+
+local/envo-leaf-ids.txt: local/envo-leaves.txt
+	cut -f1 -d' ' $< > $@
+
+local/env-local-scale-candidate-ids.txt: local/env-local-scale-candidates.txt
+	cut -f1 -d' ' $< > $@
+
+local/env-local-scale-non-leaf.txt: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt
+	$(RUN) runoak --input sqlite:obo:envo info .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ] > $@
+
+local/env-local-scale-non-leaf.csv: local/env-local-scale-non-leaf.txt
+	$(RUN) normalize-envo-data \
+		--input-file $< \
+		--ontology-prefix ENVO \
+		--output-file $@
+
+local/env-local-scale-non-leaf.png: local/env-local-scale-candidates.txt local/envo-leaf-ids.txt
+	$(RUN) runoak --input sqlite:obo:envo viz --gap-fill .idfile $(word 1,$^) .not [ .idfile $(word 2,$^) ]
+
+local/goldData.xlsx:
+	wget -O $@ "https://gold.jgi.doe.gov/download?mode=site_excel"
+
+local/goldData_biosamples.csv: local/goldData.xlsx # for counting biosamples with a path that corresponds to en env_local_scale from local/goldterms-env_local_scale-of-environmental-terrestrial-soil-counts.txt
+	$(RUN) python -c "import pandas as pd; import sys; pd.read_excel(sys.argv[1], sheet_name=sys.argv[3]).to_csv(sys.argv[2], index=False)" $< $@ Biosample
+
+local/goldterms-env_local_scale-of-environmental-terrestrial-soil-counts.txt: # counts by path,  not by bold biosamples
+	$(RUN) runoak --input sqlite:obo:goldterms query --output $@.bak --query "SELECT s.object, count(1) as path_count FROM entailed_edge ee JOIN statements s ON ee.subject = s.subject WHERE ee.predicate = 'rdfs:subClassOf' AND ee.object = 'GOLDTERMS:4212' AND s.predicate = 'mixs:env_local' group by s.object order by count(1) desc"
+	cut -f 1,3 $@.bak  > $@
+	rm -rf $@.bak
+
+
+###### SIERRA's STUFF #######
+
+# local/env-local-scale-candidates.txt:
+#	$(RUN) runoak --input sqlite:obo:envo info .desc//p=i 'material entity' .not [ .desc//p=i 'biome'.or .desc//p=i 'environmental material' .or .desc//p=i 'anatomical entity' .or .desc//p=i 'chemical entity' .or .desc//p=i 'environmental system' .or .desc//p=i 'administrative region' .or .desc//p=i 'aeroform' .or .desc//p=i 'anatomical entity environment' .or .desc//p=i 'area protected according to IUCN guidelines' .or .desc//p=i 'astronomical object' .or .desc//p=i 'building part' .or .desc//p=i 'channel of a watercourse' .or .desc//p=i 'collection of organisms' .or .desc//p=i 'cryospheric layer' .or .desc//p=i 'ecozone' .or .desc//p=i 'environmental monitoring area' .or .desc//p=i 'fluid layer'  .or .desc//p=i 'healthcare facility' .or .desc//p=i 'interface layer' .or .desc//p=i 'manufactured product' .or .desc//p=i 'mass of biological material' .or .desc//p=i 'mass of fluid' .or .desc//p=i 'material isosurface' .or .desc//p=i 'meteor' .or .desc//p=i 'meteorite' .or .desc//p=i 'observing system' .or .desc//p=i 'organic object' .or .desc//p=i 'organism' .or .desc//p=i 'particle' .or .desc//p=i 'piece of plastic' .or .desc//p=i 'piece of rock' .or .desc//p=i 'planetary structural layer' .or .desc//p=i 'plant anatomical entity' .or .desc//p=i 'political entity' .or .desc//p=i 'protected area' .or .desc//p=i 'subatomic particle' .or .desc//p=i 'transport feature' .or .desc//p=i 'water current' .or .desc//p=i,p 'l~undersea' ] .or bridge .or road .or 'wildlife management area' .or .desc//p=i 'lake layer' .or .desc//p=i island > $@
+
+generate-env-local-scale-candidates:
+	# Ensure the poetry environment is activated and run the script with the specified config
+	$(RUN) python external_metadata_awareness/envo_local_scale_extraction.py \
+           --oak-config-file config/oak-config.yaml \
+           --extraction-config-file config/env-local-scale-extraction-config.yaml
+
+test:
+	$(RUN) pytest tests/*
+###### END SIERRA's STUFF #######
diff --git a/external_metadata_awareness/envo_local_scale_extraction.py b/external_metadata_awareness/envo_local_scale_extraction.py
new file mode 100755
index 0000000..67cd74a
--- /dev/null
+++ b/external_metadata_awareness/envo_local_scale_extraction.py
@@ -0,0 +1,142 @@
+import logging
+from typing import List
+import yaml
+import click
+from oaklib import get_adapter
+from oaklib.query import onto_query, SimpleQueryTerm
+
+# Configure logging
+logging.basicConfig(level=logging.WARN, format='%(asctime)s - %(levelname)s - %(message)s')
+
+
+def load_configs(oak_config_file, extraction_config_file):
+    with open(oak_config_file, 'r') as file:
+        oak_config = yaml.safe_load(file)
+    with open(extraction_config_file, 'r') as file:
+        extraction_config = yaml.safe_load(file)
+    return oak_config, extraction_config
+
+
+def create_exclusion_list(term_labels, adapter) -> List[str]:
+    """
+    Creates a combined FunctionQuery to exclude specific terms and their descendants.
+
+    :param term_labels: List of term labels to exclude.
+    :param adapter: The ontology adapter.
+    :return: Combined FunctionQuery to exclude all specified terms and their descendants.
+    """
+    all_ids_to_exclude = []
+    for label in term_labels:
+        # Find the CURIE for the label
+        term_curies = onto_query(SimpleQueryTerm(term=label), adapter)
+        if term_curies:
+            term_curie = term_curies[0]  # Assuming one CURIE per label
+            # Create a descendant exclusion query for the term
+            list_to_exclude = onto_query([".desc//p=i", term_curie], adapter)
+            all_ids_to_exclude.extend(list_to_exclude)
+    return list(set(all_ids_to_exclude))
+
+
+def create_text_exclusion_list(text_exclusions, adapter):
+    """
+    Creates a combined FunctionQuery to exclude specific terms based on text matching.
+
+    :param text_exclusions: List of text patterns to exclude.
+    :param adapter: The ontology adapter.
+    :return: Combined FunctionQuery to exclude all specified text matches.
+    """
+
+    all_ids_to_exclude = []
+
+    for text in text_exclusions:
+        # Find the CURIE for the label
+        list_to_exclude = onto_query(["l~"+text], adapter)
+        all_ids_to_exclude.extend(list_to_exclude)
+    return list(set(all_ids_to_exclude))
+
+
+def exclude_terms(full_list, exclusion_list):
+    """
+    Returns a list of items from the full list with the items in the exclusion list removed.
+
+    :param full_list: List of items to be filtered.
+    :param exclusion_list: List of items to exclude from the full list.
+    :return: A list with items from exclusion_list removed.
+    """
+    return [item for item in full_list if item not in exclusion_list]
+
+
+def retrieve_individual_terms(terms_to_retrieve: List[str], adapter) -> List[str]:
+    """
+    Creates a list of CURIEs based on the provided list of term labels.
+
+    :param terms_to_retrieve: List of term labels.
+    :param envo: The ontology adapter.
+    """
+    all_ids = []
+
+    for term_label in terms_to_retrieve:
+        # Find the CURIE for the label
+        list_to_exclude = onto_query([term_label], adapter)
+        print("term_label", term_label)
+        print("list_to_exclude", list_to_exclude)
+        all_ids.extend(list_to_exclude)
+    return list(set(all_ids))
+
+
+def extract_terfms_to_file(oak_config_file, extraction_config):
+    # Load the ontology using the get_adapter function
+    envo = get_adapter(oak_config_file)
+
+    # Get the entity and exclusions from the config
+    initial_term_label = extraction_config['entity']
+    initial_term_list = onto_query([".desc//p=i", initial_term_label], envo)
+    logging.info(f"Length of initial term list: {len(initial_term_list)}")
+
+    exclusion_terms_and_children = create_exclusion_list(extraction_config.get('term_and_descendant_exclusions', []),
+                                                         envo)
+
+    exclusion_terms_from_text = create_text_exclusion_list(extraction_config.get('text_exclusions', []),
+                                                           envo)
+
+    exclude_single_terms = retrieve_individual_terms(extraction_config.get('exclude_single_terms', []), envo)
+    solo_inclusion_terms = extraction_config.get('post_process_inclusion_single_terms', [])
+    logging.info("solo_inclusion_terms", solo_inclusion_terms)
+    post_process_inclusion_single_terms = retrieve_individual_terms(extraction_config.get('post_process_inclusion_single_terms', []), envo)
+    logging.info("post_process_inclusion_terms", post_process_inclusion_single_terms)
+
+
+    exclusion_list = exclusion_terms_and_children + exclusion_terms_from_text + exclude_single_terms
+    logging.info(f"Length of excluded terms and descendants: {len(exclusion_terms_and_children)}")
+    logging.info(f"Length of excluded terms from text: {len(exclusion_terms_from_text)}")
+    logging.info(f"Length of excluded terms from solo terms: {len(post_process_inclusion_single_terms)}")
+
+    remaining_items = exclude_terms(initial_term_list, exclusion_list)
+    logging.info(f"Length of remaining items: {len(remaining_items)}")
+
+    final_list_to_retrieve = post_process_inclusion_single_terms + remaining_items
+
+    results = onto_query(final_list_to_retrieve, envo, labels=True)
+
+    # Write the results to the output file specified in the extraction config
+    output_file_path = extraction_config['output']
+    with open(output_file_path, 'w') as output_file:
+        for curie, label in results:
+            output_file.write(f"{curie}: {label}\n")
+
+    logging.info(f"Results written to {output_file_path}")
+
+
+@click.command()
+@click.option('--extraction-config-file', required=True, help='Path to the extraction YAML configuration file.')
+@click.option('--oak-config-file', required=True, help='Path to the extraction YAML configuration file.')
+def cli(extraction_config_file, oak_config_file):
+    """
+    CLI tool to process an ontology based on the provided YAML configuration file.
+    """
+    _, extraction_config = load_configs(oak_config_file, extraction_config_file)
+    extract_terms_to_file(oak_config_file, extraction_config)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/ncbi.Makefile b/ncbi.Makefile
new file mode 100644
index 0000000..b679ff1
--- /dev/null
+++ b/ncbi.Makefile
@@ -0,0 +1,160 @@
+WGET=wget
+RUN=poetry run
+
+# preferable to use a tagged release, but theres good stuff in this commit that hasn't been released yet
+MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml
+SUBMISSION_SCHEMA_URL=https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml
+
+
+## NCBI STUFF
+# very complex documents; many are too large to load into a MongoDB document
+downloads/bioproject.xml:
+	$(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml" # ~ 3 GB August 2024
+
+downloads/biosample_set.xml.gz:
+	$(WGET) -O $@ "https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz" # ~ 3 GB August 2024
+
+local/biosample_set.xml: downloads/biosample_set.xml.gz
+	gunzip -c $< > $@
+
+# for development
+downloads/books.xml:
+	$(WGET) -O $@ "https://www.w3schools.com/xml/books.xml"
+
+# 8 years old. seems very incomplete.
+downloads/biosample.xsd:
+	$(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co"
+
+# find code for converting to table in other repos
+# or convert to duckdb
+downloads/ncbi-biosample-attributes.xml:
+	$(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/attributes/?format=xml"
+
+downloads/ncbi-biosample-packages.xml:
+	$(WGET) -O $@ "https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml"
+
+local/ncbi-biosample-packages.csv: downloads/ncbi-biosample-packages.xml
+	$(RUN) ncbi-packages-csv-report \
+	--xml-file $< \
+	--output-file $@
+
+
+# see also https://www.npmjs.com/package/mongodb-schema/v/12.2.0?activeTab=versions
+
+#local/mongodb-paths-10pct.txt: # 450000 -> ~ 4 minutes # 4.5 M -> heavy load, never finishes. Use streaming approach?
+#	$(RUN) list-mongodb-paths \
+#		--db-name ncbi_metadata \
+#		--collection samples \
+#		--sample-size 4500000 > $@
+
+#local/ncbi_biosamples_inferred_schema.json: # ~ 2 minutes for 410,000 (1%) # ~ 1 hour for 13 million ~ 30%
+#	$(RUN) python external_metadata_awareness/infer_schema_with_batching.py \
+#		--host localhost \
+#		--port 27017 \
+#		--database ncbi_metadata \
+#		--collection samples \
+#		--total-samples 13000000 \
+#		--batch-size 50000 \
+#		--output $@
+
+.PHONY: load-biosamples-into-mongo
+
+local/biosample-count-xml.txt: local/biosample_set.xml
+	date && grep -c "</BioSample>" $< > $@ && date
+
+# see also https://gitlab.com/wurssb/insdc_metadata
+load-biosamples-into-mongo: local/biosample_set.xml
+	$(RUN) xml-to-mongo \
+		--file-path $< \
+		--node-type BioSample \
+		--id-field id \
+		--db-name biosamples_dev \
+		--collection-name biosamples_dev \
+		--max-elements 100000 \
+		--anticipated-last-id 100000
+
+local/biosample-count-mongodb.txt:
+	date && mongosh --eval 'db.getSiblingDB("ncbi_metadata").samples.countDocuments()' > $@ && date # 1 minute
+
+local/ncbi-biosamples-packages-counts.tsv: sql/packages-counts.sql
+	$(RUN) sql-to-tsv \
+	--sql-file $< \
+	--output-file $@
+
+ncbi-biosamples-duckdb-overview:
+	$(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \
+		--connection-string "mongodb://localhost:27017/" \
+		--db-name ncbi_metadata \
+		--collection-name samples \
+		--limit 41000000 \
+		--batch-size 100000 \
+		--duckdb-file local/ncbi_biosamples.duckdb \
+		--table-name overview # no path # 40462422 biosamples in ~ 50 minutes
+
+# add counts from duckdb; need to compile duckdb or download binary
+
+ncbi-biosamples-duckdb-attributes:
+	$(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \
+		--connection-string "mongodb://localhost:27017/" \
+		--db-name ncbi_metadata \
+		--collection-name samples \
+		--limit 41000000 \
+		--batch-size 100000 \
+		--duckdb-file local/ncbi_biosamples.duckdb \
+		--table-name attributes \
+		--path BioSample.Attributes.Attribute
+
+ncbi-biosamples-duckdb-links:
+	$(RUN) python external_metadata_awareness/first_n_attributes_duckdb.py \
+		--connection-string "mongodb://localhost:27017/" \
+		--db-name ncbi_metadata \
+		--collection-name samples \
+		--limit 41000000 \
+		--batch-size 100000 \
+		--duckdb-file local/ncbi_biosamples.duckdb \
+		--table-name links \
+		--path BioSample.Links.Link
+
+  ## @click.option('--path', default="BioSample.Links.Link", required=True,
+  ##               help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').")
+  ## @click.option('--path', default="BioSample.Ids.Id", required=True,
+  ##               help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').")
+  ## @click.option('--path', default="BioSample.Description.Organism", required=True,
+  ##               help="Path within the document to process (e.g., 'BioSample.Attributes.Attribute').")
+
+NCBI_BIOSAMPLES_DUCKDB_PATH = local/ncbi_biosamples.duckdb
+
+local/ncbi-mims-soil-biosamples-env_local_scale.csv:
+	echo ".mode csv\nSELECT content, COUNT(1) AS sample_count FROM attributes WHERE harmonized_name = 'env_local_scale' AND package_content = 'MIMS.me.soil.6.0' GROUP BY content ORDER BY COUNT(1) DESC;" | duckdb $(NCBI_BIOSAMPLES_DUCKDB_PATH) > $@
+
+local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv: local/ncbi-mims-soil-biosamples-env_local_scale.csv
+	$(RUN) normalize-envo-data \
+		--count-col-name sample_count \
+		--input-file $< \
+		--ontology-prefix ENVO \
+		--output-file $@ \
+		--val-col-name content
+
+local/ncbi-mims-soil-biosamples-env_local_scale-failures.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv
+	$(RUN) find-envo-present-no-curie-extracted \
+		--input-file $< \
+		--output-file $@
+
+local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv: local/ncbi-mims-soil-biosamples-env_local_scale-normalized.csv local/envo-info.csv
+	$(RUN) merge-in-reference-data \
+		--keep-file $(word 1,$^) \
+		--keep-key normalized_curie \
+		--reference-file $(word 2,$^) \
+		--reference-key normalized_curie \
+		--reference-addition normalized_label \
+		--addition-rename real_label \
+		--merged-file $@
+
+local/ncbi-mims-soil-biosamples-env_local_scale-annotated.tsv: local/ncbi-mims-soil-biosamples-env_local_scale-real-labels.csv
+	date ; $(RUN) runoak \
+		--input sqlite:obo:envo annotate \
+		--matches-whole-text \
+		--output-type tsv \
+		--output $@ \
+		--text-file $< \
+		--match-column normalized_label ; date
diff --git a/tests/test_env_local_scale_generator.py b/tests/test_env_local_scale_generator.py
new file mode 100644
index 0000000..d5c8b4e
--- /dev/null
+++ b/tests/test_env_local_scale_generator.py
@@ -0,0 +1,157 @@
+import pytest
+import yaml
+import os
+from click.testing import CliRunner
+from external_metadata_awareness.envo_local_scale_extraction import cli, load_configs, extract_terms_to_file
+from oaklib.query import onto_query
+from oaklib.selector import get_adapter
+
+
+@pytest.fixture
+def oak_config_file(tmp_path):
+    config_data = {
+        "ontology_resources": {
+            "envo": {
+                "selector": "sqlite:obo:envo"
+            }
+        }
+    }
+    config_file = tmp_path / "oak_config.yaml"
+    with open(config_file, 'w') as file:
+        yaml.dump(config_data, file)
+    return config_file
+
+
+@pytest.fixture
+def extraction_config_file(tmp_path):
+    config_data = {
+        "entity": "material entity",
+        "post_process_inclusion_single_terms": [
+            "bridge",
+            "road",
+            "wildlife management area"
+        ],
+        "term_and_descendant_exclusions": [
+            "biome"
+            , "environmental material"
+            , "chemical entity"
+            , "organic material"
+            , "anatomical entity"
+            , "organism"
+            , "plant anatomical entity"
+            , "healthcare facility"
+            , "fluid layer"
+            , "interface layer"
+            , "manufactured product"
+            , "anatomical entity environment"
+            , "ecosystem"
+            , "area protected according to IUCN guidelines"
+            , "astronomical body"
+            , "astronomical object"
+            , "cloud"
+            , "collection of organisms"
+            , "environmental system"
+            , "ecozone"
+            , "material isosurface"
+            , "environmental zone"
+            , "water current"
+            , "mass of environmental material"
+            , "subatomic particle"
+            , "observing system"
+            , "particle"
+            , "planetary structural layer"
+            , "political entity"
+            , "meteor"
+            , "room"
+            , "transport feature"
+            , "mass of liquid"
+            , "RO:0001025 water body"
+            , "BFO:0000050 environmental monitoring area"
+            , "BFO:0000050 marine littoral zone"
+            , "BFO:0000050 marine environmental zone"
+            , "RO:0002473 sea floor"
+            , "BFO:0000050 saline water"
+            , "BFO:0000050 ice"
+            , "RO:0001025 water body"
+            , "administrative region"
+            , "protected area"
+            , "channel of a watercourse"
+            , "cryospheric layer"
+            , "material isosurface"
+            , "NCBITaxon:1"
+            , "aeroform"
+        ],
+        "text_exclusions": [
+            "gaseous"
+            , "marine"
+            , "undersea"
+            , "saline"
+            , "brackish"
+        ],
+        "output": str(tmp_path / "environmental-materials-relationships.txt")
+    }
+    config_file = tmp_path / "extraction_config.yaml"
+    with open(config_file, 'w') as file:
+        yaml.dump(config_data, file)
+    return config_file
+
+
+def test_load_configs(oak_config_file, extraction_config_file):
+    oak_config, extraction_config = load_configs(oak_config_file, extraction_config_file)
+    assert "ontology_resources" in oak_config
+    assert "envo" in oak_config["ontology_resources"]
+    assert oak_config["ontology_resources"]["envo"]["selector"] == "sqlite:obo:envo"
+    assert extraction_config["entity"] == "material entity"
+    assert "post_process_inclusion_single_terms" in extraction_config
+    assert "text_exclusions" in extraction_config
+    assert extraction_config["output"].endswith("environmental-materials-relationships.txt")
+
+
+def test_process_ontology(oak_config_file, extraction_config_file):
+    _, extraction_config = load_configs(oak_config_file, extraction_config_file)
+
+    # Run the ontology processing
+    extract_terms_to_file(oak_config_file, extraction_config)
+
+    # Check if the output file is created and has content
+    output_file_path = extraction_config["output"]
+    assert os.path.exists(output_file_path), "Output file was not created"
+
+    with open(output_file_path, 'r') as file:
+        content = file.read()
+        assert len(content) > 0, "Output file is empty, expected some data."
+
+    # You could also add assertions based on expected content
+    # For example, checking that excluded terms are not in the output
+    assert "biome" not in content
+    assert "brackish" not in content
+    assert "saline" not in content
+    assert "wildlife management area" in content
+
+
+def test_cli_runs_successfully(oak_config_file, extraction_config_file):
+    runner = CliRunner()
+    result = runner.invoke(cli, ['--extraction-config-file', str(extraction_config_file), '--oak-config-file',
+                                 str(oak_config_file)])
+    assert result.exit_code == 0
+
+    # Verify the output file exists and contains the expected results
+    output_file = extraction_config_file.parent / "environmental-materials-relationships.txt"
+    assert output_file.exists()
+    with open(output_file, 'r') as file:
+        content = file.read()
+        assert len(content) > 0, "Output file is empty, expected some data."
+
+    # Add additional assertions to check that the CLI correctly excluded terms
+    assert "biome" not in content
+    assert "brackish" not in content
+    assert "saline" not in content
+
+
+def test_onto_query():
+    adapter = get_adapter("sqlite:obo:envo")
+    # desc = onto_query([".desc//p=i", "material entity"], adapter)
+    # print(len(desc))
+
+    list_to_exclude = onto_query(["l~saline"], adapter, labels=True)
+    print(list_to_exclude)
\ No newline at end of file