Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Water broad #30

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ include Makefiles/nmdc_schema.Makefile
include Makefiles/soil-env_broad_scale.Makefile
include Makefiles/soil-env_local_scale.Makefile
include Makefiles/soil-env_medium.Makefile
include Makefiles/water-env_broad_scale.Makefile

# suggested LLM models: gpt-4, gpt-4o, gpt-4-turbo (?), claude-3-opus, claude-3.5-sonnet, gemini-1.5-pro-latest
# gemini models don't seem to take a temperature parameter
Expand Down
11 changes: 10 additions & 1 deletion Makefiles/envo.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,16 @@ local/envo-info.csv: local/envo-info.txt
--output-file $@

local/biome-info.txt:
$(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00000428 > $@
$(RUN) runoak --input sqlite:obo:envo info .desc//p=i biome > $@

local/biome-ids.tsv: local/biome-info.txt
cut -f1 -d' ' $< > $@

local/soil-info.txt:
$(RUN) runoak --input sqlite:obo:envo info .desc//p=i soil > $@

local/soil-ids.tsv: local/soil-info.txt
cut -f1 -d' ' $< > $@

local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \
local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \
Expand Down
29 changes: 20 additions & 9 deletions Makefiles/gold.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,45 @@ local/goldData.xlsx:
local/goldData_biosamples.csv: local/goldData.xlsx # for counting biosamples with a path that corresponds to en env_local_scale from local/goldterms-env_local_scale-of-environmental-terrestrial-soil-counts.txt
$(RUN) python -c "import pandas as pd; import sys; df = pd.read_excel(sys.argv[1], sheet_name=sys.argv[3]); df['BIOSAMPLE ECOSYSTEM PATH ID'] = df['BIOSAMPLE ECOSYSTEM PATH ID'].fillna(0).astype(int); df.to_csv(sys.argv[2], index=False)" $< $@ Biosample

local/goldterms-env_local_scale-of-environmental-terrestrial-soil.tsv: local/envo_goldterms.db # counts by path, not by bold biosamples
$(RUN) runoak --input $< query --output [email protected] --query "SELECT s.subject, SUBSTR(s.subject, INSTR(s.subject, ':') + 1) AS gold_path_id_int, s.object as inferred_env_local_scale, count(1) as path_count FROM entailed_edge ee JOIN statements s ON ee.subject = s.subject WHERE ee.predicate = 'rdfs:subClassOf' AND ee.object = 'GOLDTERMS:4212' AND s.predicate = 'mixs:env_local' group by s.object order by count(1) desc"
local/goldterms-env_broad_scale-of-environmental-terrestrial-soil.tsv: local/envo_goldterms.db # counts by path, not by bold biosamples
$(RUN) runoak --input $< query --output [email protected] --query "SELECT s.subject, SUBSTR(s.subject, INSTR(s.subject, ':') + 1) AS gold_path_id_int, s.object as inferred_env_broad_scale, count(1) as path_count FROM entailed_edge ee JOIN statements s ON ee.subject = s.subject WHERE ee.predicate = 'rdfs:subClassOf' AND ee.object = 'GOLDTERMS:4212' AND s.predicate = 'mixs:env_broad' group by s.object order by count(1) desc"
cut -f1,2,3,5,6 [email protected] > $@
rm -rf [email protected]

local/goldterms-env_broad_scale-of-environmental-terrestrial-soil.tsv: local/envo_goldterms.db # counts by path, not by bold biosamples
$(RUN) runoak --input $< query --output [email protected] --query "SELECT s.subject, SUBSTR(s.subject, INSTR(s.subject, ':') + 1) AS gold_path_id_int, s.object as inferred_env_local_scale, count(1) as path_count FROM entailed_edge ee JOIN statements s ON ee.subject = s.subject WHERE ee.predicate = 'rdfs:subClassOf' AND ee.object = 'GOLDTERMS:4212' AND s.predicate = 'mixs:env_broad' group by s.object order by count(1) desc"
local/goldterms-env_broad_scale-of-environmental-water.tsv: local/envo_goldterms.db # counts by path, not by bold biosamples
$(RUN) runoak --input $< query --output [email protected] --query "SELECT s.subject, SUBSTR(s.subject, INSTR(s.subject, ':') + 1) AS gold_path_id_int, s.object as inferred_env_broad_scale, count(1) as path_count FROM entailed_edge ee JOIN statements s ON ee.subject = s.subject WHERE ee.predicate = 'rdfs:subClassOf' AND ee.object = 'GOLDTERMS:3984' AND s.predicate = 'mixs:env_broad' group by s.object order by count(1) desc"
cut -f1,2,3,5,6 [email protected] > $@
rm -rf [email protected]

local/goldterms-env_local_scale-of-environmental-terrestrial-soil.tsv: local/envo_goldterms.db # counts by path, not by bold biosamples
$(RUN) runoak --input $< query --output [email protected] --query "SELECT s.subject, SUBSTR(s.subject, INSTR(s.subject, ':') + 1) AS gold_path_id_int, s.object as inferred_env_local_scale, count(1) as path_count FROM entailed_edge ee JOIN statements s ON ee.subject = s.subject WHERE ee.predicate = 'rdfs:subClassOf' AND ee.object = 'GOLDTERMS:4212' AND s.predicate = 'mixs:env_local' group by s.object order by count(1) desc"
cut -f1,2,3,5,6 [email protected] > $@
rm -rf [email protected]

local/goldterms-env_medium-of-environmental-terrestrial-soil.tsv: local/envo_goldterms.db # counts by path, not by bold biosamples
$(RUN) runoak --input $< query --output [email protected] --query "SELECT s.subject, SUBSTR(s.subject, INSTR(s.subject, ':') + 1) AS gold_path_id_int, s.object as inferred_env_local_scale, count(1) as path_count FROM entailed_edge ee JOIN statements s ON ee.subject = s.subject WHERE ee.predicate = 'rdfs:subClassOf' AND ee.object = 'GOLDTERMS:4212' AND s.predicate = 'mixs:env_medium' group by s.object order by count(1) desc"
$(RUN) runoak --input $< query --output [email protected] --query "SELECT s.subject, SUBSTR(s.subject, INSTR(s.subject, ':') + 1) AS gold_path_id_int, s.object as inferred_env_medium, count(1) as path_count FROM entailed_edge ee JOIN statements s ON ee.subject = s.subject WHERE ee.predicate = 'rdfs:subClassOf' AND ee.object = 'GOLDTERMS:4212' AND s.predicate = 'mixs:env_medium' group by s.object order by count(1) desc"
cut -f1,2,3,5,6 [email protected] > $@
rm -rf [email protected]

local/goldData_biosamples-inferred-soil-env_local_scale.tsv: local/goldData_biosamples.csv local/goldterms-env_local_scale-of-environmental-terrestrial-soil.tsv
local/goldData_biosamples-inferred-soil-env_broad_scale.tsv: local/goldData_biosamples.csv local/goldterms-env_broad_scale-of-environmental-terrestrial-soil.tsv
$(RUN) python -c "import pandas as pd; import sys; df1 = pd.read_csv(sys.argv[1]); df2 = pd.read_csv(sys.argv[2], sep='\t'); merged_df = pd.merge(df1, df2, left_on='BIOSAMPLE ECOSYSTEM PATH ID', right_on='gold_path_id_int', how='left'); merged_df.to_csv(sys.argv[3], index=False, sep='\t')" $(word 1,$^) $(word 2,$^) $@

local/goldData_biosamples-inferred-soil-env_broad_scale.tsv: local/goldData_biosamples.csv local/goldterms-env_broad_scale-of-environmental-terrestrial-soil.tsv
local/goldData_biosamples-inferred-water-env_broad_scale.tsv: local/goldData_biosamples.csv local/goldterms-env_broad_scale-of-environmental-water.tsv
$(RUN) python -c "import pandas as pd; import sys; df1 = pd.read_csv(sys.argv[1]); df2 = pd.read_csv(sys.argv[2], sep='\t'); merged_df = pd.merge(df1, df2, left_on='BIOSAMPLE ECOSYSTEM PATH ID', right_on='gold_path_id_int', how='left'); merged_df.to_csv(sys.argv[3], index=False, sep='\t')" $(word 1,$^) $(word 2,$^) $@

local/goldData_biosamples-inferred-soil-env_local_scale.tsv: local/goldData_biosamples.csv local/goldterms-env_local_scale-of-environmental-terrestrial-soil.tsv
$(RUN) python -c "import pandas as pd; import sys; df1 = pd.read_csv(sys.argv[1]); df2 = pd.read_csv(sys.argv[2], sep='\t'); merged_df = pd.merge(df1, df2, left_on='BIOSAMPLE ECOSYSTEM PATH ID', right_on='gold_path_id_int', how='left'); merged_df.to_csv(sys.argv[3], index=False, sep='\t')" $(word 1,$^) $(word 2,$^) $@

local/goldData_biosamples-inferred-soil-env_medium.tsv: local/goldData_biosamples.csv local/goldterms-env_medium-of-environmental-terrestrial-soil.tsv
$(RUN) python -c "import pandas as pd; import sys; df1 = pd.read_csv(sys.argv[1]); df2 = pd.read_csv(sys.argv[2], sep='\t'); merged_df = pd.merge(df1, df2, left_on='BIOSAMPLE ECOSYSTEM PATH ID', right_on='gold_path_id_int', how='left'); merged_df.to_csv(sys.argv[3], index=False, sep='\t')" $(word 1,$^) $(word 2,$^) $@

local/goldData_biosamples-inferred-soil-env_local_scale-counts.tsv: local/goldData_biosamples-inferred-soil-env_local_scale.tsv
local/goldData_biosamples-inferred-soil-env_broad_scale-counts.tsv: local/goldData_biosamples-inferred-soil-env_broad_scale.tsv
cut -f19 $< | sed '1d' | sort | uniq -c | awk 'NR>1 {print $$2 "\t" $$1}' > $@ # NR>1 assumes there is an initial counts for an empty env_local_scale value

local/goldData_biosamples-inferred-soil-env_broad_scale-counts.tsv: local/goldData_biosamples-inferred-soil-env_broad_scale.tsv
local/goldData_biosamples-inferred-water-env_broad_scale-counts.tsv: local/goldData_biosamples-inferred-water-env_broad_scale.tsv
cut -f19 $< | sed '1d' | sort | uniq -c | awk 'NR>1 {print $$2 "\t" $$1}' > $@ # NR>1 assumes there is an initial counts for an empty env_local_scale value

local/goldData_biosamples-inferred-soil-env_local_scale-counts.tsv: local/goldData_biosamples-inferred-soil-env_local_scale.tsv
cut -f19 $< | sed '1d' | sort | uniq -c | awk 'NR>1 {print $$2 "\t" $$1}' > $@ # NR>1 assumes there is an initial counts for an empty env_local_scale value

local/goldData_biosamples-inferred-soil-env_medium-counts.tsv: local/goldData_biosamples-inferred-soil-env_medium.tsv
Expand Down
2 changes: 1 addition & 1 deletion Makefiles/mixs.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ RUN=poetry run
WGET=wget

# preferable to use a tagged release, but theres good stuff in this commit that hasn't been released yet
MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/b0b1e03b705cb432d08914c686ea820985b9cb20/src/mixs/schema/mixs.yaml
MIXS_YAML_URL=https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/3abc0096702ddbf453e7c115e978286f892edfe0/src/mixs/schema/mixs.yaml

downloads/mixs.yaml:
wget -O $@ $(MIXS_YAML_URL)
Expand Down
32 changes: 32 additions & 0 deletions Makefiles/ncbi_metadata.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ local/ncbi-mims-soil-biosamples-env_local_scale.csv:
local/ncbi-mims-soil-biosamples-env_broad_scale.csv:
echo ".mode csv\nSELECT content, COUNT(1) AS sample_count FROM attributes WHERE harmonized_name = 'env_broad_scale' AND package_content = 'MIMS.me.soil.6.0' GROUP BY content ORDER BY COUNT(1) DESC;" | duckdb $(NCBI_BIOSAMPLES_DUCKDB_PATH) > $@


local/ncbi-mims-water-biosamples-env_broad_scale.csv:
echo ".mode csv\nSELECT content, COUNT(1) AS sample_count FROM attributes WHERE harmonized_name = 'env_broad_scale' AND package_content = 'MIMS.me.water.6.0' GROUP BY content ORDER BY COUNT(1) DESC;" | duckdb $(NCBI_BIOSAMPLES_DUCKDB_PATH) > $@

local/ncbi-mims-soil-biosamples-env_medium.csv:
echo ".mode csv\nSELECT content, COUNT(1) AS sample_count FROM attributes WHERE harmonized_name = 'env_medium' AND package_content = 'MIMS.me.soil.6.0' GROUP BY content ORDER BY COUNT(1) DESC;" | duckdb $(NCBI_BIOSAMPLES_DUCKDB_PATH) > $@

Expand All @@ -141,6 +145,15 @@ local/ncbi-mims-soil-biosamples-env_broad_scale-normalized.csv: local/ncbi-mims-
--output-file $@ \
--val-col-name content


local/ncbi-mims-water-biosamples-env_broad_scale-normalized.csv: local/ncbi-mims-water-biosamples-env_broad_scale.csv
$(RUN) normalize-envo-data \
--count-col-name sample_count \
--input-file $< \
--ontology-prefix ENVO \
--output-file $@ \
--val-col-name content

local/ncbi-mims-soil-biosamples-env_medium-normalized.csv: local/ncbi-mims-soil-biosamples-env_medium.csv
$(RUN) normalize-envo-data \
--count-col-name sample_count \
Expand Down Expand Up @@ -184,6 +197,16 @@ local/ncbi-mims-soil-biosamples-env_broad_scale-real-labels.csv: local/ncbi-mims
--addition-rename real_label \
--merged-file $@

local/ncbi-mims-water-biosamples-env_broad_scale-real-labels.csv: local/ncbi-mims-water-biosamples-env_broad_scale-normalized.csv local/envo-info.csv
$(RUN) merge-in-reference-data \
--keep-file $(word 1,$^) \
--keep-key normalized_curie \
--reference-file $(word 2,$^) \
--reference-key normalized_curie \
--reference-addition normalized_label \
--addition-rename real_label \
--merged-file $@

local/ncbi-mims-soil-biosamples-env_medium-real-labels.csv: local/ncbi-mims-soil-biosamples-env_medium-normalized.csv local/envo-info.csv
$(RUN) merge-in-reference-data \
--keep-file $(word 1,$^) \
Expand Down Expand Up @@ -212,6 +235,15 @@ local/ncbi-mims-soil-biosamples-env_broad_scale-annotated.tsv: local/ncbi-mims-s
--text-file $< \
--match-column normalized_label ; date

local/ncbi-mims-water-biosamples-env_broad_scale-annotated.tsv: local/ncbi-mims-water-biosamples-env_broad_scale-real-labels.csv
date ; $(RUN) runoak \
--input sqlite:obo:envo annotate \
--matches-whole-text \
--output-type tsv \
--output $@ \
--text-file $< \
--match-column normalized_label ; date

local/ncbi-mims-soil-biosamples-env_medium-annotated.tsv: local/ncbi-mims-soil-biosamples-env_medium-real-labels.csv
date ; $(RUN) runoak \
--input sqlite:obo:envo annotate \
Expand Down
7 changes: 7 additions & 0 deletions Makefiles/nmdc_metadata.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ local/nmdc-production-biosamples-env-context-authoritative-labels.tsv: local/nmd

local/nmdc-production-biosamples-env_package-predictions.tsv: local/nmdc-production-biosamples-env-context-authoritative-labels.tsv \
downloads/nmdc-production-studies.json
# local/env-package-heterogeneity.tsv is an output
# may also get printed to the console?
$(RUN) python external_metadata_awareness/predict_env_package_from_nmdc_context_authoritative_labels.py \
--input-file $(word 1,$^) \
--output-file $@ \
Expand All @@ -54,6 +56,11 @@ local/nmdc-production-biosamples-soil-env_broad_scale.tsv: local/nmdc-production
cut -f3 [email protected] | sed '1d' | sort | uniq -c | awk '{print $$2 "\t" $$1}' > $@
rm -rf [email protected]

local/nmdc-production-biosamples-water-env_broad_scale.tsv: local/nmdc-production-biosamples-env_package-predictions.tsv
$(RUN) python -c "import pandas as pd, sys; pd.read_csv(sys.argv[1], sep='\t').query('predicted_curated_env_package == \"water\"').to_csv(sys.argv[2], sep='\t', index=False)" $< [email protected]
cut -f3 [email protected] | sed '1d' | sort | uniq -c | awk '{print $$2 "\t" $$1}' > $@
rm -rf [email protected]

local/nmdc-production-biosamples-soil-env_medium.tsv: local/nmdc-production-biosamples-env_package-predictions.tsv
$(RUN) python -c "import pandas as pd, sys; pd.read_csv(sys.argv[1], sep='\t').query('predicted_curated_env_package == \"soil\"').to_csv(sys.argv[2], sep='\t', index=False)" $< [email protected]
cut -f7 [email protected] | sed '1d' | sort | uniq -c | awk '{print $$2 "\t" $$1}' > $@
Expand Down
1 change: 1 addition & 0 deletions Makefiles/soil-env_broad_scale.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ RUN=poetry run
WGET=wget

local/soil-env-broad-scale-evidence-table.tsv: config/soil-env_broad_scale-evidence-config.yaml \
local/biome-ids.tsv \
local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \
local/nmdc-production-biosamples-soil-env_broad_scale.tsv \
local/ncbi-mims-soil-biosamples-env_broad_scale-annotated.tsv \
Expand Down
3 changes: 2 additions & 1 deletion Makefiles/soil-env_medium.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ local/soil-env-medium-evidence-table.tsv: config/soil-env_medium-evidence-config
local/EnvMediumSoilEnum-pvs-keys-parsed-unique.csv \
local/nmdc-production-biosamples-soil-env_medium.tsv \
local/ncbi-mims-soil-biosamples-env_medium-annotated.tsv \
local/goldData_biosamples-inferred-soil-env_medium-counts.tsv
local/goldData_biosamples-inferred-soil-env_medium-counts.tsv \
local/soil-ids.tsv
$(RUN) python external_metadata_awareness/extract_value_set_evidence.py \
--config $< \
--downsample-uncounted \
Expand Down
22 changes: 22 additions & 0 deletions Makefiles/water-env_broad_scale.Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
RUN=poetry run
WGET=wget

local/water-env-broad-scale-evidence-table.tsv: config/water-env_broad_scale-evidence-config.yaml \
local/biome-ids.tsv \
local/nmdc-production-biosamples-water-env_broad_scale.tsv \
local/ncbi-mims-water-biosamples-env_broad_scale-annotated.tsv \
local/goldData_biosamples-inferred-water-env_broad_scale-counts.tsv
$(RUN) python external_metadata_awareness/extract_value_set_evidence.py \
--config $< \
--output-file $@

.PHONY: aggressive-soil-env-broad-scale-cleanup
aggressive-water-env-broad-scale-cleanup:
rm -rf local/env-package-heterogeneity.tsv
rm -rf local/envo-info*
rm -rf local/goldData*
rm -rf local/goldterms*
rm -rf local/ncbi-mims-water-biosamples-env_broad_scale*
rm -rf local/ncbi-mims-water-biosamples-env_broad_scale-annotated.tsv
rm -rf local/nmdc-production-biosamples*
rm -rf local/water-env-broad-scale-evidence-table.tsv
4 changes: 4 additions & 0 deletions config/soil-env_broad_scale-evidence-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
# output_prefix: non_host_oak_queries
# header: false
# data_column_number: 1
- filename: local/biome-ids.tsv
output_prefix: all_biomes_oak
header: false
data_column_number: 1
- filename: local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv
output_prefix: historical_permissible_values
header: true
Expand Down
4 changes: 4 additions & 0 deletions config/soil-env_medium-evidence-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
# output_prefix: non_host_oak_queries
# header: false
# data_column_number: 1
- filename: local/soil-ids.tsv
output_prefix: all_soils_oak
header: false
data_column_number: 1
- filename: local/EnvMediumSoilEnum-pvs-keys-parsed-unique.csv
output_prefix: historical_permissible_values
header: true
Expand Down
25 changes: 25 additions & 0 deletions config/water-env_broad_scale-evidence-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
- filename: local/biome-ids.tsv
output_prefix: all_biomes_oak
header: false
data_column_number: 1
# no legacy enum
- filename: local/nmdc-production-biosamples-water-env_broad_scale.tsv
output_prefix: NMDC_water
header: false
data_column_number: 1
count_column_number: 2
- filename: local/ncbi-mims-water-biosamples-env_broad_scale-annotated.tsv
output_prefix: NCBI_mims_water_trusting_CURIe
header: true
data_column_name: normalized_curie
count_column_name: count
- filename: local/ncbi-mims-water-biosamples-env_broad_scale-annotated.tsv
output_prefix: NCBI_mims_water_trusting_labels
header: true
data_column_name: matched_id
count_column_name: count
- filename: local/goldData_biosamples-inferred-water-env_broad_scale-counts.tsv
output_prefix: GOLD_env_terr_soil
header: false
data_column_number: 1
count_column_number: 2
Loading