Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

valueset-streamlit #29

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion Makefiles/envo.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,16 @@ local/envo-info.csv: local/envo-info.txt
--output-file $@

local/biome-info.txt:
$(RUN) runoak --input sqlite:obo:envo info .desc//p=i ENVO:00000428 > $@
$(RUN) runoak --input sqlite:obo:envo info .desc//p=i biome > $@

local/biome-ids.tsv: local/biome-info.txt
cut -f1 -d' ' $< > $@

local/soil-info.txt:
$(RUN) runoak --input sqlite:obo:envo info .desc//p=i soil > $@

local/soil-ids.tsv: local/soil-info.txt
cut -f1 -d' ' $< > $@

local/unused-terrestrial-biomes-prompt.txt: prompt-templates/unused-terrestrial-biomes-prompt.yaml \
local/soil-env_broad_scale-algebraic.txt local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \
Expand Down
1 change: 1 addition & 0 deletions Makefiles/soil-env_broad_scale.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ RUN=poetry run
WGET=wget

local/soil-env-broad-scale-evidence-table.tsv: config/soil-env_broad_scale-evidence-config.yaml \
local/biome-ids.tsv \
local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv \
local/nmdc-production-biosamples-soil-env_broad_scale.tsv \
local/ncbi-mims-soil-biosamples-env_broad_scale-annotated.tsv \
Expand Down
3 changes: 2 additions & 1 deletion Makefiles/soil-env_medium.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ local/soil-env-medium-evidence-table.tsv: config/soil-env_medium-evidence-config
local/EnvMediumSoilEnum-pvs-keys-parsed-unique.csv \
local/nmdc-production-biosamples-soil-env_medium.tsv \
local/ncbi-mims-soil-biosamples-env_medium-annotated.tsv \
local/goldData_biosamples-inferred-soil-env_medium-counts.tsv
local/goldData_biosamples-inferred-soil-env_medium-counts.tsv \
local/soil-ids.tsv
$(RUN) python external_metadata_awareness/extract_value_set_evidence.py \
--config $< \
--downsample-uncounted \
Expand Down
4 changes: 4 additions & 0 deletions config/soil-env_broad_scale-evidence-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
# output_prefix: non_host_oak_queries
# header: false
# data_column_number: 1
- filename: local/biome-ids.tsv
output_prefix: all_biomes_oak
header: false
data_column_number: 1
- filename: local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv
output_prefix: historical_permissible_values
header: true
Expand Down
4 changes: 4 additions & 0 deletions config/soil-env_medium-evidence-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
# output_prefix: non_host_oak_queries
# header: false
# data_column_number: 1
- filename: local/soil-ids.tsv
output_prefix: all_soils_oak
header: false
data_column_number: 1
- filename: local/EnvMediumSoilEnum-pvs-keys-parsed-unique.csv
output_prefix: historical_permissible_values
header: true
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,009 changes: 1,009 additions & 0 deletions env-triad-voting-data/with-iaa/old/soil-ebs-with-iaa.csv

Large diffs are not rendered by default.

1,158 changes: 1,158 additions & 0 deletions env-triad-voting-data/with-iaa/old/soil-els-with-iaa.csv

Large diffs are not rendered by default.

735 changes: 735 additions & 0 deletions env-triad-voting-data/with-iaa/old/soil-em-with-iaa.csv

Large diffs are not rendered by default.

1,009 changes: 1,009 additions & 0 deletions env-triad-voting-data/with-iaa/soil-ebs-with-iaa.csv

Large diffs are not rendered by default.

1,158 changes: 1,158 additions & 0 deletions env-triad-voting-data/with-iaa/soil-els-with-iaa.csv

Large diffs are not rendered by default.

735 changes: 735 additions & 0 deletions env-triad-voting-data/with-iaa/soil-em-with-iaa.csv

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion external_metadata_awareness/extract_value_set_evidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ def extract_columns(config: str, output_file: str, downsample_uncounted: bool) -
'ENVO:00002030': 'is_aquatic_biome', # Biome
'ENVO:00000446': 'is_terrestrial_biome', # Biome
'ENVO:00010483': 'is_environmental_material', # Environmental material
'ENVO:00001998': 'is_soil',
'ENVO:00003082': 'is_enriched_soil',
}

# Add boolean columns based on ontology subclass relationships
Expand All @@ -224,7 +226,9 @@ def extract_columns(config: str, output_file: str, downsample_uncounted: bool) -
columns.insert(7, columns.pop(columns.index('is_aquatic_biome')))
columns.insert(8, columns.pop(columns.index('is_terrestrial_biome')))
columns.insert(9, columns.pop(columns.index('is_environmental_material')))
columns.insert(10, columns.pop(columns.index('obsolete')))
columns.insert(10, columns.pop(columns.index('is_soil')))
columns.insert(11, columns.pop(columns.index('is_enriched_soil')))
columns.insert(12, columns.pop(columns.index('obsolete')))
final_df_with_labels = final_df_with_labels[columns]

# Save the final DataFrame to the specified output file
Expand Down
32 changes: 32 additions & 0 deletions iaa.Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
RUN=poetry run

env-triad-voting-data/with-iaa/soil-ebs-with-iaa.csv: env-triad-voting-data/consolidated-additional-columns/Consolidated_soil-env-broad-scale-evidence-table-additional-booleans.csv
poetry run python iaa.py \
--input-file $< \
--vote-columns CJM_Vote \
--vote-columns 'MAM vote' \
--vote-columns MLS_vote \
--vote-columns NMW_vote \
--vote-columns SM_vote \
--output-file $@

env-triad-voting-data/with-iaa/soil-els-with-iaa.csv: env-triad-voting-data/consolidated-additional-columns/Consolidated_soil-env-local-scale-evidence-table-additional-booleans.csv
poetry run python iaa.py \
--input-file $< \
--vote-columns CJM_Vote \
--vote-columns 'MAM vote' \
--vote-columns MLS_vote \
--vote-columns NMW_vote \
--vote-columns SM_vote \
--output-file $@

env-triad-voting-data/with-iaa/soil-em-with-iaa.csv: env-triad-voting-data/consolidated-additional-columns/Consolidated_soil-env-medium-evidence-table-additional-booleans.csv
poetry run python iaa.py \
--input-file $< \
--vote-columns CJM_Vote \
--vote-columns 'MAM vote' \
--vote-columns MLS_vote \
--vote-columns NMW_vote \
--vote-columns SM_vote \
--output-file $@

50 changes: 50 additions & 0 deletions iaa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd
import itertools
import click
from typing import List, Tuple


def safe_int_convert(x):
try:
return int(x)
except (ValueError, pd.errors.IntCastingNaNError):
return 0


def calculate_iaa(row: pd.Series) -> float:
values: List[int] = [0 if v not in [-1, 0, 1] else v for v in row.map(safe_int_convert)]
pairs: List[Tuple[int, int]] = list(itertools.combinations(values, 2))
agreements: int = sum(1 for a, b in pairs if a == b)
return agreements / len(pairs)


@click.command()
@click.option('--input-file', '-i', type=click.Path(exists=True), required=True, help='Path to input CSV file.')
@click.option('--output-file', '-o', type=click.Path(), required=True,
help='Path where the output CSV file will be saved.')
@click.option('--vote-columns', '-v', multiple=True,
default=['CJM_Vote', 'MAM vote', 'MLS_vote', 'NMW_vote', 'SM_vote'],
help='Column names for votes.')
@click.option('--debug-toolbar/--no-debug-toolbar', default=False, help='Show or hide the Dash debug toolbar.')
def main(input_file: str, output_file: str, vote_columns: List[str], debug_toolbar: bool) -> None:
# Step 1: Process the data and create `for_plotting` DataFrame
df: pd.DataFrame = pd.read_csv(input_file)
safe_int_vote_columns = [f"{col}_safe_int" for col in vote_columns]
for col in vote_columns:
df[f"{col}_safe_int"] = df[col].map(safe_int_convert)

df['IAA_score'] = df[list(safe_int_vote_columns)].apply(calculate_iaa, axis=1)
df['vote_sum'] = df[safe_int_vote_columns].sum(axis=1)

for_plotting = df[~((df['IAA_score'] == 1) & (df['vote_sum'] == 0))]

# Group data by vote_sum and IAA_score, and calculate count at each coordinate
grouped = for_plotting.groupby(['vote_sum', 'IAA_score']).size().reset_index(name='count')

# Step 2: Save the `for_plotting` DataFrame to a CSV file
df.to_csv(f"{output_file}", index=False)
print(f"Updated CSV file with IAA scores has been created: {output_file}")


if __name__ == '__main__':
main()
Loading