diff --git a/assets/mixs_environments_env_materials_subsets.yaml.txt b/assets/mixs_environments_env_materials_subsets.yaml.txt new file mode 100644 index 0000000..912a6e0 --- /dev/null +++ b/assets/mixs_environments_env_materials_subsets.yaml.txt @@ -0,0 +1,139 @@ +"Here is the exhaustive YAML-formatted report of all environmental materials that\ + \ could reasonably be found in the Soil environment, with both the environmental\ + \ material id and label for each associated material:\n\nSoil:\n environmental\ + \ materials:\n - id: ENVO_00001998\n label: soil\n - id: ENVO_00002871\n\ + \ label: alluvial soil\n - id: ENVO_00003082\n label: enriched soil\n\ + \ - id: ENVO_00002229\n label: arenosol\n - id: ENVO_00002231\n \ + \ label: alisol\n - id: ENVO_00002232\n label: andosol\n - id: ENVO_00002233\n\ + \ label: albeluvisol\n - id: ENVO_01001397\n label: ultisol\n -\ + \ id: ENVO_00002235\n label: cambisol\n - id: ENVO_01001526\n label:\ + \ frozen soil\n - id: ENVO_00002237\n label: chernozem\n - id: ENVO_00002238\n\ + \ label: durisol\n - id: ENVO_00002239\n label: calcisol\n - id:\ + \ ENVO_00002240\n label: kastanozem\n - id: ENVO_00002241\n label:\ + \ leptosol\n - id: ENVO_00002242\n label: lixisol\n - id: ENVO_00002243\n\ + \ label: histosol\n - id: ENVO_00002244\n label: gleysol\n - id:\ + \ ENVO_00002245\n label: gypsisol\n - id: ENVO_00002246\n label: ferralsol\n\ + \ - id: ENVO_00002247\n label: nitisol\n - id: ENVO_00002248\n label:\ + \ luvisol\n - id: ENVO_00002249\n label: phaeozem\n - id: ENVO_00002250\n\ + \ label: plinthosol\n - id: ENVO_00002251\n label: planosol\n -\ + \ id: ENVO_00002252\n label: solonchak\n - id: ENVO_00002253\n label:\ + \ umbrisol\n - id: ENVO_00002254\n label: vertisol\n - id: ENVO_00002255\n\ + \ label: solonetz\n - id: ENVO_00002256\n label: regosol\n - id:\ + \ ENVO_00002257\n label: podzol\n - id: ENVO_00002258\n label: loam\n\ + \ - id: ENVO_00002259\n label: agricultural soil\n - id: ENVO_00005786\n\ + \ label: upland soil\n - id: ENVO_00002261\n label: forest soil\n \ + \ - id: ENVO_00002262\n label: clay soil\n - id: ENVO_00002263\n \ + \ label: garden soil\n - id: ENVO_00002273\n label: fluvisol\n - id:\ + \ ENVO_00002274\n label: stagnosol\n - id: ENVO_00002275\n label: technosol\n\ + \ - id: ENVO_00005755\n label: field soil\n - id: ENVO_00005741\n \ + \ label: alpine soil\n - id: ENVO_00005742\n label: arable soil\n -\ + \ id: ENVO_00005743\n label: roadside soil\n - id: ENVO_00005750\n \ + \ label: grassland soil\n - id: ENVO_00005747\n label: compost soil\n \ + \ - id: ENVO_00005748\n label: dry soil\n - id: ENVO_00005749\n label:\ + \ farm soil\n - id: ENVO_00005751\n label: jungle soil\n - id: ENVO_00005752\n\ + \ label: sawah soil\n - id: ENVO_00005754\n label: fertilized soil\n\ + \ - id: ENVO_00005756\n label: lawn soil\n - id: ENVO_00005760\n \ + \ label: burned soil\n - id: ENVO_00005761\n label: meadow soil\n -\ + \ id: ENVO_00005764\n label: pond soil\n - id: ENVO_00005766\n label:\ + \ limed soil\n - id: ENVO_00005767\n label: manured soil\n - id: ENVO_00005768\n\ + \ label: orchid soil\n - id: ENVO_00005771\n label: muddy soil\n \ + \ - id: ENVO_00005773\n label: pasture soil\n - id: ENVO_00005778\n \ + \ label: tropical soil\n - id: ENVO_00005780\n label: greenhouse soil\n\ + \ - id: ENVO_00005781\n label: heat stressed soil\n - id: ENVO_00005782\n\ + \ label: ornithogenic soil\n - id: ENVO_00005790\n label: red soil\n\ + \ - id: ENVO_00005802\n label: bulk soil\n - id: ENVO_01001185\n \ + \ label: acidic soil\n - id: ENVO_01001616\n label: bare soil\n - id:\ + \ ENVO_01001638\n label: frost-susceptible soil\n - id: ENVO_02000059\n\ + \ label: surface soil\n - id: ENVO_02000138\n label: mangrove biome\ + \ soil\n - id: ENVO_06105205\n label: compacted soil\n - id: ENVO_03600036\n\ + \ label: pathogen-suppressive soil\n - id: ENVO_01000018\n label: gravel\n\ + \ - id: ENVO_01001125\n label: ice\n - id: ENVO_00001995\n label:\ + \ rock\n - id: ENVO_00000194\n label: scree\n - id: ENVO_01000660\n \ + \ label: tephra\n - id: ENVO_01000256\n label: mineral material\n \ + \ - id: ENVO_00002008\n label: dust\n - id: ENVO_00002164\n label:\ + \ fossil material\n - id: ENVO_01000000\n label: humus\n - id: ENVO_02000090\n\ + \ label: ash\n - id: ENVO_01001646\n label: amorphous solid\n -\ + \ id: ENVO_01000480\n label: glass\n - id: ENVO_01000560\n label: charcoal\n\ + \ - id: ENVO_01000845\n label: crystal\n - id: ENVO_01001121\n label:\ + \ plant matter\n - id: ENVO_01001231\n label: kerogen\n - id: ENVO_01001525\n\ + \ label: hard-frozen soil\n - id: ENVO_01001528\n label: friable-frozen\ + \ soil\n - id: ENVO_01001561\n label: gel\n - id: ENVO_01001850\n \ + \ label: frost\n - id: ENVO_03501307\n label: ceramic\n - id: ENVO_00002122\n\ + \ label: arsenic-rich mud\n - id: ENVO_00002133\n label: anaerobic\ + \ mud\n - id: ENVO_00002160\n label: estuarine mud\n - id: ENVO_00005795\n\ + \ label: marine mud\n - id: ENVO_00005797\n label: lake bottom mud\n\ + \ - id: ENVO_02000019\n label: bodily fluid material\n - id: ENVO_04000008\n\ + \ label: soil organic matter\n - id: ENVO_01000063\n label: planktonic\ + \ material\n - id: ENVO_01000156\n label: biofilm material\n - id: ENVO_01000157\n\ + \ label: microbial mat material\n - id: ENVO_01001103\n label: detritus\n\ + \ - id: ENVO_01000628\n label: plant litter\n - id: ENVO_01000349\n \ + \ label: root matter\n - id: ENVO_02000008\n label: cell culture\n \ + \ - id: ENVO_01001395\n label: necromass\n - id: ENVO_03501304\n \ + \ label: cellophane\n - id: ENVO_03600084\n label: lichen material\n \ + \ - id: ENVO_1000746\n label: marine mucilage\n - id: ENVO_00002001\n \ + \ label: waste water\n - id: ENVO_02000022\n label: excreta material\n\ + \ - id: ENVO_00002267\n label: industrial waste material\n - id: ENVO_01000371\n\ + \ label: agricultural waste material\n - id: ENVO_01000372\n label:\ + \ household waste material\n - id: ENVO_03510070\n label: toxic waste\n\ + \ - id: ENVO_03600006\n label: food waste\n - id: ENVO_01000017\n \ + \ label: sand\n - id: ENVO_00002007\n label: sediment\n - id: ENVO_01001563\n\ + \ label: quicksand\n - id: ENVO_01000016\n label: silt\n - id: ENVO_00002982\n\ + \ label: clay\n - id: ENVO_01000436\n label: waterborne particulate\ + \ matter\n - id: ENVO_03000021\n label: soot\n - id: ENVO_03000038\n\ + \ label: cryoconite deposit\n - id: ENVO_04000012\n label: particulate\ + \ organic matter\n - id: ENVO_00002170\n label: compost\n - id: ENVO_00003031\n\ + \ label: animal manure\n - id: ENVO_00002985\n label: oil\n - id:\ + \ ENVO_01000554\n label: hydrocarbon gas\n - id: ENVO_01001139\n label:\ + \ methane ice\n - id: ENVO_01001238\n label: residual kerogen\n - id:\ + \ ENVO_00005739\n label: sea foam\n - id: ENVO_01001089\n label: aerosolised\ + \ solids\n - id: ENVO_01001088\n label: aerosolised liquids\n - id: ENVO_01001652\n\ + \ label: atmospheric aerosol\n - id: ENVO_01000797\n label: gaseous\ + \ environmental material\n - id: ENVO_01000815\n label: liquid environmental\ + \ material\n - id: ENVO_01000231\n label: lava\n - id: ENVO_01000648\n\ + \ label: magma\n - id: ENVO_01000798\n label: plasma\n - id: ENVO_03600007\n\ + \ label: formation fluid\n - id: ENVO_03600082\n label: crustal fluid\n\ + \ - id: ENVO_02000118\n label: paraffin wax\n - id: ENVO_02000117\n \ + \ label: natural wax\n - id: ENVO_01001155\n label: astrogeological\ + \ gas\n - id: ENVO_01001126\n label: astrogeological ice\n - id: ENVO_01001647\n\ + \ label: colloid suspended in a hydrosphere\n - id: ENVO_02000047\n \ + \ label: animal feed\n - id: ENVO_02000055\n label: plant feed\n - id:\ + \ FOODON_00001002\n label: food product\n - id: ENVO_03000076\n label:\ + \ slush ice\n - id: ENVO_03501259\n label: litter\n - id: ENVO_03501303\n\ + \ label: natural-based polymer\n - id: ENVO_03501306\n label: mylar\n\ + \ - id: ENVO_03510015\n label: xanthan gum\n - id: ENVO_03510016\n \ + \ label: tragacanth\n - id: ENVO_03510018\n label: resin\n - id: ENVO_03510023\n\ + \ label: shellac\n - id: ENVO_03510037\n label: radioactive material\n\ + \ - id: ENVO_03510038\n label: volume of cyanoacrylate\n - id: ENVO_03510064\n\ + \ label: emissions from petroleum combustion\n - id: ENVO_03600013\n \ + \ label: bituminous sand\n - id: ENVO_03600017\n label: slurry\n -\ + \ id: ENVO_06105003\n label: thermoplastic material\n - id: ENVO_06105101\n\ + \ label: plastic\n - id: ENVO_00000003\n label: mine tailing\n -\ + \ id: ENVO_00002044\n label: sludge\n - id: ENVO_00002230\n label:\ + \ anthrosol\n - id: ENVO_00002870\n label: adobe\n - id: ENVO_01000458\n\ + \ label: concrete\n - id: ENVO_01000462\n label: masonry cement\n \ + \ - id: ENVO_01000461\n label: refined asphalt\n - id: ENVO_01000474\n\ + \ label: brick material\n - id: ENVO_01000476\n label: plaster\n \ + \ - id: ENVO_01001869\n label: fracking liquid\n - id: ENVO_02000123\n\ + \ label: paint\n - id: ENVO_03501253\n label: gypsum\n - id: ENVO_03501324\n\ + \ label: latex\n - id: ENVO_03510055\n label: rubber cement\n -\ + \ id: ENVO_03510056\n label: methyl cellulose paste\n - id: ENVO_03510057\n\ + \ label: permanent hair dye\n - id: ENVO_03510058\n label: pharmaceutical\ + \ ink\n - id: ENVO_03510060\n label: gel ink\n - id: ENVO_03510061\n\ + \ label: soy ink\n - id: ENVO_03510062\n label: aqueous inkjet printer\ + \ ink\n - id: ENVO_01001876\n label: graupel\n - id: ENVO_03000000\n\ + \ label: neve\n - id: ENVO_03000027\n label: powdery snow\n - id:\ + \ ENVO_03000002\n label: firn\n - id: ENVO_03000108\n label: slab snow\n\ + \ - id: ENVO_01001614\n label: ice-bearing permafrost\n - id: ENVO_03000088\n\ + \ label: methane-laden permafrost\n - id: ENVO_06105274\n label: sandy\ + \ loam\n - id: ENVO_06105275\n label: silty loam\n - id: ENVO_06105277\n\ + \ label: clay loam\n - id: ENVO_01001841\n label: volcanic soil\n \ + \ - id: ENVO_00002234\n label: acrisol\n - id: ENVO_00002236\n label:\ + \ cryosol\n - id: ENVO_00005765\n label: frozen compost soil\n - id:\ + \ ENVO_01001527\n label: plastic-frozen soil\n - id: ENVO_00005774\n \ + \ label: peat soil\n - id: ENVO_00005740\n label: paddy field soil\n \ + \ - id: ENVO_00005789\n label: bluegrass field soil\n - id: ENVO_00002260\n\ + \ label: dune soil\n - id: ENVO_00005769\n label: mountain forest soil\n\ + \ - id: ENVO_00005770\n label: beech forest soil\n - id: ENVO_00005783\n\ + \ label: leafy wood soil\n - id: ENVO_00005784\n label: spruce forest\ + \ soil\n - id: ENVO_00005787\n label: eucalyptus forest soil\n - id:\ + \ ENVO_00005744\n " diff --git a/nmdc_ontology/mixs_environments_extensions_to_envo_envirnmental_materials_by_claude.py b/nmdc_ontology/mixs_environments_extensions_to_envo_envirnmental_materials_by_claude.py index d2df64e..d0f11a9 100644 --- a/nmdc_ontology/mixs_environments_extensions_to_envo_envirnmental_materials_by_claude.py +++ b/nmdc_ontology/mixs_environments_extensions_to_envo_envirnmental_materials_by_claude.py @@ -1,11 +1,18 @@ import os import time +import yaml + from anthropic import Anthropic from dotenv import load_dotenv -load_dotenv("../local/.env") +load_dotenv("local/.env") + +mixs_extension_report_file = "assets/extension_report.yaml" +envo_class_annotations_file = "assets/report_envo_environmental_material_annotations.tsv" +envo_classes_description = "environmental materials" +output_file = "assets/mixs_environments_env_materials_subsets.yaml.txt" api_key = os.environ["ANTHROPIC_API_KEY"] @@ -15,12 +22,13 @@ client = Anthropic(api_key=api_key) -with open('../assets/extension_report.yaml', 'r') as file: +with open(mixs_extension_report_file, 'r') as file: mixs_environments = file.read() -with open('../assets/report_envo_environmental_material_annotations.tsv', 'r') as file: +with open(envo_class_annotations_file, 'r') as file: envo_materials = file.read() + def get_completion(client, prompt): while True: try: @@ -37,16 +45,19 @@ def get_completion(client, prompt): time.sleep(5) - completion = get_completion(client, - f"""Here are the definitions of environments, according to MIxS: {mixs_environments} -and the definitions of environmental materials, according to EnvO: {envo_materials}. + f"""Here are the definitions of environments, according to MIxS: {mixs_environments} +and the definitions of {envo_classes_description}, according to EnvO: {envo_materials}. Generate an exhaustive YAML-formatted report of all environmental materials that could reasonably be found in the Soil environment. When associating an environmental material with an environment, report both the environmental material id and the environmental material label for every associated environmental material. -""" -) +""") + +# print(completion) -print(completion) +# Open the output file in write mode ('w') +with open(output_file, 'w') as outfile: + # Dump the YAML string to the file + yaml.dump(completion, outfile) diff --git a/nmdc_ontology/report_envo_biome_annotations.py b/nmdc_ontology/report_envo_biome_annotations.py new file mode 100644 index 0000000..b27ac23 --- /dev/null +++ b/nmdc_ontology/report_envo_biome_annotations.py @@ -0,0 +1,78 @@ +import requests +from collections import defaultdict +import pandas as pd + +output_file = "assets/report_envo_biome_annotations.tsv" + +endpoint = "http://3.236.215.220/repositories/nmdc-knowledgegraph" +# https://graphdb-dev.microbiomedata.org/repositories/nmdc-metadata +query = """ +PREFIX rdfs: +PREFIX ENVO: +PREFIX oboInOwl: +PREFIX IAO: +PREFIX rdf: +PREFIX xsd: + +SELECT ?s ?p ?o +WHERE { + VALUES ?p { + IAO:0000115 + IAO:0000116 + oboInOwl:hasBroadSynonym + oboInOwl:hasExactSynonym + oboInOwl:hasNarrowSynonym + oboInOwl:hasRelatedSynonym + rdfs:comment + rdfs:label + } + GRAPH { + ?s rdfs:subClassOf* ENVO:00000428 ; + ?p ?o . + } + FILTER (datatype(?o) = rdf:langString || datatype(?o) = xsd:string) +} +""" + +# Set the Content-Type header to application/sparql-query +headers = { + "Content-Type": "application/sparql-query", + "Accept": "application/sparql-results+json" +} + +# Send the POST request to the SPARQL endpoint +response = requests.post(endpoint, data=query, headers=headers) + +# Check the response status code +if response.status_code == 200: + # Parse the JSON response + results = response.json() + + # Create a dictionary to store the pivot table data + pivot_data = defaultdict(lambda: defaultdict(list)) + + # Process the query results and populate the pivot table data + for result in results["results"]["bindings"]: + s = result["s"]["value"] + p = result["p"]["value"] + o = result["o"]["value"] + + # Convert the subject and predicate IRIs to CURIEs + s_curie = s.split("#")[-1] if "#" in s else s.split("/")[-1] + p_curie = p.split("#")[-1] if "#" in p else p.split("/")[-1] + + pivot_data[s_curie][p_curie].append(o) + + # Convert the pivot table data to a Pandas DataFrame + df = pd.DataFrame.from_dict(pivot_data, orient='index') + + # Fill empty cells with an empty string + df.fillna('', inplace=True) + + # Save the DataFrame to a TSV file + df.to_csv(output_file, sep="\t") + + print(f"DataFrame saved to {output_file}") + +else: + print(f"Query failed with status code: {response.status_code}") diff --git a/nmdc_ontology/report_envo_environmental_material_annotations.py b/nmdc_ontology/report_envo_environmental_material_annotations.py new file mode 100644 index 0000000..36c155d --- /dev/null +++ b/nmdc_ontology/report_envo_environmental_material_annotations.py @@ -0,0 +1,79 @@ +import requests +from collections import defaultdict +import pandas as pd + +endpoint = "http://3.236.215.220/repositories/nmdc-knowledgegraph" +# https://graphdb-dev.microbiomedata.org/repositories/nmdc-metadata + +output_file = "assets/report_envo_environmental_material_annotations.tsv" + +query = """ +PREFIX rdfs: +PREFIX ENVO: +PREFIX oboInOwl: +PREFIX IAO: +PREFIX rdf: +PREFIX xsd: + +SELECT ?s ?p ?o +WHERE { + VALUES ?p { + IAO:0000115 + IAO:0000116 + oboInOwl:hasBroadSynonym + oboInOwl:hasExactSynonym + oboInOwl:hasNarrowSynonym + oboInOwl:hasRelatedSynonym + rdfs:comment + rdfs:label + } + GRAPH { + ?s rdfs:subClassOf* ENVO:00010483 ; + ?p ?o . + } + FILTER (datatype(?o) = rdf:langString || datatype(?o) = xsd:string) +} +""" + +# Set the Content-Type header to application/sparql-query +headers = { + "Content-Type": "application/sparql-query", + "Accept": "application/sparql-results+json" +} + +# Send the POST request to the SPARQL endpoint +response = requests.post(endpoint, data=query, headers=headers) + +# Check the response status code +if response.status_code == 200: + # Parse the JSON response + results = response.json() + + # Create a dictionary to store the pivot table data + pivot_data = defaultdict(lambda: defaultdict(list)) + + # Process the query results and populate the pivot table data + for result in results["results"]["bindings"]: + s = result["s"]["value"] + p = result["p"]["value"] + o = result["o"]["value"] + + # Convert the subject and predicate IRIs to CURIEs + s_curie = s.split("#")[-1] if "#" in s else s.split("/")[-1] + p_curie = p.split("#")[-1] if "#" in p else p.split("/")[-1] + + pivot_data[s_curie][p_curie].append(o) + + # Convert the pivot table data to a Pandas DataFrame + df = pd.DataFrame.from_dict(pivot_data, orient='index') + + # Fill empty cells with an empty string + df.fillna('', inplace=True) + + # Save the DataFrame to a TSV file + df.to_csv(output_file, sep="\t") + + print(f"DataFrame saved to {output_file}") + +else: + print(f"Query failed with status code: {response.status_code}") diff --git a/nmdc_ontology/report_mixs_extensions.py b/nmdc_ontology/report_mixs_extensions.py new file mode 100644 index 0000000..5a4956d --- /dev/null +++ b/nmdc_ontology/report_mixs_extensions.py @@ -0,0 +1,62 @@ +import csv +import io + +import yaml +import sys +import requests + +# input_file = 'assets/class_summary_results.tsv' +output_file = 'assets/extension_report.yaml' + +# Define the URL of the TSV file +# hopefully this will get merged in soon +# https://github.com/GenomicsStandardsConsortium/mixs/pull/769 + +url = "https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/c196fef8d9864b15db1abc71e66c4c0ddd8bdcee/class_summary_results.tsv" + +# or download mixs.yaml and build here? +# uses schemasheets and then runs that though a python filtering script. +# probably don't wat to have that filtering code in multiple places + +# Send a GET request to the URL +response = requests.get(url) + +# Check if the request was successful +if response.status_code == 200: + # Read the content of the response as text + data = response.text + + # Use a StringIo object to treat the text data as a file-like object + file = io.StringIO(data) + + # Use csv.DictReader to read the data + reader = csv.DictReader(file, delimiter='\t') + + # Filter rows where is_a = "Extension" + extension_rows = [row for row in reader if row['is_a'] == 'Extension'] + +else: + print(f"Error: Failed to retrieve data from {url}. Status code: {response.status_code}") + +# Create a dictionary to store the YAML data +yaml_data = {} + +# Extract the desired fields for each Extension row +for row in extension_rows: + extension_key = row['class'].strip() + + yaml_data[extension_key] = {} + + for key in ['title', 'description', 'comments', 'use_cases']: + value = row[key].strip() + if '\u2019' in value: + print(f"Replacing '\\u2019' with ''' in field '{key}' of extension '{extension_key}'", file=sys.stderr) + value = value.replace('\u2019', "'") # Replace \u2019 with ASCII equivalent of \u0027 + if value: + yaml_data[extension_key][key] = value + +# Write the YAML data to a file +with open(output_file, 'w') as file: + yaml.dump(yaml_data, file, default_flow_style=False) + +print(f"YAML data written to {output_file}") diff --git a/qc.Makefile b/qc.Makefile index 2fb60d7..837e6f3 100644 --- a/qc.Makefile +++ b/qc.Makefile @@ -136,3 +136,26 @@ src/ontology/imports/report-unlabelled-classes.txt: qc-reports/report-unlabelled awk 'NR > 1 ' $< | tr -d '<>' > $@.tmp cat assets/additional-extracts.txt $@.tmp | sort -u > $@ rm $@.tmp + +### + +.PHONY: envo_mixs_all +envo_mixs_all: envo_mixs_clean assets/mixs_environments_env_materials_subsets.yaml.txt + +.PHONY: envo_mixs_clean +envo_mixs_clean: + rm -rf assets/extension_report.yaml \ + assets/report_envo_environmental_material_annotations.tsv \ + assets/mixs_environments_env_materials_subsets.yaml.txt + +assets/extension_report.yaml: + $(RUN) python nmdc_ontology/report_mixs_extensions.py + +assets/report_envo_biome_annotations.tsv: + $(RUN) python nmdc_ontology/report_envo_biome_annotations.py + +assets/report_envo_environmental_material_annotations.tsv: + $(RUN) python nmdc_ontology/report_envo_environmental_material_annotations.py + +assets/mixs_environments_env_materials_subsets.yaml.txt: assets/extension_report.yaml assets/report_envo_environmental_material_annotations.tsv + date && time $(RUN) python nmdc_ontology/mixs_environments_extensions_to_envo_envirnmental_materials_by_claude.py \ No newline at end of file