diff --git a/nmdc_runtime/site/changesheets/base.py b/nmdc_runtime/site/changesheets/base.py index 1f4ea745..60a8dd6c 100644 --- a/nmdc_runtime/site/changesheets/base.py +++ b/nmdc_runtime/site/changesheets/base.py @@ -4,14 +4,17 @@ """ import logging +import os import time from dataclasses import dataclass, field +from dotenv import load_dotenv from pathlib import Path import requests from typing import Any, ClassVar, Dict, TypeAlias, Optional -from nmdc_runtime.site.resources import RuntimeApiUserClient +from nmdc_runtime.site.resources import GoldApiClient, RuntimeApiUserClient +load_dotenv() logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(" "message)s" ) @@ -83,3 +86,24 @@ def write_changesheet(self) -> None: f.write(self.header + "\n") for line_item in self.line_items: f.write(line_item.line + "\n") + + +def get_runtime_client(use_dev_api): + if use_dev_api: + base_url = os.getenv("API_HOST_DEV") + logging.info("using Dev API...") + else: + base_url = os.getenv("API_HOST") + logging.info("using prod API...") + return RuntimeApiUserClient( + base_url=base_url, username=os.getenv("API_QUERY_USER"), + password=os.getenv("API_QUERY_PASS"), ) + + + +def get_gold_client(): + return GoldApiClient( + base_url=os.getenv("GOLD_API_BASE_URL"), + username=os.getenv("GOLD_API_USERNAME"), + password=os.getenv("GOLD_API_PASSWORD"), ) + diff --git a/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py b/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py index 032b3d37..b1b33e56 100755 --- a/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +++ b/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py @@ -89,13 +89,7 @@ def gold_biosample_to_nmdc_biosamples_and_omics_processing_records( # Search for NMDC biosamples with by GOLD biosample ID nmdc_biosamples = [] logging.info(f"Searching for NMDC biosamples with {goldbs_id}...") - nmdcbs_response = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id) - if nmdcbs_response.status_code != 200: - logging.error( - f"Failed to retrieve NMDC biosamples with {goldbs_id}: {nmdcbs_response.status_code}" - ) - - nmdcbs = nmdcbs_response.json()["cursor"]["firstBatch"] + nmdcbs = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id) logging.info(f"Found {len(nmdcbs)} NMDC biosamples with {goldbs_id}...") nmdc_biosamples.extend(nmdcbs) @@ -103,13 +97,7 @@ def gold_biosample_to_nmdc_biosamples_and_omics_processing_records( logging.info( f"Searching for NMDC omics processing name containing {goldbs_name_suffix}..." ) - omprc_response = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix) - if omprc_response.status_code != 200: - logging.error( - f"Failed to retrieve NMDC omics processing with {goldbs_name_suffix}: {omprc_response.status_code}" - ) - - omprc_records = omprc_response.json()["cursor"]["firstBatch"] + omprc_records = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix) for omprc in omprc_records: omprc_id = omprc["id"] logging.info(f"omprc_id: {omprc_id}") diff --git a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py new file mode 100644 index 00000000..b9870ea1 --- /dev/null +++ b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +# coding: utf-8 +# nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +""" +neon_soils_add_ncbi_ids.py: Add NCBI biosample accessions to neon soils +biosamples, NCBI bioproject accessions to omics processing, and +NCBI Umbrella bioproject accession to neon soils study. +""" +import logging +import time + +import click +from dotenv import load_dotenv + +from nmdc_runtime.site.changesheets.base import (Changesheet, + ChangesheetLineItem, + get_gold_client, + get_runtime_client) + +load_dotenv() +NAME = "neon_soils_add_ncbi_ids" +NMDC_STUDY_ID = "nmdc:sty-11-34xj1150" +UMBRELLA_BIOPROJECT_ACCESSION = "PRJNA1029061" + +log_filename = f"{NAME}-{time.strftime('%Y%m%d-%H%M%S')}.log" +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", + filename=log_filename, encoding="utf-8", filemode="w", ) + + +def _get_change_for_biosample(biosample, ncbi_biosample_accession): + """ + Get the changes for the given biosample + :param biosample: dict - the biosample + :param ncbi_biosample_accession: str - the NCBI BioSample accession + :return: list - the changes + """ + ncbi_biosample_accessions = biosample.get("insdc_biosample_identifiers", []) + if ncbi_biosample_accession in ncbi_biosample_accessions: + return + biosample_id = biosample["id"] + logging.info(f"creating change for biosample_id: {biosample_id}") + return ChangesheetLineItem( + id=biosample["id"], action="insert", + attribute="insdc_biosample_identifiers", + value="biosample:" + ncbi_biosample_accession + "|", ) + +def _get_change_for_omics_processing(omics_processing_record, + ncbi_bioproject_accession): + """ + Get the changes for the given omics_processing_record + :param omics_processing_record: + :param ncbi_bioproject_accession: + :return: + """ + ncbi_bioproject_accessions = omics_processing_record.get( + "insdc_bioproject_identifiers", []) + if ncbi_bioproject_accession in ncbi_bioproject_accessions: + return + omics_processing_id = omics_processing_record["id"] + logging.info(f"creating change for omics_processing_id: {omics_processing_id}") + return ChangesheetLineItem( + id=omics_processing_id, action="insert", + attribute="insdc_bioproject_identifiers", + value="bioproject:" + ncbi_bioproject_accession + "|", ) + + +@click.command() +@click.option("--study_id", default=NMDC_STUDY_ID, help="NMDC study ID") +@click.option( + "--use_dev_api", is_flag=True, default=True, help="Use the dev API" +) +def generate_changesheet(study_id, use_dev_api): + """ + Generate a changesheet for neon soils study and biosamples by: + 0. Changesheet line item: Umbrella BioProjectAccession to + study.insdc_project_identifiers + 1. Retrieving all gold_study_identifiers for the neon soils study + 2. For each gold_study_identifier, retrieve the GOLD projects + 3. For each GOLD project, + A. retrieve the corresponding NMDC biosample(s). For each biosample, + - Changesheet line item:NCBI BioSampleAccession to + insdc_biosample_identifiers + B. Retrieve the corresponding NMDC omics_processing. For each, + - Changesheet line item:NCBI BioProjectAccession to + insdc_bioproject_identifiers + + WARNING: This script is not idempotent. It will generate a new changesheet + each time it is run. + Changesheet is written to nmdc_runtime/site/changesheets/changesheets_output + + :param study_id: The NMDC study ID + :param use_dev_api: Use the dev API (default: False) + :return: + """ + start_time = time.time() + logging.info(f"Generating changesheet for {study_id}") + logging.info(f"Using dev API: {use_dev_api}") + + # Initialize the NMDC API + runtime_client = get_runtime_client(use_dev_api) + + # Initialize the GOLD API + gold_client = get_gold_client() + + # Initialize the changesheet + changesheet = Changesheet(name=NAME) + + # 1. Retrieve all gold_study_identifiers for the neon soils study + logging.info(f"Retrieving gold_study_identifiers for {study_id}") + res = runtime_client.request("GET", f"/studies/{study_id}") + nmdc_study = res.json() + changesheet.line_items.append( + ChangesheetLineItem( + id=study_id, action="insert", + attribute="insdc_bioproject_identifiers", + value="bioproject:" + UMBRELLA_BIOPROJECT_ACCESSION + "|", ) + ) + + gold_study_identifiers = nmdc_study["gold_study_identifiers"] + logging.info(f"gold_study_identifiers: {gold_study_identifiers}") + gold_project_count = 0 + biosample_count = 0 + for gold_study_identifier in gold_study_identifiers: + + # 2. For each gold_study_identifier, retrieve the GOLD projects + if gold_study_identifier == 'gold:Gs0144570': + # TODO verify that this one has already been done + continue + logging.info( + f"Retrieving GOLD projects for gold_study_identifier: {gold_study_identifier}" + ) + projects = gold_client.fetch_projects_by_study(gold_study_identifier) + logging.info(f"Retrieved {len(projects)} projects") + + # 3. For each GOLD project, + for project in projects: + gold_project_count += 1 + project_gold_id = project["projectGoldId"] + biosample_gold_id = project["biosampleGoldId"] + ncbi_bioproject_accession = project["ncbiBioProjectAccession"] + ncbi_biosample_accession = project["ncbiBioSampleAccession"] + + # A. retrieve the corresponding NMDC biosample(s) + logging.info( + f"Retrieving NMDC biosamples for biosample_gold_id: {biosample_gold_id}" + ) + biosamples = runtime_client.get_biosamples_by_gold_biosample_id( + biosample_gold_id + ) + logging.info(f"Retrieved {len(biosamples)} biosamples") + for biosample in biosamples: + biosample_count += 1 + biosample_id = biosample["id"] + logging.info(f"biosample_id: {biosample_id}") + # NcbiBioSampleAccession to insdc_biosample_identifiers + change =_get_change_for_biosample( + biosample, ncbi_biosample_accession + ) + if change: + changesheet.line_items.append(change) + + # B. Retrieve the corresponding NMDC omics_processing + logging.info( + f"Retrieving NMDC omics_processing for project_gold_id: {project_gold_id}" + ) + omics_processing_records = ( + runtime_client.get_omics_processing_records_by_gold_project_id( + project_gold_id + )) + logging.info( + f"Retrieved {len(omics_processing_records)} omics_processings" + ) + for omics_processing in omics_processing_records: + omics_processing_id = omics_processing["id"] + logging.info( + f"omics_processing_id: {omics_processing_id}" + ) + # NcbiBioProjectAccession to insdc_experiment_identifiers + change = _get_change_for_omics_processing( + omics_processing, ncbi_bioproject_accession + ) + if change: + changesheet.line_items.append(change) + + logging.info(f"gold_project_count: {gold_project_count}") + logging.info(f"biosample_count: {biosample_count}") + logging.info(f"changesheet has {len(changesheet.line_items)} line items") + + # Write the changesheet + changesheet.write_changesheet() + + # Validate the changesheet + if changesheet.validate_changesheet(runtime_client.base_url): + logging.info(f"Changesheet is valid") + else: + logging.error(f"Changesheet is invalid") + + logging.info(f"Completed in {time.time() - start_time} seconds") + + +if __name__ == "__main__": + generate_changesheet() diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 22b22d6d..a831da33 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -27,10 +27,12 @@ from nmdc_runtime.api.models.object import DrsObject, AccessURL, DrsObjectIn from nmdc_runtime.api.models.operation import ListOperationsResponse from nmdc_runtime.api.models.util import ListRequest +from nmdc_runtime.site.normalization.gold import normalize_gold_id from nmdc_runtime.util import unfreeze, nmdc_jsonschema_validator_noidpatterns from nmdc_schema import nmdc + class RuntimeApiClient: def __init__(self, base_url: str): self.base_url = base_url @@ -95,7 +97,8 @@ def get_run_info(self, run_id: str): return self.request("GET", f"/runs/{run_id}") def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str): - return self.request( + gold_biosample_id = normalize_gold_id(gold_biosample_id) + response = self.request( "POST", f"/queries:run", { @@ -107,9 +110,44 @@ def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str): }, }, ) + response.raise_for_status() + return response.json()["cursor"]["firstBatch"] + + def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str): + gold_project_id = normalize_gold_id(gold_project_id) + response = self.request( + "POST", + f"/queries:run", + { + "find": "omics_processing_set", + "filter": { + "gold_sequencing_project_identifiers": { + "$elemMatch": {"$eq": gold_project_id} + } + }, + }, + ) + response.raise_for_status() + return response.json()["cursor"]["firstBatch"] + + def get_biosamples_for_study(self, study_id: str): + response = self.request( + "POST", + f"/queries:run", + { + "find": "biosample_set", + "filter": { + "part_of": { + "$elemMatch": {"$eq": study_id} + } + }, + }, + ) + response.raise_for_status() + return response.json()["cursor"]["firstBatch"] def get_omics_processing_by_name(self, name: str): - return self.request( + response = self.request( "POST", f"/queries:run", { @@ -117,6 +155,8 @@ def get_omics_processing_by_name(self, name: str): "filter": {"name": {"$regex": name, "$options": "i"}}, }, ) + response.raise_for_status() + return response.json()["cursor"]["firstBatch"] class RuntimeApiSiteClient(RuntimeApiClient): @@ -320,6 +360,8 @@ def fetch_study(self, id: str) -> Union[Dict[str, Any], None]: return results[0] + + @resource( config_schema={ "base_url": StringSource, diff --git a/requirements/dev.txt b/requirements/dev.txt index 4120028f..b87ef35a 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --allow-unsafe --output-file=requirements/dev.txt --strip-extras requirements/dev.in @@ -9,7 +9,7 @@ attrs==23.1.0 # -c requirements/main.txt # cattrs # requests-cache -black==23.10.1 +black==23.11.0 # via # -c requirements/main.txt # -r requirements/dev.in @@ -40,11 +40,6 @@ docutils==0.20.1 # via # -c requirements/main.txt # readme-renderer -exceptiongroup==1.1.3 - # via - # -c requirements/main.txt - # cattrs - # pytest flake8==6.1.0 # via -r requirements/dev.in idna==3.4 @@ -64,7 +59,7 @@ invoke==2.2.0 # via -r requirements/dev.in jaraco-classes==3.3.0 # via keyring -keyring==24.2.0 +keyring==24.3.0 # via twine markdown-it-py==3.0.0 # via @@ -98,7 +93,7 @@ pip-tools==7.3.0 # via -r requirements/dev.in pkginfo==1.9.6 # via twine -platformdirs==3.11.0 +platformdirs==4.0.0 # via # -c requirements/main.txt # black diff --git a/requirements/main.in b/requirements/main.in index 067940de..c5c46de0 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -30,6 +30,7 @@ pandas passlib[bcrypt] pymongo pydantic[email]>=1.10.0 +python-dotenv python-jose[cryptography] python-multipart pyyaml diff --git a/requirements/main.txt b/requirements/main.txt index 5fba6141..c9e99118 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --allow-unsafe --output-file=requirements/main.txt --strip-extras requirements/main.in @@ -56,20 +56,20 @@ base32-lib==1.0.2 # via -r requirements/main.in bcrypt==4.0.1 # via passlib -beanie==1.23.1 +beanie==1.23.6 # via -r requirements/main.in beautifulsoup4==4.12.2 # via # -r requirements/main.in # mkdocs-mermaid2-plugin # nbconvert -black==23.10.1 +black==23.11.0 # via shed bleach==6.1.0 # via nbconvert -boto3==1.28.80 +boto3==1.29.0 # via -r requirements/main.in -botocore==1.31.80 +botocore==1.32.0 # via # boto3 # s3transfer @@ -121,25 +121,25 @@ croniter==2.0.1 # via dagster cryptography==41.0.5 # via python-jose -curies==0.7.2 +curies==0.7.4 # via linkml-runtime -dagit==1.5.6 +dagit==1.5.7 # via -r requirements/main.in -dagster==1.5.6 +dagster==1.5.7 # via # -r requirements/main.in # dagster-graphql # dagster-postgres # dagster-webserver -dagster-graphql==1.5.6 +dagster-graphql==1.5.7 # via # -r requirements/main.in # dagster-webserver -dagster-pipes==1.5.6 +dagster-pipes==1.5.7 # via dagster -dagster-postgres==0.21.6 +dagster-postgres==0.21.7 # via -r requirements/main.in -dagster-webserver==1.5.6 +dagster-webserver==1.5.7 # via dagit debugpy==1.8.0 # via ipykernel @@ -169,17 +169,11 @@ email-validator==2.1.0.post1 # via pydantic et-xmlfile==1.1.0 # via openpyxl -exceptiongroup==1.1.3 - # via - # anyio - # cattrs - # ipython - # pytest executing==2.0.1 # via stack-data fastapi==0.104.1 # via -r requirements/main.in -fastjsonschema==2.18.1 +fastjsonschema==2.19.0 # via # -r requirements/main.in # nbformat @@ -316,7 +310,7 @@ jsonschema==4.19.2 # linkml # linkml-runtime # nbformat -jsonschema-specifications==2023.7.1 +jsonschema-specifications==2023.11.1 # via jsonschema jupyter==1.0.0 # via -r requirements/main.in @@ -359,7 +353,7 @@ jupyterlab==4.0.8 # notebook jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-server==2.25.0 +jupyterlab-server==2.25.1 # via # jupyterlab # notebook @@ -377,13 +371,13 @@ linkml==1.6.2 # nmdc-schema linkml-dataops==0.1.0 # via linkml -linkml-runtime==1.6.1 +linkml-runtime==1.6.2 # via # -r requirements/main.in # linkml # linkml-dataops # nmdc-schema -mako==1.2.4 +mako==1.3.0 # via alembic markdown==3.5.1 # via @@ -460,7 +454,7 @@ notebook-shim==0.2.3 # via # jupyterlab # notebook -numpy==1.26.1 +numpy==1.26.2 # via pandas numpydoc==1.6.0 # via terminusdb-client @@ -487,7 +481,7 @@ packaging==23.2 # sphinx paginate==0.5.6 # via mkdocs-material -pandas==2.1.2 +pandas==2.1.3 # via -r requirements/main.in pandocfilters==1.5.0 # via nbconvert @@ -505,7 +499,7 @@ pendulum==2.1.2 # via dagster pexpect==4.8.0 # via ipython -platformdirs==3.11.0 +platformdirs==4.0.0 # via # black # jupyter-core @@ -525,7 +519,7 @@ prefixmaps==0.1.5 # linkml-runtime prometheus-client==0.18.0 # via jupyter-server -prompt-toolkit==3.0.39 +prompt-toolkit==3.0.41 # via # ipython # jupyter-console @@ -549,7 +543,7 @@ pyasn1==0.5.0 # rsa pycparser==2.21 # via cffi -pydantic==2.4.2 +pydantic==2.5.0 # via # -r requirements/main.in # beanie @@ -560,7 +554,7 @@ pydantic==2.4.2 # linkml # linkml-runtime # pydantic -pydantic-core==2.10.1 +pydantic-core==2.14.1 # via pydantic pyflakes==3.1.0 # via autoflake @@ -578,7 +572,7 @@ pyjsg==0.11.10 # linkml # pyshexc # shexjsg -pymdown-extensions==10.3.1 +pymdown-extensions==10.4 # via # mkdocs-material # mkdocs-mermaid2-plugin @@ -611,6 +605,7 @@ python-dateutil==2.8.2 # pendulum python-dotenv==1.0.0 # via + # -r requirements/main.in # dagster # uvicorn python-jose==3.3.0 @@ -675,7 +670,7 @@ rdflib-shim==1.0.3 # pyshex # pyshexc # sparqlslurper -referencing==0.30.2 +referencing==0.31.0 # via # jsonschema # jsonschema-specifications @@ -804,7 +799,7 @@ tabulate==0.9.0 # numpydoc tenacity==8.2.3 # via -r requirements/main.in -terminado==0.17.1 +terminado==0.18.0 # via # jupyter-server # jupyter-server-terminals @@ -819,14 +814,7 @@ toml==0.10.2 # beanie # jupytext tomli==2.0.1 - # via - # autoflake - # black - # dagster - # jupyterlab - # numpydoc - # pytest - # setuptools-scm + # via dagster toolz==0.12.0 # via -r requirements/main.in toposort==1.10 @@ -868,10 +856,6 @@ types-python-dateutil==2.8.19.14 typing-extensions==4.8.0 # via # alembic - # async-lru - # beanie - # black - # cattrs # dagster # fastapi # libcst @@ -881,7 +865,6 @@ typing-extensions==4.8.0 # setuptools-scm # sqlalchemy # typing-inspect - # uvicorn typing-inspect==0.9.0 # via libcst tzdata==2023.3 @@ -912,7 +895,7 @@ watchdog==3.0.0 # mkdocs watchfiles==0.21.0 # via uvicorn -wcwidth==0.2.9 +wcwidth==0.2.10 # via prompt-toolkit webcolors==1.13 # via jsonschema @@ -926,7 +909,7 @@ websockets==12.0 # via uvicorn widgetsnbextension==4.0.9 # via ipywidgets -wrapt==1.15.0 +wrapt==1.16.0 # via deprecated xlrd==2.0.1 # via -r requirements/main.in