From 7500f5faacc19ca48be04fca8f96db014ebe436f Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Wed, 1 Nov 2023 10:24:36 -0700 Subject: [PATCH 1/8] add script and api function --- .../scripts/neon_soils_add_ncbi_ids.py | 90 +++++++++++++++++++ nmdc_runtime/site/resources.py | 14 +++ 2 files changed, 104 insertions(+) create mode 100644 nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py diff --git a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py new file mode 100644 index 00000000..04bb6c10 --- /dev/null +++ b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# coding: utf-8 +# nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +""" +neon_soils_add_ncbi_ids.py: Add NCBI biosample IDs to neon soils biosamples, and +add NCBI study ID to neon soils study. +""" +import logging +import os +from pathlib import Path +import time + +import click +from dotenv import load_dotenv + +from nmdc_runtime.site.changesheets.base import ( + Changesheet, + ChangesheetLineItem, + JSON_OBJECT, +) + +from nmdc_runtime.site.resources import GoldApiClient, RuntimeApiUserClient + +load_dotenv() +NAME = "neon_soils_add_ncbi_ids" +NMDC_STUDY_ID = "nmdc:sty-11-34xj1150" + +log_filename = f"{NAME}-{time.strftime('%Y%m%d-%H%M%S')}.log" +logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", + filename=log_filename, encoding="utf-8", filemode="w", ) + + +@click.command() +@click.option("--study_id", default=NMDC_STUDY_ID, help="NMDC study ID") +@click.option( + "--use_dev_api", is_flag=True, default=False, help="Use the dev API" +) +def generate_changesheet(study_id, use_dev_api): + """ + Generate a changesheet for neon soils study and biosamples by: + 1. Retrieving all biosamples for neon soils study + 2. For each biosample, retrieve the corresponding GOLD biosample record + 3. Retrieve the NCBI biosample ID from the GOLD biosample record + 4. Generate a changesheet for the neon soils biosamples, adding the NCBI IDs + 5. Add changesheet line item for NCDB study ID + + WARNING: This script is not idempotent. It will generate a new changesheet + each time it is run. + Changesheet is written to nmdc_runtime/site/changesheets/changesheets_output + + :param study_id: The NMDC study ID + :param use_dev_api: Use the dev API (default: False) + :return: + """ + start_time = time.time() + logging.info(f"Generating changesheet for {study_id}") + logging.info(f"Using dev API: {use_dev_api}") + + # Initialize the NMDC API + if use_dev_api: + base_url = os.getenv("API_HOST_DEV") + logging.info("using dev API...") + else: + base_url = os.getenv("API_HOST") + logging.info("using prod API...") + + runtime_api_user_client = RuntimeApiUserClient( + base_url=base_url, + username=os.getenv("API_QUERY_USER"), + password=os.getenv("API_QUERY_PASS"), + ) + logging.info("connected to NMDC API...") + + # Initialize the GOLD API + gold_api_client = GoldApiClient( + base_url=os.getenv("GOLD_API_BASE_URL"), + username=os.getenv("GOLD_API_USERNAME"), + password=os.getenv("GOLD_API_PASSWORD"), + ) + logging.info("connected to GOLD API...") + + # Retrieve all biosamples for the neon soils study + biosamples = runtime_api_user_client.get_biosamples_for_study(study_id) + logging.info(f"retrieved {len(biosamples)} biosamples for {study_id}") + + + +if __name__ == "__main__": + generate_changesheet() diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 79cae368..5af88f1f 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -108,6 +108,20 @@ def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str): }, ) + def get_biosamples_for_study(self, study_id: str): + return self.request( + "POST", + f"/queries:run", + { + "find": "biosample_set", + "filter": { + "part_of": { + "$elemMatch": {"$eq": study_id} + } + }, + }, + ) + def get_omics_processing_by_name(self, name: str): return self.request( "POST", From 58ddbfae78ad140e778ca4cec8bc830c1c3c3cd2 Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Wed, 1 Nov 2023 14:07:00 -0700 Subject: [PATCH 2/8] update script --- .../scripts/neon_soils_add_ncbi_ids.py | 26 ++++++++++++++++++- nmdc_runtime/site/resources.py | 2 ++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py index 04bb6c10..f84c6c50 100644 --- a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +++ b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py @@ -81,9 +81,33 @@ def generate_changesheet(study_id, use_dev_api): logging.info("connected to GOLD API...") # Retrieve all biosamples for the neon soils study - biosamples = runtime_api_user_client.get_biosamples_for_study(study_id) + res = runtime_api_user_client.get_biosamples_for_study(study_id) + if res.status_code != 200: + logging.error( + f"error retrieving biosamples for {study_id}: {res.status_code}" + ) + return + biosamples = res.json()["cursor"]["firstBatch"] logging.info(f"retrieved {len(biosamples)} biosamples for {study_id}") + changesheet = Changesheet(name=NAME) + # For each biosample, retrieve the corresponding GOLD biosample record + for biosample in biosamples: + logging.info(f"processing biosample {biosample['id']}") + for gold_biosample_identifier in biosample["gold_biosample_identifiers"]: + # Retrieve the GOLD biosample record + res = gold_api_client.request("/biosamples", params={ + "biosampleGoldId": gold_biosample_identifier}) + if res.status_code != 200: + logging.error( + f"error retrieving GOLD biosample record for " + f"{gold_biosample_identifier}: {res.status_code}" + ) + continue + # the /biosamples endpoint returns a list of records + gold_biosample_record = res.json()[0] + + if __name__ == "__main__": diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 5af88f1f..38e102cd 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -332,6 +332,8 @@ def fetch_study(self, id: str) -> Union[Dict[str, Any], None]: return results[0] + + @resource( config_schema={ "base_url": StringSource, From 623fcda5fa690ebfd15d54ebe1627555c6c1c297 Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Fri, 3 Nov 2023 11:25:41 -0700 Subject: [PATCH 3/8] Refactor runtime client methods to raise for status and parse and return results --- nmdc_runtime/site/changesheets/base.py | 26 ++- .../missing_neon_soils_ecosystem_data.py | 16 +- .../scripts/neon_soils_add_ncbi_ids.py | 162 ++++++++++++------ nmdc_runtime/site/resources.py | 15 +- 4 files changed, 144 insertions(+), 75 deletions(-) diff --git a/nmdc_runtime/site/changesheets/base.py b/nmdc_runtime/site/changesheets/base.py index 1f4ea745..4c6443fb 100644 --- a/nmdc_runtime/site/changesheets/base.py +++ b/nmdc_runtime/site/changesheets/base.py @@ -4,14 +4,17 @@ """ import logging +import os import time from dataclasses import dataclass, field +from dotenv import load_dotenv from pathlib import Path import requests from typing import Any, ClassVar, Dict, TypeAlias, Optional -from nmdc_runtime.site.resources import RuntimeApiUserClient +from nmdc_runtime.site.resources import GoldApiClient, RuntimeApiUserClient +load_dotenv() logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(" "message)s" ) @@ -83,3 +86,24 @@ def write_changesheet(self) -> None: f.write(self.header + "\n") for line_item in self.line_items: f.write(line_item.line + "\n") + + +def get_runtime_client(use_dev_api): + if use_dev_api: + base_url = os.getenv("API_HOST_NAPA") + logging.info("using Napa API...") + else: + base_url = os.getenv("API_HOST") + logging.info("using prod API...") + return RuntimeApiUserClient( + base_url=base_url, username=os.getenv("API_QUERY_USER"), + password=os.getenv("API_QUERY_PASS"), ) + + + +def get_gold_client(): + return GoldApiClient( + base_url=os.getenv("GOLD_API_BASE_URL"), + username=os.getenv("GOLD_API_USERNAME"), + password=os.getenv("GOLD_API_PASSWORD"), ) + diff --git a/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py b/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py index 032b3d37..b1b33e56 100755 --- a/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +++ b/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py @@ -89,13 +89,7 @@ def gold_biosample_to_nmdc_biosamples_and_omics_processing_records( # Search for NMDC biosamples with by GOLD biosample ID nmdc_biosamples = [] logging.info(f"Searching for NMDC biosamples with {goldbs_id}...") - nmdcbs_response = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id) - if nmdcbs_response.status_code != 200: - logging.error( - f"Failed to retrieve NMDC biosamples with {goldbs_id}: {nmdcbs_response.status_code}" - ) - - nmdcbs = nmdcbs_response.json()["cursor"]["firstBatch"] + nmdcbs = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id) logging.info(f"Found {len(nmdcbs)} NMDC biosamples with {goldbs_id}...") nmdc_biosamples.extend(nmdcbs) @@ -103,13 +97,7 @@ def gold_biosample_to_nmdc_biosamples_and_omics_processing_records( logging.info( f"Searching for NMDC omics processing name containing {goldbs_name_suffix}..." ) - omprc_response = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix) - if omprc_response.status_code != 200: - logging.error( - f"Failed to retrieve NMDC omics processing with {goldbs_name_suffix}: {omprc_response.status_code}" - ) - - omprc_records = omprc_response.json()["cursor"]["firstBatch"] + omprc_records = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix) for omprc in omprc_records: omprc_id = omprc["id"] logging.info(f"omprc_id: {omprc_id}") diff --git a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py index f84c6c50..01229090 100644 --- a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +++ b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py @@ -6,24 +6,20 @@ add NCBI study ID to neon soils study. """ import logging -import os -from pathlib import Path import time import click from dotenv import load_dotenv -from nmdc_runtime.site.changesheets.base import ( - Changesheet, - ChangesheetLineItem, - JSON_OBJECT, -) - -from nmdc_runtime.site.resources import GoldApiClient, RuntimeApiUserClient +from nmdc_runtime.site.changesheets.base import (Changesheet, + ChangesheetLineItem, + get_gold_client, + get_runtime_client) load_dotenv() NAME = "neon_soils_add_ncbi_ids" NMDC_STUDY_ID = "nmdc:sty-11-34xj1150" +UMBRELLA_BIOPROJECT_ACCESSION = "PRJNA1029061" log_filename = f"{NAME}-{time.strftime('%Y%m%d-%H%M%S')}.log" logging.basicConfig( @@ -31,19 +27,43 @@ filename=log_filename, encoding="utf-8", filemode="w", ) +def _get_change_for_biosample(biosample, ncbi_biosample_accession): + """ + Get the changes for the given biosample + :param biosample: dict - the biosample + :param ncbi_biosample_accession: str - the NCBI BioSample accession + :return: list - the changes + """ + ncbi_biosample_accessions = biosample.get("insdc_biosample_identifiers", []) + if ncbi_biosample_accession in ncbi_biosample_accessions: + return + biosample_id = biosample["id"] + logging.info(f"creating change for biosample_id: {biosample_id}") + return ChangesheetLineItem( + id=biosample["id"], action="insert", + attribute="insdc_biosample_identifiers", + value=ncbi_biosample_accession, ) + + @click.command() @click.option("--study_id", default=NMDC_STUDY_ID, help="NMDC study ID") @click.option( - "--use_dev_api", is_flag=True, default=False, help="Use the dev API" + "--use_dev_api", is_flag=True, default=True, help="Use the dev API" ) def generate_changesheet(study_id, use_dev_api): """ Generate a changesheet for neon soils study and biosamples by: - 1. Retrieving all biosamples for neon soils study - 2. For each biosample, retrieve the corresponding GOLD biosample record - 3. Retrieve the NCBI biosample ID from the GOLD biosample record - 4. Generate a changesheet for the neon soils biosamples, adding the NCBI IDs - 5. Add changesheet line item for NCDB study ID + 0. Changesheet line item: Umbrella BioProjectAccession to + study.insdc_project_identifiers + 1. Retrieving all gold_study_identifiers for the neon soils study + 2. For each gold_study_identifier, retrieve the GOLD projects + 3. For each GOLD project, + A. retrieve the corresponding NMDC biosample(s). For each biosample, + - Changesheet line item:NCBI BioSampleAccession to + insdc_biosample_identifiers + B. Retrieve the corresponding NMDC omics_processing. For each, + - Changesheet line item:NCBI BioProjectAccession to + insdc_experiment_identifiers WARNING: This script is not idempotent. It will generate a new changesheet each time it is run. @@ -58,56 +78,84 @@ def generate_changesheet(study_id, use_dev_api): logging.info(f"Using dev API: {use_dev_api}") # Initialize the NMDC API - if use_dev_api: - base_url = os.getenv("API_HOST_DEV") - logging.info("using dev API...") - else: - base_url = os.getenv("API_HOST") - logging.info("using prod API...") - - runtime_api_user_client = RuntimeApiUserClient( - base_url=base_url, - username=os.getenv("API_QUERY_USER"), - password=os.getenv("API_QUERY_PASS"), - ) - logging.info("connected to NMDC API...") + runtime_client = get_runtime_client(use_dev_api) # Initialize the GOLD API - gold_api_client = GoldApiClient( - base_url=os.getenv("GOLD_API_BASE_URL"), - username=os.getenv("GOLD_API_USERNAME"), - password=os.getenv("GOLD_API_PASSWORD"), + gold_client = get_gold_client() + + # Initialize the changesheet + changesheet = Changesheet(name=NAME) + + # 1. Retrieve all gold_study_identifiers for the neon soils study + logging.info(f"Retrieving gold_study_identifiers for {study_id}") + res = runtime_client.request("GET", f"/studies/{study_id}") + nmdc_study = res.json() + changesheet.line_items.append( + ChangesheetLineItem( + id=study_id, action="insert", + attribute="insdc_bioproject_identifiers", + value=UMBRELLA_BIOPROJECT_ACCESSION, ) ) - logging.info("connected to GOLD API...") - # Retrieve all biosamples for the neon soils study - res = runtime_api_user_client.get_biosamples_for_study(study_id) - if res.status_code != 200: - logging.error( - f"error retrieving biosamples for {study_id}: {res.status_code}" + gold_study_identifiers = nmdc_study["gold_study_identifiers"] + logging.info(f"gold_study_identifiers: {gold_study_identifiers}") + gold_project_count = 0 + biosample_count = 0 + for gold_study_identifier in gold_study_identifiers: + + # 2. For each gold_study_identifier, retrieve the GOLD projects + if gold_study_identifier == 'gold:Gs0144570': + continue + logging.info( + f"Retrieving GOLD projects for gold_study_identifier: {gold_study_identifier}" ) - return - biosamples = res.json()["cursor"]["firstBatch"] - logging.info(f"retrieved {len(biosamples)} biosamples for {study_id}") - - changesheet = Changesheet(name=NAME) - # For each biosample, retrieve the corresponding GOLD biosample record - for biosample in biosamples: - logging.info(f"processing biosample {biosample['id']}") - for gold_biosample_identifier in biosample["gold_biosample_identifiers"]: - # Retrieve the GOLD biosample record - res = gold_api_client.request("/biosamples", params={ - "biosampleGoldId": gold_biosample_identifier}) - if res.status_code != 200: - logging.error( - f"error retrieving GOLD biosample record for " - f"{gold_biosample_identifier}: {res.status_code}" + projects = gold_client.fetch_projects_by_study(gold_study_identifier) + logging.info(f"Retrieved {len(projects)} projects") + + # 3. For each GOLD project, + for project in projects: + gold_project_count += 1 + project_gold_id = project["projectGoldId"] + biosample_gold_id = project["biosampleGoldId"] + ncbi_bioproject_accession = project["ncbiBioProjectAccession"] + ncbi_biosample_accession = project["ncbiBioSampleAccession"] + + # A. retrieve the corresponding NMDC biosample(s) + logging.info( + f"Retrieving NMDC biosamples for biosample_gold_id: {biosample_gold_id}" + ) + biosamples = runtime_client.get_biosamples_by_gold_biosample_id( + biosample_gold_id + ) + logging.info(f"Retrieved {len(biosamples)} biosamples") + for biosample in biosamples: + biosample_count += 1 + biosample_id = biosample["id"] + logging.info(f"biosample_id: {biosample_id}") + # NcbiBioSampleAccession to insdc_biosample_identifiers + changesheet.line_items.append( + _get_change_for_biosample( + biosample, ncbi_biosample_accession + ) ) - continue - # the /biosamples endpoint returns a list of records - gold_biosample_record = res.json()[0] + # B. Retrieve the corresponding NMDC omics_processing + + + logging.info(f"gold_project_count: {gold_project_count}") + logging.info(f"biosample_count: {biosample_count}") + logging.info(f"changesheet has {len(changesheet.line_items)} line items") + + # Write the changesheet + changesheet.write_changesheet() + + # Validate the changesheet + if changesheet.validate_changesheet(runtime_client.base_url): + logging.info(f"Changesheet is valid") + else: + logging.error(f"Changesheet is invalid") + logging.info(f"Completed in {time.time() - start_time} seconds") if __name__ == "__main__": diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 38e102cd..8cdf7b41 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -27,10 +27,12 @@ from nmdc_runtime.api.models.object import DrsObject, AccessURL, DrsObjectIn from nmdc_runtime.api.models.operation import ListOperationsResponse from nmdc_runtime.api.models.util import ListRequest +from nmdc_runtime.site.normalization.gold import normalize_gold_id from nmdc_runtime.util import unfreeze, nmdc_jsonschema_validator_noidpatterns from nmdc_schema import nmdc + class RuntimeApiClient: def __init__(self, base_url: str): self.base_url = base_url @@ -95,7 +97,8 @@ def get_run_info(self, run_id: str): return self.request("GET", f"/runs/{run_id}") def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str): - return self.request( + gold_biosample_id = normalize_gold_id(gold_biosample_id) + response = self.request( "POST", f"/queries:run", { @@ -107,9 +110,11 @@ def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str): }, }, ) + response.raise_for_status() + return response.json()["cursor"]["firstBatch"] def get_biosamples_for_study(self, study_id: str): - return self.request( + response = self.request( "POST", f"/queries:run", { @@ -121,9 +126,11 @@ def get_biosamples_for_study(self, study_id: str): }, }, ) + response.raise_for_status() + return response.json()["cursor"]["firstBatch"] def get_omics_processing_by_name(self, name: str): - return self.request( + response = self.request( "POST", f"/queries:run", { @@ -131,6 +138,8 @@ def get_omics_processing_by_name(self, name: str): "filter": {"name": {"$regex": name, "$options": "i"}}, }, ) + response.raise_for_status() + return response.json()["cursor"]["firstBatch"] class RuntimeApiSiteClient(RuntimeApiClient): From a637157447c7361583c3af3c7c80989dd618a6aa Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Fri, 3 Nov 2023 12:39:50 -0700 Subject: [PATCH 4/8] handle omics processing records --- .../scripts/neon_soils_add_ncbi_ids.py | 50 +++++++++++++++++-- nmdc_runtime/site/resources.py | 17 +++++++ 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py index 01229090..c2537114 100644 --- a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +++ b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py @@ -44,6 +44,25 @@ def _get_change_for_biosample(biosample, ncbi_biosample_accession): attribute="insdc_biosample_identifiers", value=ncbi_biosample_accession, ) +def _get_change_for_omics_processing(omics_processing_record, + ncbi_bioproject_accession): + """ + Get the changes for the given omics_processing_record + :param omics_processing_record: + :param ncbi_bioproject_accession: + :return: + """ + ncbi_bioproject_accessions = omics_processing_record.get( + "insdc_experiment_identifiers", []) + if ncbi_bioproject_accession in ncbi_bioproject_accessions: + return + omics_processing_id = omics_processing_record["id"] + logging.info(f"creating change for omics_processing_id: {omics_processing_id}") + return ChangesheetLineItem( + id=omics_processing_id, action="insert", + attribute="insdc_experiment_identifiers", + value=ncbi_bioproject_accession, ) + @click.command() @click.option("--study_id", default=NMDC_STUDY_ID, help="NMDC study ID") @@ -105,6 +124,7 @@ def generate_changesheet(study_id, use_dev_api): # 2. For each gold_study_identifier, retrieve the GOLD projects if gold_study_identifier == 'gold:Gs0144570': + # TODO verify that this one has already been done continue logging.info( f"Retrieving GOLD projects for gold_study_identifier: {gold_study_identifier}" @@ -133,14 +153,34 @@ def generate_changesheet(study_id, use_dev_api): biosample_id = biosample["id"] logging.info(f"biosample_id: {biosample_id}") # NcbiBioSampleAccession to insdc_biosample_identifiers - changesheet.line_items.append( - _get_change_for_biosample( + change =_get_change_for_biosample( biosample, ncbi_biosample_accession ) - ) - - # B. Retrieve the corresponding NMDC omics_processing + if change: + changesheet.line_items.append(change) + # B. Retrieve the corresponding NMDC omics_processing + logging.info( + f"Retrieving NMDC omics_processing for project_gold_id: {project_gold_id}" + ) + omics_processing_records = ( + runtime_client.get_omics_processing_records_by_gold_project_id( + project_gold_id + )) + logging.info( + f"Retrieved {len(omics_processing_records)} omics_processings" + ) + for omics_processing in omics_processing_records: + omics_processing_id = omics_processing["id"] + logging.info( + f"omics_processing_id: {omics_processing_id}" + ) + # NcbiBioProjectAccession to insdc_experiment_identifiers + change = _get_change_for_omics_processing( + omics_processing, ncbi_bioproject_accession + ) + if change: + changesheet.line_items.append(change) logging.info(f"gold_project_count: {gold_project_count}") logging.info(f"biosample_count: {biosample_count}") diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 8cdf7b41..f344d9e0 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -113,6 +113,23 @@ def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str): response.raise_for_status() return response.json()["cursor"]["firstBatch"] + def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str): + gold_project_id = normalize_gold_id(gold_project_id) + response = self.request( + "POST", + f"/queries:run", + { + "find": "omics_processing_set", + "filter": { + "gold_sequencing_project_identifiers": { + "$elemMatch": {"$eq": gold_project_id} + } + }, + }, + ) + response.raise_for_status() + return response.json()["cursor"]["firstBatch"] + def get_biosamples_for_study(self, study_id: str): response = self.request( "POST", From 2ad6bf494b42695be4f72f59f1c5764269688d76 Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Fri, 3 Nov 2023 13:09:05 -0700 Subject: [PATCH 5/8] update docstring --- .../site/changesheets/scripts/neon_soils_add_ncbi_ids.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py index c2537114..774c66ca 100644 --- a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +++ b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py @@ -2,8 +2,9 @@ # coding: utf-8 # nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py """ -neon_soils_add_ncbi_ids.py: Add NCBI biosample IDs to neon soils biosamples, and -add NCBI study ID to neon soils study. +neon_soils_add_ncbi_ids.py: Add NCBI biosample accessions to neon soils +biosamples, NCBI bioproject accessions to omics processing, and +NCBI Umbrella bioproject accession to neon soils study. """ import logging import time From dbc9189b809a8ed8bae24e836d6604b44cf0f96e Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Mon, 6 Nov 2023 16:08:59 -0800 Subject: [PATCH 6/8] update to include correct prefix --- .../site/changesheets/scripts/neon_soils_add_ncbi_ids.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py index 774c66ca..3ce64962 100644 --- a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +++ b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py @@ -43,7 +43,7 @@ def _get_change_for_biosample(biosample, ncbi_biosample_accession): return ChangesheetLineItem( id=biosample["id"], action="insert", attribute="insdc_biosample_identifiers", - value=ncbi_biosample_accession, ) + value="biosample:" + ncbi_biosample_accession, ) def _get_change_for_omics_processing(omics_processing_record, ncbi_bioproject_accession): @@ -62,7 +62,7 @@ def _get_change_for_omics_processing(omics_processing_record, return ChangesheetLineItem( id=omics_processing_id, action="insert", attribute="insdc_experiment_identifiers", - value=ncbi_bioproject_accession, ) + value="bioproject:" + ncbi_bioproject_accession, ) @click.command() @@ -114,7 +114,7 @@ def generate_changesheet(study_id, use_dev_api): ChangesheetLineItem( id=study_id, action="insert", attribute="insdc_bioproject_identifiers", - value=UMBRELLA_BIOPROJECT_ACCESSION, ) + value="bioproject:" + UMBRELLA_BIOPROJECT_ACCESSION, ) ) gold_study_identifiers = nmdc_study["gold_study_identifiers"] From 185f17dc88e8ec8ffb1fb78891e28948c1fd448e Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Tue, 14 Nov 2023 12:53:55 -0800 Subject: [PATCH 7/8] update to use use new insdc_bioproject_identifiers slot on omics_processing --- nmdc_runtime/site/changesheets/base.py | 4 ++-- .../changesheets/scripts/neon_soils_add_ncbi_ids.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/nmdc_runtime/site/changesheets/base.py b/nmdc_runtime/site/changesheets/base.py index 4c6443fb..60a8dd6c 100644 --- a/nmdc_runtime/site/changesheets/base.py +++ b/nmdc_runtime/site/changesheets/base.py @@ -90,8 +90,8 @@ def write_changesheet(self) -> None: def get_runtime_client(use_dev_api): if use_dev_api: - base_url = os.getenv("API_HOST_NAPA") - logging.info("using Napa API...") + base_url = os.getenv("API_HOST_DEV") + logging.info("using Dev API...") else: base_url = os.getenv("API_HOST") logging.info("using prod API...") diff --git a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py index 3ce64962..b9870ea1 100644 --- a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +++ b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py @@ -43,7 +43,7 @@ def _get_change_for_biosample(biosample, ncbi_biosample_accession): return ChangesheetLineItem( id=biosample["id"], action="insert", attribute="insdc_biosample_identifiers", - value="biosample:" + ncbi_biosample_accession, ) + value="biosample:" + ncbi_biosample_accession + "|", ) def _get_change_for_omics_processing(omics_processing_record, ncbi_bioproject_accession): @@ -54,15 +54,15 @@ def _get_change_for_omics_processing(omics_processing_record, :return: """ ncbi_bioproject_accessions = omics_processing_record.get( - "insdc_experiment_identifiers", []) + "insdc_bioproject_identifiers", []) if ncbi_bioproject_accession in ncbi_bioproject_accessions: return omics_processing_id = omics_processing_record["id"] logging.info(f"creating change for omics_processing_id: {omics_processing_id}") return ChangesheetLineItem( id=omics_processing_id, action="insert", - attribute="insdc_experiment_identifiers", - value="bioproject:" + ncbi_bioproject_accession, ) + attribute="insdc_bioproject_identifiers", + value="bioproject:" + ncbi_bioproject_accession + "|", ) @click.command() @@ -83,7 +83,7 @@ def generate_changesheet(study_id, use_dev_api): insdc_biosample_identifiers B. Retrieve the corresponding NMDC omics_processing. For each, - Changesheet line item:NCBI BioProjectAccession to - insdc_experiment_identifiers + insdc_bioproject_identifiers WARNING: This script is not idempotent. It will generate a new changesheet each time it is run. @@ -114,7 +114,7 @@ def generate_changesheet(study_id, use_dev_api): ChangesheetLineItem( id=study_id, action="insert", attribute="insdc_bioproject_identifiers", - value="bioproject:" + UMBRELLA_BIOPROJECT_ACCESSION, ) + value="bioproject:" + UMBRELLA_BIOPROJECT_ACCESSION + "|", ) ) gold_study_identifiers = nmdc_study["gold_study_identifiers"] From 2d3df792308c6f66d899d79d903107832f4f39a3 Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Tue, 14 Nov 2023 13:07:26 -0800 Subject: [PATCH 8/8] added python-dotenv to main.in and ran make update-deps --- requirements/dev.txt | 29 +++-------- requirements/main.in | 1 + requirements/main.txt | 116 ++++++++++++++++++------------------------ 3 files changed, 56 insertions(+), 90 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 029c5a12..2a4d676b 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --allow-unsafe --output-file=requirements/dev.txt --strip-extras requirements/dev.in @@ -9,7 +9,7 @@ attrs==23.1.0 # -c requirements/main.txt # cattrs # requests-cache -black==23.10.1 +black==23.11.0 # via # -c requirements/main.txt # -r requirements/dev.in @@ -23,7 +23,7 @@ certifi==2023.7.22 # via # -c requirements/main.txt # requests -charset-normalizer==3.3.1 +charset-normalizer==3.3.2 # via # -c requirements/main.txt # requests @@ -40,11 +40,6 @@ docutils==0.20.1 # via # -c requirements/main.txt # readme-renderer -exceptiongroup==1.1.3 - # via - # -c requirements/main.txt - # cattrs - # pytest flake8==6.1.0 # via -r requirements/dev.in idna==3.4 @@ -64,7 +59,7 @@ invoke==2.2.0 # via -r requirements/dev.in jaraco-classes==3.3.0 # via keyring -keyring==24.2.0 +keyring==24.3.0 # via twine markdown-it-py==3.0.0 # via @@ -99,7 +94,7 @@ pip-tools==7.3.0 # via -r requirements/dev.in pkginfo==1.9.6 # via twine -platformdirs==3.11.0 +platformdirs==4.0.0 # via # -c requirements/main.txt # black @@ -127,7 +122,7 @@ pytest==7.4.3 # -r requirements/dev.in # pytest-asyncio # pytest-cov -pytest-asyncio==0.22.0 +pytest-asyncio==0.21.1 # via -r requirements/dev.in pytest-cov==4.1.0 # via -r requirements/dev.in @@ -161,23 +156,11 @@ six==1.16.0 # -c requirements/main.txt # requests-mock # url-normalize -tomli==2.0.1 - # via - # -c requirements/main.txt - # black - # build - # coverage - # pip-tools - # pyproject-hooks - # pytest - # setuptools-scm twine==4.0.2 # via -r requirements/dev.in typing-extensions==4.8.0 # via # -c requirements/main.txt - # black - # cattrs # setuptools-scm url-normalize==1.4.3 # via diff --git a/requirements/main.in b/requirements/main.in index 487c608e..1a53fd78 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -30,6 +30,7 @@ pandas passlib[bcrypt] pymongo pydantic[email]>=1.10.0 +python-dotenv python-jose[cryptography] python-multipart pyyaml diff --git a/requirements/main.txt b/requirements/main.txt index b2ae0d97..c1b44f0b 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --allow-unsafe --output-file=requirements/main.txt --strip-extras requirements/main.in @@ -56,20 +56,20 @@ base32-lib==1.0.2 # via -r requirements/main.in bcrypt==4.0.1 # via passlib -beanie==1.23.1 +beanie==1.23.6 # via -r requirements/main.in beautifulsoup4==4.12.2 # via # -r requirements/main.in # mkdocs-mermaid2-plugin # nbconvert -black==23.10.1 +black==23.11.0 # via shed bleach==6.1.0 # via nbconvert -boto3==1.28.74 +boto3==1.29.0 # via -r requirements/main.in -botocore==1.31.74 +botocore==1.32.0 # via # boto3 # s3transfer @@ -87,7 +87,7 @@ chardet==5.2.0 # via # pyshex # pyshexc -charset-normalizer==3.3.1 +charset-normalizer==3.3.2 # via requests click==8.1.7 # via @@ -113,7 +113,7 @@ coloredlogs==14.0 # via dagster com2ann==0.3.0 # via shed -comm==0.1.4 +comm==0.2.0 # via # ipykernel # ipywidgets @@ -121,25 +121,25 @@ croniter==2.0.1 # via dagster cryptography==41.0.5 # via python-jose -curies==0.6.7 +curies==0.7.4 # via linkml-runtime -dagit==1.5.5 +dagit==1.5.7 # via -r requirements/main.in -dagster==1.5.5 +dagster==1.5.7 # via # -r requirements/main.in # dagster-graphql # dagster-postgres # dagster-webserver -dagster-graphql==1.5.5 +dagster-graphql==1.5.7 # via # -r requirements/main.in # dagster-webserver -dagster-pipes==1.5.5 +dagster-pipes==1.5.7 # via dagster -dagster-postgres==0.21.5 +dagster-postgres==0.21.7 # via -r requirements/main.in -dagster-webserver==1.5.5 +dagster-webserver==1.5.7 # via dagit debugpy==1.8.0 # via ipykernel @@ -169,17 +169,11 @@ email-validator==2.1.0.post1 # via pydantic et-xmlfile==1.1.0 # via openpyxl -exceptiongroup==1.1.3 - # via - # anyio - # cattrs - # ipython - # pytest executing==2.0.1 # via stack-data fastapi==0.104.1 # via -r requirements/main.in -fastjsonschema==2.18.1 +fastjsonschema==2.19.0 # via # -r requirements/main.in # nbformat @@ -254,8 +248,6 @@ ipython==8.17.2 # ipykernel # ipywidgets # jupyter-console -ipython-genutils==0.2.0 - # via qtconsole ipywidgets==8.1.1 # via jupyter isodate==0.6.1 @@ -287,7 +279,7 @@ jmespath==1.0.1 # botocore jq==1.6.0 # via -r requirements/main.in -jsbeautifier==1.14.9 +jsbeautifier==1.14.11 # via mkdocs-mermaid2-plugin json-flattener==0.1.9 # via linkml-runtime @@ -318,11 +310,11 @@ jsonschema==4.19.2 # linkml # linkml-runtime # nbformat -jsonschema-specifications==2023.7.1 +jsonschema-specifications==2023.11.1 # via jsonschema jupyter==1.0.0 # via -r requirements/main.in -jupyter-client==8.5.0 +jupyter-client==8.6.0 # via # ipykernel # jupyter-console @@ -342,11 +334,11 @@ jupyter-core==5.5.0 # nbconvert # nbformat # qtconsole -jupyter-events==0.8.0 +jupyter-events==0.9.0 # via jupyter-server jupyter-lsp==2.2.0 # via jupyterlab -jupyter-server==2.9.1 +jupyter-server==2.10.0 # via # jupyter-lsp # jupyterlab @@ -355,13 +347,13 @@ jupyter-server==2.9.1 # notebook-shim jupyter-server-terminals==0.4.4 # via jupyter-server -jupyterlab==4.0.7 +jupyterlab==4.0.8 # via # -r requirements/main.in # notebook jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-server==2.25.0 +jupyterlab-server==2.25.1 # via # jupyterlab # notebook @@ -373,21 +365,21 @@ lazy-model==0.2.0 # via beanie libcst==1.1.0 # via shed -linkml==1.6.1 +linkml==1.6.2 # via # -r requirements/main.in # nmdc-schema linkml-dataops==0.1.0 # via linkml -linkml-runtime==1.6.0 +linkml-runtime==1.6.2 # via # -r requirements/main.in # linkml # linkml-dataops # nmdc-schema -mako==1.2.4 +mako==1.3.0 # via alembic -markdown==3.5 +markdown==3.5.1 # via # mkdocs # mkdocs-material @@ -421,7 +413,7 @@ mkdocs==1.5.3 # mkdocs-mermaid2-plugin mkdocs-jupyter==0.24.6 # via -r requirements/main.in -mkdocs-material==9.4.7 +mkdocs-material==9.4.8 # via # -r requirements/main.in # mkdocs-jupyter @@ -439,9 +431,9 @@ mypy-extensions==1.0.0 # via # black # typing-inspect -nbclient==0.8.0 +nbclient==0.9.0 # via nbconvert -nbconvert==7.10.0 +nbconvert==7.11.0 # via # jupyter # jupyter-server @@ -462,7 +454,7 @@ notebook-shim==0.2.3 # via # jupyterlab # notebook -numpy==1.26.1 +numpy==1.26.2 # via pandas numpydoc==1.6.0 # via terminusdb-client @@ -488,7 +480,7 @@ packaging==23.2 # sphinx paginate==0.5.6 # via mkdocs-material -pandas==2.1.2 +pandas==2.1.3 # via -r requirements/main.in pandocfilters==1.5.0 # via nbconvert @@ -506,7 +498,7 @@ pendulum==2.1.2 # via dagster pexpect==4.8.0 # via ipython -platformdirs==3.11.0 +platformdirs==4.0.0 # via # black # jupyter-core @@ -526,11 +518,11 @@ prefixmaps==0.1.5 # linkml-runtime prometheus-client==0.18.0 # via jupyter-server -prompt-toolkit==3.0.39 +prompt-toolkit==3.0.41 # via # ipython # jupyter-console -protobuf==4.24.4 +protobuf==4.25.0 # via # dagster # grpcio-health-checking @@ -550,7 +542,7 @@ pyasn1==0.5.0 # rsa pycparser==2.21 # via cffi -pydantic==2.4.2 +pydantic==2.5.0 # via # -r requirements/main.in # beanie @@ -561,7 +553,7 @@ pydantic==2.4.2 # linkml # linkml-runtime # pydantic -pydantic-core==2.10.1 +pydantic-core==2.14.1 # via pydantic pyflakes==3.1.0 # via autoflake @@ -579,11 +571,11 @@ pyjsg==0.11.10 # linkml # pyshexc # shexjsg -pymdown-extensions==10.3.1 +pymdown-extensions==10.4 # via # mkdocs-material # mkdocs-mermaid2-plugin -pymongo==4.5.0 +pymongo==4.6.0 # via # -r requirements/main.in # motor @@ -612,6 +604,7 @@ python-dateutil==2.8.2 # pendulum python-dotenv==1.0.0 # via + # -r requirements/main.in # dagster # uvicorn python-jose==3.3.0 @@ -656,7 +649,7 @@ pyzmq==25.1.1 # jupyter-console # jupyter-server # qtconsole -qtconsole==5.4.4 +qtconsole==5.5.0 # via jupyter qtpy==2.4.1 # via qtconsole @@ -676,7 +669,7 @@ rdflib-shim==1.0.3 # pyshex # pyshexc # sparqlslurper -referencing==0.30.2 +referencing==0.31.0 # via # jsonschema # jsonschema-specifications @@ -715,13 +708,13 @@ rfc3986-validator==0.1.1 # jupyter-events rfc3987==1.3.8 # via jsonschema -rpds-py==0.10.6 +rpds-py==0.12.0 # via # jsonschema # referencing rsa==4.9 # via python-jose -ruamel-yaml==0.18.3 +ruamel-yaml==0.18.5 # via linkml-dataops ruamel-yaml-clib==0.2.8 # via ruamel-yaml @@ -783,7 +776,7 @@ sphinxcontrib-qthelp==1.0.6 # via sphinx sphinxcontrib-serializinghtml==1.1.9 # via sphinx -sqlalchemy==2.0.22 +sqlalchemy==2.0.23 # via # alembic # dagster @@ -801,7 +794,7 @@ tabulate==0.9.0 # numpydoc tenacity==8.2.3 # via -r requirements/main.in -terminado==0.17.1 +terminado==0.18.0 # via # jupyter-server # jupyter-server-terminals @@ -816,13 +809,7 @@ toml==0.10.2 # beanie # jupytext tomli==2.0.1 - # via - # autoflake - # black - # dagster - # jupyterlab - # numpydoc - # pytest + # via dagster toolz==0.12.0 # via -r requirements/main.in toposort==1.10 @@ -864,10 +851,6 @@ types-python-dateutil==2.8.19.14 typing-extensions==4.8.0 # via # alembic - # async-lru - # beanie - # black - # cattrs # dagster # fastapi # libcst @@ -876,7 +859,6 @@ typing-extensions==4.8.0 # pydantic-core # sqlalchemy # typing-inspect - # uvicorn typing-inspect==0.9.0 # via libcst tzdata==2023.3 @@ -894,7 +876,7 @@ urllib3==1.26.18 # pyshex # requests # requests-cache -uvicorn==0.23.2 +uvicorn==0.24.0.post1 # via # -r requirements/main.in # dagster-webserver @@ -907,7 +889,7 @@ watchdog==3.0.0 # mkdocs watchfiles==0.21.0 # via uvicorn -wcwidth==0.2.9 +wcwidth==0.2.10 # via prompt-toolkit webcolors==1.13 # via jsonschema @@ -921,7 +903,7 @@ websockets==12.0 # via uvicorn widgetsnbextension==4.0.9 # via ipywidgets -wrapt==1.15.0 +wrapt==1.16.0 # via deprecated xlrd==2.0.1 # via -r requirements/main.in