Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

529 add ncbi biosample and project ids to neon soil #354

Merged
26 changes: 25 additions & 1 deletion nmdc_runtime/site/changesheets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
"""

import logging
import os
import time
from dataclasses import dataclass, field
from dotenv import load_dotenv
from pathlib import Path
import requests
from typing import Any, ClassVar, Dict, TypeAlias, Optional

from nmdc_runtime.site.resources import RuntimeApiUserClient
from nmdc_runtime.site.resources import GoldApiClient, RuntimeApiUserClient

load_dotenv()
logging.basicConfig(
level=logging.INFO, format="%(asctime)s %(levelname)s %(" "message)s"
)
Expand Down Expand Up @@ -83,3 +86,24 @@ def write_changesheet(self) -> None:
f.write(self.header + "\n")
for line_item in self.line_items:
f.write(line_item.line + "\n")


def get_runtime_client(use_dev_api):
if use_dev_api:
base_url = os.getenv("API_HOST_DEV")
logging.info("using Dev API...")
else:
base_url = os.getenv("API_HOST")
logging.info("using prod API...")
return RuntimeApiUserClient(
base_url=base_url, username=os.getenv("API_QUERY_USER"),
password=os.getenv("API_QUERY_PASS"), )



def get_gold_client():
return GoldApiClient(
base_url=os.getenv("GOLD_API_BASE_URL"),
username=os.getenv("GOLD_API_USERNAME"),
password=os.getenv("GOLD_API_PASSWORD"), )

Original file line number Diff line number Diff line change
Expand Up @@ -89,27 +89,15 @@ def gold_biosample_to_nmdc_biosamples_and_omics_processing_records(
# Search for NMDC biosamples with by GOLD biosample ID
nmdc_biosamples = []
logging.info(f"Searching for NMDC biosamples with {goldbs_id}...")
nmdcbs_response = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id)
if nmdcbs_response.status_code != 200:
logging.error(
f"Failed to retrieve NMDC biosamples with {goldbs_id}: {nmdcbs_response.status_code}"
)

nmdcbs = nmdcbs_response.json()["cursor"]["firstBatch"]
nmdcbs = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id)
logging.info(f"Found {len(nmdcbs)} NMDC biosamples with {goldbs_id}...")
nmdc_biosamples.extend(nmdcbs)

# Search for NMDC biosamples via omics processing name containing GOLD biosample name suffix
logging.info(
f"Searching for NMDC omics processing name containing {goldbs_name_suffix}..."
)
omprc_response = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix)
if omprc_response.status_code != 200:
logging.error(
f"Failed to retrieve NMDC omics processing with {goldbs_name_suffix}: {omprc_response.status_code}"
)

omprc_records = omprc_response.json()["cursor"]["firstBatch"]
omprc_records = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix)
for omprc in omprc_records:
omprc_id = omprc["id"]
logging.info(f"omprc_id: {omprc_id}")
Expand Down
203 changes: 203 additions & 0 deletions nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#!/usr/bin/env python3
# coding: utf-8
# nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py
"""
neon_soils_add_ncbi_ids.py: Add NCBI biosample accessions to neon soils
biosamples, NCBI bioproject accessions to omics processing, and
NCBI Umbrella bioproject accession to neon soils study.
"""
import logging
import time

import click
from dotenv import load_dotenv

from nmdc_runtime.site.changesheets.base import (Changesheet,
ChangesheetLineItem,
get_gold_client,
get_runtime_client)

load_dotenv()
NAME = "neon_soils_add_ncbi_ids"
NMDC_STUDY_ID = "nmdc:sty-11-34xj1150"
UMBRELLA_BIOPROJECT_ACCESSION = "PRJNA1029061"

log_filename = f"{NAME}-{time.strftime('%Y%m%d-%H%M%S')}.log"
logging.basicConfig(
level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s",
filename=log_filename, encoding="utf-8", filemode="w", )


def _get_change_for_biosample(biosample, ncbi_biosample_accession):
"""
Get the changes for the given biosample
:param biosample: dict - the biosample
:param ncbi_biosample_accession: str - the NCBI BioSample accession
:return: list - the changes
"""
ncbi_biosample_accessions = biosample.get("insdc_biosample_identifiers", [])
if ncbi_biosample_accession in ncbi_biosample_accessions:
return
biosample_id = biosample["id"]
logging.info(f"creating change for biosample_id: {biosample_id}")
return ChangesheetLineItem(
id=biosample["id"], action="insert",
attribute="insdc_biosample_identifiers",
value="biosample:" + ncbi_biosample_accession + "|", )

def _get_change_for_omics_processing(omics_processing_record,
ncbi_bioproject_accession):
"""
Get the changes for the given omics_processing_record
:param omics_processing_record:
:param ncbi_bioproject_accession:
:return:
"""
ncbi_bioproject_accessions = omics_processing_record.get(
"insdc_bioproject_identifiers", [])
if ncbi_bioproject_accession in ncbi_bioproject_accessions:
return
omics_processing_id = omics_processing_record["id"]
logging.info(f"creating change for omics_processing_id: {omics_processing_id}")
return ChangesheetLineItem(
id=omics_processing_id, action="insert",
attribute="insdc_bioproject_identifiers",
value="bioproject:" + ncbi_bioproject_accession + "|", )


@click.command()
@click.option("--study_id", default=NMDC_STUDY_ID, help="NMDC study ID")
@click.option(
"--use_dev_api", is_flag=True, default=True, help="Use the dev API"
)
def generate_changesheet(study_id, use_dev_api):
"""
Generate a changesheet for neon soils study and biosamples by:
0. Changesheet line item: Umbrella BioProjectAccession to
study.insdc_project_identifiers
1. Retrieving all gold_study_identifiers for the neon soils study
2. For each gold_study_identifier, retrieve the GOLD projects
3. For each GOLD project,
A. retrieve the corresponding NMDC biosample(s). For each biosample,
- Changesheet line item:NCBI BioSampleAccession to
insdc_biosample_identifiers
B. Retrieve the corresponding NMDC omics_processing. For each,
- Changesheet line item:NCBI BioProjectAccession to
insdc_bioproject_identifiers

WARNING: This script is not idempotent. It will generate a new changesheet
each time it is run.
Changesheet is written to nmdc_runtime/site/changesheets/changesheets_output

:param study_id: The NMDC study ID
:param use_dev_api: Use the dev API (default: False)
:return:
"""
start_time = time.time()
logging.info(f"Generating changesheet for {study_id}")
logging.info(f"Using dev API: {use_dev_api}")

# Initialize the NMDC API
runtime_client = get_runtime_client(use_dev_api)

# Initialize the GOLD API
gold_client = get_gold_client()

# Initialize the changesheet
changesheet = Changesheet(name=NAME)

# 1. Retrieve all gold_study_identifiers for the neon soils study
logging.info(f"Retrieving gold_study_identifiers for {study_id}")
res = runtime_client.request("GET", f"/studies/{study_id}")
nmdc_study = res.json()
changesheet.line_items.append(
ChangesheetLineItem(
id=study_id, action="insert",
attribute="insdc_bioproject_identifiers",
value="bioproject:" + UMBRELLA_BIOPROJECT_ACCESSION + "|", )
)

gold_study_identifiers = nmdc_study["gold_study_identifiers"]
logging.info(f"gold_study_identifiers: {gold_study_identifiers}")
gold_project_count = 0
biosample_count = 0
for gold_study_identifier in gold_study_identifiers:

# 2. For each gold_study_identifier, retrieve the GOLD projects
if gold_study_identifier == 'gold:Gs0144570':
# TODO verify that this one has already been done
continue
logging.info(
f"Retrieving GOLD projects for gold_study_identifier: {gold_study_identifier}"
)
projects = gold_client.fetch_projects_by_study(gold_study_identifier)
logging.info(f"Retrieved {len(projects)} projects")

# 3. For each GOLD project,
for project in projects:
gold_project_count += 1
project_gold_id = project["projectGoldId"]
biosample_gold_id = project["biosampleGoldId"]
ncbi_bioproject_accession = project["ncbiBioProjectAccession"]
ncbi_biosample_accession = project["ncbiBioSampleAccession"]

# A. retrieve the corresponding NMDC biosample(s)
logging.info(
f"Retrieving NMDC biosamples for biosample_gold_id: {biosample_gold_id}"
)
biosamples = runtime_client.get_biosamples_by_gold_biosample_id(
biosample_gold_id
)
logging.info(f"Retrieved {len(biosamples)} biosamples")
for biosample in biosamples:
biosample_count += 1
biosample_id = biosample["id"]
logging.info(f"biosample_id: {biosample_id}")
# NcbiBioSampleAccession to insdc_biosample_identifiers
change =_get_change_for_biosample(
biosample, ncbi_biosample_accession
)
if change:
changesheet.line_items.append(change)

# B. Retrieve the corresponding NMDC omics_processing
logging.info(
f"Retrieving NMDC omics_processing for project_gold_id: {project_gold_id}"
)
omics_processing_records = (
runtime_client.get_omics_processing_records_by_gold_project_id(
project_gold_id
))
logging.info(
f"Retrieved {len(omics_processing_records)} omics_processings"
)
for omics_processing in omics_processing_records:
omics_processing_id = omics_processing["id"]
logging.info(
f"omics_processing_id: {omics_processing_id}"
)
# NcbiBioProjectAccession to insdc_experiment_identifiers
change = _get_change_for_omics_processing(
omics_processing, ncbi_bioproject_accession
)
if change:
changesheet.line_items.append(change)

logging.info(f"gold_project_count: {gold_project_count}")
logging.info(f"biosample_count: {biosample_count}")
logging.info(f"changesheet has {len(changesheet.line_items)} line items")

# Write the changesheet
changesheet.write_changesheet()

# Validate the changesheet
if changesheet.validate_changesheet(runtime_client.base_url):
logging.info(f"Changesheet is valid")
else:
logging.error(f"Changesheet is invalid")

logging.info(f"Completed in {time.time() - start_time} seconds")


if __name__ == "__main__":
generate_changesheet()
46 changes: 44 additions & 2 deletions nmdc_runtime/site/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@
from nmdc_runtime.api.models.object import DrsObject, AccessURL, DrsObjectIn
from nmdc_runtime.api.models.operation import ListOperationsResponse
from nmdc_runtime.api.models.util import ListRequest
from nmdc_runtime.site.normalization.gold import normalize_gold_id
from nmdc_runtime.util import unfreeze, nmdc_jsonschema_validator_noidpatterns
from nmdc_schema import nmdc



class RuntimeApiClient:
def __init__(self, base_url: str):
self.base_url = base_url
Expand Down Expand Up @@ -95,7 +97,8 @@ def get_run_info(self, run_id: str):
return self.request("GET", f"/runs/{run_id}")

def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str):
return self.request(
gold_biosample_id = normalize_gold_id(gold_biosample_id)
response = self.request(
"POST",
f"/queries:run",
{
Expand All @@ -107,16 +110,53 @@ def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str):
},
},
)
response.raise_for_status()
return response.json()["cursor"]["firstBatch"]

def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
gold_project_id = normalize_gold_id(gold_project_id)
response = self.request(
"POST",
f"/queries:run",
{
"find": "omics_processing_set",
"filter": {
"gold_sequencing_project_identifiers": {
"$elemMatch": {"$eq": gold_project_id}
}
},
},
)
response.raise_for_status()
return response.json()["cursor"]["firstBatch"]

def get_biosamples_for_study(self, study_id: str):
response = self.request(
"POST",
f"/queries:run",
{
"find": "biosample_set",
"filter": {
"part_of": {
"$elemMatch": {"$eq": study_id}
}
},
},
)
response.raise_for_status()
return response.json()["cursor"]["firstBatch"]

def get_omics_processing_by_name(self, name: str):
return self.request(
response = self.request(
"POST",
f"/queries:run",
{
"find": "omics_processing_set",
"filter": {"name": {"$regex": name, "$options": "i"}},
},
)
response.raise_for_status()
return response.json()["cursor"]["firstBatch"]


class RuntimeApiSiteClient(RuntimeApiClient):
Expand Down Expand Up @@ -320,6 +360,8 @@ def fetch_study(self, id: str) -> Union[Dict[str, Any], None]:
return results[0]




@resource(
config_schema={
"base_url": StringSource,
Expand Down
Loading