Merge pull request #354 from microbiomedata/529_add_NCBI_biosample_an…

…d_project_ids_to_NEON_soil 529 add ncbi biosample and project ids to neon soil
microbiomedata · Nov 15, 2023 · f1e9caf · f1e9caf
2 parents 1eacc43 + e488932
commit f1e9caf
Show file tree

Hide file tree

Showing 7 changed files with 309 additions and 73 deletions.
diff --git a/nmdc_runtime/site/changesheets/base.py b/nmdc_runtime/site/changesheets/base.py
@@ -4,14 +4,17 @@
 """
 
 import logging
+import os
 import time
 from dataclasses import dataclass, field
+from dotenv import load_dotenv
 from pathlib import Path
 import requests
 from typing import Any, ClassVar, Dict, TypeAlias, Optional
 
-from nmdc_runtime.site.resources import RuntimeApiUserClient
+from nmdc_runtime.site.resources import GoldApiClient, RuntimeApiUserClient
 
+load_dotenv()
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s %(levelname)s %(" "message)s"
 )
@@ -83,3 +86,24 @@ def write_changesheet(self) -> None:
             f.write(self.header + "\n")
             for line_item in self.line_items:
                 f.write(line_item.line + "\n")
+
+
+def get_runtime_client(use_dev_api):
+    if use_dev_api:
+        base_url = os.getenv("API_HOST_DEV")
+        logging.info("using Dev API...")
+    else:
+        base_url = os.getenv("API_HOST")
+        logging.info("using prod API...")
+    return  RuntimeApiUserClient(
+        base_url=base_url, username=os.getenv("API_QUERY_USER"),
+        password=os.getenv("API_QUERY_PASS"), )
+
+
+
+def get_gold_client():
+    return GoldApiClient(
+        base_url=os.getenv("GOLD_API_BASE_URL"),
+        username=os.getenv("GOLD_API_USERNAME"),
+        password=os.getenv("GOLD_API_PASSWORD"), )
+
diff --git a/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py b/nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py
@@ -89,27 +89,15 @@ def gold_biosample_to_nmdc_biosamples_and_omics_processing_records(
     # Search for NMDC biosamples with by GOLD biosample ID
     nmdc_biosamples = []
     logging.info(f"Searching for NMDC biosamples with {goldbs_id}...")
-    nmdcbs_response = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id)
-    if nmdcbs_response.status_code != 200:
-        logging.error(
-            f"Failed to retrieve NMDC biosamples with {goldbs_id}: {nmdcbs_response.status_code}"
-        )
-
-    nmdcbs = nmdcbs_response.json()["cursor"]["firstBatch"]
+    nmdcbs = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id)
     logging.info(f"Found {len(nmdcbs)} NMDC biosamples with {goldbs_id}...")
     nmdc_biosamples.extend(nmdcbs)
 
     # Search for NMDC biosamples via omics processing name containing GOLD biosample name suffix
     logging.info(
         f"Searching for NMDC omics processing name containing {goldbs_name_suffix}..."
     )
-    omprc_response = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix)
-    if omprc_response.status_code != 200:
-        logging.error(
-            f"Failed to retrieve NMDC omics processing with {goldbs_name_suffix}: {omprc_response.status_code}"
-        )
-
-    omprc_records = omprc_response.json()["cursor"]["firstBatch"]
+    omprc_records = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix)
     for omprc in omprc_records:
         omprc_id = omprc["id"]
         logging.info(f"omprc_id: {omprc_id}")

diff --git a/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py b/nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+# coding: utf-8
+# nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py
+"""
+neon_soils_add_ncbi_ids.py: Add NCBI biosample accessions to neon soils
+biosamples, NCBI bioproject accessions to omics processing, and
+NCBI Umbrella bioproject accession to neon soils study.
+"""
+import logging
+import time
+
+import click
+from dotenv import load_dotenv
+
+from nmdc_runtime.site.changesheets.base import (Changesheet,
+                                                 ChangesheetLineItem,
+                                                 get_gold_client,
+                                                 get_runtime_client)
+
+load_dotenv()
+NAME = "neon_soils_add_ncbi_ids"
+NMDC_STUDY_ID = "nmdc:sty-11-34xj1150"
+UMBRELLA_BIOPROJECT_ACCESSION = "PRJNA1029061"
+
+log_filename = f"{NAME}-{time.strftime('%Y%m%d-%H%M%S')}.log"
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s",
+    filename=log_filename, encoding="utf-8", filemode="w", )
+
+
+def _get_change_for_biosample(biosample, ncbi_biosample_accession):
+    """
+    Get the changes for the given biosample
+    :param biosample: dict - the biosample
+    :param ncbi_biosample_accession: str - the NCBI BioSample accession
+    :return: list - the changes
+    """
+    ncbi_biosample_accessions = biosample.get("insdc_biosample_identifiers", [])
+    if ncbi_biosample_accession in ncbi_biosample_accessions:
+        return
+    biosample_id = biosample["id"]
+    logging.info(f"creating change for biosample_id: {biosample_id}")
+    return ChangesheetLineItem(
+        id=biosample["id"], action="insert",
+        attribute="insdc_biosample_identifiers",
+        value="biosample:" + ncbi_biosample_accession + "|", )
+
+def _get_change_for_omics_processing(omics_processing_record,
+                                     ncbi_bioproject_accession):
+    """
+    Get the changes for the given omics_processing_record
+    :param omics_processing_record:
+    :param ncbi_bioproject_accession:
+    :return:
+    """
+    ncbi_bioproject_accessions = omics_processing_record.get(
+        "insdc_bioproject_identifiers", [])
+    if ncbi_bioproject_accession in ncbi_bioproject_accessions:
+        return
+    omics_processing_id = omics_processing_record["id"]
+    logging.info(f"creating change for omics_processing_id: {omics_processing_id}")
+    return ChangesheetLineItem(
+        id=omics_processing_id, action="insert",
+        attribute="insdc_bioproject_identifiers",
+        value="bioproject:" + ncbi_bioproject_accession + "|", )
+
+
+@click.command()
+@click.option("--study_id", default=NMDC_STUDY_ID, help="NMDC study ID")
+@click.option(
+    "--use_dev_api", is_flag=True, default=True, help="Use the dev API"
+)
+def generate_changesheet(study_id, use_dev_api):
+    """
+    Generate a changesheet for neon soils study and biosamples by:
+    0. Changesheet line item: Umbrella BioProjectAccession to
+        study.insdc_project_identifiers
+    1. Retrieving all gold_study_identifiers for the neon soils study
+    2. For each gold_study_identifier, retrieve the GOLD projects
+    3. For each GOLD project,
+        A. retrieve the corresponding NMDC biosample(s). For each biosample,
+            - Changesheet line item:NCBI BioSampleAccession to
+            insdc_biosample_identifiers
+        B. Retrieve the corresponding NMDC omics_processing. For each,
+            - Changesheet line item:NCBI BioProjectAccession to
+            insdc_bioproject_identifiers
+
+    WARNING: This script is not idempotent. It will generate a new changesheet
+    each time it is run.
+    Changesheet is written to nmdc_runtime/site/changesheets/changesheets_output
+
+    :param study_id: The NMDC study ID
+    :param use_dev_api: Use the dev API (default: False)
+    :return:
+    """
+    start_time = time.time()
+    logging.info(f"Generating changesheet for {study_id}")
+    logging.info(f"Using dev API: {use_dev_api}")
+
+    # Initialize the NMDC API
+    runtime_client = get_runtime_client(use_dev_api)
+
+    # Initialize the GOLD API
+    gold_client = get_gold_client()
+
+    # Initialize the changesheet
+    changesheet = Changesheet(name=NAME)
+
+    # 1. Retrieve all gold_study_identifiers for the neon soils study
+    logging.info(f"Retrieving gold_study_identifiers for {study_id}")
+    res = runtime_client.request("GET", f"/studies/{study_id}")
+    nmdc_study = res.json()
+    changesheet.line_items.append(
+        ChangesheetLineItem(
+            id=study_id, action="insert",
+            attribute="insdc_bioproject_identifiers",
+            value="bioproject:" + UMBRELLA_BIOPROJECT_ACCESSION + "|", )
+    )
+
+    gold_study_identifiers = nmdc_study["gold_study_identifiers"]
+    logging.info(f"gold_study_identifiers: {gold_study_identifiers}")
+    gold_project_count = 0
+    biosample_count = 0
+    for gold_study_identifier in gold_study_identifiers:
+
+        # 2. For each gold_study_identifier, retrieve the GOLD projects
+        if gold_study_identifier == 'gold:Gs0144570':
+            # TODO verify that this one has already been done
+            continue
+        logging.info(
+            f"Retrieving GOLD projects for gold_study_identifier: {gold_study_identifier}"
+        )
+        projects = gold_client.fetch_projects_by_study(gold_study_identifier)
+        logging.info(f"Retrieved {len(projects)} projects")
+
+        # 3. For each GOLD project,
+        for project in projects:
+            gold_project_count += 1
+            project_gold_id = project["projectGoldId"]
+            biosample_gold_id = project["biosampleGoldId"]
+            ncbi_bioproject_accession = project["ncbiBioProjectAccession"]
+            ncbi_biosample_accession = project["ncbiBioSampleAccession"]
+
+            # A. retrieve the corresponding NMDC biosample(s)
+            logging.info(
+                f"Retrieving NMDC biosamples for biosample_gold_id: {biosample_gold_id}"
+            )
+            biosamples = runtime_client.get_biosamples_by_gold_biosample_id(
+                biosample_gold_id
+            )
+            logging.info(f"Retrieved {len(biosamples)} biosamples")
+            for biosample in biosamples:
+                biosample_count += 1
+                biosample_id = biosample["id"]
+                logging.info(f"biosample_id: {biosample_id}")
+                # NcbiBioSampleAccession to insdc_biosample_identifiers
+                change =_get_change_for_biosample(
+                        biosample, ncbi_biosample_accession
+                    )
+                if change:
+                    changesheet.line_items.append(change)
+
+            # B. Retrieve the corresponding NMDC omics_processing
+            logging.info(
+                f"Retrieving NMDC omics_processing for project_gold_id: {project_gold_id}"
+            )
+            omics_processing_records = (
+                runtime_client.get_omics_processing_records_by_gold_project_id(
+                project_gold_id
+            ))
+            logging.info(
+                f"Retrieved {len(omics_processing_records)} omics_processings"
+            )
+            for omics_processing in omics_processing_records:
+                omics_processing_id = omics_processing["id"]
+                logging.info(
+                    f"omics_processing_id: {omics_processing_id}"
+                )
+                # NcbiBioProjectAccession to insdc_experiment_identifiers
+                change = _get_change_for_omics_processing(
+                    omics_processing, ncbi_bioproject_accession
+                )
+                if change:
+                    changesheet.line_items.append(change)
+
+    logging.info(f"gold_project_count: {gold_project_count}")
+    logging.info(f"biosample_count: {biosample_count}")
+    logging.info(f"changesheet has {len(changesheet.line_items)} line items")
+
+    # Write the changesheet
+    changesheet.write_changesheet()
+
+    # Validate the changesheet
+    if changesheet.validate_changesheet(runtime_client.base_url):
+        logging.info(f"Changesheet is valid")
+    else:
+        logging.error(f"Changesheet is invalid")
+
+    logging.info(f"Completed in {time.time() - start_time} seconds")
+
+
+if __name__ == "__main__":
+    generate_changesheet()
diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py
@@ -27,10 +27,12 @@
 from nmdc_runtime.api.models.object import DrsObject, AccessURL, DrsObjectIn
 from nmdc_runtime.api.models.operation import ListOperationsResponse
 from nmdc_runtime.api.models.util import ListRequest
+from nmdc_runtime.site.normalization.gold import normalize_gold_id
 from nmdc_runtime.util import unfreeze, nmdc_jsonschema_validator_noidpatterns
 from nmdc_schema import nmdc
 
 
+
 class RuntimeApiClient:
     def __init__(self, base_url: str):
         self.base_url = base_url
@@ -95,7 +97,8 @@ def get_run_info(self, run_id: str):
         return self.request("GET", f"/runs/{run_id}")
 
     def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str):
-        return self.request(
+        gold_biosample_id = normalize_gold_id(gold_biosample_id)
+        response = self.request(
             "POST",
             f"/queries:run",
             {
@@ -107,16 +110,53 @@ def get_biosamples_by_gold_biosample_id(self, gold_biosample_id: str):
                 },
             },
         )
+        response.raise_for_status()
+        return response.json()["cursor"]["firstBatch"]
+
+    def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
+        gold_project_id = normalize_gold_id(gold_project_id)
+        response = self.request(
+            "POST",
+            f"/queries:run",
+            {
+                "find": "omics_processing_set",
+                "filter": {
+                    "gold_sequencing_project_identifiers": {
+                        "$elemMatch": {"$eq": gold_project_id}
+                    }
+                },
+            },
+        )
+        response.raise_for_status()
+        return response.json()["cursor"]["firstBatch"]
+
+    def get_biosamples_for_study(self, study_id: str):
+        response = self.request(
+            "POST",
+            f"/queries:run",
+            {
+                "find": "biosample_set",
+                "filter": {
+                    "part_of": {
+                        "$elemMatch": {"$eq": study_id}
+                    }
+                },
+            },
+        )
+        response.raise_for_status()
+        return response.json()["cursor"]["firstBatch"]
 
     def get_omics_processing_by_name(self, name: str):
-        return self.request(
+        response = self.request(
             "POST",
             f"/queries:run",
             {
                 "find": "omics_processing_set",
                 "filter": {"name": {"$regex": name, "$options": "i"}},
             },
         )
+        response.raise_for_status()
+        return response.json()["cursor"]["firstBatch"]
 
 
 class RuntimeApiSiteClient(RuntimeApiClient):
@@ -320,6 +360,8 @@ def fetch_study(self, id: str) -> Union[Dict[str, Any], None]:
         return results[0]
 
 
+
+
 @resource(
     config_schema={
         "base_url": StringSource,