Skip to content

Commit

Permalink
Merge branch 'lipid_dev_metadat' into 'lipid_dev'
Browse files Browse the repository at this point in the history
Make small modifications to the lipid metadata generation scripts

See merge request mass-spectrometry/metams!11
  • Loading branch information
kheal committed Dec 19, 2024
2 parents 259cf3b + 95bc156 commit b379afa
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 25 deletions.
99 changes: 94 additions & 5 deletions metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,57 @@
import requests
from dataclasses import dataclass
import json
import logging

class ApiInfoRetriever:
class NMDCAPIInterface:
"""
A generic interface for the NMDC runtime API.
Attributes
----------
base_url : str
The base URL for the NMDC runtime API.
Methods
-------
validate_json(json_path: str) -> None:
Validates a json file using the NMDC json validate endpoint.
"""

def __init__(self):
self.base_url = "https://api.microbiomedata.org"

def validate_json(self, json_path) -> None:
"""
Validates a json file using the NMDC json validate endpoint.
If the validation passes, the method returns without any side effects.
Parameters
----------
json_path : str
The path to the json file to be validated.
Raises
------
Exception
If the validation fails.
"""
with open(json_path, 'r') as f:
data = json.load(f)

url = f"{self.base_url}/metadata/json:validate"
headers = {
'accept': 'application/json',
'Content-Type': 'application/json'
}
response = requests.post(url, headers=headers, json=data)
if response.status_code != 200:
logging.error(f"Request failed with status code {response.status_code}")
logging.error(response.text)
raise Exception("Validation failed")


class ApiInfoRetriever(NMDCAPIInterface):
"""
A class to retrieve API information from a specified collection.
Expand All @@ -28,6 +78,7 @@ def __init__(self, collection_name: str):
collection_name : str
The name of the collection to be used for API queries.
"""
super().__init__()
self.collection_name = collection_name

def get_id_by_name_from_collection(self, name_field_value: str) -> str:
Expand Down Expand Up @@ -61,15 +112,53 @@ def get_id_by_name_from_collection(self, name_field_value: str) -> str:
filter_param = f'{{"name": "{name_field_value}"}}'
field = "id"

og_url = f'https://api.microbiomedata.org/nmdcschema/{self.collection_name}?&filter={filter_param}&projection={field}'
og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection={field}"

try:
resp = requests.get(og_url)
resp.raise_for_status() # Raises an HTTPError for bad responses
data = resp.json()
identifier = data['resources'][0]['id']
identifier = data["resources"][0]["id"]
return identifier
except requests.RequestException as e:
raise requests.RequestException(f"Error making API request: {e}")
except (KeyError, IndexError) as e:
raise IndexError(f"No matching entry found for '{name_field_value}': {e}")

def check_if_ids_exist(self, ids: list) -> bool:
"""
Check if the IDs exist in the collection.
This method constructs a query to the API to filter the collection based on the given IDs, and checks if all IDs exist in the collection.
Parameters
----------
ids : list
A list of IDs to check if they exist in the collection.
Returns
-------
bool
True if all IDs exist in the collection, False otherwise.
Raises
------
requests.RequestException
If there's an error in making the API request.
"""
ids_test = list(set(ids))
ids_test = [id.replace('"', "'") for id in ids_test]
ids_test_str = ", ".join(f'"{id}"' for id in ids_test)
filter_param = f'{{"id": {{"$in": [{ids_test_str}]}}}}'
og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection=id"

try:
resp = requests.get(og_url)
resp.raise_for_status() # Raises an HTTPError for bad responses
data = resp.json()
if len(data["resources"]) != len(ids_test):
return False
except requests.RequestException as e:
raise requests.RequestException(f"Error making API request: {e}")

return True
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Biosample Id,Associated Study,Processing Type,Raw Data File,Raw Data Object Alt Id,Processed Data Directory,mass spec configuration name,lc config name,instrument used,processing institution,instrument analysis start date,instrument analysis end date,execution resource
nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test1.raw,https://status.my.emsl.pnl.gov/view/1834807,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,15T FT-ICR MS ,EMSL,01/22/2023,03/04/2024,EMSL-RZR
nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test2.raw,https://status.my.emsl.pnl.gov/view/1834808,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,12T FT-ICR MS ,EMSL,01/22/2023,03/04/2024,EMSL-RZR
nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test3.raw,https://status.my.emsl.pnl.gov/view/1554801,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,VOrbiETD04,EMSL,01/22/2023,03/04/2024,EMSL-RZR
nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test4.raw,https://status.my.emsl.pnl.gov/view/1554809,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,Agilent GC-MS (2009),EMSL,01/22/2023,03/04/2024,EMSL-RZR
Biosample Id,Associated Study,Processing Type,Raw Data File,Processed Data Directory,mass spec configuration name,lc config name,instrument used,processing institution,instrument analysis start date,instrument analysis end date,execution resource
nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test1.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,15T FT-ICR MS ,EMSL,1/22/23,3/4/24,EMSL-RZR
nmdc:biosamp:12-233,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test2.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample1_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,12T FT-ICR MS ,EMSL,1/22/23,3/4/24,EMSL-RZR
nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test3.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms1_processed,"EMSL lipidomics DDA mass spectrometry method, positive",EMSL metabolomics GC/MS mass spectrometry method,VOrbiETD04,EMSL,1/22/23,3/4/24,EMSL-RZR
nmdc:biosamp:13-234,nmdc:sty-11-8fb6t785,MPLEX,/Users/zalm693/development/nmdc_ingest/example_lip_data/raw/raw_data_test4.raw,/Users/zalm693/development/nmdc_ingest/example_lip_data/processed/sample2_ms2_processed,"EMSL lipidomics DDA mass spectrometry method, negative",EMSL metabolomics GC/MS mass spectrometry method,Agilent GC-MS (2009),EMSL,1/22/23,3/4/24,EMSL-RZR
43 changes: 28 additions & 15 deletions metaMS/nmdc_lipidomics_metadata_generation/metadata_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import nmdc_schema.nmdc as nmdc
from linkml_runtime.dumpers import json_dumper
from api_info_retriever import ApiInfoRetriever
from api_info_retriever import ApiInfoRetriever, NMDCAPIInterface

# Configure logging
logging.basicConfig(
Expand Down Expand Up @@ -57,8 +57,6 @@ class WorkflowMetadata:
Directory containing processed data files.
raw_data_file : str
Path or name of the raw data file.
raw_data_object_alt_id : str
Alternative identifier for the raw data object.
mass_spec_config_name : str
Name of the mass spectrometry configuration used.
lc_config_name : str
Expand All @@ -74,7 +72,6 @@ class WorkflowMetadata:
"""
processed_data_dir: str
raw_data_file: str
raw_data_object_alt_id: str
mass_spec_config_name: str
lc_config_name: str
instrument_used: str
Expand Down Expand Up @@ -209,7 +206,7 @@ def __init__(
'processing institution'
]
self.mass_spec_desc = (
"Analysis of raw mass spectrometry data for the annotation of lipids."
"Generation of mass spectrometry data for the analysis of lipids."
)
self.mass_spec_eluent_intro = "liquid_chromatography"
self.analyte_category = "lipidome"
Expand Down Expand Up @@ -302,14 +299,15 @@ def run(self):
description=self.raw_data_obj_desc,
base_url=self.raw_data_url,
was_generated_by=mass_spec.id,
alternative_id=workflow_metadata_obj.raw_data_object_alt_id
)

metab_analysis = self.generate_metabolomics_analysis(
cluster_name=workflow_metadata_obj.execution_resource,
raw_data_name=Path(workflow_metadata_obj.raw_data_file).name,
raw_data_id=raw_data_object.id,
data_gen_id=mass_spec.id,
processed_data_id="nmdc:placeholder",
parameter_data_id="nmdc:placeholder",
processing_institution=group_metadata_obj.processing_institution
)

Expand All @@ -332,7 +330,7 @@ def run(self):
was_generated_by=metab_analysis.id
)
nmdc_database_inst.data_object_set.append(processed_data_object)
processed_data.append(processed_data_object.id)
parameter_data_id = processed_data_object.id

elif file_type == 'csv':
processed_data_object = self.generate_data_object(
Expand Down Expand Up @@ -369,6 +367,7 @@ def run(self):
mass_spec_obj=mass_spec,
analysis_obj=metab_analysis,
raw_data_obj=raw_data_object,
parameter_data_id=parameter_data_id,
processed_data_id_list=processed_data
)

Expand All @@ -377,14 +376,16 @@ def run(self):
nmdc_database_inst.workflow_execution_set.append(metab_analysis)

self.dump_nmdc_database(nmdc_database=nmdc_database_inst)
api_interface = NMDCAPIInterface()
api_interface.validate_json(self.database_dump_json_path)
logging.info("Metadata processing completed.")

def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy:
"""
Load and group workflow metadata from a CSV file.
This method reads the metadata CSV file, checks for uniqueness in
specified columns, and groups the data by biosample ID.
specified columns, checks that biosamples exist, and groups the data by biosample ID.
Returns
-------
Expand All @@ -396,7 +397,7 @@ def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy:
FileNotFoundError
If the `metadata_file` does not exist.
ValueError
If values in columns 'Raw Data File', 'Raw Data Object Alt Id',
If values in columns 'Raw Data File',
and 'Processed Data Directory' are not unique.
Notes
Expand All @@ -412,12 +413,18 @@ def load_metadata(self) -> pd.core.groupby.DataFrameGroupBy:
# Check for uniqueness in specified columns
columns_to_check = [
'Raw Data File',
'Raw Data Object Alt Id',
'Processed Data Directory'
]
for column in columns_to_check:
if not metadata_df[column].is_unique:
raise ValueError(f"Duplicate values found in column '{column}'.")

# Check that all biosamples exist
biosample_ids = metadata_df['Biosample Id'].unique()
api_biosample_getter = ApiInfoRetriever(collection_name="biosample_set")

if not api_biosample_getter.check_if_ids_exist(biosample_ids):
raise ValueError("Biosample IDs do not exist in the collection.")

# Group by Biosample
grouped = metadata_df.groupby('Biosample Id')
Expand Down Expand Up @@ -481,7 +488,6 @@ def create_workflow_metadata(
return WorkflowMetadata(
processed_data_dir=row['Processed Data Directory'],
raw_data_file=row['Raw Data File'],
raw_data_object_alt_id=row['Raw Data Object Alt Id'],
mass_spec_config_name=row['mass spec configuration name'],
lc_config_name=row['lc config name'],
instrument_used=row['instrument used'],
Expand Down Expand Up @@ -590,8 +596,6 @@ def generate_mass_spectrometry(
-----
This method uses the ApiInfoRetriever to fetch IDs for the instrument
and configurations. It also mints a new NMDC ID for the DataGeneration object.
TODO: Update docstring with new variables (e.g. analyte_category).
"""
nmdc_id = self.mint_nmdc_id(nmdc_type=NmdcTypes.MassSpectrometry)[0]

Expand Down Expand Up @@ -704,8 +708,10 @@ def generate_metabolomics_analysis(
self,
cluster_name: str,
raw_data_name: str,
raw_data_id: str,
data_gen_id: str,
processed_data_id: str,
parameter_data_id: str,
processing_institution: str
) -> nmdc.MetabolomicsAnalysis:
"""
Expand All @@ -720,10 +726,14 @@ def generate_metabolomics_analysis(
Name of the cluster or computing resource used for the analysis.
raw_data_name : str
Name of the raw data file that was analyzed.
raw_data_id : str
ID of the raw data object that was analyzed.
data_gen_id : str
ID of the DataGeneration object that generated the raw data.
processed_data_id : str
ID of the processed data resulting from the analysis.
parameter_data_id : str
ID of the parameter data object used for the analysis.
processing_institution : str
Name of the institution where the analysis was performed.
Expand All @@ -738,7 +748,8 @@ def generate_metabolomics_analysis(
placeholder values and should be updated with actual timestamps later
when the processed files are iterated over in the run method.
"""
nmdc_id = self.mint_nmdc_id(nmdc_type=NmdcTypes.MetabolomicsAnalysis)[0]
nmdc_id = self.mint_nmdc_id(nmdc_type=NmdcTypes.MetabolomicsAnalysis)[0]+".1"
#TODO: Update the minting to handle versioning in the future

data_dict = {
'id': nmdc_id,
Expand All @@ -749,7 +760,7 @@ def generate_metabolomics_analysis(
'git_url': self.workflow_git_url,
'version': self.workflow_version,
'was_informed_by': data_gen_id,
'has_input': [raw_data_name],
'has_input': [raw_data_id, parameter_data_id],
'has_output': [processed_data_id],
'started_at_time': 'placeholder',
'ended_at_time': 'placeholder',
Expand All @@ -765,6 +776,7 @@ def update_outputs(
mass_spec_obj: object,
analysis_obj: object,
raw_data_obj: object,
parameter_data_id: str,
processed_data_id_list: list
) -> None:
"""
Expand Down Expand Up @@ -798,6 +810,7 @@ def update_outputs(
- Sets `analysis_obj.has_output` to `processed_data_id_list`.
"""
mass_spec_obj.has_output = [raw_data_obj.id]
analysis_obj.has_input[1] = parameter_data_id
analysis_obj.has_output = processed_data_id_list

def start_nmdc_database(self) -> nmdc.Database:
Expand Down

0 comments on commit b379afa

Please sign in to comment.